1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,+avx512vl < %s | FileCheck %s 3 4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 5target triple = "x86_64-unknown-unknown" 6 7; Stack reload folding tests. 8; 9; By including a nop call with sideeffects we can force a partial register spill of the 10; relevant registers and check that the reload is correctly folded into the instruction. 11 12define <8 x half> @stack_fold_fmadd123ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { 13; CHECK-LABEL: stack_fold_fmadd123ph: 14; CHECK: # %bb.0: 15; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16; CHECK-NEXT: #APP 17; CHECK-NEXT: nop 18; CHECK-NEXT: #NO_APP 19; CHECK-NEXT: vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 20; CHECK-NEXT: retq 21 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 22 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) 23 ret <8 x half> %2 24} 25declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>) 26 27define <8 x half> @stack_fold_fmadd213ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { 28; CHECK-LABEL: stack_fold_fmadd213ph: 29; CHECK: # %bb.0: 30; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 31; CHECK-NEXT: #APP 32; CHECK-NEXT: nop 33; CHECK-NEXT: #NO_APP 34; CHECK-NEXT: vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 35; CHECK-NEXT: retq 36 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 37 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a0, <8 x half> %a2) 38 ret <8 x half> %2 39} 40 41define <8 x half> @stack_fold_fmadd231ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { 42; CHECK-LABEL: stack_fold_fmadd231ph: 43; CHECK: # %bb.0: 44; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 45; CHECK-NEXT: #APP 46; CHECK-NEXT: nop 47; CHECK-NEXT: #NO_APP 48; CHECK-NEXT: vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 49; CHECK-NEXT: retq 50 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 51 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a2, <8 x half> %a0) 52 ret <8 x half> %2 53} 54 55define <8 x half> @stack_fold_fmadd321ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { 56; CHECK-LABEL: stack_fold_fmadd321ph: 57; CHECK: # %bb.0: 58; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 59; CHECK-NEXT: #APP 60; CHECK-NEXT: nop 61; CHECK-NEXT: #NO_APP 62; CHECK-NEXT: vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 63; CHECK-NEXT: retq 64 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 65 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a1, <8 x half> %a0) 66 ret <8 x half> %2 67} 68 69define <8 x half> @stack_fold_fmadd132ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { 70; CHECK-LABEL: stack_fold_fmadd132ph: 71; CHECK: # %bb.0: 72; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 73; CHECK-NEXT: #APP 74; CHECK-NEXT: nop 75; CHECK-NEXT: #NO_APP 76; CHECK-NEXT: vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 77; CHECK-NEXT: retq 78 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 79 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a2, <8 x half> %a1) 80 ret <8 x half> %2 81} 82 83define <8 x half> @stack_fold_fmadd312ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { 84; CHECK-LABEL: stack_fold_fmadd312ph: 85; CHECK: # %bb.0: 86; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 87; CHECK-NEXT: #APP 88; CHECK-NEXT: nop 89; CHECK-NEXT: #NO_APP 90; CHECK-NEXT: vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 91; CHECK-NEXT: retq 92 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 93 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a0, <8 x half> %a1) 94 ret <8 x half> %2 95} 96 97define <8 x half> @stack_fold_fmadd123ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { 98; CHECK-LABEL: stack_fold_fmadd123ph_mask: 99; CHECK: # %bb.0: 100; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 101; CHECK-NEXT: #APP 102; CHECK-NEXT: nop 103; CHECK-NEXT: #NO_APP 104; CHECK-NEXT: vmovaps (%rdi), %xmm2 105; CHECK-NEXT: kmovd %esi, %k1 106; CHECK-NEXT: vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 107; CHECK-NEXT: vmovaps %xmm2, %xmm0 108; CHECK-NEXT: retq 109 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 110 %a0 = load <8 x half>, ptr %p 111 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) 112 %3 = bitcast i8 %mask to <8 x i1> 113 %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 114 ret <8 x half> %4 115} 116 117define <8 x half> @stack_fold_fmadd213ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { 118; CHECK-LABEL: stack_fold_fmadd213ph_mask: 119; CHECK: # %bb.0: 120; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 121; CHECK-NEXT: #APP 122; CHECK-NEXT: nop 123; CHECK-NEXT: #NO_APP 124; CHECK-NEXT: vmovaps (%rdi), %xmm2 125; CHECK-NEXT: kmovd %esi, %k1 126; CHECK-NEXT: vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 127; CHECK-NEXT: vmovaps %xmm2, %xmm0 128; CHECK-NEXT: retq 129 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 130 %a0 = load <8 x half>, ptr %p 131 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a0, <8 x half> %a2) 132 %3 = bitcast i8 %mask to <8 x i1> 133 %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 134 ret <8 x half> %4 135} 136 137define <8 x half> @stack_fold_fmadd231ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { 138; CHECK-LABEL: stack_fold_fmadd231ph_mask: 139; CHECK: # %bb.0: 140; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 141; CHECK-NEXT: #APP 142; CHECK-NEXT: nop 143; CHECK-NEXT: #NO_APP 144; CHECK-NEXT: vmovaps (%rdi), %xmm2 145; CHECK-NEXT: kmovd %esi, %k1 146; CHECK-NEXT: vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 147; CHECK-NEXT: vmovaps %xmm2, %xmm0 148; CHECK-NEXT: retq 149 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 150 %a0 = load <8 x half>, ptr %p 151 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a2, <8 x half> %a0) 152 %3 = bitcast i8 %mask to <8 x i1> 153 %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 154 ret <8 x half> %4 155} 156 157define <8 x half> @stack_fold_fmadd321ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { 158; CHECK-LABEL: stack_fold_fmadd321ph_mask: 159; CHECK: # %bb.0: 160; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 161; CHECK-NEXT: #APP 162; CHECK-NEXT: nop 163; CHECK-NEXT: #NO_APP 164; CHECK-NEXT: vmovaps (%rdi), %xmm2 165; CHECK-NEXT: kmovd %esi, %k1 166; CHECK-NEXT: vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 167; CHECK-NEXT: vmovaps %xmm2, %xmm0 168; CHECK-NEXT: retq 169 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 170 %a0 = load <8 x half>, ptr %p 171 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a1, <8 x half> %a0) 172 %3 = bitcast i8 %mask to <8 x i1> 173 %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 174 ret <8 x half> %4 175} 176 177define <8 x half> @stack_fold_fmadd132ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { 178; CHECK-LABEL: stack_fold_fmadd132ph_mask: 179; CHECK: # %bb.0: 180; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 181; CHECK-NEXT: #APP 182; CHECK-NEXT: nop 183; CHECK-NEXT: #NO_APP 184; CHECK-NEXT: vmovaps (%rdi), %xmm2 185; CHECK-NEXT: kmovd %esi, %k1 186; CHECK-NEXT: vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 187; CHECK-NEXT: vmovaps %xmm2, %xmm0 188; CHECK-NEXT: retq 189 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 190 %a0 = load <8 x half>, ptr %p 191 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a2, <8 x half> %a1) 192 %3 = bitcast i8 %mask to <8 x i1> 193 %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 194 ret <8 x half> %4 195} 196 197define <8 x half> @stack_fold_fmadd312ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { 198; CHECK-LABEL: stack_fold_fmadd312ph_mask: 199; CHECK: # %bb.0: 200; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 201; CHECK-NEXT: #APP 202; CHECK-NEXT: nop 203; CHECK-NEXT: #NO_APP 204; CHECK-NEXT: vmovaps (%rdi), %xmm2 205; CHECK-NEXT: kmovd %esi, %k1 206; CHECK-NEXT: vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 207; CHECK-NEXT: vmovaps %xmm2, %xmm0 208; CHECK-NEXT: retq 209 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 210 %a0 = load <8 x half>, ptr %p 211 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a0, <8 x half> %a1) 212 %3 = bitcast i8 %mask to <8 x i1> 213 %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 214 ret <8 x half> %4 215} 216 217define <8 x half> @stack_fold_fmadd123ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) { 218; CHECK-LABEL: stack_fold_fmadd123ph_maskz: 219; CHECK: # %bb.0: 220; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 221; CHECK-NEXT: #APP 222; CHECK-NEXT: nop 223; CHECK-NEXT: #NO_APP 224; CHECK-NEXT: kmovb (%rdi), %k1 225; CHECK-NEXT: vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 226; CHECK-NEXT: retq 227 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 228 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) 229 %3 = load i8, ptr %mask 230 %4 = bitcast i8 %3 to <8 x i1> 231 %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer 232 ret <8 x half> %5 233} 234 235define <8 x half> @stack_fold_fmadd213ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) { 236; CHECK-LABEL: stack_fold_fmadd213ph_maskz: 237; CHECK: # %bb.0: 238; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 239; CHECK-NEXT: #APP 240; CHECK-NEXT: nop 241; CHECK-NEXT: #NO_APP 242; CHECK-NEXT: kmovb (%rdi), %k1 243; CHECK-NEXT: vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 244; CHECK-NEXT: retq 245 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 246 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a0, <8 x half> %a2) 247 %3 = load i8, ptr %mask 248 %4 = bitcast i8 %3 to <8 x i1> 249 %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer 250 ret <8 x half> %5 251} 252 253define <8 x half> @stack_fold_fmadd231ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) { 254; CHECK-LABEL: stack_fold_fmadd231ph_maskz: 255; CHECK: # %bb.0: 256; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 257; CHECK-NEXT: #APP 258; CHECK-NEXT: nop 259; CHECK-NEXT: #NO_APP 260; CHECK-NEXT: kmovb (%rdi), %k1 261; CHECK-NEXT: vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 262; CHECK-NEXT: retq 263 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 264 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a2, <8 x half> %a0) 265 %3 = load i8, ptr %mask 266 %4 = bitcast i8 %3 to <8 x i1> 267 %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer 268 ret <8 x half> %5 269} 270 271define <8 x half> @stack_fold_fmadd321ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) { 272; CHECK-LABEL: stack_fold_fmadd321ph_maskz: 273; CHECK: # %bb.0: 274; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 275; CHECK-NEXT: #APP 276; CHECK-NEXT: nop 277; CHECK-NEXT: #NO_APP 278; CHECK-NEXT: kmovb (%rdi), %k1 279; CHECK-NEXT: vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 280; CHECK-NEXT: retq 281 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 282 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a1, <8 x half> %a0) 283 %3 = load i8, ptr %mask 284 %4 = bitcast i8 %3 to <8 x i1> 285 %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer 286 ret <8 x half> %5 287} 288 289define <8 x half> @stack_fold_fmadd132ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) { 290; CHECK-LABEL: stack_fold_fmadd132ph_maskz: 291; CHECK: # %bb.0: 292; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 293; CHECK-NEXT: #APP 294; CHECK-NEXT: nop 295; CHECK-NEXT: #NO_APP 296; CHECK-NEXT: kmovb (%rdi), %k1 297; CHECK-NEXT: vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 298; CHECK-NEXT: retq 299 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 300 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a2, <8 x half> %a1) 301 %3 = load i8, ptr %mask 302 %4 = bitcast i8 %3 to <8 x i1> 303 %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer 304 ret <8 x half> %5 305} 306 307define <8 x half> @stack_fold_fmadd312ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) { 308; CHECK-LABEL: stack_fold_fmadd312ph_maskz: 309; CHECK: # %bb.0: 310; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 311; CHECK-NEXT: #APP 312; CHECK-NEXT: nop 313; CHECK-NEXT: #NO_APP 314; CHECK-NEXT: kmovb (%rdi), %k1 315; CHECK-NEXT: vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 316; CHECK-NEXT: retq 317 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 318 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a0, <8 x half> %a1) 319 %3 = load i8, ptr %mask 320 %4 = bitcast i8 %3 to <8 x i1> 321 %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer 322 ret <8 x half> %5 323} 324 325define <8 x half> @stack_fold_fmsub123ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { 326; CHECK-LABEL: stack_fold_fmsub123ph: 327; CHECK: # %bb.0: 328; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 329; CHECK-NEXT: #APP 330; CHECK-NEXT: nop 331; CHECK-NEXT: #NO_APP 332; CHECK-NEXT: vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 333; CHECK-NEXT: retq 334 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 335 %2 = fneg <8 x half> %a2 336 %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> %2) 337 ret <8 x half> %3 338} 339 340define <8 x half> @stack_fold_fmsub213ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { 341; CHECK-LABEL: stack_fold_fmsub213ph: 342; CHECK: # %bb.0: 343; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 344; CHECK-NEXT: #APP 345; CHECK-NEXT: nop 346; CHECK-NEXT: #NO_APP 347; CHECK-NEXT: vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 348; CHECK-NEXT: retq 349 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 350 %2 = fneg <8 x half> %a2 351 %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a0, <8 x half> %2) 352 ret <8 x half> %3 353} 354 355define <8 x half> @stack_fold_fmsub231ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { 356; CHECK-LABEL: stack_fold_fmsub231ph: 357; CHECK: # %bb.0: 358; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 359; CHECK-NEXT: #APP 360; CHECK-NEXT: nop 361; CHECK-NEXT: #NO_APP 362; CHECK-NEXT: vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 363; CHECK-NEXT: retq 364 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 365 %2 = fneg <8 x half> %a0 366 %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a2, <8 x half> %2) 367 ret <8 x half> %3 368} 369 370define <8 x half> @stack_fold_fmsub321ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { 371; CHECK-LABEL: stack_fold_fmsub321ph: 372; CHECK: # %bb.0: 373; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 374; CHECK-NEXT: #APP 375; CHECK-NEXT: nop 376; CHECK-NEXT: #NO_APP 377; CHECK-NEXT: vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 378; CHECK-NEXT: retq 379 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 380 %2 = fneg <8 x half> %a0 381 %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a1, <8 x half> %2) 382 ret <8 x half> %3 383} 384 385define <8 x half> @stack_fold_fmsub132ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { 386; CHECK-LABEL: stack_fold_fmsub132ph: 387; CHECK: # %bb.0: 388; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 389; CHECK-NEXT: #APP 390; CHECK-NEXT: nop 391; CHECK-NEXT: #NO_APP 392; CHECK-NEXT: vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 393; CHECK-NEXT: retq 394 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 395 %2 = fneg <8 x half> %a1 396 %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a2, <8 x half> %2) 397 ret <8 x half> %3 398} 399 400define <8 x half> @stack_fold_fmsub312ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { 401; CHECK-LABEL: stack_fold_fmsub312ph: 402; CHECK: # %bb.0: 403; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 404; CHECK-NEXT: #APP 405; CHECK-NEXT: nop 406; CHECK-NEXT: #NO_APP 407; CHECK-NEXT: vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 408; CHECK-NEXT: retq 409 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 410 %2 = fneg <8 x half> %a1 411 %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a0, <8 x half> %2) 412 ret <8 x half> %3 413} 414 415define <8 x half> @stack_fold_fmsub123ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { 416; CHECK-LABEL: stack_fold_fmsub123ph_mask: 417; CHECK: # %bb.0: 418; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 419; CHECK-NEXT: #APP 420; CHECK-NEXT: nop 421; CHECK-NEXT: #NO_APP 422; CHECK-NEXT: vmovaps (%rdi), %xmm2 423; CHECK-NEXT: kmovd %esi, %k1 424; CHECK-NEXT: vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 425; CHECK-NEXT: vmovaps %xmm2, %xmm0 426; CHECK-NEXT: retq 427 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 428 %a0 = load <8 x half>, ptr %p 429 %neg = fneg <8 x half> %a2 430 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> %neg) 431 %3 = bitcast i8 %mask to <8 x i1> 432 %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 433 ret <8 x half> %4 434} 435 436define <8 x half> @stack_fold_fmsub213ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { 437; CHECK-LABEL: stack_fold_fmsub213ph_mask: 438; CHECK: # %bb.0: 439; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 440; CHECK-NEXT: #APP 441; CHECK-NEXT: nop 442; CHECK-NEXT: #NO_APP 443; CHECK-NEXT: vmovaps (%rdi), %xmm2 444; CHECK-NEXT: kmovd %esi, %k1 445; CHECK-NEXT: vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 446; CHECK-NEXT: vmovaps %xmm2, %xmm0 447; CHECK-NEXT: retq 448 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 449 %a0 = load <8 x half>, ptr %p 450 %neg = fneg <8 x half> %a2 451 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a0, <8 x half> %neg) 452 %3 = bitcast i8 %mask to <8 x i1> 453 %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 454 ret <8 x half> %4 455} 456 457define <8 x half> @stack_fold_fmsub231ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { 458; CHECK-LABEL: stack_fold_fmsub231ph_mask: 459; CHECK: # %bb.0: 460; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 461; CHECK-NEXT: #APP 462; CHECK-NEXT: nop 463; CHECK-NEXT: #NO_APP 464; CHECK-NEXT: vmovaps (%rdi), %xmm2 465; CHECK-NEXT: kmovd %esi, %k1 466; CHECK-NEXT: vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 467; CHECK-NEXT: vmovaps %xmm2, %xmm0 468; CHECK-NEXT: retq 469 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 470 %a0 = load <8 x half>, ptr %p 471 %neg = fneg <8 x half> %a0 472 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a2, <8 x half> %neg) 473 %3 = bitcast i8 %mask to <8 x i1> 474 %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 475 ret <8 x half> %4 476} 477 478define <8 x half> @stack_fold_fmsub321ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { 479; CHECK-LABEL: stack_fold_fmsub321ph_mask: 480; CHECK: # %bb.0: 481; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 482; CHECK-NEXT: #APP 483; CHECK-NEXT: nop 484; CHECK-NEXT: #NO_APP 485; CHECK-NEXT: vmovaps (%rdi), %xmm2 486; CHECK-NEXT: kmovd %esi, %k1 487; CHECK-NEXT: vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 488; CHECK-NEXT: vmovaps %xmm2, %xmm0 489; CHECK-NEXT: retq 490 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 491 %a0 = load <8 x half>, ptr %p 492 %neg = fneg <8 x half> %a0 493 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a1, <8 x half> %neg) 494 %3 = bitcast i8 %mask to <8 x i1> 495 %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 496 ret <8 x half> %4 497} 498 499define <8 x half> @stack_fold_fmsub132ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { 500; CHECK-LABEL: stack_fold_fmsub132ph_mask: 501; CHECK: # %bb.0: 502; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 503; CHECK-NEXT: #APP 504; CHECK-NEXT: nop 505; CHECK-NEXT: #NO_APP 506; CHECK-NEXT: vmovaps (%rdi), %xmm2 507; CHECK-NEXT: kmovd %esi, %k1 508; CHECK-NEXT: vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 509; CHECK-NEXT: vmovaps %xmm2, %xmm0 510; CHECK-NEXT: retq 511 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 512 %a0 = load <8 x half>, ptr %p 513 %neg = fneg <8 x half> %a1 514 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a2, <8 x half> %neg) 515 %3 = bitcast i8 %mask to <8 x i1> 516 %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 517 ret <8 x half> %4 518} 519 520define <8 x half> @stack_fold_fmsub312ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { 521; CHECK-LABEL: stack_fold_fmsub312ph_mask: 522; CHECK: # %bb.0: 523; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 524; CHECK-NEXT: #APP 525; CHECK-NEXT: nop 526; CHECK-NEXT: #NO_APP 527; CHECK-NEXT: vmovaps (%rdi), %xmm2 528; CHECK-NEXT: kmovd %esi, %k1 529; CHECK-NEXT: vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 530; CHECK-NEXT: vmovaps %xmm2, %xmm0 531; CHECK-NEXT: retq 532 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 533 %a0 = load <8 x half>, ptr %p 534 %neg = fneg <8 x half> %a1 535 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a0, <8 x half> %neg) 536 %3 = bitcast i8 %mask to <8 x i1> 537 %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 538 ret <8 x half> %4 539} 540 541define <8 x half> @stack_fold_fmsub123ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) { 542; CHECK-LABEL: stack_fold_fmsub123ph_maskz: 543; CHECK: # %bb.0: 544; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 545; CHECK-NEXT: #APP 546; CHECK-NEXT: nop 547; CHECK-NEXT: #NO_APP 548; CHECK-NEXT: kmovb (%rdi), %k1 549; CHECK-NEXT: vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 550; CHECK-NEXT: retq 551 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 552 %neg = fneg <8 x half> %a2 553 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> %neg) 554 %3 = load i8, ptr %mask 555 %4 = bitcast i8 %3 to <8 x i1> 556 %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer 557 ret <8 x half> %5 558} 559 560define <8 x half> @stack_fold_fmsub213ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) { 561; CHECK-LABEL: stack_fold_fmsub213ph_maskz: 562; CHECK: # %bb.0: 563; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 564; CHECK-NEXT: #APP 565; CHECK-NEXT: nop 566; CHECK-NEXT: #NO_APP 567; CHECK-NEXT: kmovb (%rdi), %k1 568; CHECK-NEXT: vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 569; CHECK-NEXT: retq 570 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 571 %neg = fneg <8 x half> %a2 572 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a0, <8 x half> %neg) 573 %3 = load i8, ptr %mask 574 %4 = bitcast i8 %3 to <8 x i1> 575 %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer 576 ret <8 x half> %5 577} 578 579define <8 x half> @stack_fold_fmsub231ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) { 580; CHECK-LABEL: stack_fold_fmsub231ph_maskz: 581; CHECK: # %bb.0: 582; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 583; CHECK-NEXT: #APP 584; CHECK-NEXT: nop 585; CHECK-NEXT: #NO_APP 586; CHECK-NEXT: kmovb (%rdi), %k1 587; CHECK-NEXT: vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 588; CHECK-NEXT: retq 589 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 590 %neg = fneg <8 x half> %a0 591 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a2, <8 x half> %neg) 592 %3 = load i8, ptr %mask 593 %4 = bitcast i8 %3 to <8 x i1> 594 %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer 595 ret <8 x half> %5 596} 597 598define <8 x half> @stack_fold_fmsub321ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) { 599; CHECK-LABEL: stack_fold_fmsub321ph_maskz: 600; CHECK: # %bb.0: 601; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 602; CHECK-NEXT: #APP 603; CHECK-NEXT: nop 604; CHECK-NEXT: #NO_APP 605; CHECK-NEXT: kmovb (%rdi), %k1 606; CHECK-NEXT: vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 607; CHECK-NEXT: retq 608 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 609 %neg = fneg <8 x half> %a0 610 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a1, <8 x half> %neg) 611 %3 = load i8, ptr %mask 612 %4 = bitcast i8 %3 to <8 x i1> 613 %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer 614 ret <8 x half> %5 615} 616 617define <8 x half> @stack_fold_fmsub132ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) { 618; CHECK-LABEL: stack_fold_fmsub132ph_maskz: 619; CHECK: # %bb.0: 620; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 621; CHECK-NEXT: #APP 622; CHECK-NEXT: nop 623; CHECK-NEXT: #NO_APP 624; CHECK-NEXT: kmovb (%rdi), %k1 625; CHECK-NEXT: vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 626; CHECK-NEXT: retq 627 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 628 %neg = fneg <8 x half> %a1 629 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a2, <8 x half> %neg) 630 %3 = load i8, ptr %mask 631 %4 = bitcast i8 %3 to <8 x i1> 632 %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer 633 ret <8 x half> %5 634} 635 636define <8 x half> @stack_fold_fmsub312ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) { 637; CHECK-LABEL: stack_fold_fmsub312ph_maskz: 638; CHECK: # %bb.0: 639; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 640; CHECK-NEXT: #APP 641; CHECK-NEXT: nop 642; CHECK-NEXT: #NO_APP 643; CHECK-NEXT: kmovb (%rdi), %k1 644; CHECK-NEXT: vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 645; CHECK-NEXT: retq 646 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 647 %neg = fneg <8 x half> %a1 648 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a0, <8 x half> %neg) 649 %3 = load i8, ptr %mask 650 %4 = bitcast i8 %3 to <8 x i1> 651 %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer 652 ret <8 x half> %5 653} 654 655define <8 x half> @stack_fold_fnmadd123ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { 656; CHECK-LABEL: stack_fold_fnmadd123ph: 657; CHECK: # %bb.0: 658; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 659; CHECK-NEXT: #APP 660; CHECK-NEXT: nop 661; CHECK-NEXT: #NO_APP 662; CHECK-NEXT: vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 663; CHECK-NEXT: retq 664 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 665 %2 = fneg <8 x half> %a0 666 %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a1, <8 x half> %a2) 667 ret <8 x half> %3 668} 669 670define <8 x half> @stack_fold_fnmadd213ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { 671; CHECK-LABEL: stack_fold_fnmadd213ph: 672; CHECK: # %bb.0: 673; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 674; CHECK-NEXT: #APP 675; CHECK-NEXT: nop 676; CHECK-NEXT: #NO_APP 677; CHECK-NEXT: vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 678; CHECK-NEXT: retq 679 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 680 %2 = fneg <8 x half> %a1 681 %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a0, <8 x half> %a2) 682 ret <8 x half> %3 683} 684 685define <8 x half> @stack_fold_fnmadd231ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { 686; CHECK-LABEL: stack_fold_fnmadd231ph: 687; CHECK: # %bb.0: 688; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 689; CHECK-NEXT: #APP 690; CHECK-NEXT: nop 691; CHECK-NEXT: #NO_APP 692; CHECK-NEXT: vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 693; CHECK-NEXT: retq 694 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 695 %2 = fneg <8 x half> %a1 696 %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a2, <8 x half> %a0) 697 ret <8 x half> %3 698} 699 700define <8 x half> @stack_fold_fnmadd321ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { 701; CHECK-LABEL: stack_fold_fnmadd321ph: 702; CHECK: # %bb.0: 703; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 704; CHECK-NEXT: #APP 705; CHECK-NEXT: nop 706; CHECK-NEXT: #NO_APP 707; CHECK-NEXT: vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 708; CHECK-NEXT: retq 709 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 710 %2 = fneg <8 x half> %a2 711 %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a1, <8 x half> %a0) 712 ret <8 x half> %3 713} 714 715define <8 x half> @stack_fold_fnmadd132ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { 716; CHECK-LABEL: stack_fold_fnmadd132ph: 717; CHECK: # %bb.0: 718; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 719; CHECK-NEXT: #APP 720; CHECK-NEXT: nop 721; CHECK-NEXT: #NO_APP 722; CHECK-NEXT: vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 723; CHECK-NEXT: retq 724 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 725 %2 = fneg <8 x half> %a0 726 %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a2, <8 x half> %a1) 727 ret <8 x half> %3 728} 729 730define <8 x half> @stack_fold_fnmadd312ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { 731; CHECK-LABEL: stack_fold_fnmadd312ph: 732; CHECK: # %bb.0: 733; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 734; CHECK-NEXT: #APP 735; CHECK-NEXT: nop 736; CHECK-NEXT: #NO_APP 737; CHECK-NEXT: vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 738; CHECK-NEXT: retq 739 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 740 %2 = fneg <8 x half> %a2 741 %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a0, <8 x half> %a1) 742 ret <8 x half> %3 743} 744 745define <8 x half> @stack_fold_fnmadd123ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { 746; CHECK-LABEL: stack_fold_fnmadd123ph_mask: 747; CHECK: # %bb.0: 748; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 749; CHECK-NEXT: #APP 750; CHECK-NEXT: nop 751; CHECK-NEXT: #NO_APP 752; CHECK-NEXT: vmovaps (%rdi), %xmm2 753; CHECK-NEXT: kmovd %esi, %k1 754; CHECK-NEXT: vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 755; CHECK-NEXT: vmovaps %xmm2, %xmm0 756; CHECK-NEXT: retq 757 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 758 %a0 = load <8 x half>, ptr %p 759 %neg = fneg <8 x half> %a0 760 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a1, <8 x half> %a2) 761 %3 = bitcast i8 %mask to <8 x i1> 762 %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 763 ret <8 x half> %4 764} 765 766define <8 x half> @stack_fold_fnmadd213ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { 767; CHECK-LABEL: stack_fold_fnmadd213ph_mask: 768; CHECK: # %bb.0: 769; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 770; CHECK-NEXT: #APP 771; CHECK-NEXT: nop 772; CHECK-NEXT: #NO_APP 773; CHECK-NEXT: vmovaps (%rdi), %xmm2 774; CHECK-NEXT: kmovd %esi, %k1 775; CHECK-NEXT: vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 776; CHECK-NEXT: vmovaps %xmm2, %xmm0 777; CHECK-NEXT: retq 778 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 779 %a0 = load <8 x half>, ptr %p 780 %neg = fneg <8 x half> %a1 781 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a0, <8 x half> %a2) 782 %3 = bitcast i8 %mask to <8 x i1> 783 %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 784 ret <8 x half> %4 785} 786 787define <8 x half> @stack_fold_fnmadd231ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { 788; CHECK-LABEL: stack_fold_fnmadd231ph_mask: 789; CHECK: # %bb.0: 790; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 791; CHECK-NEXT: #APP 792; CHECK-NEXT: nop 793; CHECK-NEXT: #NO_APP 794; CHECK-NEXT: vmovaps (%rdi), %xmm2 795; CHECK-NEXT: kmovd %esi, %k1 796; CHECK-NEXT: vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 797; CHECK-NEXT: vmovaps %xmm2, %xmm0 798; CHECK-NEXT: retq 799 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 800 %a0 = load <8 x half>, ptr %p 801 %neg = fneg <8 x half> %a1 802 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a2, <8 x half> %a0) 803 %3 = bitcast i8 %mask to <8 x i1> 804 %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 805 ret <8 x half> %4 806} 807 808define <8 x half> @stack_fold_fnmadd321ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { 809; CHECK-LABEL: stack_fold_fnmadd321ph_mask: 810; CHECK: # %bb.0: 811; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 812; CHECK-NEXT: #APP 813; CHECK-NEXT: nop 814; CHECK-NEXT: #NO_APP 815; CHECK-NEXT: vmovaps (%rdi), %xmm2 816; CHECK-NEXT: kmovd %esi, %k1 817; CHECK-NEXT: vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 818; CHECK-NEXT: vmovaps %xmm2, %xmm0 819; CHECK-NEXT: retq 820 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 821 %a0 = load <8 x half>, ptr %p 822 %neg = fneg <8 x half> %a2 823 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a1, <8 x half> %a0) 824 %3 = bitcast i8 %mask to <8 x i1> 825 %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 826 ret <8 x half> %4 827} 828 829define <8 x half> @stack_fold_fnmadd132ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { 830; CHECK-LABEL: stack_fold_fnmadd132ph_mask: 831; CHECK: # %bb.0: 832; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 833; CHECK-NEXT: #APP 834; CHECK-NEXT: nop 835; CHECK-NEXT: #NO_APP 836; CHECK-NEXT: vmovaps (%rdi), %xmm2 837; CHECK-NEXT: kmovd %esi, %k1 838; CHECK-NEXT: vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 839; CHECK-NEXT: vmovaps %xmm2, %xmm0 840; CHECK-NEXT: retq 841 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 842 %a0 = load <8 x half>, ptr %p 843 %neg = fneg <8 x half> %a0 844 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a2, <8 x half> %a1) 845 %3 = bitcast i8 %mask to <8 x i1> 846 %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 847 ret <8 x half> %4 848} 849 850define <8 x half> @stack_fold_fnmadd312ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { 851; CHECK-LABEL: stack_fold_fnmadd312ph_mask: 852; CHECK: # %bb.0: 853; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 854; CHECK-NEXT: #APP 855; CHECK-NEXT: nop 856; CHECK-NEXT: #NO_APP 857; CHECK-NEXT: vmovaps (%rdi), %xmm2 858; CHECK-NEXT: kmovd %esi, %k1 859; CHECK-NEXT: vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 860; CHECK-NEXT: vmovaps %xmm2, %xmm0 861; CHECK-NEXT: retq 862 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 863 %a0 = load <8 x half>, ptr %p 864 %neg = fneg <8 x half> %a2 865 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a0, <8 x half> %a1) 866 %3 = bitcast i8 %mask to <8 x i1> 867 %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 868 ret <8 x half> %4 869} 870 871define <8 x half> @stack_fold_fnmadd123ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) { 872; CHECK-LABEL: stack_fold_fnmadd123ph_maskz: 873; CHECK: # %bb.0: 874; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 875; CHECK-NEXT: #APP 876; CHECK-NEXT: nop 877; CHECK-NEXT: #NO_APP 878; CHECK-NEXT: kmovb (%rdi), %k1 879; CHECK-NEXT: vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 880; CHECK-NEXT: retq 881 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 882 %neg = fneg <8 x half> %a0 883 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a1, <8 x half> %a2) 884 %3 = load i8, ptr %mask 885 %4 = bitcast i8 %3 to <8 x i1> 886 %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer 887 ret <8 x half> %5 888} 889 890define <8 x half> @stack_fold_fnmadd213ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) { 891; CHECK-LABEL: stack_fold_fnmadd213ph_maskz: 892; CHECK: # %bb.0: 893; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 894; CHECK-NEXT: #APP 895; CHECK-NEXT: nop 896; CHECK-NEXT: #NO_APP 897; CHECK-NEXT: kmovb (%rdi), %k1 898; CHECK-NEXT: vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 899; CHECK-NEXT: retq 900 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 901 %neg = fneg <8 x half> %a1 902 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a0, <8 x half> %a2) 903 %3 = load i8, ptr %mask 904 %4 = bitcast i8 %3 to <8 x i1> 905 %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer 906 ret <8 x half> %5 907} 908 909define <8 x half> @stack_fold_fnmadd231ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) { 910; CHECK-LABEL: stack_fold_fnmadd231ph_maskz: 911; CHECK: # %bb.0: 912; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 913; CHECK-NEXT: #APP 914; CHECK-NEXT: nop 915; CHECK-NEXT: #NO_APP 916; CHECK-NEXT: kmovb (%rdi), %k1 917; CHECK-NEXT: vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 918; CHECK-NEXT: retq 919 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 920 %neg = fneg <8 x half> %a1 921 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a2, <8 x half> %a0) 922 %3 = load i8, ptr %mask 923 %4 = bitcast i8 %3 to <8 x i1> 924 %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer 925 ret <8 x half> %5 926} 927 928define <8 x half> @stack_fold_fnmadd321ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) { 929; CHECK-LABEL: stack_fold_fnmadd321ph_maskz: 930; CHECK: # %bb.0: 931; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 932; CHECK-NEXT: #APP 933; CHECK-NEXT: nop 934; CHECK-NEXT: #NO_APP 935; CHECK-NEXT: kmovb (%rdi), %k1 936; CHECK-NEXT: vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 937; CHECK-NEXT: retq 938 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 939 %neg = fneg <8 x half> %a2 940 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a1, <8 x half> %a0) 941 %3 = load i8, ptr %mask 942 %4 = bitcast i8 %3 to <8 x i1> 943 %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer 944 ret <8 x half> %5 945} 946 947define <8 x half> @stack_fold_fnmadd132ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) { 948; CHECK-LABEL: stack_fold_fnmadd132ph_maskz: 949; CHECK: # %bb.0: 950; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 951; CHECK-NEXT: #APP 952; CHECK-NEXT: nop 953; CHECK-NEXT: #NO_APP 954; CHECK-NEXT: kmovb (%rdi), %k1 955; CHECK-NEXT: vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 956; CHECK-NEXT: retq 957 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 958 %neg = fneg <8 x half> %a0 959 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a2, <8 x half> %a1) 960 %3 = load i8, ptr %mask 961 %4 = bitcast i8 %3 to <8 x i1> 962 %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer 963 ret <8 x half> %5 964} 965 966define <8 x half> @stack_fold_fnmadd312ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) { 967; CHECK-LABEL: stack_fold_fnmadd312ph_maskz: 968; CHECK: # %bb.0: 969; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 970; CHECK-NEXT: #APP 971; CHECK-NEXT: nop 972; CHECK-NEXT: #NO_APP 973; CHECK-NEXT: kmovb (%rdi), %k1 974; CHECK-NEXT: vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 975; CHECK-NEXT: retq 976 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 977 %neg = fneg <8 x half> %a2 978 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a0, <8 x half> %a1) 979 %3 = load i8, ptr %mask 980 %4 = bitcast i8 %3 to <8 x i1> 981 %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer 982 ret <8 x half> %5 983} 984 985define <8 x half> @stack_fold_fnmsub123ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { 986; CHECK-LABEL: stack_fold_fnmsub123ph: 987; CHECK: # %bb.0: 988; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 989; CHECK-NEXT: #APP 990; CHECK-NEXT: nop 991; CHECK-NEXT: #NO_APP 992; CHECK-NEXT: vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 993; CHECK-NEXT: retq 994 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 995 %2 = fneg <8 x half> %a0 996 %3 = fneg <8 x half> %a2 997 %4 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a1, <8 x half> %3) 998 ret <8 x half> %4 999} 1000 1001define <8 x half> @stack_fold_fnmsub213ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { 1002; CHECK-LABEL: stack_fold_fnmsub213ph: 1003; CHECK: # %bb.0: 1004; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1005; CHECK-NEXT: #APP 1006; CHECK-NEXT: nop 1007; CHECK-NEXT: #NO_APP 1008; CHECK-NEXT: vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1009; CHECK-NEXT: retq 1010 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1011 %2 = fneg <8 x half> %a1 1012 %3 = fneg <8 x half> %a2 1013 %4 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a0, <8 x half> %3) 1014 ret <8 x half> %4 1015} 1016 1017define <8 x half> @stack_fold_fnmsub231ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { 1018; CHECK-LABEL: stack_fold_fnmsub231ph: 1019; CHECK: # %bb.0: 1020; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1021; CHECK-NEXT: #APP 1022; CHECK-NEXT: nop 1023; CHECK-NEXT: #NO_APP 1024; CHECK-NEXT: vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1025; CHECK-NEXT: retq 1026 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1027 %2 = fneg <8 x half> %a1 1028 %3 = fneg <8 x half> %a0 1029 %4 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a2, <8 x half> %3) 1030 ret <8 x half> %4 1031} 1032 1033define <8 x half> @stack_fold_fnmsub321ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { 1034; CHECK-LABEL: stack_fold_fnmsub321ph: 1035; CHECK: # %bb.0: 1036; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1037; CHECK-NEXT: #APP 1038; CHECK-NEXT: nop 1039; CHECK-NEXT: #NO_APP 1040; CHECK-NEXT: vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1041; CHECK-NEXT: retq 1042 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1043 %2 = fneg <8 x half> %a2 1044 %3 = fneg <8 x half> %a0 1045 %4 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a1, <8 x half> %3) 1046 ret <8 x half> %4 1047} 1048 1049define <8 x half> @stack_fold_fnmsub132ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { 1050; CHECK-LABEL: stack_fold_fnmsub132ph: 1051; CHECK: # %bb.0: 1052; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1053; CHECK-NEXT: #APP 1054; CHECK-NEXT: nop 1055; CHECK-NEXT: #NO_APP 1056; CHECK-NEXT: vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1057; CHECK-NEXT: retq 1058 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1059 %2 = fneg <8 x half> %a0 1060 %3 = fneg <8 x half> %a1 1061 %4 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a2, <8 x half> %3) 1062 ret <8 x half> %4 1063} 1064 1065define <8 x half> @stack_fold_fnmsub312ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { 1066; CHECK-LABEL: stack_fold_fnmsub312ph: 1067; CHECK: # %bb.0: 1068; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1069; CHECK-NEXT: #APP 1070; CHECK-NEXT: nop 1071; CHECK-NEXT: #NO_APP 1072; CHECK-NEXT: vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1073; CHECK-NEXT: retq 1074 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1075 %2 = fneg <8 x half> %a2 1076 %3 = fneg <8 x half> %a1 1077 %4 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a0, <8 x half> %3) 1078 ret <8 x half> %4 1079} 1080 1081define <8 x half> @stack_fold_fnmsub123ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { 1082; CHECK-LABEL: stack_fold_fnmsub123ph_mask: 1083; CHECK: # %bb.0: 1084; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1085; CHECK-NEXT: #APP 1086; CHECK-NEXT: nop 1087; CHECK-NEXT: #NO_APP 1088; CHECK-NEXT: vmovaps (%rdi), %xmm2 1089; CHECK-NEXT: kmovd %esi, %k1 1090; CHECK-NEXT: vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 1091; CHECK-NEXT: vmovaps %xmm2, %xmm0 1092; CHECK-NEXT: retq 1093 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1094 %a0 = load <8 x half>, ptr %p 1095 %neg = fneg <8 x half> %a2 1096 %neg1 = fneg <8 x half> %a0 1097 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a1, <8 x half> %neg) 1098 %3 = bitcast i8 %mask to <8 x i1> 1099 %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 1100 ret <8 x half> %4 1101} 1102 1103define <8 x half> @stack_fold_fnmsub213ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { 1104; CHECK-LABEL: stack_fold_fnmsub213ph_mask: 1105; CHECK: # %bb.0: 1106; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1107; CHECK-NEXT: #APP 1108; CHECK-NEXT: nop 1109; CHECK-NEXT: #NO_APP 1110; CHECK-NEXT: vmovaps (%rdi), %xmm2 1111; CHECK-NEXT: kmovd %esi, %k1 1112; CHECK-NEXT: vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 1113; CHECK-NEXT: vmovaps %xmm2, %xmm0 1114; CHECK-NEXT: retq 1115 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1116 %a0 = load <8 x half>, ptr %p 1117 %neg = fneg <8 x half> %a2 1118 %neg1 = fneg <8 x half> %a1 1119 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a0, <8 x half> %neg) 1120 %3 = bitcast i8 %mask to <8 x i1> 1121 %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 1122 ret <8 x half> %4 1123} 1124 1125define <8 x half> @stack_fold_fnmsub231ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { 1126; CHECK-LABEL: stack_fold_fnmsub231ph_mask: 1127; CHECK: # %bb.0: 1128; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1129; CHECK-NEXT: #APP 1130; CHECK-NEXT: nop 1131; CHECK-NEXT: #NO_APP 1132; CHECK-NEXT: vmovaps (%rdi), %xmm2 1133; CHECK-NEXT: kmovd %esi, %k1 1134; CHECK-NEXT: vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 1135; CHECK-NEXT: vmovaps %xmm2, %xmm0 1136; CHECK-NEXT: retq 1137 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1138 %a0 = load <8 x half>, ptr %p 1139 %neg = fneg <8 x half> %a0 1140 %neg1 = fneg <8 x half> %a1 1141 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a2, <8 x half> %neg) 1142 %3 = bitcast i8 %mask to <8 x i1> 1143 %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 1144 ret <8 x half> %4 1145} 1146 1147define <8 x half> @stack_fold_fnmsub321ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { 1148; CHECK-LABEL: stack_fold_fnmsub321ph_mask: 1149; CHECK: # %bb.0: 1150; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1151; CHECK-NEXT: #APP 1152; CHECK-NEXT: nop 1153; CHECK-NEXT: #NO_APP 1154; CHECK-NEXT: vmovaps (%rdi), %xmm2 1155; CHECK-NEXT: kmovd %esi, %k1 1156; CHECK-NEXT: vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 1157; CHECK-NEXT: vmovaps %xmm2, %xmm0 1158; CHECK-NEXT: retq 1159 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1160 %a0 = load <8 x half>, ptr %p 1161 %neg = fneg <8 x half> %a0 1162 %neg1 = fneg <8 x half> %a2 1163 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a1, <8 x half> %neg) 1164 %3 = bitcast i8 %mask to <8 x i1> 1165 %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 1166 ret <8 x half> %4 1167} 1168 1169define <8 x half> @stack_fold_fnmsub132ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { 1170; CHECK-LABEL: stack_fold_fnmsub132ph_mask: 1171; CHECK: # %bb.0: 1172; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1173; CHECK-NEXT: #APP 1174; CHECK-NEXT: nop 1175; CHECK-NEXT: #NO_APP 1176; CHECK-NEXT: vmovaps (%rdi), %xmm2 1177; CHECK-NEXT: kmovd %esi, %k1 1178; CHECK-NEXT: vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 1179; CHECK-NEXT: vmovaps %xmm2, %xmm0 1180; CHECK-NEXT: retq 1181 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1182 %a0 = load <8 x half>, ptr %p 1183 %neg = fneg <8 x half> %a1 1184 %neg1 = fneg <8 x half> %a0 1185 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a2, <8 x half> %neg) 1186 %3 = bitcast i8 %mask to <8 x i1> 1187 %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 1188 ret <8 x half> %4 1189} 1190 1191define <8 x half> @stack_fold_fnmsub312ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) { 1192; CHECK-LABEL: stack_fold_fnmsub312ph_mask: 1193; CHECK: # %bb.0: 1194; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1195; CHECK-NEXT: #APP 1196; CHECK-NEXT: nop 1197; CHECK-NEXT: #NO_APP 1198; CHECK-NEXT: vmovaps (%rdi), %xmm2 1199; CHECK-NEXT: kmovd %esi, %k1 1200; CHECK-NEXT: vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 1201; CHECK-NEXT: vmovaps %xmm2, %xmm0 1202; CHECK-NEXT: retq 1203 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1204 %a0 = load <8 x half>, ptr %p 1205 %neg = fneg <8 x half> %a1 1206 %neg1 = fneg <8 x half> %a2 1207 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a0, <8 x half> %neg) 1208 %3 = bitcast i8 %mask to <8 x i1> 1209 %4 = select <8 x i1> %3, <8 x half> %2, <8 x half> %a0 1210 ret <8 x half> %4 1211} 1212 1213define <8 x half> @stack_fold_fnmsub123ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) { 1214; CHECK-LABEL: stack_fold_fnmsub123ph_maskz: 1215; CHECK: # %bb.0: 1216; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1217; CHECK-NEXT: #APP 1218; CHECK-NEXT: nop 1219; CHECK-NEXT: #NO_APP 1220; CHECK-NEXT: kmovb (%rdi), %k1 1221; CHECK-NEXT: vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 1222; CHECK-NEXT: retq 1223 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1224 %neg = fneg <8 x half> %a2 1225 %neg1 = fneg <8 x half> %a0 1226 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a1, <8 x half> %neg) 1227 %3 = load i8, ptr %mask 1228 %4 = bitcast i8 %3 to <8 x i1> 1229 %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer 1230 ret <8 x half> %5 1231} 1232 1233define <8 x half> @stack_fold_fnmsub213ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) { 1234; CHECK-LABEL: stack_fold_fnmsub213ph_maskz: 1235; CHECK: # %bb.0: 1236; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1237; CHECK-NEXT: #APP 1238; CHECK-NEXT: nop 1239; CHECK-NEXT: #NO_APP 1240; CHECK-NEXT: kmovb (%rdi), %k1 1241; CHECK-NEXT: vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 1242; CHECK-NEXT: retq 1243 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1244 %neg = fneg <8 x half> %a2 1245 %neg1 = fneg <8 x half> %a1 1246 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a0, <8 x half> %neg) 1247 %3 = load i8, ptr %mask 1248 %4 = bitcast i8 %3 to <8 x i1> 1249 %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer 1250 ret <8 x half> %5 1251} 1252 1253define <8 x half> @stack_fold_fnmsub231ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) { 1254; CHECK-LABEL: stack_fold_fnmsub231ph_maskz: 1255; CHECK: # %bb.0: 1256; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1257; CHECK-NEXT: #APP 1258; CHECK-NEXT: nop 1259; CHECK-NEXT: #NO_APP 1260; CHECK-NEXT: kmovb (%rdi), %k1 1261; CHECK-NEXT: vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 1262; CHECK-NEXT: retq 1263 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1264 %neg = fneg <8 x half> %a0 1265 %neg1 = fneg <8 x half> %a1 1266 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a2, <8 x half> %neg) 1267 %3 = load i8, ptr %mask 1268 %4 = bitcast i8 %3 to <8 x i1> 1269 %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer 1270 ret <8 x half> %5 1271} 1272 1273define <8 x half> @stack_fold_fnmsub321ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) { 1274; CHECK-LABEL: stack_fold_fnmsub321ph_maskz: 1275; CHECK: # %bb.0: 1276; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1277; CHECK-NEXT: #APP 1278; CHECK-NEXT: nop 1279; CHECK-NEXT: #NO_APP 1280; CHECK-NEXT: kmovb (%rdi), %k1 1281; CHECK-NEXT: vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 1282; CHECK-NEXT: retq 1283 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1284 %neg = fneg <8 x half> %a0 1285 %neg1 = fneg <8 x half> %a2 1286 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a1, <8 x half> %neg) 1287 %3 = load i8, ptr %mask 1288 %4 = bitcast i8 %3 to <8 x i1> 1289 %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer 1290 ret <8 x half> %5 1291} 1292 1293define <8 x half> @stack_fold_fnmsub132ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) { 1294; CHECK-LABEL: stack_fold_fnmsub132ph_maskz: 1295; CHECK: # %bb.0: 1296; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1297; CHECK-NEXT: #APP 1298; CHECK-NEXT: nop 1299; CHECK-NEXT: #NO_APP 1300; CHECK-NEXT: kmovb (%rdi), %k1 1301; CHECK-NEXT: vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 1302; CHECK-NEXT: retq 1303 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1304 %neg = fneg <8 x half> %a1 1305 %neg1 = fneg <8 x half> %a0 1306 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a2, <8 x half> %neg) 1307 %3 = load i8, ptr %mask 1308 %4 = bitcast i8 %3 to <8 x i1> 1309 %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer 1310 ret <8 x half> %5 1311} 1312 1313define <8 x half> @stack_fold_fnmsub312ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) { 1314; CHECK-LABEL: stack_fold_fnmsub312ph_maskz: 1315; CHECK: # %bb.0: 1316; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1317; CHECK-NEXT: #APP 1318; CHECK-NEXT: nop 1319; CHECK-NEXT: #NO_APP 1320; CHECK-NEXT: kmovb (%rdi), %k1 1321; CHECK-NEXT: vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 1322; CHECK-NEXT: retq 1323 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1324 %neg = fneg <8 x half> %a1 1325 %neg1 = fneg <8 x half> %a2 1326 %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg1, <8 x half> %a0, <8 x half> %neg) 1327 %3 = load i8, ptr %mask 1328 %4 = bitcast i8 %3 to <8 x i1> 1329 %5 = select <8 x i1> %4, <8 x half> %2, <8 x half> zeroinitializer 1330 ret <8 x half> %5 1331} 1332 1333define <16 x half> @stack_fold_fmadd123ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { 1334; CHECK-LABEL: stack_fold_fmadd123ph_ymm: 1335; CHECK: # %bb.0: 1336; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1337; CHECK-NEXT: #APP 1338; CHECK-NEXT: nop 1339; CHECK-NEXT: #NO_APP 1340; CHECK-NEXT: vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1341; CHECK-NEXT: retq 1342 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1343 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) 1344 ret <16 x half> %2 1345} 1346declare <16 x half> @llvm.fma.v16f16(<16 x half>, <16 x half>, <16 x half>) 1347 1348define <16 x half> @stack_fold_fmadd213ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { 1349; CHECK-LABEL: stack_fold_fmadd213ph_ymm: 1350; CHECK: # %bb.0: 1351; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1352; CHECK-NEXT: #APP 1353; CHECK-NEXT: nop 1354; CHECK-NEXT: #NO_APP 1355; CHECK-NEXT: vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1356; CHECK-NEXT: retq 1357 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1358 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a0, <16 x half> %a2) 1359 ret <16 x half> %2 1360} 1361 1362define <16 x half> @stack_fold_fmadd231ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { 1363; CHECK-LABEL: stack_fold_fmadd231ph_ymm: 1364; CHECK: # %bb.0: 1365; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1366; CHECK-NEXT: #APP 1367; CHECK-NEXT: nop 1368; CHECK-NEXT: #NO_APP 1369; CHECK-NEXT: vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1370; CHECK-NEXT: retq 1371 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1372 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a2, <16 x half> %a0) 1373 ret <16 x half> %2 1374} 1375 1376define <16 x half> @stack_fold_fmadd321ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { 1377; CHECK-LABEL: stack_fold_fmadd321ph_ymm: 1378; CHECK: # %bb.0: 1379; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1380; CHECK-NEXT: #APP 1381; CHECK-NEXT: nop 1382; CHECK-NEXT: #NO_APP 1383; CHECK-NEXT: vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1384; CHECK-NEXT: retq 1385 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1386 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a1, <16 x half> %a0) 1387 ret <16 x half> %2 1388} 1389 1390define <16 x half> @stack_fold_fmadd132ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { 1391; CHECK-LABEL: stack_fold_fmadd132ph_ymm: 1392; CHECK: # %bb.0: 1393; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1394; CHECK-NEXT: #APP 1395; CHECK-NEXT: nop 1396; CHECK-NEXT: #NO_APP 1397; CHECK-NEXT: vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1398; CHECK-NEXT: retq 1399 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1400 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a2, <16 x half> %a1) 1401 ret <16 x half> %2 1402} 1403 1404define <16 x half> @stack_fold_fmadd312ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { 1405; CHECK-LABEL: stack_fold_fmadd312ph_ymm: 1406; CHECK: # %bb.0: 1407; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1408; CHECK-NEXT: #APP 1409; CHECK-NEXT: nop 1410; CHECK-NEXT: #NO_APP 1411; CHECK-NEXT: vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1412; CHECK-NEXT: retq 1413 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1414 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a0, <16 x half> %a1) 1415 ret <16 x half> %2 1416} 1417 1418define <16 x half> @stack_fold_fmadd123ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { 1419; CHECK-LABEL: stack_fold_fmadd123ph_mask_ymm: 1420; CHECK: # %bb.0: 1421; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1422; CHECK-NEXT: #APP 1423; CHECK-NEXT: nop 1424; CHECK-NEXT: #NO_APP 1425; CHECK-NEXT: vmovaps (%rdi), %ymm2 1426; CHECK-NEXT: kmovd %esi, %k1 1427; CHECK-NEXT: vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 1428; CHECK-NEXT: vmovaps %ymm2, %ymm0 1429; CHECK-NEXT: retq 1430 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1431 %a0 = load <16 x half>, ptr %p 1432 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) 1433 %3 = bitcast i16 %mask to <16 x i1> 1434 %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 1435 ret <16 x half> %4 1436} 1437 1438define <16 x half> @stack_fold_fmadd213ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { 1439; CHECK-LABEL: stack_fold_fmadd213ph_mask_ymm: 1440; CHECK: # %bb.0: 1441; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1442; CHECK-NEXT: #APP 1443; CHECK-NEXT: nop 1444; CHECK-NEXT: #NO_APP 1445; CHECK-NEXT: vmovaps (%rdi), %ymm2 1446; CHECK-NEXT: kmovd %esi, %k1 1447; CHECK-NEXT: vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 1448; CHECK-NEXT: vmovaps %ymm2, %ymm0 1449; CHECK-NEXT: retq 1450 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1451 %a0 = load <16 x half>, ptr %p 1452 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a0, <16 x half> %a2) 1453 %3 = bitcast i16 %mask to <16 x i1> 1454 %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 1455 ret <16 x half> %4 1456} 1457 1458define <16 x half> @stack_fold_fmadd231ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { 1459; CHECK-LABEL: stack_fold_fmadd231ph_mask_ymm: 1460; CHECK: # %bb.0: 1461; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1462; CHECK-NEXT: #APP 1463; CHECK-NEXT: nop 1464; CHECK-NEXT: #NO_APP 1465; CHECK-NEXT: vmovaps (%rdi), %ymm2 1466; CHECK-NEXT: kmovd %esi, %k1 1467; CHECK-NEXT: vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 1468; CHECK-NEXT: vmovaps %ymm2, %ymm0 1469; CHECK-NEXT: retq 1470 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1471 %a0 = load <16 x half>, ptr %p 1472 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a2, <16 x half> %a0) 1473 %3 = bitcast i16 %mask to <16 x i1> 1474 %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 1475 ret <16 x half> %4 1476} 1477 1478define <16 x half> @stack_fold_fmadd321ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { 1479; CHECK-LABEL: stack_fold_fmadd321ph_mask_ymm: 1480; CHECK: # %bb.0: 1481; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1482; CHECK-NEXT: #APP 1483; CHECK-NEXT: nop 1484; CHECK-NEXT: #NO_APP 1485; CHECK-NEXT: vmovaps (%rdi), %ymm2 1486; CHECK-NEXT: kmovd %esi, %k1 1487; CHECK-NEXT: vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 1488; CHECK-NEXT: vmovaps %ymm2, %ymm0 1489; CHECK-NEXT: retq 1490 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1491 %a0 = load <16 x half>, ptr %p 1492 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a1, <16 x half> %a0) 1493 %3 = bitcast i16 %mask to <16 x i1> 1494 %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 1495 ret <16 x half> %4 1496} 1497 1498define <16 x half> @stack_fold_fmadd132ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { 1499; CHECK-LABEL: stack_fold_fmadd132ph_mask_ymm: 1500; CHECK: # %bb.0: 1501; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1502; CHECK-NEXT: #APP 1503; CHECK-NEXT: nop 1504; CHECK-NEXT: #NO_APP 1505; CHECK-NEXT: vmovaps (%rdi), %ymm2 1506; CHECK-NEXT: kmovd %esi, %k1 1507; CHECK-NEXT: vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 1508; CHECK-NEXT: vmovaps %ymm2, %ymm0 1509; CHECK-NEXT: retq 1510 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1511 %a0 = load <16 x half>, ptr %p 1512 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a2, <16 x half> %a1) 1513 %3 = bitcast i16 %mask to <16 x i1> 1514 %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 1515 ret <16 x half> %4 1516} 1517 1518define <16 x half> @stack_fold_fmadd312ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { 1519; CHECK-LABEL: stack_fold_fmadd312ph_mask_ymm: 1520; CHECK: # %bb.0: 1521; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1522; CHECK-NEXT: #APP 1523; CHECK-NEXT: nop 1524; CHECK-NEXT: #NO_APP 1525; CHECK-NEXT: vmovaps (%rdi), %ymm2 1526; CHECK-NEXT: kmovd %esi, %k1 1527; CHECK-NEXT: vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 1528; CHECK-NEXT: vmovaps %ymm2, %ymm0 1529; CHECK-NEXT: retq 1530 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1531 %a0 = load <16 x half>, ptr %p 1532 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a0, <16 x half> %a1) 1533 %3 = bitcast i16 %mask to <16 x i1> 1534 %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 1535 ret <16 x half> %4 1536} 1537 1538define <16 x half> @stack_fold_fmadd123ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) { 1539; CHECK-LABEL: stack_fold_fmadd123ph_maskz_ymm: 1540; CHECK: # %bb.0: 1541; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1542; CHECK-NEXT: #APP 1543; CHECK-NEXT: nop 1544; CHECK-NEXT: #NO_APP 1545; CHECK-NEXT: kmovw (%rdi), %k1 1546; CHECK-NEXT: vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload 1547; CHECK-NEXT: retq 1548 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1549 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) 1550 %3 = load i16, ptr %mask 1551 %4 = bitcast i16 %3 to <16 x i1> 1552 %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer 1553 ret <16 x half> %5 1554} 1555 1556define <16 x half> @stack_fold_fmadd213ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) { 1557; CHECK-LABEL: stack_fold_fmadd213ph_maskz_ymm: 1558; CHECK: # %bb.0: 1559; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1560; CHECK-NEXT: #APP 1561; CHECK-NEXT: nop 1562; CHECK-NEXT: #NO_APP 1563; CHECK-NEXT: kmovw (%rdi), %k1 1564; CHECK-NEXT: vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload 1565; CHECK-NEXT: retq 1566 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1567 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a0, <16 x half> %a2) 1568 %3 = load i16, ptr %mask 1569 %4 = bitcast i16 %3 to <16 x i1> 1570 %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer 1571 ret <16 x half> %5 1572} 1573 1574define <16 x half> @stack_fold_fmadd231ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) { 1575; CHECK-LABEL: stack_fold_fmadd231ph_maskz_ymm: 1576; CHECK: # %bb.0: 1577; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1578; CHECK-NEXT: #APP 1579; CHECK-NEXT: nop 1580; CHECK-NEXT: #NO_APP 1581; CHECK-NEXT: kmovw (%rdi), %k1 1582; CHECK-NEXT: vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload 1583; CHECK-NEXT: retq 1584 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1585 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a2, <16 x half> %a0) 1586 %3 = load i16, ptr %mask 1587 %4 = bitcast i16 %3 to <16 x i1> 1588 %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer 1589 ret <16 x half> %5 1590} 1591 1592define <16 x half> @stack_fold_fmadd321ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) { 1593; CHECK-LABEL: stack_fold_fmadd321ph_maskz_ymm: 1594; CHECK: # %bb.0: 1595; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1596; CHECK-NEXT: #APP 1597; CHECK-NEXT: nop 1598; CHECK-NEXT: #NO_APP 1599; CHECK-NEXT: kmovw (%rdi), %k1 1600; CHECK-NEXT: vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload 1601; CHECK-NEXT: retq 1602 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1603 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a1, <16 x half> %a0) 1604 %3 = load i16, ptr %mask 1605 %4 = bitcast i16 %3 to <16 x i1> 1606 %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer 1607 ret <16 x half> %5 1608} 1609 1610define <16 x half> @stack_fold_fmadd132ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) { 1611; CHECK-LABEL: stack_fold_fmadd132ph_maskz_ymm: 1612; CHECK: # %bb.0: 1613; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1614; CHECK-NEXT: #APP 1615; CHECK-NEXT: nop 1616; CHECK-NEXT: #NO_APP 1617; CHECK-NEXT: kmovw (%rdi), %k1 1618; CHECK-NEXT: vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload 1619; CHECK-NEXT: retq 1620 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1621 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a2, <16 x half> %a1) 1622 %3 = load i16, ptr %mask 1623 %4 = bitcast i16 %3 to <16 x i1> 1624 %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer 1625 ret <16 x half> %5 1626} 1627 1628define <16 x half> @stack_fold_fmadd312ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) { 1629; CHECK-LABEL: stack_fold_fmadd312ph_maskz_ymm: 1630; CHECK: # %bb.0: 1631; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1632; CHECK-NEXT: #APP 1633; CHECK-NEXT: nop 1634; CHECK-NEXT: #NO_APP 1635; CHECK-NEXT: kmovw (%rdi), %k1 1636; CHECK-NEXT: vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload 1637; CHECK-NEXT: retq 1638 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1639 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a0, <16 x half> %a1) 1640 %3 = load i16, ptr %mask 1641 %4 = bitcast i16 %3 to <16 x i1> 1642 %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer 1643 ret <16 x half> %5 1644} 1645 1646define <16 x half> @stack_fold_fmsub123ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { 1647; CHECK-LABEL: stack_fold_fmsub123ph_ymm: 1648; CHECK: # %bb.0: 1649; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1650; CHECK-NEXT: #APP 1651; CHECK-NEXT: nop 1652; CHECK-NEXT: #NO_APP 1653; CHECK-NEXT: vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1654; CHECK-NEXT: retq 1655 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1656 %2 = fneg <16 x half> %a2 1657 %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> %2) 1658 ret <16 x half> %3 1659} 1660 1661define <16 x half> @stack_fold_fmsub213ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { 1662; CHECK-LABEL: stack_fold_fmsub213ph_ymm: 1663; CHECK: # %bb.0: 1664; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1665; CHECK-NEXT: #APP 1666; CHECK-NEXT: nop 1667; CHECK-NEXT: #NO_APP 1668; CHECK-NEXT: vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1669; CHECK-NEXT: retq 1670 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1671 %2 = fneg <16 x half> %a2 1672 %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a0, <16 x half> %2) 1673 ret <16 x half> %3 1674} 1675 1676define <16 x half> @stack_fold_fmsub231ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { 1677; CHECK-LABEL: stack_fold_fmsub231ph_ymm: 1678; CHECK: # %bb.0: 1679; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1680; CHECK-NEXT: #APP 1681; CHECK-NEXT: nop 1682; CHECK-NEXT: #NO_APP 1683; CHECK-NEXT: vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1684; CHECK-NEXT: retq 1685 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1686 %2 = fneg <16 x half> %a0 1687 %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a2, <16 x half> %2) 1688 ret <16 x half> %3 1689} 1690 1691define <16 x half> @stack_fold_fmsub321ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { 1692; CHECK-LABEL: stack_fold_fmsub321ph_ymm: 1693; CHECK: # %bb.0: 1694; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1695; CHECK-NEXT: #APP 1696; CHECK-NEXT: nop 1697; CHECK-NEXT: #NO_APP 1698; CHECK-NEXT: vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1699; CHECK-NEXT: retq 1700 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1701 %2 = fneg <16 x half> %a0 1702 %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a1, <16 x half> %2) 1703 ret <16 x half> %3 1704} 1705 1706define <16 x half> @stack_fold_fmsub132ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { 1707; CHECK-LABEL: stack_fold_fmsub132ph_ymm: 1708; CHECK: # %bb.0: 1709; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1710; CHECK-NEXT: #APP 1711; CHECK-NEXT: nop 1712; CHECK-NEXT: #NO_APP 1713; CHECK-NEXT: vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1714; CHECK-NEXT: retq 1715 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1716 %2 = fneg <16 x half> %a1 1717 %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a2, <16 x half> %2) 1718 ret <16 x half> %3 1719} 1720 1721define <16 x half> @stack_fold_fmsub312ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { 1722; CHECK-LABEL: stack_fold_fmsub312ph_ymm: 1723; CHECK: # %bb.0: 1724; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1725; CHECK-NEXT: #APP 1726; CHECK-NEXT: nop 1727; CHECK-NEXT: #NO_APP 1728; CHECK-NEXT: vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1729; CHECK-NEXT: retq 1730 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1731 %2 = fneg <16 x half> %a1 1732 %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a0, <16 x half> %2) 1733 ret <16 x half> %3 1734} 1735 1736define <16 x half> @stack_fold_fmsub123ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { 1737; CHECK-LABEL: stack_fold_fmsub123ph_mask_ymm: 1738; CHECK: # %bb.0: 1739; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1740; CHECK-NEXT: #APP 1741; CHECK-NEXT: nop 1742; CHECK-NEXT: #NO_APP 1743; CHECK-NEXT: vmovaps (%rdi), %ymm2 1744; CHECK-NEXT: kmovd %esi, %k1 1745; CHECK-NEXT: vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 1746; CHECK-NEXT: vmovaps %ymm2, %ymm0 1747; CHECK-NEXT: retq 1748 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1749 %a0 = load <16 x half>, ptr %p 1750 %neg = fneg <16 x half> %a2 1751 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> %neg) 1752 %3 = bitcast i16 %mask to <16 x i1> 1753 %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 1754 ret <16 x half> %4 1755} 1756 1757define <16 x half> @stack_fold_fmsub213ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { 1758; CHECK-LABEL: stack_fold_fmsub213ph_mask_ymm: 1759; CHECK: # %bb.0: 1760; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1761; CHECK-NEXT: #APP 1762; CHECK-NEXT: nop 1763; CHECK-NEXT: #NO_APP 1764; CHECK-NEXT: vmovaps (%rdi), %ymm2 1765; CHECK-NEXT: kmovd %esi, %k1 1766; CHECK-NEXT: vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 1767; CHECK-NEXT: vmovaps %ymm2, %ymm0 1768; CHECK-NEXT: retq 1769 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1770 %a0 = load <16 x half>, ptr %p 1771 %neg = fneg <16 x half> %a2 1772 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a0, <16 x half> %neg) 1773 %3 = bitcast i16 %mask to <16 x i1> 1774 %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 1775 ret <16 x half> %4 1776} 1777 1778define <16 x half> @stack_fold_fmsub231ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { 1779; CHECK-LABEL: stack_fold_fmsub231ph_mask_ymm: 1780; CHECK: # %bb.0: 1781; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1782; CHECK-NEXT: #APP 1783; CHECK-NEXT: nop 1784; CHECK-NEXT: #NO_APP 1785; CHECK-NEXT: vmovaps (%rdi), %ymm2 1786; CHECK-NEXT: kmovd %esi, %k1 1787; CHECK-NEXT: vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 1788; CHECK-NEXT: vmovaps %ymm2, %ymm0 1789; CHECK-NEXT: retq 1790 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1791 %a0 = load <16 x half>, ptr %p 1792 %neg = fneg <16 x half> %a0 1793 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a2, <16 x half> %neg) 1794 %3 = bitcast i16 %mask to <16 x i1> 1795 %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 1796 ret <16 x half> %4 1797} 1798 1799define <16 x half> @stack_fold_fmsub321ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { 1800; CHECK-LABEL: stack_fold_fmsub321ph_mask_ymm: 1801; CHECK: # %bb.0: 1802; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1803; CHECK-NEXT: #APP 1804; CHECK-NEXT: nop 1805; CHECK-NEXT: #NO_APP 1806; CHECK-NEXT: vmovaps (%rdi), %ymm2 1807; CHECK-NEXT: kmovd %esi, %k1 1808; CHECK-NEXT: vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 1809; CHECK-NEXT: vmovaps %ymm2, %ymm0 1810; CHECK-NEXT: retq 1811 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1812 %a0 = load <16 x half>, ptr %p 1813 %neg = fneg <16 x half> %a0 1814 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a1, <16 x half> %neg) 1815 %3 = bitcast i16 %mask to <16 x i1> 1816 %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 1817 ret <16 x half> %4 1818} 1819 1820define <16 x half> @stack_fold_fmsub132ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { 1821; CHECK-LABEL: stack_fold_fmsub132ph_mask_ymm: 1822; CHECK: # %bb.0: 1823; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1824; CHECK-NEXT: #APP 1825; CHECK-NEXT: nop 1826; CHECK-NEXT: #NO_APP 1827; CHECK-NEXT: vmovaps (%rdi), %ymm2 1828; CHECK-NEXT: kmovd %esi, %k1 1829; CHECK-NEXT: vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 1830; CHECK-NEXT: vmovaps %ymm2, %ymm0 1831; CHECK-NEXT: retq 1832 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1833 %a0 = load <16 x half>, ptr %p 1834 %neg = fneg <16 x half> %a1 1835 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a2, <16 x half> %neg) 1836 %3 = bitcast i16 %mask to <16 x i1> 1837 %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 1838 ret <16 x half> %4 1839} 1840 1841define <16 x half> @stack_fold_fmsub312ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { 1842; CHECK-LABEL: stack_fold_fmsub312ph_mask_ymm: 1843; CHECK: # %bb.0: 1844; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1845; CHECK-NEXT: #APP 1846; CHECK-NEXT: nop 1847; CHECK-NEXT: #NO_APP 1848; CHECK-NEXT: vmovaps (%rdi), %ymm2 1849; CHECK-NEXT: kmovd %esi, %k1 1850; CHECK-NEXT: vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 1851; CHECK-NEXT: vmovaps %ymm2, %ymm0 1852; CHECK-NEXT: retq 1853 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1854 %a0 = load <16 x half>, ptr %p 1855 %neg = fneg <16 x half> %a1 1856 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a0, <16 x half> %neg) 1857 %3 = bitcast i16 %mask to <16 x i1> 1858 %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 1859 ret <16 x half> %4 1860} 1861 1862define <16 x half> @stack_fold_fmsub123ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) { 1863; CHECK-LABEL: stack_fold_fmsub123ph_maskz_ymm: 1864; CHECK: # %bb.0: 1865; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1866; CHECK-NEXT: #APP 1867; CHECK-NEXT: nop 1868; CHECK-NEXT: #NO_APP 1869; CHECK-NEXT: kmovw (%rdi), %k1 1870; CHECK-NEXT: vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload 1871; CHECK-NEXT: retq 1872 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1873 %neg = fneg <16 x half> %a2 1874 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> %neg) 1875 %3 = load i16, ptr %mask 1876 %4 = bitcast i16 %3 to <16 x i1> 1877 %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer 1878 ret <16 x half> %5 1879} 1880 1881define <16 x half> @stack_fold_fmsub213ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) { 1882; CHECK-LABEL: stack_fold_fmsub213ph_maskz_ymm: 1883; CHECK: # %bb.0: 1884; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1885; CHECK-NEXT: #APP 1886; CHECK-NEXT: nop 1887; CHECK-NEXT: #NO_APP 1888; CHECK-NEXT: kmovw (%rdi), %k1 1889; CHECK-NEXT: vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload 1890; CHECK-NEXT: retq 1891 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1892 %neg = fneg <16 x half> %a2 1893 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a0, <16 x half> %neg) 1894 %3 = load i16, ptr %mask 1895 %4 = bitcast i16 %3 to <16 x i1> 1896 %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer 1897 ret <16 x half> %5 1898} 1899 1900define <16 x half> @stack_fold_fmsub231ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) { 1901; CHECK-LABEL: stack_fold_fmsub231ph_maskz_ymm: 1902; CHECK: # %bb.0: 1903; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1904; CHECK-NEXT: #APP 1905; CHECK-NEXT: nop 1906; CHECK-NEXT: #NO_APP 1907; CHECK-NEXT: kmovw (%rdi), %k1 1908; CHECK-NEXT: vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload 1909; CHECK-NEXT: retq 1910 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1911 %neg = fneg <16 x half> %a0 1912 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a2, <16 x half> %neg) 1913 %3 = load i16, ptr %mask 1914 %4 = bitcast i16 %3 to <16 x i1> 1915 %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer 1916 ret <16 x half> %5 1917} 1918 1919define <16 x half> @stack_fold_fmsub321ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) { 1920; CHECK-LABEL: stack_fold_fmsub321ph_maskz_ymm: 1921; CHECK: # %bb.0: 1922; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1923; CHECK-NEXT: #APP 1924; CHECK-NEXT: nop 1925; CHECK-NEXT: #NO_APP 1926; CHECK-NEXT: kmovw (%rdi), %k1 1927; CHECK-NEXT: vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload 1928; CHECK-NEXT: retq 1929 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1930 %neg = fneg <16 x half> %a0 1931 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a1, <16 x half> %neg) 1932 %3 = load i16, ptr %mask 1933 %4 = bitcast i16 %3 to <16 x i1> 1934 %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer 1935 ret <16 x half> %5 1936} 1937 1938define <16 x half> @stack_fold_fmsub132ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) { 1939; CHECK-LABEL: stack_fold_fmsub132ph_maskz_ymm: 1940; CHECK: # %bb.0: 1941; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1942; CHECK-NEXT: #APP 1943; CHECK-NEXT: nop 1944; CHECK-NEXT: #NO_APP 1945; CHECK-NEXT: kmovw (%rdi), %k1 1946; CHECK-NEXT: vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload 1947; CHECK-NEXT: retq 1948 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1949 %neg = fneg <16 x half> %a1 1950 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a2, <16 x half> %neg) 1951 %3 = load i16, ptr %mask 1952 %4 = bitcast i16 %3 to <16 x i1> 1953 %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer 1954 ret <16 x half> %5 1955} 1956 1957define <16 x half> @stack_fold_fmsub312ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) { 1958; CHECK-LABEL: stack_fold_fmsub312ph_maskz_ymm: 1959; CHECK: # %bb.0: 1960; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1961; CHECK-NEXT: #APP 1962; CHECK-NEXT: nop 1963; CHECK-NEXT: #NO_APP 1964; CHECK-NEXT: kmovw (%rdi), %k1 1965; CHECK-NEXT: vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload 1966; CHECK-NEXT: retq 1967 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1968 %neg = fneg <16 x half> %a1 1969 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a0, <16 x half> %neg) 1970 %3 = load i16, ptr %mask 1971 %4 = bitcast i16 %3 to <16 x i1> 1972 %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer 1973 ret <16 x half> %5 1974} 1975 1976define <16 x half> @stack_fold_fnmadd123ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { 1977; CHECK-LABEL: stack_fold_fnmadd123ph_ymm: 1978; CHECK: # %bb.0: 1979; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1980; CHECK-NEXT: #APP 1981; CHECK-NEXT: nop 1982; CHECK-NEXT: #NO_APP 1983; CHECK-NEXT: vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1984; CHECK-NEXT: retq 1985 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1986 %2 = fneg <16 x half> %a0 1987 %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a1, <16 x half> %a2) 1988 ret <16 x half> %3 1989} 1990 1991define <16 x half> @stack_fold_fnmadd213ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { 1992; CHECK-LABEL: stack_fold_fnmadd213ph_ymm: 1993; CHECK: # %bb.0: 1994; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1995; CHECK-NEXT: #APP 1996; CHECK-NEXT: nop 1997; CHECK-NEXT: #NO_APP 1998; CHECK-NEXT: vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1999; CHECK-NEXT: retq 2000 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2001 %2 = fneg <16 x half> %a1 2002 %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a0, <16 x half> %a2) 2003 ret <16 x half> %3 2004} 2005 2006define <16 x half> @stack_fold_fnmadd231ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { 2007; CHECK-LABEL: stack_fold_fnmadd231ph_ymm: 2008; CHECK: # %bb.0: 2009; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2010; CHECK-NEXT: #APP 2011; CHECK-NEXT: nop 2012; CHECK-NEXT: #NO_APP 2013; CHECK-NEXT: vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 2014; CHECK-NEXT: retq 2015 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2016 %2 = fneg <16 x half> %a1 2017 %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a2, <16 x half> %a0) 2018 ret <16 x half> %3 2019} 2020 2021define <16 x half> @stack_fold_fnmadd321ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { 2022; CHECK-LABEL: stack_fold_fnmadd321ph_ymm: 2023; CHECK: # %bb.0: 2024; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2025; CHECK-NEXT: #APP 2026; CHECK-NEXT: nop 2027; CHECK-NEXT: #NO_APP 2028; CHECK-NEXT: vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 2029; CHECK-NEXT: retq 2030 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2031 %2 = fneg <16 x half> %a2 2032 %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a1, <16 x half> %a0) 2033 ret <16 x half> %3 2034} 2035 2036define <16 x half> @stack_fold_fnmadd132ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { 2037; CHECK-LABEL: stack_fold_fnmadd132ph_ymm: 2038; CHECK: # %bb.0: 2039; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2040; CHECK-NEXT: #APP 2041; CHECK-NEXT: nop 2042; CHECK-NEXT: #NO_APP 2043; CHECK-NEXT: vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 2044; CHECK-NEXT: retq 2045 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2046 %2 = fneg <16 x half> %a0 2047 %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a2, <16 x half> %a1) 2048 ret <16 x half> %3 2049} 2050 2051define <16 x half> @stack_fold_fnmadd312ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { 2052; CHECK-LABEL: stack_fold_fnmadd312ph_ymm: 2053; CHECK: # %bb.0: 2054; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2055; CHECK-NEXT: #APP 2056; CHECK-NEXT: nop 2057; CHECK-NEXT: #NO_APP 2058; CHECK-NEXT: vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 2059; CHECK-NEXT: retq 2060 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2061 %2 = fneg <16 x half> %a2 2062 %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a0, <16 x half> %a1) 2063 ret <16 x half> %3 2064} 2065 2066define <16 x half> @stack_fold_fnmadd123ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { 2067; CHECK-LABEL: stack_fold_fnmadd123ph_mask_ymm: 2068; CHECK: # %bb.0: 2069; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2070; CHECK-NEXT: #APP 2071; CHECK-NEXT: nop 2072; CHECK-NEXT: #NO_APP 2073; CHECK-NEXT: vmovaps (%rdi), %ymm2 2074; CHECK-NEXT: kmovd %esi, %k1 2075; CHECK-NEXT: vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 2076; CHECK-NEXT: vmovaps %ymm2, %ymm0 2077; CHECK-NEXT: retq 2078 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2079 %a0 = load <16 x half>, ptr %p 2080 %neg = fneg <16 x half> %a0 2081 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a1, <16 x half> %a2) 2082 %3 = bitcast i16 %mask to <16 x i1> 2083 %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 2084 ret <16 x half> %4 2085} 2086 2087define <16 x half> @stack_fold_fnmadd213ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { 2088; CHECK-LABEL: stack_fold_fnmadd213ph_mask_ymm: 2089; CHECK: # %bb.0: 2090; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2091; CHECK-NEXT: #APP 2092; CHECK-NEXT: nop 2093; CHECK-NEXT: #NO_APP 2094; CHECK-NEXT: vmovaps (%rdi), %ymm2 2095; CHECK-NEXT: kmovd %esi, %k1 2096; CHECK-NEXT: vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 2097; CHECK-NEXT: vmovaps %ymm2, %ymm0 2098; CHECK-NEXT: retq 2099 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2100 %a0 = load <16 x half>, ptr %p 2101 %neg = fneg <16 x half> %a1 2102 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a0, <16 x half> %a2) 2103 %3 = bitcast i16 %mask to <16 x i1> 2104 %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 2105 ret <16 x half> %4 2106} 2107 2108define <16 x half> @stack_fold_fnmadd231ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { 2109; CHECK-LABEL: stack_fold_fnmadd231ph_mask_ymm: 2110; CHECK: # %bb.0: 2111; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2112; CHECK-NEXT: #APP 2113; CHECK-NEXT: nop 2114; CHECK-NEXT: #NO_APP 2115; CHECK-NEXT: vmovaps (%rdi), %ymm2 2116; CHECK-NEXT: kmovd %esi, %k1 2117; CHECK-NEXT: vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 2118; CHECK-NEXT: vmovaps %ymm2, %ymm0 2119; CHECK-NEXT: retq 2120 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2121 %a0 = load <16 x half>, ptr %p 2122 %neg = fneg <16 x half> %a1 2123 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a2, <16 x half> %a0) 2124 %3 = bitcast i16 %mask to <16 x i1> 2125 %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 2126 ret <16 x half> %4 2127} 2128 2129define <16 x half> @stack_fold_fnmadd321ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { 2130; CHECK-LABEL: stack_fold_fnmadd321ph_mask_ymm: 2131; CHECK: # %bb.0: 2132; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2133; CHECK-NEXT: #APP 2134; CHECK-NEXT: nop 2135; CHECK-NEXT: #NO_APP 2136; CHECK-NEXT: vmovaps (%rdi), %ymm2 2137; CHECK-NEXT: kmovd %esi, %k1 2138; CHECK-NEXT: vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 2139; CHECK-NEXT: vmovaps %ymm2, %ymm0 2140; CHECK-NEXT: retq 2141 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2142 %a0 = load <16 x half>, ptr %p 2143 %neg = fneg <16 x half> %a2 2144 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a1, <16 x half> %a0) 2145 %3 = bitcast i16 %mask to <16 x i1> 2146 %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 2147 ret <16 x half> %4 2148} 2149 2150define <16 x half> @stack_fold_fnmadd132ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { 2151; CHECK-LABEL: stack_fold_fnmadd132ph_mask_ymm: 2152; CHECK: # %bb.0: 2153; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2154; CHECK-NEXT: #APP 2155; CHECK-NEXT: nop 2156; CHECK-NEXT: #NO_APP 2157; CHECK-NEXT: vmovaps (%rdi), %ymm2 2158; CHECK-NEXT: kmovd %esi, %k1 2159; CHECK-NEXT: vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 2160; CHECK-NEXT: vmovaps %ymm2, %ymm0 2161; CHECK-NEXT: retq 2162 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2163 %a0 = load <16 x half>, ptr %p 2164 %neg = fneg <16 x half> %a0 2165 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a2, <16 x half> %a1) 2166 %3 = bitcast i16 %mask to <16 x i1> 2167 %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 2168 ret <16 x half> %4 2169} 2170 2171define <16 x half> @stack_fold_fnmadd312ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { 2172; CHECK-LABEL: stack_fold_fnmadd312ph_mask_ymm: 2173; CHECK: # %bb.0: 2174; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2175; CHECK-NEXT: #APP 2176; CHECK-NEXT: nop 2177; CHECK-NEXT: #NO_APP 2178; CHECK-NEXT: vmovaps (%rdi), %ymm2 2179; CHECK-NEXT: kmovd %esi, %k1 2180; CHECK-NEXT: vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 2181; CHECK-NEXT: vmovaps %ymm2, %ymm0 2182; CHECK-NEXT: retq 2183 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2184 %a0 = load <16 x half>, ptr %p 2185 %neg = fneg <16 x half> %a2 2186 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a0, <16 x half> %a1) 2187 %3 = bitcast i16 %mask to <16 x i1> 2188 %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 2189 ret <16 x half> %4 2190} 2191 2192define <16 x half> @stack_fold_fnmadd123ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) { 2193; CHECK-LABEL: stack_fold_fnmadd123ph_maskz_ymm: 2194; CHECK: # %bb.0: 2195; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2196; CHECK-NEXT: #APP 2197; CHECK-NEXT: nop 2198; CHECK-NEXT: #NO_APP 2199; CHECK-NEXT: kmovw (%rdi), %k1 2200; CHECK-NEXT: vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload 2201; CHECK-NEXT: retq 2202 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2203 %neg = fneg <16 x half> %a0 2204 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a1, <16 x half> %a2) 2205 %3 = load i16, ptr %mask 2206 %4 = bitcast i16 %3 to <16 x i1> 2207 %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer 2208 ret <16 x half> %5 2209} 2210 2211define <16 x half> @stack_fold_fnmadd213ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) { 2212; CHECK-LABEL: stack_fold_fnmadd213ph_maskz_ymm: 2213; CHECK: # %bb.0: 2214; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2215; CHECK-NEXT: #APP 2216; CHECK-NEXT: nop 2217; CHECK-NEXT: #NO_APP 2218; CHECK-NEXT: kmovw (%rdi), %k1 2219; CHECK-NEXT: vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload 2220; CHECK-NEXT: retq 2221 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2222 %neg = fneg <16 x half> %a1 2223 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a0, <16 x half> %a2) 2224 %3 = load i16, ptr %mask 2225 %4 = bitcast i16 %3 to <16 x i1> 2226 %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer 2227 ret <16 x half> %5 2228} 2229 2230define <16 x half> @stack_fold_fnmadd231ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) { 2231; CHECK-LABEL: stack_fold_fnmadd231ph_maskz_ymm: 2232; CHECK: # %bb.0: 2233; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2234; CHECK-NEXT: #APP 2235; CHECK-NEXT: nop 2236; CHECK-NEXT: #NO_APP 2237; CHECK-NEXT: kmovw (%rdi), %k1 2238; CHECK-NEXT: vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload 2239; CHECK-NEXT: retq 2240 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2241 %neg = fneg <16 x half> %a1 2242 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a2, <16 x half> %a0) 2243 %3 = load i16, ptr %mask 2244 %4 = bitcast i16 %3 to <16 x i1> 2245 %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer 2246 ret <16 x half> %5 2247} 2248 2249define <16 x half> @stack_fold_fnmadd321ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) { 2250; CHECK-LABEL: stack_fold_fnmadd321ph_maskz_ymm: 2251; CHECK: # %bb.0: 2252; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2253; CHECK-NEXT: #APP 2254; CHECK-NEXT: nop 2255; CHECK-NEXT: #NO_APP 2256; CHECK-NEXT: kmovw (%rdi), %k1 2257; CHECK-NEXT: vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload 2258; CHECK-NEXT: retq 2259 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2260 %neg = fneg <16 x half> %a2 2261 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a1, <16 x half> %a0) 2262 %3 = load i16, ptr %mask 2263 %4 = bitcast i16 %3 to <16 x i1> 2264 %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer 2265 ret <16 x half> %5 2266} 2267 2268define <16 x half> @stack_fold_fnmadd132ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) { 2269; CHECK-LABEL: stack_fold_fnmadd132ph_maskz_ymm: 2270; CHECK: # %bb.0: 2271; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2272; CHECK-NEXT: #APP 2273; CHECK-NEXT: nop 2274; CHECK-NEXT: #NO_APP 2275; CHECK-NEXT: kmovw (%rdi), %k1 2276; CHECK-NEXT: vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload 2277; CHECK-NEXT: retq 2278 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2279 %neg = fneg <16 x half> %a0 2280 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a2, <16 x half> %a1) 2281 %3 = load i16, ptr %mask 2282 %4 = bitcast i16 %3 to <16 x i1> 2283 %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer 2284 ret <16 x half> %5 2285} 2286 2287define <16 x half> @stack_fold_fnmadd312ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) { 2288; CHECK-LABEL: stack_fold_fnmadd312ph_maskz_ymm: 2289; CHECK: # %bb.0: 2290; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2291; CHECK-NEXT: #APP 2292; CHECK-NEXT: nop 2293; CHECK-NEXT: #NO_APP 2294; CHECK-NEXT: kmovw (%rdi), %k1 2295; CHECK-NEXT: vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload 2296; CHECK-NEXT: retq 2297 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2298 %neg = fneg <16 x half> %a2 2299 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a0, <16 x half> %a1) 2300 %3 = load i16, ptr %mask 2301 %4 = bitcast i16 %3 to <16 x i1> 2302 %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer 2303 ret <16 x half> %5 2304} 2305 2306define <16 x half> @stack_fold_fnmsub123ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { 2307; CHECK-LABEL: stack_fold_fnmsub123ph_ymm: 2308; CHECK: # %bb.0: 2309; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2310; CHECK-NEXT: #APP 2311; CHECK-NEXT: nop 2312; CHECK-NEXT: #NO_APP 2313; CHECK-NEXT: vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 2314; CHECK-NEXT: retq 2315 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2316 %2 = fneg <16 x half> %a0 2317 %3 = fneg <16 x half> %a2 2318 %4 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a1, <16 x half> %3) 2319 ret <16 x half> %4 2320} 2321 2322define <16 x half> @stack_fold_fnmsub213ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { 2323; CHECK-LABEL: stack_fold_fnmsub213ph_ymm: 2324; CHECK: # %bb.0: 2325; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2326; CHECK-NEXT: #APP 2327; CHECK-NEXT: nop 2328; CHECK-NEXT: #NO_APP 2329; CHECK-NEXT: vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 2330; CHECK-NEXT: retq 2331 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2332 %2 = fneg <16 x half> %a1 2333 %3 = fneg <16 x half> %a2 2334 %4 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a0, <16 x half> %3) 2335 ret <16 x half> %4 2336} 2337 2338define <16 x half> @stack_fold_fnmsub231ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { 2339; CHECK-LABEL: stack_fold_fnmsub231ph_ymm: 2340; CHECK: # %bb.0: 2341; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2342; CHECK-NEXT: #APP 2343; CHECK-NEXT: nop 2344; CHECK-NEXT: #NO_APP 2345; CHECK-NEXT: vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 2346; CHECK-NEXT: retq 2347 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2348 %2 = fneg <16 x half> %a1 2349 %3 = fneg <16 x half> %a0 2350 %4 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a2, <16 x half> %3) 2351 ret <16 x half> %4 2352} 2353 2354define <16 x half> @stack_fold_fnmsub321ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { 2355; CHECK-LABEL: stack_fold_fnmsub321ph_ymm: 2356; CHECK: # %bb.0: 2357; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2358; CHECK-NEXT: #APP 2359; CHECK-NEXT: nop 2360; CHECK-NEXT: #NO_APP 2361; CHECK-NEXT: vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 2362; CHECK-NEXT: retq 2363 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2364 %2 = fneg <16 x half> %a2 2365 %3 = fneg <16 x half> %a0 2366 %4 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a1, <16 x half> %3) 2367 ret <16 x half> %4 2368} 2369 2370define <16 x half> @stack_fold_fnmsub132ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { 2371; CHECK-LABEL: stack_fold_fnmsub132ph_ymm: 2372; CHECK: # %bb.0: 2373; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2374; CHECK-NEXT: #APP 2375; CHECK-NEXT: nop 2376; CHECK-NEXT: #NO_APP 2377; CHECK-NEXT: vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 2378; CHECK-NEXT: retq 2379 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2380 %2 = fneg <16 x half> %a0 2381 %3 = fneg <16 x half> %a1 2382 %4 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a2, <16 x half> %3) 2383 ret <16 x half> %4 2384} 2385 2386define <16 x half> @stack_fold_fnmsub312ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) { 2387; CHECK-LABEL: stack_fold_fnmsub312ph_ymm: 2388; CHECK: # %bb.0: 2389; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2390; CHECK-NEXT: #APP 2391; CHECK-NEXT: nop 2392; CHECK-NEXT: #NO_APP 2393; CHECK-NEXT: vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 2394; CHECK-NEXT: retq 2395 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2396 %2 = fneg <16 x half> %a2 2397 %3 = fneg <16 x half> %a1 2398 %4 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a0, <16 x half> %3) 2399 ret <16 x half> %4 2400} 2401 2402define <16 x half> @stack_fold_fnmsub123ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { 2403; CHECK-LABEL: stack_fold_fnmsub123ph_mask_ymm: 2404; CHECK: # %bb.0: 2405; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2406; CHECK-NEXT: #APP 2407; CHECK-NEXT: nop 2408; CHECK-NEXT: #NO_APP 2409; CHECK-NEXT: vmovaps (%rdi), %ymm2 2410; CHECK-NEXT: kmovd %esi, %k1 2411; CHECK-NEXT: vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 2412; CHECK-NEXT: vmovaps %ymm2, %ymm0 2413; CHECK-NEXT: retq 2414 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2415 %a0 = load <16 x half>, ptr %p 2416 %neg = fneg <16 x half> %a2 2417 %neg1 = fneg <16 x half> %a0 2418 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a1, <16 x half> %neg) 2419 %3 = bitcast i16 %mask to <16 x i1> 2420 %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 2421 ret <16 x half> %4 2422} 2423 2424define <16 x half> @stack_fold_fnmsub213ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { 2425; CHECK-LABEL: stack_fold_fnmsub213ph_mask_ymm: 2426; CHECK: # %bb.0: 2427; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2428; CHECK-NEXT: #APP 2429; CHECK-NEXT: nop 2430; CHECK-NEXT: #NO_APP 2431; CHECK-NEXT: vmovaps (%rdi), %ymm2 2432; CHECK-NEXT: kmovd %esi, %k1 2433; CHECK-NEXT: vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 2434; CHECK-NEXT: vmovaps %ymm2, %ymm0 2435; CHECK-NEXT: retq 2436 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2437 %a0 = load <16 x half>, ptr %p 2438 %neg = fneg <16 x half> %a2 2439 %neg1 = fneg <16 x half> %a1 2440 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a0, <16 x half> %neg) 2441 %3 = bitcast i16 %mask to <16 x i1> 2442 %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 2443 ret <16 x half> %4 2444} 2445 2446define <16 x half> @stack_fold_fnmsub231ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { 2447; CHECK-LABEL: stack_fold_fnmsub231ph_mask_ymm: 2448; CHECK: # %bb.0: 2449; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2450; CHECK-NEXT: #APP 2451; CHECK-NEXT: nop 2452; CHECK-NEXT: #NO_APP 2453; CHECK-NEXT: vmovaps (%rdi), %ymm2 2454; CHECK-NEXT: kmovd %esi, %k1 2455; CHECK-NEXT: vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 2456; CHECK-NEXT: vmovaps %ymm2, %ymm0 2457; CHECK-NEXT: retq 2458 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2459 %a0 = load <16 x half>, ptr %p 2460 %neg = fneg <16 x half> %a0 2461 %neg1 = fneg <16 x half> %a1 2462 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a2, <16 x half> %neg) 2463 %3 = bitcast i16 %mask to <16 x i1> 2464 %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 2465 ret <16 x half> %4 2466} 2467 2468define <16 x half> @stack_fold_fnmsub321ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { 2469; CHECK-LABEL: stack_fold_fnmsub321ph_mask_ymm: 2470; CHECK: # %bb.0: 2471; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2472; CHECK-NEXT: #APP 2473; CHECK-NEXT: nop 2474; CHECK-NEXT: #NO_APP 2475; CHECK-NEXT: vmovaps (%rdi), %ymm2 2476; CHECK-NEXT: kmovd %esi, %k1 2477; CHECK-NEXT: vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 2478; CHECK-NEXT: vmovaps %ymm2, %ymm0 2479; CHECK-NEXT: retq 2480 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2481 %a0 = load <16 x half>, ptr %p 2482 %neg = fneg <16 x half> %a0 2483 %neg1 = fneg <16 x half> %a2 2484 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a1, <16 x half> %neg) 2485 %3 = bitcast i16 %mask to <16 x i1> 2486 %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 2487 ret <16 x half> %4 2488} 2489 2490define <16 x half> @stack_fold_fnmsub132ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { 2491; CHECK-LABEL: stack_fold_fnmsub132ph_mask_ymm: 2492; CHECK: # %bb.0: 2493; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2494; CHECK-NEXT: #APP 2495; CHECK-NEXT: nop 2496; CHECK-NEXT: #NO_APP 2497; CHECK-NEXT: vmovaps (%rdi), %ymm2 2498; CHECK-NEXT: kmovd %esi, %k1 2499; CHECK-NEXT: vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 2500; CHECK-NEXT: vmovaps %ymm2, %ymm0 2501; CHECK-NEXT: retq 2502 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2503 %a0 = load <16 x half>, ptr %p 2504 %neg = fneg <16 x half> %a1 2505 %neg1 = fneg <16 x half> %a0 2506 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a2, <16 x half> %neg) 2507 %3 = bitcast i16 %mask to <16 x i1> 2508 %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 2509 ret <16 x half> %4 2510} 2511 2512define <16 x half> @stack_fold_fnmsub312ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) { 2513; CHECK-LABEL: stack_fold_fnmsub312ph_mask_ymm: 2514; CHECK: # %bb.0: 2515; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2516; CHECK-NEXT: #APP 2517; CHECK-NEXT: nop 2518; CHECK-NEXT: #NO_APP 2519; CHECK-NEXT: vmovaps (%rdi), %ymm2 2520; CHECK-NEXT: kmovd %esi, %k1 2521; CHECK-NEXT: vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 2522; CHECK-NEXT: vmovaps %ymm2, %ymm0 2523; CHECK-NEXT: retq 2524 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2525 %a0 = load <16 x half>, ptr %p 2526 %neg = fneg <16 x half> %a1 2527 %neg1 = fneg <16 x half> %a2 2528 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a0, <16 x half> %neg) 2529 %3 = bitcast i16 %mask to <16 x i1> 2530 %4 = select <16 x i1> %3, <16 x half> %2, <16 x half> %a0 2531 ret <16 x half> %4 2532} 2533 2534define <16 x half> @stack_fold_fnmsub123ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) { 2535; CHECK-LABEL: stack_fold_fnmsub123ph_maskz_ymm: 2536; CHECK: # %bb.0: 2537; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2538; CHECK-NEXT: #APP 2539; CHECK-NEXT: nop 2540; CHECK-NEXT: #NO_APP 2541; CHECK-NEXT: kmovw (%rdi), %k1 2542; CHECK-NEXT: vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload 2543; CHECK-NEXT: retq 2544 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2545 %neg = fneg <16 x half> %a2 2546 %neg1 = fneg <16 x half> %a0 2547 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a1, <16 x half> %neg) 2548 %3 = load i16, ptr %mask 2549 %4 = bitcast i16 %3 to <16 x i1> 2550 %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer 2551 ret <16 x half> %5 2552} 2553 2554define <16 x half> @stack_fold_fnmsub213ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) { 2555; CHECK-LABEL: stack_fold_fnmsub213ph_maskz_ymm: 2556; CHECK: # %bb.0: 2557; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2558; CHECK-NEXT: #APP 2559; CHECK-NEXT: nop 2560; CHECK-NEXT: #NO_APP 2561; CHECK-NEXT: kmovw (%rdi), %k1 2562; CHECK-NEXT: vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload 2563; CHECK-NEXT: retq 2564 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2565 %neg = fneg <16 x half> %a2 2566 %neg1 = fneg <16 x half> %a1 2567 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a0, <16 x half> %neg) 2568 %3 = load i16, ptr %mask 2569 %4 = bitcast i16 %3 to <16 x i1> 2570 %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer 2571 ret <16 x half> %5 2572} 2573 2574define <16 x half> @stack_fold_fnmsub231ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) { 2575; CHECK-LABEL: stack_fold_fnmsub231ph_maskz_ymm: 2576; CHECK: # %bb.0: 2577; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2578; CHECK-NEXT: #APP 2579; CHECK-NEXT: nop 2580; CHECK-NEXT: #NO_APP 2581; CHECK-NEXT: kmovw (%rdi), %k1 2582; CHECK-NEXT: vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload 2583; CHECK-NEXT: retq 2584 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2585 %neg = fneg <16 x half> %a0 2586 %neg1 = fneg <16 x half> %a1 2587 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a2, <16 x half> %neg) 2588 %3 = load i16, ptr %mask 2589 %4 = bitcast i16 %3 to <16 x i1> 2590 %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer 2591 ret <16 x half> %5 2592} 2593 2594define <16 x half> @stack_fold_fnmsub321ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) { 2595; CHECK-LABEL: stack_fold_fnmsub321ph_maskz_ymm: 2596; CHECK: # %bb.0: 2597; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2598; CHECK-NEXT: #APP 2599; CHECK-NEXT: nop 2600; CHECK-NEXT: #NO_APP 2601; CHECK-NEXT: kmovw (%rdi), %k1 2602; CHECK-NEXT: vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload 2603; CHECK-NEXT: retq 2604 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2605 %neg = fneg <16 x half> %a0 2606 %neg1 = fneg <16 x half> %a2 2607 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a1, <16 x half> %neg) 2608 %3 = load i16, ptr %mask 2609 %4 = bitcast i16 %3 to <16 x i1> 2610 %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer 2611 ret <16 x half> %5 2612} 2613 2614define <16 x half> @stack_fold_fnmsub132ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) { 2615; CHECK-LABEL: stack_fold_fnmsub132ph_maskz_ymm: 2616; CHECK: # %bb.0: 2617; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2618; CHECK-NEXT: #APP 2619; CHECK-NEXT: nop 2620; CHECK-NEXT: #NO_APP 2621; CHECK-NEXT: kmovw (%rdi), %k1 2622; CHECK-NEXT: vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload 2623; CHECK-NEXT: retq 2624 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2625 %neg = fneg <16 x half> %a1 2626 %neg1 = fneg <16 x half> %a0 2627 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a2, <16 x half> %neg) 2628 %3 = load i16, ptr %mask 2629 %4 = bitcast i16 %3 to <16 x i1> 2630 %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer 2631 ret <16 x half> %5 2632} 2633 2634define <16 x half> @stack_fold_fnmsub312ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) { 2635; CHECK-LABEL: stack_fold_fnmsub312ph_maskz_ymm: 2636; CHECK: # %bb.0: 2637; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2638; CHECK-NEXT: #APP 2639; CHECK-NEXT: nop 2640; CHECK-NEXT: #NO_APP 2641; CHECK-NEXT: kmovw (%rdi), %k1 2642; CHECK-NEXT: vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload 2643; CHECK-NEXT: retq 2644 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2645 %neg = fneg <16 x half> %a1 2646 %neg1 = fneg <16 x half> %a2 2647 %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg1, <16 x half> %a0, <16 x half> %neg) 2648 %3 = load i16, ptr %mask 2649 %4 = bitcast i16 %3 to <16 x i1> 2650 %5 = select <16 x i1> %4, <16 x half> %2, <16 x half> zeroinitializer 2651 ret <16 x half> %5 2652} 2653