1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 < %s | FileCheck %s 3 4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 5target triple = "x86_64-unknown-unknown" 6 7; Stack reload folding tests. 8; 9; By including a nop call with sideeffects we can force a partial register spill of the 10; relevant registers and check that the reload is correctly folded into the instruction. 11 12define <32 x half> @stack_fold_fmadd123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 13; CHECK-LABEL: stack_fold_fmadd123ph: 14; CHECK: # %bb.0: 15; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16; CHECK-NEXT: #APP 17; CHECK-NEXT: nop 18; CHECK-NEXT: #NO_APP 19; CHECK-NEXT: vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 20; CHECK-NEXT: retq 21 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 22 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) 23 ret <32 x half> %2 24} 25declare <32 x half> @llvm.fma.v32f16(<32 x half>, <32 x half>, <32 x half>) 26 27define <32 x half> @stack_fold_fmadd213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 28; CHECK-LABEL: stack_fold_fmadd213ph: 29; CHECK: # %bb.0: 30; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 31; CHECK-NEXT: #APP 32; CHECK-NEXT: nop 33; CHECK-NEXT: #NO_APP 34; CHECK-NEXT: vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 35; CHECK-NEXT: retq 36 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 37 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2) 38 ret <32 x half> %2 39} 40 41define <32 x half> @stack_fold_fmadd231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 42; CHECK-LABEL: stack_fold_fmadd231ph: 43; CHECK: # %bb.0: 44; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 45; CHECK-NEXT: #APP 46; CHECK-NEXT: nop 47; CHECK-NEXT: #NO_APP 48; CHECK-NEXT: vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 49; CHECK-NEXT: retq 50 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 51 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0) 52 ret <32 x half> %2 53} 54 55define <32 x half> @stack_fold_fmadd321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 56; CHECK-LABEL: stack_fold_fmadd321ph: 57; CHECK: # %bb.0: 58; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 59; CHECK-NEXT: #APP 60; CHECK-NEXT: nop 61; CHECK-NEXT: #NO_APP 62; CHECK-NEXT: vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 63; CHECK-NEXT: retq 64 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 65 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0) 66 ret <32 x half> %2 67} 68 69define <32 x half> @stack_fold_fmadd132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 70; CHECK-LABEL: stack_fold_fmadd132ph: 71; CHECK: # %bb.0: 72; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 73; CHECK-NEXT: #APP 74; CHECK-NEXT: nop 75; CHECK-NEXT: #NO_APP 76; CHECK-NEXT: vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 77; CHECK-NEXT: retq 78 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 79 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1) 80 ret <32 x half> %2 81} 82 83define <32 x half> @stack_fold_fmadd312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 84; CHECK-LABEL: stack_fold_fmadd312ph: 85; CHECK: # %bb.0: 86; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 87; CHECK-NEXT: #APP 88; CHECK-NEXT: nop 89; CHECK-NEXT: #NO_APP 90; CHECK-NEXT: vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 91; CHECK-NEXT: retq 92 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 93 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1) 94 ret <32 x half> %2 95} 96 97define <32 x half> @stack_fold_fmadd123ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 98; CHECK-LABEL: stack_fold_fmadd123ph_mask: 99; CHECK: # %bb.0: 100; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 101; CHECK-NEXT: #APP 102; CHECK-NEXT: nop 103; CHECK-NEXT: #NO_APP 104; CHECK-NEXT: vmovaps (%rdi), %zmm2 105; CHECK-NEXT: kmovd %esi, %k1 106; CHECK-NEXT: vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 107; CHECK-NEXT: vmovaps %zmm2, %zmm0 108; CHECK-NEXT: retq 109 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 110 %a0 = load <32 x half>, ptr %p 111 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) 112 %3 = bitcast i32 %mask to <32 x i1> 113 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 114 ret <32 x half> %4 115} 116 117define <32 x half> @stack_fold_fmadd213ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 118; CHECK-LABEL: stack_fold_fmadd213ph_mask: 119; CHECK: # %bb.0: 120; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 121; CHECK-NEXT: #APP 122; CHECK-NEXT: nop 123; CHECK-NEXT: #NO_APP 124; CHECK-NEXT: vmovaps (%rdi), %zmm2 125; CHECK-NEXT: kmovd %esi, %k1 126; CHECK-NEXT: vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 127; CHECK-NEXT: vmovaps %zmm2, %zmm0 128; CHECK-NEXT: retq 129 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 130 %a0 = load <32 x half>, ptr %p 131 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2) 132 %3 = bitcast i32 %mask to <32 x i1> 133 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 134 ret <32 x half> %4 135} 136 137define <32 x half> @stack_fold_fmadd231ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 138; CHECK-LABEL: stack_fold_fmadd231ph_mask: 139; CHECK: # %bb.0: 140; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 141; CHECK-NEXT: #APP 142; CHECK-NEXT: nop 143; CHECK-NEXT: #NO_APP 144; CHECK-NEXT: vmovaps (%rdi), %zmm2 145; CHECK-NEXT: kmovd %esi, %k1 146; CHECK-NEXT: vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 147; CHECK-NEXT: vmovaps %zmm2, %zmm0 148; CHECK-NEXT: retq 149 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 150 %a0 = load <32 x half>, ptr %p 151 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0) 152 %3 = bitcast i32 %mask to <32 x i1> 153 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 154 ret <32 x half> %4 155} 156 157define <32 x half> @stack_fold_fmadd321ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 158; CHECK-LABEL: stack_fold_fmadd321ph_mask: 159; CHECK: # %bb.0: 160; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 161; CHECK-NEXT: #APP 162; CHECK-NEXT: nop 163; CHECK-NEXT: #NO_APP 164; CHECK-NEXT: vmovaps (%rdi), %zmm2 165; CHECK-NEXT: kmovd %esi, %k1 166; CHECK-NEXT: vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 167; CHECK-NEXT: vmovaps %zmm2, %zmm0 168; CHECK-NEXT: retq 169 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 170 %a0 = load <32 x half>, ptr %p 171 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0) 172 %3 = bitcast i32 %mask to <32 x i1> 173 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 174 ret <32 x half> %4 175} 176 177define <32 x half> @stack_fold_fmadd132ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 178; CHECK-LABEL: stack_fold_fmadd132ph_mask: 179; CHECK: # %bb.0: 180; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 181; CHECK-NEXT: #APP 182; CHECK-NEXT: nop 183; CHECK-NEXT: #NO_APP 184; CHECK-NEXT: vmovaps (%rdi), %zmm2 185; CHECK-NEXT: kmovd %esi, %k1 186; CHECK-NEXT: vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 187; CHECK-NEXT: vmovaps %zmm2, %zmm0 188; CHECK-NEXT: retq 189 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 190 %a0 = load <32 x half>, ptr %p 191 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1) 192 %3 = bitcast i32 %mask to <32 x i1> 193 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 194 ret <32 x half> %4 195} 196 197define <32 x half> @stack_fold_fmadd312ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 198; CHECK-LABEL: stack_fold_fmadd312ph_mask: 199; CHECK: # %bb.0: 200; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 201; CHECK-NEXT: #APP 202; CHECK-NEXT: nop 203; CHECK-NEXT: #NO_APP 204; CHECK-NEXT: vmovaps (%rdi), %zmm2 205; CHECK-NEXT: kmovd %esi, %k1 206; CHECK-NEXT: vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 207; CHECK-NEXT: vmovaps %zmm2, %zmm0 208; CHECK-NEXT: retq 209 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 210 %a0 = load <32 x half>, ptr %p 211 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1) 212 %3 = bitcast i32 %mask to <32 x i1> 213 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 214 ret <32 x half> %4 215} 216 217define <32 x half> @stack_fold_fmadd123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 218; CHECK-LABEL: stack_fold_fmadd123ph_maskz: 219; CHECK: # %bb.0: 220; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 221; CHECK-NEXT: #APP 222; CHECK-NEXT: nop 223; CHECK-NEXT: #NO_APP 224; CHECK-NEXT: kmovd (%rdi), %k1 225; CHECK-NEXT: vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 226; CHECK-NEXT: retq 227 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 228 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) 229 %3 = load i32, ptr %mask 230 %4 = bitcast i32 %3 to <32 x i1> 231 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 232 ret <32 x half> %5 233} 234 235define <32 x half> @stack_fold_fmadd213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 236; CHECK-LABEL: stack_fold_fmadd213ph_maskz: 237; CHECK: # %bb.0: 238; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 239; CHECK-NEXT: #APP 240; CHECK-NEXT: nop 241; CHECK-NEXT: #NO_APP 242; CHECK-NEXT: kmovd (%rdi), %k1 243; CHECK-NEXT: vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 244; CHECK-NEXT: retq 245 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 246 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2) 247 %3 = load i32, ptr %mask 248 %4 = bitcast i32 %3 to <32 x i1> 249 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 250 ret <32 x half> %5 251} 252 253define <32 x half> @stack_fold_fmadd231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 254; CHECK-LABEL: stack_fold_fmadd231ph_maskz: 255; CHECK: # %bb.0: 256; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 257; CHECK-NEXT: #APP 258; CHECK-NEXT: nop 259; CHECK-NEXT: #NO_APP 260; CHECK-NEXT: kmovd (%rdi), %k1 261; CHECK-NEXT: vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 262; CHECK-NEXT: retq 263 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 264 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0) 265 %3 = load i32, ptr %mask 266 %4 = bitcast i32 %3 to <32 x i1> 267 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 268 ret <32 x half> %5 269} 270 271define <32 x half> @stack_fold_fmadd321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 272; CHECK-LABEL: stack_fold_fmadd321ph_maskz: 273; CHECK: # %bb.0: 274; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 275; CHECK-NEXT: #APP 276; CHECK-NEXT: nop 277; CHECK-NEXT: #NO_APP 278; CHECK-NEXT: kmovd (%rdi), %k1 279; CHECK-NEXT: vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 280; CHECK-NEXT: retq 281 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 282 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0) 283 %3 = load i32, ptr %mask 284 %4 = bitcast i32 %3 to <32 x i1> 285 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 286 ret <32 x half> %5 287} 288 289define <32 x half> @stack_fold_fmadd132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 290; CHECK-LABEL: stack_fold_fmadd132ph_maskz: 291; CHECK: # %bb.0: 292; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 293; CHECK-NEXT: #APP 294; CHECK-NEXT: nop 295; CHECK-NEXT: #NO_APP 296; CHECK-NEXT: kmovd (%rdi), %k1 297; CHECK-NEXT: vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 298; CHECK-NEXT: retq 299 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 300 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1) 301 %3 = load i32, ptr %mask 302 %4 = bitcast i32 %3 to <32 x i1> 303 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 304 ret <32 x half> %5 305} 306 307define <32 x half> @stack_fold_fmadd312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 308; CHECK-LABEL: stack_fold_fmadd312ph_maskz: 309; CHECK: # %bb.0: 310; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 311; CHECK-NEXT: #APP 312; CHECK-NEXT: nop 313; CHECK-NEXT: #NO_APP 314; CHECK-NEXT: kmovd (%rdi), %k1 315; CHECK-NEXT: vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 316; CHECK-NEXT: retq 317 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 318 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1) 319 %3 = load i32, ptr %mask 320 %4 = bitcast i32 %3 to <32 x i1> 321 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 322 ret <32 x half> %5 323} 324 325define <32 x half> @stack_fold_fmsub123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 326; CHECK-LABEL: stack_fold_fmsub123ph: 327; CHECK: # %bb.0: 328; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 329; CHECK-NEXT: #APP 330; CHECK-NEXT: nop 331; CHECK-NEXT: #NO_APP 332; CHECK-NEXT: vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 333; CHECK-NEXT: retq 334 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 335 %2 = fneg <32 x half> %a2 336 %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %2) 337 ret <32 x half> %3 338} 339 340define <32 x half> @stack_fold_fmsub213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 341; CHECK-LABEL: stack_fold_fmsub213ph: 342; CHECK: # %bb.0: 343; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 344; CHECK-NEXT: #APP 345; CHECK-NEXT: nop 346; CHECK-NEXT: #NO_APP 347; CHECK-NEXT: vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 348; CHECK-NEXT: retq 349 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 350 %2 = fneg <32 x half> %a2 351 %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %2) 352 ret <32 x half> %3 353} 354 355define <32 x half> @stack_fold_fmsub231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 356; CHECK-LABEL: stack_fold_fmsub231ph: 357; CHECK: # %bb.0: 358; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 359; CHECK-NEXT: #APP 360; CHECK-NEXT: nop 361; CHECK-NEXT: #NO_APP 362; CHECK-NEXT: vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 363; CHECK-NEXT: retq 364 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 365 %2 = fneg <32 x half> %a0 366 %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %2) 367 ret <32 x half> %3 368} 369 370define <32 x half> @stack_fold_fmsub321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 371; CHECK-LABEL: stack_fold_fmsub321ph: 372; CHECK: # %bb.0: 373; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 374; CHECK-NEXT: #APP 375; CHECK-NEXT: nop 376; CHECK-NEXT: #NO_APP 377; CHECK-NEXT: vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 378; CHECK-NEXT: retq 379 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 380 %2 = fneg <32 x half> %a0 381 %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %2) 382 ret <32 x half> %3 383} 384 385define <32 x half> @stack_fold_fmsub132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 386; CHECK-LABEL: stack_fold_fmsub132ph: 387; CHECK: # %bb.0: 388; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 389; CHECK-NEXT: #APP 390; CHECK-NEXT: nop 391; CHECK-NEXT: #NO_APP 392; CHECK-NEXT: vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 393; CHECK-NEXT: retq 394 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 395 %2 = fneg <32 x half> %a1 396 %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %2) 397 ret <32 x half> %3 398} 399 400define <32 x half> @stack_fold_fmsub312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 401; CHECK-LABEL: stack_fold_fmsub312ph: 402; CHECK: # %bb.0: 403; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 404; CHECK-NEXT: #APP 405; CHECK-NEXT: nop 406; CHECK-NEXT: #NO_APP 407; CHECK-NEXT: vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 408; CHECK-NEXT: retq 409 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 410 %2 = fneg <32 x half> %a1 411 %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %2) 412 ret <32 x half> %3 413} 414 415define <32 x half> @stack_fold_fmsub123ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 416; CHECK-LABEL: stack_fold_fmsub123ph_mask: 417; CHECK: # %bb.0: 418; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 419; CHECK-NEXT: #APP 420; CHECK-NEXT: nop 421; CHECK-NEXT: #NO_APP 422; CHECK-NEXT: vmovaps (%rdi), %zmm2 423; CHECK-NEXT: kmovd %esi, %k1 424; CHECK-NEXT: vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 425; CHECK-NEXT: vmovaps %zmm2, %zmm0 426; CHECK-NEXT: retq 427 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 428 %a0 = load <32 x half>, ptr %p 429 %neg = fneg <32 x half> %a2 430 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %neg) 431 %3 = bitcast i32 %mask to <32 x i1> 432 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 433 ret <32 x half> %4 434} 435 436define <32 x half> @stack_fold_fmsub213ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 437; CHECK-LABEL: stack_fold_fmsub213ph_mask: 438; CHECK: # %bb.0: 439; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 440; CHECK-NEXT: #APP 441; CHECK-NEXT: nop 442; CHECK-NEXT: #NO_APP 443; CHECK-NEXT: vmovaps (%rdi), %zmm2 444; CHECK-NEXT: kmovd %esi, %k1 445; CHECK-NEXT: vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 446; CHECK-NEXT: vmovaps %zmm2, %zmm0 447; CHECK-NEXT: retq 448 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 449 %a0 = load <32 x half>, ptr %p 450 %neg = fneg <32 x half> %a2 451 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %neg) 452 %3 = bitcast i32 %mask to <32 x i1> 453 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 454 ret <32 x half> %4 455} 456 457define <32 x half> @stack_fold_fmsub231ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 458; CHECK-LABEL: stack_fold_fmsub231ph_mask: 459; CHECK: # %bb.0: 460; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 461; CHECK-NEXT: #APP 462; CHECK-NEXT: nop 463; CHECK-NEXT: #NO_APP 464; CHECK-NEXT: vmovaps (%rdi), %zmm2 465; CHECK-NEXT: kmovd %esi, %k1 466; CHECK-NEXT: vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 467; CHECK-NEXT: vmovaps %zmm2, %zmm0 468; CHECK-NEXT: retq 469 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 470 %a0 = load <32 x half>, ptr %p 471 %neg = fneg <32 x half> %a0 472 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %neg) 473 %3 = bitcast i32 %mask to <32 x i1> 474 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 475 ret <32 x half> %4 476} 477 478define <32 x half> @stack_fold_fmsub321ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 479; CHECK-LABEL: stack_fold_fmsub321ph_mask: 480; CHECK: # %bb.0: 481; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 482; CHECK-NEXT: #APP 483; CHECK-NEXT: nop 484; CHECK-NEXT: #NO_APP 485; CHECK-NEXT: vmovaps (%rdi), %zmm2 486; CHECK-NEXT: kmovd %esi, %k1 487; CHECK-NEXT: vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 488; CHECK-NEXT: vmovaps %zmm2, %zmm0 489; CHECK-NEXT: retq 490 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 491 %a0 = load <32 x half>, ptr %p 492 %neg = fneg <32 x half> %a0 493 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %neg) 494 %3 = bitcast i32 %mask to <32 x i1> 495 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 496 ret <32 x half> %4 497} 498 499define <32 x half> @stack_fold_fmsub132ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 500; CHECK-LABEL: stack_fold_fmsub132ph_mask: 501; CHECK: # %bb.0: 502; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 503; CHECK-NEXT: #APP 504; CHECK-NEXT: nop 505; CHECK-NEXT: #NO_APP 506; CHECK-NEXT: vmovaps (%rdi), %zmm2 507; CHECK-NEXT: kmovd %esi, %k1 508; CHECK-NEXT: vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 509; CHECK-NEXT: vmovaps %zmm2, %zmm0 510; CHECK-NEXT: retq 511 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 512 %a0 = load <32 x half>, ptr %p 513 %neg = fneg <32 x half> %a1 514 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %neg) 515 %3 = bitcast i32 %mask to <32 x i1> 516 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 517 ret <32 x half> %4 518} 519 520define <32 x half> @stack_fold_fmsub312ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 521; CHECK-LABEL: stack_fold_fmsub312ph_mask: 522; CHECK: # %bb.0: 523; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 524; CHECK-NEXT: #APP 525; CHECK-NEXT: nop 526; CHECK-NEXT: #NO_APP 527; CHECK-NEXT: vmovaps (%rdi), %zmm2 528; CHECK-NEXT: kmovd %esi, %k1 529; CHECK-NEXT: vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 530; CHECK-NEXT: vmovaps %zmm2, %zmm0 531; CHECK-NEXT: retq 532 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 533 %a0 = load <32 x half>, ptr %p 534 %neg = fneg <32 x half> %a1 535 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %neg) 536 %3 = bitcast i32 %mask to <32 x i1> 537 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 538 ret <32 x half> %4 539} 540 541define <32 x half> @stack_fold_fmsub123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 542; CHECK-LABEL: stack_fold_fmsub123ph_maskz: 543; CHECK: # %bb.0: 544; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 545; CHECK-NEXT: #APP 546; CHECK-NEXT: nop 547; CHECK-NEXT: #NO_APP 548; CHECK-NEXT: kmovd (%rdi), %k1 549; CHECK-NEXT: vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 550; CHECK-NEXT: retq 551 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 552 %neg = fneg <32 x half> %a2 553 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %neg) 554 %3 = load i32, ptr %mask 555 %4 = bitcast i32 %3 to <32 x i1> 556 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 557 ret <32 x half> %5 558} 559 560define <32 x half> @stack_fold_fmsub213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 561; CHECK-LABEL: stack_fold_fmsub213ph_maskz: 562; CHECK: # %bb.0: 563; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 564; CHECK-NEXT: #APP 565; CHECK-NEXT: nop 566; CHECK-NEXT: #NO_APP 567; CHECK-NEXT: kmovd (%rdi), %k1 568; CHECK-NEXT: vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 569; CHECK-NEXT: retq 570 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 571 %neg = fneg <32 x half> %a2 572 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %neg) 573 %3 = load i32, ptr %mask 574 %4 = bitcast i32 %3 to <32 x i1> 575 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 576 ret <32 x half> %5 577} 578 579define <32 x half> @stack_fold_fmsub231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 580; CHECK-LABEL: stack_fold_fmsub231ph_maskz: 581; CHECK: # %bb.0: 582; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 583; CHECK-NEXT: #APP 584; CHECK-NEXT: nop 585; CHECK-NEXT: #NO_APP 586; CHECK-NEXT: kmovd (%rdi), %k1 587; CHECK-NEXT: vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 588; CHECK-NEXT: retq 589 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 590 %neg = fneg <32 x half> %a0 591 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %neg) 592 %3 = load i32, ptr %mask 593 %4 = bitcast i32 %3 to <32 x i1> 594 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 595 ret <32 x half> %5 596} 597 598define <32 x half> @stack_fold_fmsub321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 599; CHECK-LABEL: stack_fold_fmsub321ph_maskz: 600; CHECK: # %bb.0: 601; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 602; CHECK-NEXT: #APP 603; CHECK-NEXT: nop 604; CHECK-NEXT: #NO_APP 605; CHECK-NEXT: kmovd (%rdi), %k1 606; CHECK-NEXT: vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 607; CHECK-NEXT: retq 608 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 609 %neg = fneg <32 x half> %a0 610 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %neg) 611 %3 = load i32, ptr %mask 612 %4 = bitcast i32 %3 to <32 x i1> 613 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 614 ret <32 x half> %5 615} 616 617define <32 x half> @stack_fold_fmsub132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 618; CHECK-LABEL: stack_fold_fmsub132ph_maskz: 619; CHECK: # %bb.0: 620; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 621; CHECK-NEXT: #APP 622; CHECK-NEXT: nop 623; CHECK-NEXT: #NO_APP 624; CHECK-NEXT: kmovd (%rdi), %k1 625; CHECK-NEXT: vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 626; CHECK-NEXT: retq 627 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 628 %neg = fneg <32 x half> %a1 629 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %neg) 630 %3 = load i32, ptr %mask 631 %4 = bitcast i32 %3 to <32 x i1> 632 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 633 ret <32 x half> %5 634} 635 636define <32 x half> @stack_fold_fmsub312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 637; CHECK-LABEL: stack_fold_fmsub312ph_maskz: 638; CHECK: # %bb.0: 639; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 640; CHECK-NEXT: #APP 641; CHECK-NEXT: nop 642; CHECK-NEXT: #NO_APP 643; CHECK-NEXT: kmovd (%rdi), %k1 644; CHECK-NEXT: vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 645; CHECK-NEXT: retq 646 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 647 %neg = fneg <32 x half> %a1 648 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %neg) 649 %3 = load i32, ptr %mask 650 %4 = bitcast i32 %3 to <32 x i1> 651 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 652 ret <32 x half> %5 653} 654 655define <32 x half> @stack_fold_fnmadd123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 656; CHECK-LABEL: stack_fold_fnmadd123ph: 657; CHECK: # %bb.0: 658; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 659; CHECK-NEXT: #APP 660; CHECK-NEXT: nop 661; CHECK-NEXT: #NO_APP 662; CHECK-NEXT: vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 663; CHECK-NEXT: retq 664 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 665 %2 = fneg <32 x half> %a0 666 %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a1, <32 x half> %a2) 667 ret <32 x half> %3 668} 669 670define <32 x half> @stack_fold_fnmadd213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 671; CHECK-LABEL: stack_fold_fnmadd213ph: 672; CHECK: # %bb.0: 673; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 674; CHECK-NEXT: #APP 675; CHECK-NEXT: nop 676; CHECK-NEXT: #NO_APP 677; CHECK-NEXT: vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 678; CHECK-NEXT: retq 679 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 680 %2 = fneg <32 x half> %a1 681 %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a0, <32 x half> %a2) 682 ret <32 x half> %3 683} 684 685define <32 x half> @stack_fold_fnmadd231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 686; CHECK-LABEL: stack_fold_fnmadd231ph: 687; CHECK: # %bb.0: 688; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 689; CHECK-NEXT: #APP 690; CHECK-NEXT: nop 691; CHECK-NEXT: #NO_APP 692; CHECK-NEXT: vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 693; CHECK-NEXT: retq 694 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 695 %2 = fneg <32 x half> %a1 696 %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a2, <32 x half> %a0) 697 ret <32 x half> %3 698} 699 700define <32 x half> @stack_fold_fnmadd321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 701; CHECK-LABEL: stack_fold_fnmadd321ph: 702; CHECK: # %bb.0: 703; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 704; CHECK-NEXT: #APP 705; CHECK-NEXT: nop 706; CHECK-NEXT: #NO_APP 707; CHECK-NEXT: vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 708; CHECK-NEXT: retq 709 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 710 %2 = fneg <32 x half> %a2 711 %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a1, <32 x half> %a0) 712 ret <32 x half> %3 713} 714 715define <32 x half> @stack_fold_fnmadd132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 716; CHECK-LABEL: stack_fold_fnmadd132ph: 717; CHECK: # %bb.0: 718; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 719; CHECK-NEXT: #APP 720; CHECK-NEXT: nop 721; CHECK-NEXT: #NO_APP 722; CHECK-NEXT: vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 723; CHECK-NEXT: retq 724 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 725 %2 = fneg <32 x half> %a0 726 %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a2, <32 x half> %a1) 727 ret <32 x half> %3 728} 729 730define <32 x half> @stack_fold_fnmadd312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 731; CHECK-LABEL: stack_fold_fnmadd312ph: 732; CHECK: # %bb.0: 733; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 734; CHECK-NEXT: #APP 735; CHECK-NEXT: nop 736; CHECK-NEXT: #NO_APP 737; CHECK-NEXT: vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 738; CHECK-NEXT: retq 739 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 740 %2 = fneg <32 x half> %a2 741 %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a0, <32 x half> %a1) 742 ret <32 x half> %3 743} 744 745define <32 x half> @stack_fold_fnmadd123ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 746; CHECK-LABEL: stack_fold_fnmadd123ph_mask: 747; CHECK: # %bb.0: 748; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 749; CHECK-NEXT: #APP 750; CHECK-NEXT: nop 751; CHECK-NEXT: #NO_APP 752; CHECK-NEXT: vmovaps (%rdi), %zmm2 753; CHECK-NEXT: kmovd %esi, %k1 754; CHECK-NEXT: vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 755; CHECK-NEXT: vmovaps %zmm2, %zmm0 756; CHECK-NEXT: retq 757 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 758 %a0 = load <32 x half>, ptr %p 759 %neg = fneg <32 x half> %a0 760 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a1, <32 x half> %a2) 761 %3 = bitcast i32 %mask to <32 x i1> 762 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 763 ret <32 x half> %4 764} 765 766define <32 x half> @stack_fold_fnmadd213ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 767; CHECK-LABEL: stack_fold_fnmadd213ph_mask: 768; CHECK: # %bb.0: 769; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 770; CHECK-NEXT: #APP 771; CHECK-NEXT: nop 772; CHECK-NEXT: #NO_APP 773; CHECK-NEXT: vmovaps (%rdi), %zmm2 774; CHECK-NEXT: kmovd %esi, %k1 775; CHECK-NEXT: vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 776; CHECK-NEXT: vmovaps %zmm2, %zmm0 777; CHECK-NEXT: retq 778 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 779 %a0 = load <32 x half>, ptr %p 780 %neg = fneg <32 x half> %a1 781 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a0, <32 x half> %a2) 782 %3 = bitcast i32 %mask to <32 x i1> 783 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 784 ret <32 x half> %4 785} 786 787define <32 x half> @stack_fold_fnmadd231ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 788; CHECK-LABEL: stack_fold_fnmadd231ph_mask: 789; CHECK: # %bb.0: 790; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 791; CHECK-NEXT: #APP 792; CHECK-NEXT: nop 793; CHECK-NEXT: #NO_APP 794; CHECK-NEXT: vmovaps (%rdi), %zmm2 795; CHECK-NEXT: kmovd %esi, %k1 796; CHECK-NEXT: vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 797; CHECK-NEXT: vmovaps %zmm2, %zmm0 798; CHECK-NEXT: retq 799 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 800 %a0 = load <32 x half>, ptr %p 801 %neg = fneg <32 x half> %a1 802 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a2, <32 x half> %a0) 803 %3 = bitcast i32 %mask to <32 x i1> 804 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 805 ret <32 x half> %4 806} 807 808define <32 x half> @stack_fold_fnmadd321ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 809; CHECK-LABEL: stack_fold_fnmadd321ph_mask: 810; CHECK: # %bb.0: 811; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 812; CHECK-NEXT: #APP 813; CHECK-NEXT: nop 814; CHECK-NEXT: #NO_APP 815; CHECK-NEXT: vmovaps (%rdi), %zmm2 816; CHECK-NEXT: kmovd %esi, %k1 817; CHECK-NEXT: vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 818; CHECK-NEXT: vmovaps %zmm2, %zmm0 819; CHECK-NEXT: retq 820 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 821 %a0 = load <32 x half>, ptr %p 822 %neg = fneg <32 x half> %a2 823 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a1, <32 x half> %a0) 824 %3 = bitcast i32 %mask to <32 x i1> 825 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 826 ret <32 x half> %4 827} 828 829define <32 x half> @stack_fold_fnmadd132ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 830; CHECK-LABEL: stack_fold_fnmadd132ph_mask: 831; CHECK: # %bb.0: 832; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 833; CHECK-NEXT: #APP 834; CHECK-NEXT: nop 835; CHECK-NEXT: #NO_APP 836; CHECK-NEXT: vmovaps (%rdi), %zmm2 837; CHECK-NEXT: kmovd %esi, %k1 838; CHECK-NEXT: vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 839; CHECK-NEXT: vmovaps %zmm2, %zmm0 840; CHECK-NEXT: retq 841 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 842 %a0 = load <32 x half>, ptr %p 843 %neg = fneg <32 x half> %a0 844 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a2, <32 x half> %a1) 845 %3 = bitcast i32 %mask to <32 x i1> 846 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 847 ret <32 x half> %4 848} 849 850define <32 x half> @stack_fold_fnmadd312ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 851; CHECK-LABEL: stack_fold_fnmadd312ph_mask: 852; CHECK: # %bb.0: 853; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 854; CHECK-NEXT: #APP 855; CHECK-NEXT: nop 856; CHECK-NEXT: #NO_APP 857; CHECK-NEXT: vmovaps (%rdi), %zmm2 858; CHECK-NEXT: kmovd %esi, %k1 859; CHECK-NEXT: vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 860; CHECK-NEXT: vmovaps %zmm2, %zmm0 861; CHECK-NEXT: retq 862 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 863 %a0 = load <32 x half>, ptr %p 864 %neg = fneg <32 x half> %a2 865 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a0, <32 x half> %a1) 866 %3 = bitcast i32 %mask to <32 x i1> 867 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 868 ret <32 x half> %4 869} 870 871define <32 x half> @stack_fold_fnmadd123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 872; CHECK-LABEL: stack_fold_fnmadd123ph_maskz: 873; CHECK: # %bb.0: 874; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 875; CHECK-NEXT: #APP 876; CHECK-NEXT: nop 877; CHECK-NEXT: #NO_APP 878; CHECK-NEXT: kmovd (%rdi), %k1 879; CHECK-NEXT: vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 880; CHECK-NEXT: retq 881 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 882 %neg = fneg <32 x half> %a0 883 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a1, <32 x half> %a2) 884 %3 = load i32, ptr %mask 885 %4 = bitcast i32 %3 to <32 x i1> 886 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 887 ret <32 x half> %5 888} 889 890define <32 x half> @stack_fold_fnmadd213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 891; CHECK-LABEL: stack_fold_fnmadd213ph_maskz: 892; CHECK: # %bb.0: 893; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 894; CHECK-NEXT: #APP 895; CHECK-NEXT: nop 896; CHECK-NEXT: #NO_APP 897; CHECK-NEXT: kmovd (%rdi), %k1 898; CHECK-NEXT: vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 899; CHECK-NEXT: retq 900 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 901 %neg = fneg <32 x half> %a1 902 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a0, <32 x half> %a2) 903 %3 = load i32, ptr %mask 904 %4 = bitcast i32 %3 to <32 x i1> 905 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 906 ret <32 x half> %5 907} 908 909define <32 x half> @stack_fold_fnmadd231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 910; CHECK-LABEL: stack_fold_fnmadd231ph_maskz: 911; CHECK: # %bb.0: 912; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 913; CHECK-NEXT: #APP 914; CHECK-NEXT: nop 915; CHECK-NEXT: #NO_APP 916; CHECK-NEXT: kmovd (%rdi), %k1 917; CHECK-NEXT: vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 918; CHECK-NEXT: retq 919 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 920 %neg = fneg <32 x half> %a1 921 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a2, <32 x half> %a0) 922 %3 = load i32, ptr %mask 923 %4 = bitcast i32 %3 to <32 x i1> 924 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 925 ret <32 x half> %5 926} 927 928define <32 x half> @stack_fold_fnmadd321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 929; CHECK-LABEL: stack_fold_fnmadd321ph_maskz: 930; CHECK: # %bb.0: 931; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 932; CHECK-NEXT: #APP 933; CHECK-NEXT: nop 934; CHECK-NEXT: #NO_APP 935; CHECK-NEXT: kmovd (%rdi), %k1 936; CHECK-NEXT: vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 937; CHECK-NEXT: retq 938 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 939 %neg = fneg <32 x half> %a2 940 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a1, <32 x half> %a0) 941 %3 = load i32, ptr %mask 942 %4 = bitcast i32 %3 to <32 x i1> 943 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 944 ret <32 x half> %5 945} 946 947define <32 x half> @stack_fold_fnmadd132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 948; CHECK-LABEL: stack_fold_fnmadd132ph_maskz: 949; CHECK: # %bb.0: 950; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 951; CHECK-NEXT: #APP 952; CHECK-NEXT: nop 953; CHECK-NEXT: #NO_APP 954; CHECK-NEXT: kmovd (%rdi), %k1 955; CHECK-NEXT: vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 956; CHECK-NEXT: retq 957 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 958 %neg = fneg <32 x half> %a0 959 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a2, <32 x half> %a1) 960 %3 = load i32, ptr %mask 961 %4 = bitcast i32 %3 to <32 x i1> 962 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 963 ret <32 x half> %5 964} 965 966define <32 x half> @stack_fold_fnmadd312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 967; CHECK-LABEL: stack_fold_fnmadd312ph_maskz: 968; CHECK: # %bb.0: 969; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 970; CHECK-NEXT: #APP 971; CHECK-NEXT: nop 972; CHECK-NEXT: #NO_APP 973; CHECK-NEXT: kmovd (%rdi), %k1 974; CHECK-NEXT: vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 975; CHECK-NEXT: retq 976 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 977 %neg = fneg <32 x half> %a2 978 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a0, <32 x half> %a1) 979 %3 = load i32, ptr %mask 980 %4 = bitcast i32 %3 to <32 x i1> 981 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 982 ret <32 x half> %5 983} 984 985define <32 x half> @stack_fold_fnmsub123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 986; CHECK-LABEL: stack_fold_fnmsub123ph: 987; CHECK: # %bb.0: 988; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 989; CHECK-NEXT: #APP 990; CHECK-NEXT: nop 991; CHECK-NEXT: #NO_APP 992; CHECK-NEXT: vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 993; CHECK-NEXT: retq 994 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 995 %2 = fneg <32 x half> %a0 996 %3 = fneg <32 x half> %a2 997 %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a1, <32 x half> %3) 998 ret <32 x half> %4 999} 1000 1001define <32 x half> @stack_fold_fnmsub213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 1002; CHECK-LABEL: stack_fold_fnmsub213ph: 1003; CHECK: # %bb.0: 1004; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1005; CHECK-NEXT: #APP 1006; CHECK-NEXT: nop 1007; CHECK-NEXT: #NO_APP 1008; CHECK-NEXT: vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 1009; CHECK-NEXT: retq 1010 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1011 %2 = fneg <32 x half> %a1 1012 %3 = fneg <32 x half> %a2 1013 %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a0, <32 x half> %3) 1014 ret <32 x half> %4 1015} 1016 1017define <32 x half> @stack_fold_fnmsub231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 1018; CHECK-LABEL: stack_fold_fnmsub231ph: 1019; CHECK: # %bb.0: 1020; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1021; CHECK-NEXT: #APP 1022; CHECK-NEXT: nop 1023; CHECK-NEXT: #NO_APP 1024; CHECK-NEXT: vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 1025; CHECK-NEXT: retq 1026 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1027 %2 = fneg <32 x half> %a1 1028 %3 = fneg <32 x half> %a0 1029 %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a2, <32 x half> %3) 1030 ret <32 x half> %4 1031} 1032 1033define <32 x half> @stack_fold_fnmsub321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 1034; CHECK-LABEL: stack_fold_fnmsub321ph: 1035; CHECK: # %bb.0: 1036; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1037; CHECK-NEXT: #APP 1038; CHECK-NEXT: nop 1039; CHECK-NEXT: #NO_APP 1040; CHECK-NEXT: vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 1041; CHECK-NEXT: retq 1042 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1043 %2 = fneg <32 x half> %a2 1044 %3 = fneg <32 x half> %a0 1045 %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a1, <32 x half> %3) 1046 ret <32 x half> %4 1047} 1048 1049define <32 x half> @stack_fold_fnmsub132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 1050; CHECK-LABEL: stack_fold_fnmsub132ph: 1051; CHECK: # %bb.0: 1052; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1053; CHECK-NEXT: #APP 1054; CHECK-NEXT: nop 1055; CHECK-NEXT: #NO_APP 1056; CHECK-NEXT: vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 1057; CHECK-NEXT: retq 1058 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1059 %2 = fneg <32 x half> %a0 1060 %3 = fneg <32 x half> %a1 1061 %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a2, <32 x half> %3) 1062 ret <32 x half> %4 1063} 1064 1065define <32 x half> @stack_fold_fnmsub312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 1066; CHECK-LABEL: stack_fold_fnmsub312ph: 1067; CHECK: # %bb.0: 1068; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1069; CHECK-NEXT: #APP 1070; CHECK-NEXT: nop 1071; CHECK-NEXT: #NO_APP 1072; CHECK-NEXT: vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 1073; CHECK-NEXT: retq 1074 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1075 %2 = fneg <32 x half> %a2 1076 %3 = fneg <32 x half> %a1 1077 %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a0, <32 x half> %3) 1078 ret <32 x half> %4 1079} 1080 1081define <32 x half> @stack_fold_fnmsub123ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 1082; CHECK-LABEL: stack_fold_fnmsub123ph_mask: 1083; CHECK: # %bb.0: 1084; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1085; CHECK-NEXT: #APP 1086; CHECK-NEXT: nop 1087; CHECK-NEXT: #NO_APP 1088; CHECK-NEXT: vmovaps (%rdi), %zmm2 1089; CHECK-NEXT: kmovd %esi, %k1 1090; CHECK-NEXT: vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 1091; CHECK-NEXT: vmovaps %zmm2, %zmm0 1092; CHECK-NEXT: retq 1093 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1094 %a0 = load <32 x half>, ptr %p 1095 %neg = fneg <32 x half> %a2 1096 %neg1 = fneg <32 x half> %a0 1097 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a1, <32 x half> %neg) 1098 %3 = bitcast i32 %mask to <32 x i1> 1099 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 1100 ret <32 x half> %4 1101} 1102 1103define <32 x half> @stack_fold_fnmsub213ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 1104; CHECK-LABEL: stack_fold_fnmsub213ph_mask: 1105; CHECK: # %bb.0: 1106; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1107; CHECK-NEXT: #APP 1108; CHECK-NEXT: nop 1109; CHECK-NEXT: #NO_APP 1110; CHECK-NEXT: vmovaps (%rdi), %zmm2 1111; CHECK-NEXT: kmovd %esi, %k1 1112; CHECK-NEXT: vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 1113; CHECK-NEXT: vmovaps %zmm2, %zmm0 1114; CHECK-NEXT: retq 1115 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1116 %a0 = load <32 x half>, ptr %p 1117 %neg = fneg <32 x half> %a2 1118 %neg1 = fneg <32 x half> %a1 1119 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a0, <32 x half> %neg) 1120 %3 = bitcast i32 %mask to <32 x i1> 1121 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 1122 ret <32 x half> %4 1123} 1124 1125define <32 x half> @stack_fold_fnmsub231ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 1126; CHECK-LABEL: stack_fold_fnmsub231ph_mask: 1127; CHECK: # %bb.0: 1128; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1129; CHECK-NEXT: #APP 1130; CHECK-NEXT: nop 1131; CHECK-NEXT: #NO_APP 1132; CHECK-NEXT: vmovaps (%rdi), %zmm2 1133; CHECK-NEXT: kmovd %esi, %k1 1134; CHECK-NEXT: vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 1135; CHECK-NEXT: vmovaps %zmm2, %zmm0 1136; CHECK-NEXT: retq 1137 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1138 %a0 = load <32 x half>, ptr %p 1139 %neg = fneg <32 x half> %a0 1140 %neg1 = fneg <32 x half> %a1 1141 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a2, <32 x half> %neg) 1142 %3 = bitcast i32 %mask to <32 x i1> 1143 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 1144 ret <32 x half> %4 1145} 1146 1147define <32 x half> @stack_fold_fnmsub321ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 1148; CHECK-LABEL: stack_fold_fnmsub321ph_mask: 1149; CHECK: # %bb.0: 1150; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1151; CHECK-NEXT: #APP 1152; CHECK-NEXT: nop 1153; CHECK-NEXT: #NO_APP 1154; CHECK-NEXT: vmovaps (%rdi), %zmm2 1155; CHECK-NEXT: kmovd %esi, %k1 1156; CHECK-NEXT: vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 1157; CHECK-NEXT: vmovaps %zmm2, %zmm0 1158; CHECK-NEXT: retq 1159 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1160 %a0 = load <32 x half>, ptr %p 1161 %neg = fneg <32 x half> %a0 1162 %neg1 = fneg <32 x half> %a2 1163 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a1, <32 x half> %neg) 1164 %3 = bitcast i32 %mask to <32 x i1> 1165 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 1166 ret <32 x half> %4 1167} 1168 1169define <32 x half> @stack_fold_fnmsub132ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 1170; CHECK-LABEL: stack_fold_fnmsub132ph_mask: 1171; CHECK: # %bb.0: 1172; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1173; CHECK-NEXT: #APP 1174; CHECK-NEXT: nop 1175; CHECK-NEXT: #NO_APP 1176; CHECK-NEXT: vmovaps (%rdi), %zmm2 1177; CHECK-NEXT: kmovd %esi, %k1 1178; CHECK-NEXT: vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 1179; CHECK-NEXT: vmovaps %zmm2, %zmm0 1180; CHECK-NEXT: retq 1181 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1182 %a0 = load <32 x half>, ptr %p 1183 %neg = fneg <32 x half> %a1 1184 %neg1 = fneg <32 x half> %a0 1185 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a2, <32 x half> %neg) 1186 %3 = bitcast i32 %mask to <32 x i1> 1187 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 1188 ret <32 x half> %4 1189} 1190 1191define <32 x half> @stack_fold_fnmsub312ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 1192; CHECK-LABEL: stack_fold_fnmsub312ph_mask: 1193; CHECK: # %bb.0: 1194; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1195; CHECK-NEXT: #APP 1196; CHECK-NEXT: nop 1197; CHECK-NEXT: #NO_APP 1198; CHECK-NEXT: vmovaps (%rdi), %zmm2 1199; CHECK-NEXT: kmovd %esi, %k1 1200; CHECK-NEXT: vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 1201; CHECK-NEXT: vmovaps %zmm2, %zmm0 1202; CHECK-NEXT: retq 1203 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1204 %a0 = load <32 x half>, ptr %p 1205 %neg = fneg <32 x half> %a1 1206 %neg1 = fneg <32 x half> %a2 1207 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a0, <32 x half> %neg) 1208 %3 = bitcast i32 %mask to <32 x i1> 1209 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 1210 ret <32 x half> %4 1211} 1212 1213define <32 x half> @stack_fold_fnmsub123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 1214; CHECK-LABEL: stack_fold_fnmsub123ph_maskz: 1215; CHECK: # %bb.0: 1216; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1217; CHECK-NEXT: #APP 1218; CHECK-NEXT: nop 1219; CHECK-NEXT: #NO_APP 1220; CHECK-NEXT: kmovd (%rdi), %k1 1221; CHECK-NEXT: vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 1222; CHECK-NEXT: retq 1223 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1224 %neg = fneg <32 x half> %a2 1225 %neg1 = fneg <32 x half> %a0 1226 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a1, <32 x half> %neg) 1227 %3 = load i32, ptr %mask 1228 %4 = bitcast i32 %3 to <32 x i1> 1229 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 1230 ret <32 x half> %5 1231} 1232 1233define <32 x half> @stack_fold_fnmsub213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 1234; CHECK-LABEL: stack_fold_fnmsub213ph_maskz: 1235; CHECK: # %bb.0: 1236; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1237; CHECK-NEXT: #APP 1238; CHECK-NEXT: nop 1239; CHECK-NEXT: #NO_APP 1240; CHECK-NEXT: kmovd (%rdi), %k1 1241; CHECK-NEXT: vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 1242; CHECK-NEXT: retq 1243 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1244 %neg = fneg <32 x half> %a2 1245 %neg1 = fneg <32 x half> %a1 1246 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a0, <32 x half> %neg) 1247 %3 = load i32, ptr %mask 1248 %4 = bitcast i32 %3 to <32 x i1> 1249 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 1250 ret <32 x half> %5 1251} 1252 1253define <32 x half> @stack_fold_fnmsub231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 1254; CHECK-LABEL: stack_fold_fnmsub231ph_maskz: 1255; CHECK: # %bb.0: 1256; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1257; CHECK-NEXT: #APP 1258; CHECK-NEXT: nop 1259; CHECK-NEXT: #NO_APP 1260; CHECK-NEXT: kmovd (%rdi), %k1 1261; CHECK-NEXT: vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 1262; CHECK-NEXT: retq 1263 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1264 %neg = fneg <32 x half> %a0 1265 %neg1 = fneg <32 x half> %a1 1266 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a2, <32 x half> %neg) 1267 %3 = load i32, ptr %mask 1268 %4 = bitcast i32 %3 to <32 x i1> 1269 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 1270 ret <32 x half> %5 1271} 1272 1273define <32 x half> @stack_fold_fnmsub321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 1274; CHECK-LABEL: stack_fold_fnmsub321ph_maskz: 1275; CHECK: # %bb.0: 1276; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1277; CHECK-NEXT: #APP 1278; CHECK-NEXT: nop 1279; CHECK-NEXT: #NO_APP 1280; CHECK-NEXT: kmovd (%rdi), %k1 1281; CHECK-NEXT: vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 1282; CHECK-NEXT: retq 1283 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1284 %neg = fneg <32 x half> %a0 1285 %neg1 = fneg <32 x half> %a2 1286 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a1, <32 x half> %neg) 1287 %3 = load i32, ptr %mask 1288 %4 = bitcast i32 %3 to <32 x i1> 1289 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 1290 ret <32 x half> %5 1291} 1292 1293define <32 x half> @stack_fold_fnmsub132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 1294; CHECK-LABEL: stack_fold_fnmsub132ph_maskz: 1295; CHECK: # %bb.0: 1296; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1297; CHECK-NEXT: #APP 1298; CHECK-NEXT: nop 1299; CHECK-NEXT: #NO_APP 1300; CHECK-NEXT: kmovd (%rdi), %k1 1301; CHECK-NEXT: vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 1302; CHECK-NEXT: retq 1303 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1304 %neg = fneg <32 x half> %a1 1305 %neg1 = fneg <32 x half> %a0 1306 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a2, <32 x half> %neg) 1307 %3 = load i32, ptr %mask 1308 %4 = bitcast i32 %3 to <32 x i1> 1309 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 1310 ret <32 x half> %5 1311} 1312 1313define <32 x half> @stack_fold_fnmsub312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 1314; CHECK-LABEL: stack_fold_fnmsub312ph_maskz: 1315; CHECK: # %bb.0: 1316; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1317; CHECK-NEXT: #APP 1318; CHECK-NEXT: nop 1319; CHECK-NEXT: #NO_APP 1320; CHECK-NEXT: kmovd (%rdi), %k1 1321; CHECK-NEXT: vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 1322; CHECK-NEXT: retq 1323 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1324 %neg = fneg <32 x half> %a1 1325 %neg1 = fneg <32 x half> %a2 1326 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a0, <32 x half> %neg) 1327 %3 = load i32, ptr %mask 1328 %4 = bitcast i32 %3 to <32 x i1> 1329 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 1330 ret <32 x half> %5 1331} 1332 1333define half @stack_fold_fmadd123sh(half %a0, half %a1, half %a2) { 1334; CHECK-LABEL: stack_fold_fmadd123sh: 1335; CHECK: # %bb.0: 1336; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1337; CHECK-NEXT: #APP 1338; CHECK-NEXT: nop 1339; CHECK-NEXT: #NO_APP 1340; CHECK-NEXT: vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload 1341; CHECK-NEXT: retq 1342 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1343 %2 = call half @llvm.fma.f16(half %a0, half %a1, half %a2) 1344 ret half %2 1345} 1346declare half @llvm.fma.f16(half, half, half) 1347 1348define half @stack_fold_fmadd213sh(half %a0, half %a1, half %a2) { 1349; CHECK-LABEL: stack_fold_fmadd213sh: 1350; CHECK: # %bb.0: 1351; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1352; CHECK-NEXT: #APP 1353; CHECK-NEXT: nop 1354; CHECK-NEXT: #NO_APP 1355; CHECK-NEXT: vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload 1356; CHECK-NEXT: retq 1357 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1358 %2 = call half @llvm.fma.f16(half %a1, half %a0, half %a2) 1359 ret half %2 1360} 1361 1362define half @stack_fold_fmadd231sh(half %a0, half %a1, half %a2) { 1363; CHECK-LABEL: stack_fold_fmadd231sh: 1364; CHECK: # %bb.0: 1365; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1366; CHECK-NEXT: #APP 1367; CHECK-NEXT: nop 1368; CHECK-NEXT: #NO_APP 1369; CHECK-NEXT: vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload 1370; CHECK-NEXT: retq 1371 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1372 %2 = call half @llvm.fma.f16(half %a1, half %a2, half %a0) 1373 ret half %2 1374} 1375 1376define half @stack_fold_fmadd321sh(half %a0, half %a1, half %a2) { 1377; CHECK-LABEL: stack_fold_fmadd321sh: 1378; CHECK: # %bb.0: 1379; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1380; CHECK-NEXT: #APP 1381; CHECK-NEXT: nop 1382; CHECK-NEXT: #NO_APP 1383; CHECK-NEXT: vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload 1384; CHECK-NEXT: retq 1385 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1386 %2 = call half @llvm.fma.f16(half %a2, half %a1, half %a0) 1387 ret half %2 1388} 1389 1390define half @stack_fold_fmadd132sh(half %a0, half %a1, half %a2) { 1391; CHECK-LABEL: stack_fold_fmadd132sh: 1392; CHECK: # %bb.0: 1393; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1394; CHECK-NEXT: #APP 1395; CHECK-NEXT: nop 1396; CHECK-NEXT: #NO_APP 1397; CHECK-NEXT: vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload 1398; CHECK-NEXT: retq 1399 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1400 %2 = call half @llvm.fma.f16(half %a0, half %a2, half %a1) 1401 ret half %2 1402} 1403 1404define half @stack_fold_fmadd312sh(half %a0, half %a1, half %a2) { 1405; CHECK-LABEL: stack_fold_fmadd312sh: 1406; CHECK: # %bb.0: 1407; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1408; CHECK-NEXT: #APP 1409; CHECK-NEXT: nop 1410; CHECK-NEXT: #NO_APP 1411; CHECK-NEXT: vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload 1412; CHECK-NEXT: retq 1413 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1414 %2 = call half @llvm.fma.f16(half %a2, half %a0, half %a1) 1415 ret half %2 1416} 1417 1418define half @stack_fold_fmsub123sh(half %a0, half %a1, half %a2) { 1419; CHECK-LABEL: stack_fold_fmsub123sh: 1420; CHECK: # %bb.0: 1421; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1422; CHECK-NEXT: #APP 1423; CHECK-NEXT: nop 1424; CHECK-NEXT: #NO_APP 1425; CHECK-NEXT: vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload 1426; CHECK-NEXT: retq 1427 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1428 %2 = fneg half %a2 1429 %3 = call half @llvm.fma.f16(half %a0, half %a1, half %2) 1430 ret half %3 1431} 1432 1433define half @stack_fold_fmsub213sh(half %a0, half %a1, half %a2) { 1434; CHECK-LABEL: stack_fold_fmsub213sh: 1435; CHECK: # %bb.0: 1436; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1437; CHECK-NEXT: #APP 1438; CHECK-NEXT: nop 1439; CHECK-NEXT: #NO_APP 1440; CHECK-NEXT: vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload 1441; CHECK-NEXT: retq 1442 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1443 %2 = fneg half %a2 1444 %3 = call half @llvm.fma.f16(half %a1, half %a0, half %2) 1445 ret half %3 1446} 1447 1448define half @stack_fold_fmsub231sh(half %a0, half %a1, half %a2) { 1449; CHECK-LABEL: stack_fold_fmsub231sh: 1450; CHECK: # %bb.0: 1451; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1452; CHECK-NEXT: #APP 1453; CHECK-NEXT: nop 1454; CHECK-NEXT: #NO_APP 1455; CHECK-NEXT: vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload 1456; CHECK-NEXT: retq 1457 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1458 %2 = fneg half %a0 1459 %3 = call half @llvm.fma.f16(half %a1, half %a2, half %2) 1460 ret half %3 1461} 1462 1463define half @stack_fold_fmsub321sh(half %a0, half %a1, half %a2) { 1464; CHECK-LABEL: stack_fold_fmsub321sh: 1465; CHECK: # %bb.0: 1466; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1467; CHECK-NEXT: #APP 1468; CHECK-NEXT: nop 1469; CHECK-NEXT: #NO_APP 1470; CHECK-NEXT: vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload 1471; CHECK-NEXT: retq 1472 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1473 %2 = fneg half %a0 1474 %3 = call half @llvm.fma.f16(half %a2, half %a1, half %2) 1475 ret half %3 1476} 1477 1478define half @stack_fold_fmsub132sh(half %a0, half %a1, half %a2) { 1479; CHECK-LABEL: stack_fold_fmsub132sh: 1480; CHECK: # %bb.0: 1481; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1482; CHECK-NEXT: #APP 1483; CHECK-NEXT: nop 1484; CHECK-NEXT: #NO_APP 1485; CHECK-NEXT: vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload 1486; CHECK-NEXT: retq 1487 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1488 %2 = fneg half %a1 1489 %3 = call half @llvm.fma.f16(half %a0, half %a2, half %2) 1490 ret half %3 1491} 1492 1493define half @stack_fold_fmsub312sh(half %a0, half %a1, half %a2) { 1494; CHECK-LABEL: stack_fold_fmsub312sh: 1495; CHECK: # %bb.0: 1496; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1497; CHECK-NEXT: #APP 1498; CHECK-NEXT: nop 1499; CHECK-NEXT: #NO_APP 1500; CHECK-NEXT: vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload 1501; CHECK-NEXT: retq 1502 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1503 %2 = fneg half %a1 1504 %3 = call half @llvm.fma.f16(half %a2, half %a0, half %2) 1505 ret half %3 1506} 1507 1508define half @stack_fold_fnmadd123sh(half %a0, half %a1, half %a2) { 1509; CHECK-LABEL: stack_fold_fnmadd123sh: 1510; CHECK: # %bb.0: 1511; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1512; CHECK-NEXT: #APP 1513; CHECK-NEXT: nop 1514; CHECK-NEXT: #NO_APP 1515; CHECK-NEXT: vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload 1516; CHECK-NEXT: retq 1517 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1518 %2 = fneg half %a0 1519 %3 = call half @llvm.fma.f16(half %2, half %a1, half %a2) 1520 ret half %3 1521} 1522 1523define half @stack_fold_fnmadd213sh(half %a0, half %a1, half %a2) { 1524; CHECK-LABEL: stack_fold_fnmadd213sh: 1525; CHECK: # %bb.0: 1526; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1527; CHECK-NEXT: #APP 1528; CHECK-NEXT: nop 1529; CHECK-NEXT: #NO_APP 1530; CHECK-NEXT: vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload 1531; CHECK-NEXT: retq 1532 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1533 %2 = fneg half %a1 1534 %3 = call half @llvm.fma.f16(half %2, half %a0, half %a2) 1535 ret half %3 1536} 1537 1538define half @stack_fold_fnmadd231sh(half %a0, half %a1, half %a2) { 1539; CHECK-LABEL: stack_fold_fnmadd231sh: 1540; CHECK: # %bb.0: 1541; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1542; CHECK-NEXT: #APP 1543; CHECK-NEXT: nop 1544; CHECK-NEXT: #NO_APP 1545; CHECK-NEXT: vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload 1546; CHECK-NEXT: retq 1547 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1548 %2 = fneg half %a1 1549 %3 = call half @llvm.fma.f16(half %2, half %a2, half %a0) 1550 ret half %3 1551} 1552 1553define half @stack_fold_fnmadd321sh(half %a0, half %a1, half %a2) { 1554; CHECK-LABEL: stack_fold_fnmadd321sh: 1555; CHECK: # %bb.0: 1556; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1557; CHECK-NEXT: #APP 1558; CHECK-NEXT: nop 1559; CHECK-NEXT: #NO_APP 1560; CHECK-NEXT: vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload 1561; CHECK-NEXT: retq 1562 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1563 %2 = fneg half %a2 1564 %3 = call half @llvm.fma.f16(half %2, half %a1, half %a0) 1565 ret half %3 1566} 1567 1568define half @stack_fold_fnmadd132sh(half %a0, half %a1, half %a2) { 1569; CHECK-LABEL: stack_fold_fnmadd132sh: 1570; CHECK: # %bb.0: 1571; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1572; CHECK-NEXT: #APP 1573; CHECK-NEXT: nop 1574; CHECK-NEXT: #NO_APP 1575; CHECK-NEXT: vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload 1576; CHECK-NEXT: retq 1577 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1578 %2 = fneg half %a0 1579 %3 = call half @llvm.fma.f16(half %2, half %a2, half %a1) 1580 ret half %3 1581} 1582 1583define half @stack_fold_fnmadd312sh(half %a0, half %a1, half %a2) { 1584; CHECK-LABEL: stack_fold_fnmadd312sh: 1585; CHECK: # %bb.0: 1586; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1587; CHECK-NEXT: #APP 1588; CHECK-NEXT: nop 1589; CHECK-NEXT: #NO_APP 1590; CHECK-NEXT: vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload 1591; CHECK-NEXT: retq 1592 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1593 %2 = fneg half %a2 1594 %3 = call half @llvm.fma.f16(half %2, half %a0, half %a1) 1595 ret half %3 1596} 1597 1598define half @stack_fold_fnmsub123sh(half %a0, half %a1, half %a2) { 1599; CHECK-LABEL: stack_fold_fnmsub123sh: 1600; CHECK: # %bb.0: 1601; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1602; CHECK-NEXT: #APP 1603; CHECK-NEXT: nop 1604; CHECK-NEXT: #NO_APP 1605; CHECK-NEXT: vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload 1606; CHECK-NEXT: retq 1607 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1608 %2 = fneg half %a0 1609 %3 = fneg half %a2 1610 %4 = call half @llvm.fma.f16(half %2, half %a1, half %3) 1611 ret half %4 1612} 1613 1614define half @stack_fold_fnmsub213sh(half %a0, half %a1, half %a2) { 1615; CHECK-LABEL: stack_fold_fnmsub213sh: 1616; CHECK: # %bb.0: 1617; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1618; CHECK-NEXT: #APP 1619; CHECK-NEXT: nop 1620; CHECK-NEXT: #NO_APP 1621; CHECK-NEXT: vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload 1622; CHECK-NEXT: retq 1623 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1624 %2 = fneg half %a1 1625 %3 = fneg half %a2 1626 %4 = call half @llvm.fma.f16(half %2, half %a0, half %3) 1627 ret half %4 1628} 1629 1630define half @stack_fold_fnmsub231sh(half %a0, half %a1, half %a2) { 1631; CHECK-LABEL: stack_fold_fnmsub231sh: 1632; CHECK: # %bb.0: 1633; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1634; CHECK-NEXT: #APP 1635; CHECK-NEXT: nop 1636; CHECK-NEXT: #NO_APP 1637; CHECK-NEXT: vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload 1638; CHECK-NEXT: retq 1639 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1640 %2 = fneg half %a1 1641 %3 = fneg half %a0 1642 %4 = call half @llvm.fma.f16(half %2, half %a2, half %3) 1643 ret half %4 1644} 1645 1646define half @stack_fold_fnmsub321sh(half %a0, half %a1, half %a2) { 1647; CHECK-LABEL: stack_fold_fnmsub321sh: 1648; CHECK: # %bb.0: 1649; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1650; CHECK-NEXT: #APP 1651; CHECK-NEXT: nop 1652; CHECK-NEXT: #NO_APP 1653; CHECK-NEXT: vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload 1654; CHECK-NEXT: retq 1655 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1656 %2 = fneg half %a2 1657 %3 = fneg half %a0 1658 %4 = call half @llvm.fma.f16(half %2, half %a1, half %3) 1659 ret half %4 1660} 1661 1662define half @stack_fold_fnmsub132sh(half %a0, half %a1, half %a2) { 1663; CHECK-LABEL: stack_fold_fnmsub132sh: 1664; CHECK: # %bb.0: 1665; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1666; CHECK-NEXT: #APP 1667; CHECK-NEXT: nop 1668; CHECK-NEXT: #NO_APP 1669; CHECK-NEXT: vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload 1670; CHECK-NEXT: retq 1671 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1672 %2 = fneg half %a0 1673 %3 = fneg half %a1 1674 %4 = call half @llvm.fma.f16(half %2, half %a2, half %3) 1675 ret half %4 1676} 1677 1678define half @stack_fold_fnmsub312sh(half %a0, half %a1, half %a2) { 1679; CHECK-LABEL: stack_fold_fnmsub312sh: 1680; CHECK: # %bb.0: 1681; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1682; CHECK-NEXT: #APP 1683; CHECK-NEXT: nop 1684; CHECK-NEXT: #NO_APP 1685; CHECK-NEXT: vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload 1686; CHECK-NEXT: retq 1687 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1688 %2 = fneg half %a2 1689 %3 = fneg half %a1 1690 %4 = call half @llvm.fma.f16(half %2, half %a0, half %3) 1691 ret half %4 1692} 1693 1694define <8 x half> @stack_fold_fmadd123sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { 1695; CHECK-LABEL: stack_fold_fmadd123sh_int: 1696; CHECK: # %bb.0: 1697; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1698; CHECK-NEXT: #APP 1699; CHECK-NEXT: nop 1700; CHECK-NEXT: #NO_APP 1701; CHECK-NEXT: vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1702; CHECK-NEXT: retq 1703 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1704 %a0 = extractelement <8 x half> %a0v, i64 0 1705 %a1 = extractelement <8 x half> %a1v, i64 0 1706 %a2 = extractelement <8 x half> %a2v, i64 0 1707 %2 = call half @llvm.fma.f16(half %a0, half %a1, half %a2) 1708 %res = insertelement <8 x half> %a0v, half %2, i64 0 1709 ret <8 x half> %res 1710} 1711 1712define <8 x half> @stack_fold_fmadd213sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { 1713; CHECK-LABEL: stack_fold_fmadd213sh_int: 1714; CHECK: # %bb.0: 1715; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1716; CHECK-NEXT: #APP 1717; CHECK-NEXT: nop 1718; CHECK-NEXT: #NO_APP 1719; CHECK-NEXT: vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1720; CHECK-NEXT: retq 1721 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1722 %a0 = extractelement <8 x half> %a0v, i64 0 1723 %a1 = extractelement <8 x half> %a1v, i64 0 1724 %a2 = extractelement <8 x half> %a2v, i64 0 1725 %2 = call half @llvm.fma.f16(half %a1, half %a0, half %a2) 1726 %res = insertelement <8 x half> %a0v, half %2, i64 0 1727 ret <8 x half> %res 1728} 1729 1730define <8 x half> @stack_fold_fmadd231sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { 1731; CHECK-LABEL: stack_fold_fmadd231sh_int: 1732; CHECK: # %bb.0: 1733; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1734; CHECK-NEXT: #APP 1735; CHECK-NEXT: nop 1736; CHECK-NEXT: #NO_APP 1737; CHECK-NEXT: vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1738; CHECK-NEXT: retq 1739 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1740 %a0 = extractelement <8 x half> %a0v, i64 0 1741 %a1 = extractelement <8 x half> %a1v, i64 0 1742 %a2 = extractelement <8 x half> %a2v, i64 0 1743 %2 = call half @llvm.fma.f16(half %a1, half %a2, half %a0) 1744 %res = insertelement <8 x half> %a0v, half %2, i64 0 1745 ret <8 x half> %res 1746} 1747 1748define <8 x half> @stack_fold_fmadd321sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { 1749; CHECK-LABEL: stack_fold_fmadd321sh_int: 1750; CHECK: # %bb.0: 1751; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1752; CHECK-NEXT: #APP 1753; CHECK-NEXT: nop 1754; CHECK-NEXT: #NO_APP 1755; CHECK-NEXT: vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1756; CHECK-NEXT: retq 1757 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1758 %a0 = extractelement <8 x half> %a0v, i64 0 1759 %a1 = extractelement <8 x half> %a1v, i64 0 1760 %a2 = extractelement <8 x half> %a2v, i64 0 1761 %2 = call half @llvm.fma.f16(half %a2, half %a1, half %a0) 1762 %res = insertelement <8 x half> %a0v, half %2, i64 0 1763 ret <8 x half> %res 1764} 1765 1766define <8 x half> @stack_fold_fmadd132sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { 1767; CHECK-LABEL: stack_fold_fmadd132sh_int: 1768; CHECK: # %bb.0: 1769; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1770; CHECK-NEXT: #APP 1771; CHECK-NEXT: nop 1772; CHECK-NEXT: #NO_APP 1773; CHECK-NEXT: vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1774; CHECK-NEXT: retq 1775 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1776 %a0 = extractelement <8 x half> %a0v, i64 0 1777 %a1 = extractelement <8 x half> %a1v, i64 0 1778 %a2 = extractelement <8 x half> %a2v, i64 0 1779 %2 = call half @llvm.fma.f16(half %a0, half %a2, half %a1) 1780 %res = insertelement <8 x half> %a0v, half %2, i64 0 1781 ret <8 x half> %res 1782} 1783 1784define <8 x half> @stack_fold_fmadd312sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { 1785; CHECK-LABEL: stack_fold_fmadd312sh_int: 1786; CHECK: # %bb.0: 1787; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1788; CHECK-NEXT: #APP 1789; CHECK-NEXT: nop 1790; CHECK-NEXT: #NO_APP 1791; CHECK-NEXT: vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1792; CHECK-NEXT: retq 1793 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1794 %a0 = extractelement <8 x half> %a0v, i64 0 1795 %a1 = extractelement <8 x half> %a1v, i64 0 1796 %a2 = extractelement <8 x half> %a2v, i64 0 1797 %2 = call half @llvm.fma.f16(half %a2, half %a0, half %a1) 1798 %res = insertelement <8 x half> %a0v, half %2, i64 0 1799 ret <8 x half> %res 1800} 1801 1802define <8 x half> @stack_fold_fmsub123sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { 1803; CHECK-LABEL: stack_fold_fmsub123sh_int: 1804; CHECK: # %bb.0: 1805; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1806; CHECK-NEXT: #APP 1807; CHECK-NEXT: nop 1808; CHECK-NEXT: #NO_APP 1809; CHECK-NEXT: vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1810; CHECK-NEXT: retq 1811 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1812 %a0 = extractelement <8 x half> %a0v, i64 0 1813 %a1 = extractelement <8 x half> %a1v, i64 0 1814 %a2 = extractelement <8 x half> %a2v, i64 0 1815 %neg = fneg half %a2 1816 %2 = call half @llvm.fma.f16(half %a0, half %a1, half %neg) 1817 %res = insertelement <8 x half> %a0v, half %2, i64 0 1818 ret <8 x half> %res 1819} 1820 1821define <8 x half> @stack_fold_fmsub213sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { 1822; CHECK-LABEL: stack_fold_fmsub213sh_int: 1823; CHECK: # %bb.0: 1824; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1825; CHECK-NEXT: #APP 1826; CHECK-NEXT: nop 1827; CHECK-NEXT: #NO_APP 1828; CHECK-NEXT: vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1829; CHECK-NEXT: retq 1830 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1831 %a0 = extractelement <8 x half> %a0v, i64 0 1832 %a1 = extractelement <8 x half> %a1v, i64 0 1833 %a2 = extractelement <8 x half> %a2v, i64 0 1834 %neg = fneg half %a2 1835 %2 = call half @llvm.fma.f16(half %a1, half %a0, half %neg) 1836 %res = insertelement <8 x half> %a0v, half %2, i64 0 1837 ret <8 x half> %res 1838} 1839 1840define <8 x half> @stack_fold_fmsub231sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { 1841; CHECK-LABEL: stack_fold_fmsub231sh_int: 1842; CHECK: # %bb.0: 1843; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1844; CHECK-NEXT: #APP 1845; CHECK-NEXT: nop 1846; CHECK-NEXT: #NO_APP 1847; CHECK-NEXT: vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1848; CHECK-NEXT: retq 1849 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1850 %a0 = extractelement <8 x half> %a0v, i64 0 1851 %a1 = extractelement <8 x half> %a1v, i64 0 1852 %a2 = extractelement <8 x half> %a2v, i64 0 1853 %neg = fneg half %a0 1854 %2 = call half @llvm.fma.f16(half %a1, half %a2, half %neg) 1855 %res = insertelement <8 x half> %a0v, half %2, i64 0 1856 ret <8 x half> %res 1857} 1858 1859define <8 x half> @stack_fold_fmsub321sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { 1860; CHECK-LABEL: stack_fold_fmsub321sh_int: 1861; CHECK: # %bb.0: 1862; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1863; CHECK-NEXT: #APP 1864; CHECK-NEXT: nop 1865; CHECK-NEXT: #NO_APP 1866; CHECK-NEXT: vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1867; CHECK-NEXT: retq 1868 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1869 %a0 = extractelement <8 x half> %a0v, i64 0 1870 %a1 = extractelement <8 x half> %a1v, i64 0 1871 %a2 = extractelement <8 x half> %a2v, i64 0 1872 %neg = fneg half %a0 1873 %2 = call half @llvm.fma.f16(half %a2, half %a1, half %neg) 1874 %res = insertelement <8 x half> %a0v, half %2, i64 0 1875 ret <8 x half> %res 1876} 1877 1878define <8 x half> @stack_fold_fmsub132sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { 1879; CHECK-LABEL: stack_fold_fmsub132sh_int: 1880; CHECK: # %bb.0: 1881; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1882; CHECK-NEXT: #APP 1883; CHECK-NEXT: nop 1884; CHECK-NEXT: #NO_APP 1885; CHECK-NEXT: vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1886; CHECK-NEXT: retq 1887 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1888 %a0 = extractelement <8 x half> %a0v, i64 0 1889 %a1 = extractelement <8 x half> %a1v, i64 0 1890 %a2 = extractelement <8 x half> %a2v, i64 0 1891 %neg = fneg half %a1 1892 %2 = call half @llvm.fma.f16(half %a0, half %a2, half %neg) 1893 %res = insertelement <8 x half> %a0v, half %2, i64 0 1894 ret <8 x half> %res 1895} 1896 1897define <8 x half> @stack_fold_fmsub312sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { 1898; CHECK-LABEL: stack_fold_fmsub312sh_int: 1899; CHECK: # %bb.0: 1900; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1901; CHECK-NEXT: #APP 1902; CHECK-NEXT: nop 1903; CHECK-NEXT: #NO_APP 1904; CHECK-NEXT: vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1905; CHECK-NEXT: retq 1906 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1907 %a0 = extractelement <8 x half> %a0v, i64 0 1908 %a1 = extractelement <8 x half> %a1v, i64 0 1909 %a2 = extractelement <8 x half> %a2v, i64 0 1910 %neg = fneg half %a1 1911 %2 = call half @llvm.fma.f16(half %a2, half %a0, half %neg) 1912 %res = insertelement <8 x half> %a0v, half %2, i64 0 1913 ret <8 x half> %res 1914} 1915 1916define <8 x half> @stack_fold_fnmadd123sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { 1917; CHECK-LABEL: stack_fold_fnmadd123sh_int: 1918; CHECK: # %bb.0: 1919; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1920; CHECK-NEXT: #APP 1921; CHECK-NEXT: nop 1922; CHECK-NEXT: #NO_APP 1923; CHECK-NEXT: vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1924; CHECK-NEXT: retq 1925 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1926 %a0 = extractelement <8 x half> %a0v, i64 0 1927 %a1 = extractelement <8 x half> %a1v, i64 0 1928 %a2 = extractelement <8 x half> %a2v, i64 0 1929 %neg1 = fneg half %a0 1930 %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a2) 1931 %res = insertelement <8 x half> %a0v, half %2, i64 0 1932 ret <8 x half> %res 1933} 1934 1935define <8 x half> @stack_fold_fnmadd213sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { 1936; CHECK-LABEL: stack_fold_fnmadd213sh_int: 1937; CHECK: # %bb.0: 1938; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1939; CHECK-NEXT: #APP 1940; CHECK-NEXT: nop 1941; CHECK-NEXT: #NO_APP 1942; CHECK-NEXT: vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1943; CHECK-NEXT: retq 1944 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1945 %a0 = extractelement <8 x half> %a0v, i64 0 1946 %a1 = extractelement <8 x half> %a1v, i64 0 1947 %a2 = extractelement <8 x half> %a2v, i64 0 1948 %neg1 = fneg half %a1 1949 %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a2) 1950 %res = insertelement <8 x half> %a0v, half %2, i64 0 1951 ret <8 x half> %res 1952} 1953 1954define <8 x half> @stack_fold_fnmadd231sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { 1955; CHECK-LABEL: stack_fold_fnmadd231sh_int: 1956; CHECK: # %bb.0: 1957; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1958; CHECK-NEXT: #APP 1959; CHECK-NEXT: nop 1960; CHECK-NEXT: #NO_APP 1961; CHECK-NEXT: vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1962; CHECK-NEXT: retq 1963 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1964 %a0 = extractelement <8 x half> %a0v, i64 0 1965 %a1 = extractelement <8 x half> %a1v, i64 0 1966 %a2 = extractelement <8 x half> %a2v, i64 0 1967 %neg1 = fneg half %a1 1968 %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a0) 1969 %res = insertelement <8 x half> %a0v, half %2, i64 0 1970 ret <8 x half> %res 1971} 1972 1973define <8 x half> @stack_fold_fnmadd321sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { 1974; CHECK-LABEL: stack_fold_fnmadd321sh_int: 1975; CHECK: # %bb.0: 1976; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1977; CHECK-NEXT: #APP 1978; CHECK-NEXT: nop 1979; CHECK-NEXT: #NO_APP 1980; CHECK-NEXT: vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1981; CHECK-NEXT: retq 1982 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1983 %a0 = extractelement <8 x half> %a0v, i64 0 1984 %a1 = extractelement <8 x half> %a1v, i64 0 1985 %a2 = extractelement <8 x half> %a2v, i64 0 1986 %neg1 = fneg half %a2 1987 %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a0) 1988 %res = insertelement <8 x half> %a0v, half %2, i64 0 1989 ret <8 x half> %res 1990} 1991 1992define <8 x half> @stack_fold_fnmadd132sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { 1993; CHECK-LABEL: stack_fold_fnmadd132sh_int: 1994; CHECK: # %bb.0: 1995; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1996; CHECK-NEXT: #APP 1997; CHECK-NEXT: nop 1998; CHECK-NEXT: #NO_APP 1999; CHECK-NEXT: vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 2000; CHECK-NEXT: retq 2001 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2002 %a0 = extractelement <8 x half> %a0v, i64 0 2003 %a1 = extractelement <8 x half> %a1v, i64 0 2004 %a2 = extractelement <8 x half> %a2v, i64 0 2005 %neg1 = fneg half %a0 2006 %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a1) 2007 %res = insertelement <8 x half> %a0v, half %2, i64 0 2008 ret <8 x half> %res 2009} 2010 2011define <8 x half> @stack_fold_fnmadd312sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { 2012; CHECK-LABEL: stack_fold_fnmadd312sh_int: 2013; CHECK: # %bb.0: 2014; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2015; CHECK-NEXT: #APP 2016; CHECK-NEXT: nop 2017; CHECK-NEXT: #NO_APP 2018; CHECK-NEXT: vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 2019; CHECK-NEXT: retq 2020 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2021 %a0 = extractelement <8 x half> %a0v, i64 0 2022 %a1 = extractelement <8 x half> %a1v, i64 0 2023 %a2 = extractelement <8 x half> %a2v, i64 0 2024 %neg1 = fneg half %a2 2025 %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a1) 2026 %res = insertelement <8 x half> %a0v, half %2, i64 0 2027 ret <8 x half> %res 2028} 2029 2030define <8 x half> @stack_fold_fnmsub123sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { 2031; CHECK-LABEL: stack_fold_fnmsub123sh_int: 2032; CHECK: # %bb.0: 2033; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2034; CHECK-NEXT: #APP 2035; CHECK-NEXT: nop 2036; CHECK-NEXT: #NO_APP 2037; CHECK-NEXT: vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 2038; CHECK-NEXT: retq 2039 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2040 %a0 = extractelement <8 x half> %a0v, i64 0 2041 %a1 = extractelement <8 x half> %a1v, i64 0 2042 %a2 = extractelement <8 x half> %a2v, i64 0 2043 %neg = fneg half %a2 2044 %neg1 = fneg half %a0 2045 %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg) 2046 %res = insertelement <8 x half> %a0v, half %2, i64 0 2047 ret <8 x half> %res 2048} 2049 2050define <8 x half> @stack_fold_fnmsub213sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { 2051; CHECK-LABEL: stack_fold_fnmsub213sh_int: 2052; CHECK: # %bb.0: 2053; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2054; CHECK-NEXT: #APP 2055; CHECK-NEXT: nop 2056; CHECK-NEXT: #NO_APP 2057; CHECK-NEXT: vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 2058; CHECK-NEXT: retq 2059 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2060 %a0 = extractelement <8 x half> %a0v, i64 0 2061 %a1 = extractelement <8 x half> %a1v, i64 0 2062 %a2 = extractelement <8 x half> %a2v, i64 0 2063 %neg = fneg half %a2 2064 %neg1 = fneg half %a1 2065 %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg) 2066 %res = insertelement <8 x half> %a0v, half %2, i64 0 2067 ret <8 x half> %res 2068} 2069 2070define <8 x half> @stack_fold_fnmsub231sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { 2071; CHECK-LABEL: stack_fold_fnmsub231sh_int: 2072; CHECK: # %bb.0: 2073; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2074; CHECK-NEXT: #APP 2075; CHECK-NEXT: nop 2076; CHECK-NEXT: #NO_APP 2077; CHECK-NEXT: vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 2078; CHECK-NEXT: retq 2079 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2080 %a0 = extractelement <8 x half> %a0v, i64 0 2081 %a1 = extractelement <8 x half> %a1v, i64 0 2082 %a2 = extractelement <8 x half> %a2v, i64 0 2083 %neg = fneg half %a0 2084 %neg1 = fneg half %a1 2085 %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg) 2086 %res = insertelement <8 x half> %a0v, half %2, i64 0 2087 ret <8 x half> %res 2088} 2089 2090define <8 x half> @stack_fold_fnmsub321sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { 2091; CHECK-LABEL: stack_fold_fnmsub321sh_int: 2092; CHECK: # %bb.0: 2093; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2094; CHECK-NEXT: #APP 2095; CHECK-NEXT: nop 2096; CHECK-NEXT: #NO_APP 2097; CHECK-NEXT: vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 2098; CHECK-NEXT: retq 2099 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2100 %a0 = extractelement <8 x half> %a0v, i64 0 2101 %a1 = extractelement <8 x half> %a1v, i64 0 2102 %a2 = extractelement <8 x half> %a2v, i64 0 2103 %neg = fneg half %a0 2104 %neg1 = fneg half %a2 2105 %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg) 2106 %res = insertelement <8 x half> %a0v, half %2, i64 0 2107 ret <8 x half> %res 2108} 2109 2110define <8 x half> @stack_fold_fnmsub132sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { 2111; CHECK-LABEL: stack_fold_fnmsub132sh_int: 2112; CHECK: # %bb.0: 2113; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2114; CHECK-NEXT: #APP 2115; CHECK-NEXT: nop 2116; CHECK-NEXT: #NO_APP 2117; CHECK-NEXT: vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 2118; CHECK-NEXT: retq 2119 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2120 %a0 = extractelement <8 x half> %a0v, i64 0 2121 %a1 = extractelement <8 x half> %a1v, i64 0 2122 %a2 = extractelement <8 x half> %a2v, i64 0 2123 %neg = fneg half %a1 2124 %neg1 = fneg half %a0 2125 %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg) 2126 %res = insertelement <8 x half> %a0v, half %2, i64 0 2127 ret <8 x half> %res 2128} 2129 2130define <8 x half> @stack_fold_fnmsub312sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) { 2131; CHECK-LABEL: stack_fold_fnmsub312sh_int: 2132; CHECK: # %bb.0: 2133; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2134; CHECK-NEXT: #APP 2135; CHECK-NEXT: nop 2136; CHECK-NEXT: #NO_APP 2137; CHECK-NEXT: vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 2138; CHECK-NEXT: retq 2139 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2140 %a0 = extractelement <8 x half> %a0v, i64 0 2141 %a1 = extractelement <8 x half> %a1v, i64 0 2142 %a2 = extractelement <8 x half> %a2v, i64 0 2143 %neg = fneg half %a1 2144 %neg1 = fneg half %a2 2145 %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg) 2146 %res = insertelement <8 x half> %a0v, half %2, i64 0 2147 ret <8 x half> %res 2148} 2149 2150define <8 x half> @stack_fold_fmadd123sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2151; CHECK-LABEL: stack_fold_fmadd123sh_intk: 2152; CHECK: # %bb.0: 2153; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2154; CHECK-NEXT: #APP 2155; CHECK-NEXT: nop 2156; CHECK-NEXT: #NO_APP 2157; CHECK-NEXT: kmovb (%rdi), %k1 2158; CHECK-NEXT: vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload 2159; CHECK-NEXT: retq 2160 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2161 %a0 = extractelement <8 x half> %a0v, i64 0 2162 %a1 = extractelement <8 x half> %a1v, i64 0 2163 %a2 = extractelement <8 x half> %a2v, i64 0 2164 %2 = call half @llvm.fma.f16(half %a0, half %a1, half %a2) 2165 %3 = load i8, ptr %mask 2166 %4 = bitcast i8 %3 to <8 x i1> 2167 %5 = extractelement <8 x i1> %4, i64 0 2168 %6 = select i1 %5, half %2, half %a0 2169 %res = insertelement <8 x half> %a0v, half %6, i64 0 2170 ret <8 x half> %res 2171} 2172 2173define <8 x half> @stack_fold_fmadd213sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2174; CHECK-LABEL: stack_fold_fmadd213sh_intk: 2175; CHECK: # %bb.0: 2176; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2177; CHECK-NEXT: #APP 2178; CHECK-NEXT: nop 2179; CHECK-NEXT: #NO_APP 2180; CHECK-NEXT: kmovb (%rdi), %k1 2181; CHECK-NEXT: vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload 2182; CHECK-NEXT: retq 2183 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2184 %a0 = extractelement <8 x half> %a0v, i64 0 2185 %a1 = extractelement <8 x half> %a1v, i64 0 2186 %a2 = extractelement <8 x half> %a2v, i64 0 2187 %2 = call half @llvm.fma.f16(half %a1, half %a0, half %a2) 2188 %3 = load i8, ptr %mask 2189 %4 = bitcast i8 %3 to <8 x i1> 2190 %5 = extractelement <8 x i1> %4, i64 0 2191 %6 = select i1 %5, half %2, half %a0 2192 %res = insertelement <8 x half> %a0v, half %6, i64 0 2193 ret <8 x half> %res 2194} 2195 2196define <8 x half> @stack_fold_fmadd231sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2197; CHECK-LABEL: stack_fold_fmadd231sh_intk: 2198; CHECK: # %bb.0: 2199; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2200; CHECK-NEXT: #APP 2201; CHECK-NEXT: nop 2202; CHECK-NEXT: #NO_APP 2203; CHECK-NEXT: kmovb (%rdi), %k1 2204; CHECK-NEXT: vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload 2205; CHECK-NEXT: retq 2206 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2207 %a0 = extractelement <8 x half> %a0v, i64 0 2208 %a1 = extractelement <8 x half> %a1v, i64 0 2209 %a2 = extractelement <8 x half> %a2v, i64 0 2210 %2 = call half @llvm.fma.f16(half %a1, half %a2, half %a0) 2211 %3 = load i8, ptr %mask 2212 %4 = bitcast i8 %3 to <8 x i1> 2213 %5 = extractelement <8 x i1> %4, i64 0 2214 %6 = select i1 %5, half %2, half %a0 2215 %res = insertelement <8 x half> %a0v, half %6, i64 0 2216 ret <8 x half> %res 2217} 2218 2219define <8 x half> @stack_fold_fmadd321sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2220; CHECK-LABEL: stack_fold_fmadd321sh_intk: 2221; CHECK: # %bb.0: 2222; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2223; CHECK-NEXT: #APP 2224; CHECK-NEXT: nop 2225; CHECK-NEXT: #NO_APP 2226; CHECK-NEXT: kmovb (%rdi), %k1 2227; CHECK-NEXT: vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload 2228; CHECK-NEXT: retq 2229 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2230 %a0 = extractelement <8 x half> %a0v, i64 0 2231 %a1 = extractelement <8 x half> %a1v, i64 0 2232 %a2 = extractelement <8 x half> %a2v, i64 0 2233 %2 = call half @llvm.fma.f16(half %a2, half %a1, half %a0) 2234 %3 = load i8, ptr %mask 2235 %4 = bitcast i8 %3 to <8 x i1> 2236 %5 = extractelement <8 x i1> %4, i64 0 2237 %6 = select i1 %5, half %2, half %a0 2238 %res = insertelement <8 x half> %a0v, half %6, i64 0 2239 ret <8 x half> %res 2240} 2241 2242define <8 x half> @stack_fold_fmadd132sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2243; CHECK-LABEL: stack_fold_fmadd132sh_intk: 2244; CHECK: # %bb.0: 2245; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2246; CHECK-NEXT: #APP 2247; CHECK-NEXT: nop 2248; CHECK-NEXT: #NO_APP 2249; CHECK-NEXT: kmovb (%rdi), %k1 2250; CHECK-NEXT: vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload 2251; CHECK-NEXT: retq 2252 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2253 %a0 = extractelement <8 x half> %a0v, i64 0 2254 %a1 = extractelement <8 x half> %a1v, i64 0 2255 %a2 = extractelement <8 x half> %a2v, i64 0 2256 %2 = call half @llvm.fma.f16(half %a0, half %a2, half %a1) 2257 %3 = load i8, ptr %mask 2258 %4 = bitcast i8 %3 to <8 x i1> 2259 %5 = extractelement <8 x i1> %4, i64 0 2260 %6 = select i1 %5, half %2, half %a0 2261 %res = insertelement <8 x half> %a0v, half %6, i64 0 2262 ret <8 x half> %res 2263} 2264 2265define <8 x half> @stack_fold_fmadd312sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2266; CHECK-LABEL: stack_fold_fmadd312sh_intk: 2267; CHECK: # %bb.0: 2268; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2269; CHECK-NEXT: #APP 2270; CHECK-NEXT: nop 2271; CHECK-NEXT: #NO_APP 2272; CHECK-NEXT: kmovb (%rdi), %k1 2273; CHECK-NEXT: vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload 2274; CHECK-NEXT: retq 2275 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2276 %a0 = extractelement <8 x half> %a0v, i64 0 2277 %a1 = extractelement <8 x half> %a1v, i64 0 2278 %a2 = extractelement <8 x half> %a2v, i64 0 2279 %2 = call half @llvm.fma.f16(half %a2, half %a0, half %a1) 2280 %3 = load i8, ptr %mask 2281 %4 = bitcast i8 %3 to <8 x i1> 2282 %5 = extractelement <8 x i1> %4, i64 0 2283 %6 = select i1 %5, half %2, half %a0 2284 %res = insertelement <8 x half> %a0v, half %6, i64 0 2285 ret <8 x half> %res 2286} 2287 2288define <8 x half> @stack_fold_fmsub123sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2289; CHECK-LABEL: stack_fold_fmsub123sh_intk: 2290; CHECK: # %bb.0: 2291; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2292; CHECK-NEXT: #APP 2293; CHECK-NEXT: nop 2294; CHECK-NEXT: #NO_APP 2295; CHECK-NEXT: kmovb (%rdi), %k1 2296; CHECK-NEXT: vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload 2297; CHECK-NEXT: retq 2298 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2299 %a0 = extractelement <8 x half> %a0v, i64 0 2300 %a1 = extractelement <8 x half> %a1v, i64 0 2301 %a2 = extractelement <8 x half> %a2v, i64 0 2302 %neg = fneg half %a2 2303 %2 = call half @llvm.fma.f16(half %a0, half %a1, half %neg) 2304 %3 = load i8, ptr %mask 2305 %4 = bitcast i8 %3 to <8 x i1> 2306 %5 = extractelement <8 x i1> %4, i64 0 2307 %6 = select i1 %5, half %2, half %a0 2308 %res = insertelement <8 x half> %a0v, half %6, i64 0 2309 ret <8 x half> %res 2310} 2311 2312define <8 x half> @stack_fold_fmsub213sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2313; CHECK-LABEL: stack_fold_fmsub213sh_intk: 2314; CHECK: # %bb.0: 2315; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2316; CHECK-NEXT: #APP 2317; CHECK-NEXT: nop 2318; CHECK-NEXT: #NO_APP 2319; CHECK-NEXT: kmovb (%rdi), %k1 2320; CHECK-NEXT: vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload 2321; CHECK-NEXT: retq 2322 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2323 %a0 = extractelement <8 x half> %a0v, i64 0 2324 %a1 = extractelement <8 x half> %a1v, i64 0 2325 %a2 = extractelement <8 x half> %a2v, i64 0 2326 %neg = fneg half %a2 2327 %2 = call half @llvm.fma.f16(half %a1, half %a0, half %neg) 2328 %3 = load i8, ptr %mask 2329 %4 = bitcast i8 %3 to <8 x i1> 2330 %5 = extractelement <8 x i1> %4, i64 0 2331 %6 = select i1 %5, half %2, half %a0 2332 %res = insertelement <8 x half> %a0v, half %6, i64 0 2333 ret <8 x half> %res 2334} 2335 2336define <8 x half> @stack_fold_fmsub231sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2337; CHECK-LABEL: stack_fold_fmsub231sh_intk: 2338; CHECK: # %bb.0: 2339; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2340; CHECK-NEXT: #APP 2341; CHECK-NEXT: nop 2342; CHECK-NEXT: #NO_APP 2343; CHECK-NEXT: kmovb (%rdi), %k1 2344; CHECK-NEXT: vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload 2345; CHECK-NEXT: retq 2346 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2347 %a0 = extractelement <8 x half> %a0v, i64 0 2348 %a1 = extractelement <8 x half> %a1v, i64 0 2349 %a2 = extractelement <8 x half> %a2v, i64 0 2350 %neg = fneg half %a0 2351 %2 = call half @llvm.fma.f16(half %a1, half %a2, half %neg) 2352 %3 = load i8, ptr %mask 2353 %4 = bitcast i8 %3 to <8 x i1> 2354 %5 = extractelement <8 x i1> %4, i64 0 2355 %6 = select i1 %5, half %2, half %a0 2356 %res = insertelement <8 x half> %a0v, half %6, i64 0 2357 ret <8 x half> %res 2358} 2359 2360define <8 x half> @stack_fold_fmsub321sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2361; CHECK-LABEL: stack_fold_fmsub321sh_intk: 2362; CHECK: # %bb.0: 2363; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2364; CHECK-NEXT: #APP 2365; CHECK-NEXT: nop 2366; CHECK-NEXT: #NO_APP 2367; CHECK-NEXT: kmovb (%rdi), %k1 2368; CHECK-NEXT: vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload 2369; CHECK-NEXT: retq 2370 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2371 %a0 = extractelement <8 x half> %a0v, i64 0 2372 %a1 = extractelement <8 x half> %a1v, i64 0 2373 %a2 = extractelement <8 x half> %a2v, i64 0 2374 %neg = fneg half %a0 2375 %2 = call half @llvm.fma.f16(half %a2, half %a1, half %neg) 2376 %3 = load i8, ptr %mask 2377 %4 = bitcast i8 %3 to <8 x i1> 2378 %5 = extractelement <8 x i1> %4, i64 0 2379 %6 = select i1 %5, half %2, half %a0 2380 %res = insertelement <8 x half> %a0v, half %6, i64 0 2381 ret <8 x half> %res 2382} 2383 2384define <8 x half> @stack_fold_fmsub132sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2385; CHECK-LABEL: stack_fold_fmsub132sh_intk: 2386; CHECK: # %bb.0: 2387; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2388; CHECK-NEXT: #APP 2389; CHECK-NEXT: nop 2390; CHECK-NEXT: #NO_APP 2391; CHECK-NEXT: kmovb (%rdi), %k1 2392; CHECK-NEXT: vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload 2393; CHECK-NEXT: retq 2394 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2395 %a0 = extractelement <8 x half> %a0v, i64 0 2396 %a1 = extractelement <8 x half> %a1v, i64 0 2397 %a2 = extractelement <8 x half> %a2v, i64 0 2398 %neg = fneg half %a1 2399 %2 = call half @llvm.fma.f16(half %a0, half %a2, half %neg) 2400 %3 = load i8, ptr %mask 2401 %4 = bitcast i8 %3 to <8 x i1> 2402 %5 = extractelement <8 x i1> %4, i64 0 2403 %6 = select i1 %5, half %2, half %a0 2404 %res = insertelement <8 x half> %a0v, half %6, i64 0 2405 ret <8 x half> %res 2406} 2407 2408define <8 x half> @stack_fold_fmsub312sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2409; CHECK-LABEL: stack_fold_fmsub312sh_intk: 2410; CHECK: # %bb.0: 2411; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2412; CHECK-NEXT: #APP 2413; CHECK-NEXT: nop 2414; CHECK-NEXT: #NO_APP 2415; CHECK-NEXT: kmovb (%rdi), %k1 2416; CHECK-NEXT: vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload 2417; CHECK-NEXT: retq 2418 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2419 %a0 = extractelement <8 x half> %a0v, i64 0 2420 %a1 = extractelement <8 x half> %a1v, i64 0 2421 %a2 = extractelement <8 x half> %a2v, i64 0 2422 %neg = fneg half %a1 2423 %2 = call half @llvm.fma.f16(half %a2, half %a0, half %neg) 2424 %3 = load i8, ptr %mask 2425 %4 = bitcast i8 %3 to <8 x i1> 2426 %5 = extractelement <8 x i1> %4, i64 0 2427 %6 = select i1 %5, half %2, half %a0 2428 %res = insertelement <8 x half> %a0v, half %6, i64 0 2429 ret <8 x half> %res 2430} 2431 2432define <8 x half> @stack_fold_fnmadd123sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2433; CHECK-LABEL: stack_fold_fnmadd123sh_intk: 2434; CHECK: # %bb.0: 2435; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2436; CHECK-NEXT: #APP 2437; CHECK-NEXT: nop 2438; CHECK-NEXT: #NO_APP 2439; CHECK-NEXT: kmovb (%rdi), %k1 2440; CHECK-NEXT: vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload 2441; CHECK-NEXT: retq 2442 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2443 %a0 = extractelement <8 x half> %a0v, i64 0 2444 %a1 = extractelement <8 x half> %a1v, i64 0 2445 %a2 = extractelement <8 x half> %a2v, i64 0 2446 %neg1 = fneg half %a0 2447 %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a2) 2448 %3 = load i8, ptr %mask 2449 %4 = bitcast i8 %3 to <8 x i1> 2450 %5 = extractelement <8 x i1> %4, i64 0 2451 %6 = select i1 %5, half %2, half %a0 2452 %res = insertelement <8 x half> %a0v, half %6, i64 0 2453 ret <8 x half> %res 2454} 2455 2456define <8 x half> @stack_fold_fnmadd213sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2457; CHECK-LABEL: stack_fold_fnmadd213sh_intk: 2458; CHECK: # %bb.0: 2459; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2460; CHECK-NEXT: #APP 2461; CHECK-NEXT: nop 2462; CHECK-NEXT: #NO_APP 2463; CHECK-NEXT: kmovb (%rdi), %k1 2464; CHECK-NEXT: vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload 2465; CHECK-NEXT: retq 2466 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2467 %a0 = extractelement <8 x half> %a0v, i64 0 2468 %a1 = extractelement <8 x half> %a1v, i64 0 2469 %a2 = extractelement <8 x half> %a2v, i64 0 2470 %neg1 = fneg half %a1 2471 %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a2) 2472 %3 = load i8, ptr %mask 2473 %4 = bitcast i8 %3 to <8 x i1> 2474 %5 = extractelement <8 x i1> %4, i64 0 2475 %6 = select i1 %5, half %2, half %a0 2476 %res = insertelement <8 x half> %a0v, half %6, i64 0 2477 ret <8 x half> %res 2478} 2479 2480define <8 x half> @stack_fold_fnmadd231sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2481; CHECK-LABEL: stack_fold_fnmadd231sh_intk: 2482; CHECK: # %bb.0: 2483; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2484; CHECK-NEXT: #APP 2485; CHECK-NEXT: nop 2486; CHECK-NEXT: #NO_APP 2487; CHECK-NEXT: kmovb (%rdi), %k1 2488; CHECK-NEXT: vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload 2489; CHECK-NEXT: retq 2490 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2491 %a0 = extractelement <8 x half> %a0v, i64 0 2492 %a1 = extractelement <8 x half> %a1v, i64 0 2493 %a2 = extractelement <8 x half> %a2v, i64 0 2494 %neg1 = fneg half %a1 2495 %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a0) 2496 %3 = load i8, ptr %mask 2497 %4 = bitcast i8 %3 to <8 x i1> 2498 %5 = extractelement <8 x i1> %4, i64 0 2499 %6 = select i1 %5, half %2, half %a0 2500 %res = insertelement <8 x half> %a0v, half %6, i64 0 2501 ret <8 x half> %res 2502} 2503 2504define <8 x half> @stack_fold_fnmadd321sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2505; CHECK-LABEL: stack_fold_fnmadd321sh_intk: 2506; CHECK: # %bb.0: 2507; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2508; CHECK-NEXT: #APP 2509; CHECK-NEXT: nop 2510; CHECK-NEXT: #NO_APP 2511; CHECK-NEXT: kmovb (%rdi), %k1 2512; CHECK-NEXT: vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload 2513; CHECK-NEXT: retq 2514 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2515 %a0 = extractelement <8 x half> %a0v, i64 0 2516 %a1 = extractelement <8 x half> %a1v, i64 0 2517 %a2 = extractelement <8 x half> %a2v, i64 0 2518 %neg1 = fneg half %a2 2519 %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a0) 2520 %3 = load i8, ptr %mask 2521 %4 = bitcast i8 %3 to <8 x i1> 2522 %5 = extractelement <8 x i1> %4, i64 0 2523 %6 = select i1 %5, half %2, half %a0 2524 %res = insertelement <8 x half> %a0v, half %6, i64 0 2525 ret <8 x half> %res 2526} 2527 2528define <8 x half> @stack_fold_fnmadd132sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2529; CHECK-LABEL: stack_fold_fnmadd132sh_intk: 2530; CHECK: # %bb.0: 2531; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2532; CHECK-NEXT: #APP 2533; CHECK-NEXT: nop 2534; CHECK-NEXT: #NO_APP 2535; CHECK-NEXT: kmovb (%rdi), %k1 2536; CHECK-NEXT: vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload 2537; CHECK-NEXT: retq 2538 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2539 %a0 = extractelement <8 x half> %a0v, i64 0 2540 %a1 = extractelement <8 x half> %a1v, i64 0 2541 %a2 = extractelement <8 x half> %a2v, i64 0 2542 %neg1 = fneg half %a0 2543 %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a1) 2544 %3 = load i8, ptr %mask 2545 %4 = bitcast i8 %3 to <8 x i1> 2546 %5 = extractelement <8 x i1> %4, i64 0 2547 %6 = select i1 %5, half %2, half %a0 2548 %res = insertelement <8 x half> %a0v, half %6, i64 0 2549 ret <8 x half> %res 2550} 2551 2552define <8 x half> @stack_fold_fnmadd312sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2553; CHECK-LABEL: stack_fold_fnmadd312sh_intk: 2554; CHECK: # %bb.0: 2555; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2556; CHECK-NEXT: #APP 2557; CHECK-NEXT: nop 2558; CHECK-NEXT: #NO_APP 2559; CHECK-NEXT: kmovb (%rdi), %k1 2560; CHECK-NEXT: vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload 2561; CHECK-NEXT: retq 2562 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2563 %a0 = extractelement <8 x half> %a0v, i64 0 2564 %a1 = extractelement <8 x half> %a1v, i64 0 2565 %a2 = extractelement <8 x half> %a2v, i64 0 2566 %neg1 = fneg half %a2 2567 %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a1) 2568 %3 = load i8, ptr %mask 2569 %4 = bitcast i8 %3 to <8 x i1> 2570 %5 = extractelement <8 x i1> %4, i64 0 2571 %6 = select i1 %5, half %2, half %a0 2572 %res = insertelement <8 x half> %a0v, half %6, i64 0 2573 ret <8 x half> %res 2574} 2575 2576define <8 x half> @stack_fold_fnmsub123sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2577; CHECK-LABEL: stack_fold_fnmsub123sh_intk: 2578; CHECK: # %bb.0: 2579; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2580; CHECK-NEXT: #APP 2581; CHECK-NEXT: nop 2582; CHECK-NEXT: #NO_APP 2583; CHECK-NEXT: kmovb (%rdi), %k1 2584; CHECK-NEXT: vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload 2585; CHECK-NEXT: retq 2586 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2587 %a0 = extractelement <8 x half> %a0v, i64 0 2588 %a1 = extractelement <8 x half> %a1v, i64 0 2589 %a2 = extractelement <8 x half> %a2v, i64 0 2590 %neg = fneg half %a2 2591 %neg1 = fneg half %a0 2592 %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg) 2593 %3 = load i8, ptr %mask 2594 %4 = bitcast i8 %3 to <8 x i1> 2595 %5 = extractelement <8 x i1> %4, i64 0 2596 %6 = select i1 %5, half %2, half %a0 2597 %res = insertelement <8 x half> %a0v, half %6, i64 0 2598 ret <8 x half> %res 2599} 2600 2601define <8 x half> @stack_fold_fnmsub213sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2602; CHECK-LABEL: stack_fold_fnmsub213sh_intk: 2603; CHECK: # %bb.0: 2604; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2605; CHECK-NEXT: #APP 2606; CHECK-NEXT: nop 2607; CHECK-NEXT: #NO_APP 2608; CHECK-NEXT: kmovb (%rdi), %k1 2609; CHECK-NEXT: vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload 2610; CHECK-NEXT: retq 2611 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2612 %a0 = extractelement <8 x half> %a0v, i64 0 2613 %a1 = extractelement <8 x half> %a1v, i64 0 2614 %a2 = extractelement <8 x half> %a2v, i64 0 2615 %neg = fneg half %a2 2616 %neg1 = fneg half %a1 2617 %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg) 2618 %3 = load i8, ptr %mask 2619 %4 = bitcast i8 %3 to <8 x i1> 2620 %5 = extractelement <8 x i1> %4, i64 0 2621 %6 = select i1 %5, half %2, half %a0 2622 %res = insertelement <8 x half> %a0v, half %6, i64 0 2623 ret <8 x half> %res 2624} 2625 2626define <8 x half> @stack_fold_fnmsub231sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2627; CHECK-LABEL: stack_fold_fnmsub231sh_intk: 2628; CHECK: # %bb.0: 2629; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2630; CHECK-NEXT: #APP 2631; CHECK-NEXT: nop 2632; CHECK-NEXT: #NO_APP 2633; CHECK-NEXT: kmovb (%rdi), %k1 2634; CHECK-NEXT: vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload 2635; CHECK-NEXT: retq 2636 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2637 %a0 = extractelement <8 x half> %a0v, i64 0 2638 %a1 = extractelement <8 x half> %a1v, i64 0 2639 %a2 = extractelement <8 x half> %a2v, i64 0 2640 %neg = fneg half %a0 2641 %neg1 = fneg half %a1 2642 %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg) 2643 %3 = load i8, ptr %mask 2644 %4 = bitcast i8 %3 to <8 x i1> 2645 %5 = extractelement <8 x i1> %4, i64 0 2646 %6 = select i1 %5, half %2, half %a0 2647 %res = insertelement <8 x half> %a0v, half %6, i64 0 2648 ret <8 x half> %res 2649} 2650 2651define <8 x half> @stack_fold_fnmsub321sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2652; CHECK-LABEL: stack_fold_fnmsub321sh_intk: 2653; CHECK: # %bb.0: 2654; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2655; CHECK-NEXT: #APP 2656; CHECK-NEXT: nop 2657; CHECK-NEXT: #NO_APP 2658; CHECK-NEXT: kmovb (%rdi), %k1 2659; CHECK-NEXT: vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload 2660; CHECK-NEXT: retq 2661 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2662 %a0 = extractelement <8 x half> %a0v, i64 0 2663 %a1 = extractelement <8 x half> %a1v, i64 0 2664 %a2 = extractelement <8 x half> %a2v, i64 0 2665 %neg = fneg half %a0 2666 %neg1 = fneg half %a2 2667 %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg) 2668 %3 = load i8, ptr %mask 2669 %4 = bitcast i8 %3 to <8 x i1> 2670 %5 = extractelement <8 x i1> %4, i64 0 2671 %6 = select i1 %5, half %2, half %a0 2672 %res = insertelement <8 x half> %a0v, half %6, i64 0 2673 ret <8 x half> %res 2674} 2675 2676define <8 x half> @stack_fold_fnmsub132sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2677; CHECK-LABEL: stack_fold_fnmsub132sh_intk: 2678; CHECK: # %bb.0: 2679; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2680; CHECK-NEXT: #APP 2681; CHECK-NEXT: nop 2682; CHECK-NEXT: #NO_APP 2683; CHECK-NEXT: kmovb (%rdi), %k1 2684; CHECK-NEXT: vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload 2685; CHECK-NEXT: retq 2686 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2687 %a0 = extractelement <8 x half> %a0v, i64 0 2688 %a1 = extractelement <8 x half> %a1v, i64 0 2689 %a2 = extractelement <8 x half> %a2v, i64 0 2690 %neg = fneg half %a1 2691 %neg1 = fneg half %a0 2692 %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg) 2693 %3 = load i8, ptr %mask 2694 %4 = bitcast i8 %3 to <8 x i1> 2695 %5 = extractelement <8 x i1> %4, i64 0 2696 %6 = select i1 %5, half %2, half %a0 2697 %res = insertelement <8 x half> %a0v, half %6, i64 0 2698 ret <8 x half> %res 2699} 2700 2701define <8 x half> @stack_fold_fnmsub312sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2702; CHECK-LABEL: stack_fold_fnmsub312sh_intk: 2703; CHECK: # %bb.0: 2704; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2705; CHECK-NEXT: #APP 2706; CHECK-NEXT: nop 2707; CHECK-NEXT: #NO_APP 2708; CHECK-NEXT: kmovb (%rdi), %k1 2709; CHECK-NEXT: vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload 2710; CHECK-NEXT: retq 2711 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2712 %a0 = extractelement <8 x half> %a0v, i64 0 2713 %a1 = extractelement <8 x half> %a1v, i64 0 2714 %a2 = extractelement <8 x half> %a2v, i64 0 2715 %neg = fneg half %a1 2716 %neg1 = fneg half %a2 2717 %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg) 2718 %3 = load i8, ptr %mask 2719 %4 = bitcast i8 %3 to <8 x i1> 2720 %5 = extractelement <8 x i1> %4, i64 0 2721 %6 = select i1 %5, half %2, half %a0 2722 %res = insertelement <8 x half> %a0v, half %6, i64 0 2723 ret <8 x half> %res 2724} 2725 2726define <8 x half> @stack_fold_fmadd123sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2727; CHECK-LABEL: stack_fold_fmadd123sh_intkz: 2728; CHECK: # %bb.0: 2729; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2730; CHECK-NEXT: #APP 2731; CHECK-NEXT: nop 2732; CHECK-NEXT: #NO_APP 2733; CHECK-NEXT: kmovb (%rdi), %k1 2734; CHECK-NEXT: vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 2735; CHECK-NEXT: retq 2736 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2737 %a0 = extractelement <8 x half> %a0v, i64 0 2738 %a1 = extractelement <8 x half> %a1v, i64 0 2739 %a2 = extractelement <8 x half> %a2v, i64 0 2740 %2 = call half @llvm.fma.f16(half %a0, half %a1, half %a2) 2741 %3 = load i8, ptr %mask 2742 %4 = bitcast i8 %3 to <8 x i1> 2743 %5 = extractelement <8 x i1> %4, i64 0 2744 %6 = select i1 %5, half %2, half zeroinitializer 2745 %res = insertelement <8 x half> %a0v, half %6, i64 0 2746 ret <8 x half> %res 2747} 2748 2749define <8 x half> @stack_fold_fmadd213sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2750; CHECK-LABEL: stack_fold_fmadd213sh_intkz: 2751; CHECK: # %bb.0: 2752; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2753; CHECK-NEXT: #APP 2754; CHECK-NEXT: nop 2755; CHECK-NEXT: #NO_APP 2756; CHECK-NEXT: kmovb (%rdi), %k1 2757; CHECK-NEXT: vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 2758; CHECK-NEXT: retq 2759 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2760 %a0 = extractelement <8 x half> %a0v, i64 0 2761 %a1 = extractelement <8 x half> %a1v, i64 0 2762 %a2 = extractelement <8 x half> %a2v, i64 0 2763 %2 = call half @llvm.fma.f16(half %a1, half %a0, half %a2) 2764 %3 = load i8, ptr %mask 2765 %4 = bitcast i8 %3 to <8 x i1> 2766 %5 = extractelement <8 x i1> %4, i64 0 2767 %6 = select i1 %5, half %2, half zeroinitializer 2768 %res = insertelement <8 x half> %a0v, half %6, i64 0 2769 ret <8 x half> %res 2770} 2771 2772define <8 x half> @stack_fold_fmadd231sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2773; CHECK-LABEL: stack_fold_fmadd231sh_intkz: 2774; CHECK: # %bb.0: 2775; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2776; CHECK-NEXT: #APP 2777; CHECK-NEXT: nop 2778; CHECK-NEXT: #NO_APP 2779; CHECK-NEXT: kmovb (%rdi), %k1 2780; CHECK-NEXT: vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 2781; CHECK-NEXT: retq 2782 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2783 %a0 = extractelement <8 x half> %a0v, i64 0 2784 %a1 = extractelement <8 x half> %a1v, i64 0 2785 %a2 = extractelement <8 x half> %a2v, i64 0 2786 %2 = call half @llvm.fma.f16(half %a1, half %a2, half %a0) 2787 %3 = load i8, ptr %mask 2788 %4 = bitcast i8 %3 to <8 x i1> 2789 %5 = extractelement <8 x i1> %4, i64 0 2790 %6 = select i1 %5, half %2, half zeroinitializer 2791 %res = insertelement <8 x half> %a0v, half %6, i64 0 2792 ret <8 x half> %res 2793} 2794 2795define <8 x half> @stack_fold_fmadd321sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2796; CHECK-LABEL: stack_fold_fmadd321sh_intkz: 2797; CHECK: # %bb.0: 2798; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2799; CHECK-NEXT: #APP 2800; CHECK-NEXT: nop 2801; CHECK-NEXT: #NO_APP 2802; CHECK-NEXT: kmovb (%rdi), %k1 2803; CHECK-NEXT: vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 2804; CHECK-NEXT: retq 2805 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2806 %a0 = extractelement <8 x half> %a0v, i64 0 2807 %a1 = extractelement <8 x half> %a1v, i64 0 2808 %a2 = extractelement <8 x half> %a2v, i64 0 2809 %2 = call half @llvm.fma.f16(half %a2, half %a1, half %a0) 2810 %3 = load i8, ptr %mask 2811 %4 = bitcast i8 %3 to <8 x i1> 2812 %5 = extractelement <8 x i1> %4, i64 0 2813 %6 = select i1 %5, half %2, half zeroinitializer 2814 %res = insertelement <8 x half> %a0v, half %6, i64 0 2815 ret <8 x half> %res 2816} 2817 2818define <8 x half> @stack_fold_fmadd132sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2819; CHECK-LABEL: stack_fold_fmadd132sh_intkz: 2820; CHECK: # %bb.0: 2821; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2822; CHECK-NEXT: #APP 2823; CHECK-NEXT: nop 2824; CHECK-NEXT: #NO_APP 2825; CHECK-NEXT: kmovb (%rdi), %k1 2826; CHECK-NEXT: vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 2827; CHECK-NEXT: retq 2828 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2829 %a0 = extractelement <8 x half> %a0v, i64 0 2830 %a1 = extractelement <8 x half> %a1v, i64 0 2831 %a2 = extractelement <8 x half> %a2v, i64 0 2832 %2 = call half @llvm.fma.f16(half %a0, half %a2, half %a1) 2833 %3 = load i8, ptr %mask 2834 %4 = bitcast i8 %3 to <8 x i1> 2835 %5 = extractelement <8 x i1> %4, i64 0 2836 %6 = select i1 %5, half %2, half zeroinitializer 2837 %res = insertelement <8 x half> %a0v, half %6, i64 0 2838 ret <8 x half> %res 2839} 2840 2841define <8 x half> @stack_fold_fmadd312sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2842; CHECK-LABEL: stack_fold_fmadd312sh_intkz: 2843; CHECK: # %bb.0: 2844; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2845; CHECK-NEXT: #APP 2846; CHECK-NEXT: nop 2847; CHECK-NEXT: #NO_APP 2848; CHECK-NEXT: kmovb (%rdi), %k1 2849; CHECK-NEXT: vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 2850; CHECK-NEXT: retq 2851 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2852 %a0 = extractelement <8 x half> %a0v, i64 0 2853 %a1 = extractelement <8 x half> %a1v, i64 0 2854 %a2 = extractelement <8 x half> %a2v, i64 0 2855 %2 = call half @llvm.fma.f16(half %a2, half %a0, half %a1) 2856 %3 = load i8, ptr %mask 2857 %4 = bitcast i8 %3 to <8 x i1> 2858 %5 = extractelement <8 x i1> %4, i64 0 2859 %6 = select i1 %5, half %2, half zeroinitializer 2860 %res = insertelement <8 x half> %a0v, half %6, i64 0 2861 ret <8 x half> %res 2862} 2863 2864define <8 x half> @stack_fold_fmsub123sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2865; CHECK-LABEL: stack_fold_fmsub123sh_intkz: 2866; CHECK: # %bb.0: 2867; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2868; CHECK-NEXT: #APP 2869; CHECK-NEXT: nop 2870; CHECK-NEXT: #NO_APP 2871; CHECK-NEXT: kmovb (%rdi), %k1 2872; CHECK-NEXT: vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 2873; CHECK-NEXT: retq 2874 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2875 %a0 = extractelement <8 x half> %a0v, i64 0 2876 %a1 = extractelement <8 x half> %a1v, i64 0 2877 %a2 = extractelement <8 x half> %a2v, i64 0 2878 %neg = fneg half %a2 2879 %2 = call half @llvm.fma.f16(half %a0, half %a1, half %neg) 2880 %3 = load i8, ptr %mask 2881 %4 = bitcast i8 %3 to <8 x i1> 2882 %5 = extractelement <8 x i1> %4, i64 0 2883 %6 = select i1 %5, half %2, half zeroinitializer 2884 %res = insertelement <8 x half> %a0v, half %6, i64 0 2885 ret <8 x half> %res 2886} 2887 2888define <8 x half> @stack_fold_fmsub213sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2889; CHECK-LABEL: stack_fold_fmsub213sh_intkz: 2890; CHECK: # %bb.0: 2891; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2892; CHECK-NEXT: #APP 2893; CHECK-NEXT: nop 2894; CHECK-NEXT: #NO_APP 2895; CHECK-NEXT: kmovb (%rdi), %k1 2896; CHECK-NEXT: vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 2897; CHECK-NEXT: retq 2898 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2899 %a0 = extractelement <8 x half> %a0v, i64 0 2900 %a1 = extractelement <8 x half> %a1v, i64 0 2901 %a2 = extractelement <8 x half> %a2v, i64 0 2902 %neg = fneg half %a2 2903 %2 = call half @llvm.fma.f16(half %a1, half %a0, half %neg) 2904 %3 = load i8, ptr %mask 2905 %4 = bitcast i8 %3 to <8 x i1> 2906 %5 = extractelement <8 x i1> %4, i64 0 2907 %6 = select i1 %5, half %2, half zeroinitializer 2908 %res = insertelement <8 x half> %a0v, half %6, i64 0 2909 ret <8 x half> %res 2910} 2911 2912define <8 x half> @stack_fold_fmsub231sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2913; CHECK-LABEL: stack_fold_fmsub231sh_intkz: 2914; CHECK: # %bb.0: 2915; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2916; CHECK-NEXT: #APP 2917; CHECK-NEXT: nop 2918; CHECK-NEXT: #NO_APP 2919; CHECK-NEXT: kmovb (%rdi), %k1 2920; CHECK-NEXT: vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 2921; CHECK-NEXT: retq 2922 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2923 %a0 = extractelement <8 x half> %a0v, i64 0 2924 %a1 = extractelement <8 x half> %a1v, i64 0 2925 %a2 = extractelement <8 x half> %a2v, i64 0 2926 %neg = fneg half %a0 2927 %2 = call half @llvm.fma.f16(half %a1, half %a2, half %neg) 2928 %3 = load i8, ptr %mask 2929 %4 = bitcast i8 %3 to <8 x i1> 2930 %5 = extractelement <8 x i1> %4, i64 0 2931 %6 = select i1 %5, half %2, half zeroinitializer 2932 %res = insertelement <8 x half> %a0v, half %6, i64 0 2933 ret <8 x half> %res 2934} 2935 2936define <8 x half> @stack_fold_fmsub321sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2937; CHECK-LABEL: stack_fold_fmsub321sh_intkz: 2938; CHECK: # %bb.0: 2939; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2940; CHECK-NEXT: #APP 2941; CHECK-NEXT: nop 2942; CHECK-NEXT: #NO_APP 2943; CHECK-NEXT: kmovb (%rdi), %k1 2944; CHECK-NEXT: vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 2945; CHECK-NEXT: retq 2946 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2947 %a0 = extractelement <8 x half> %a0v, i64 0 2948 %a1 = extractelement <8 x half> %a1v, i64 0 2949 %a2 = extractelement <8 x half> %a2v, i64 0 2950 %neg = fneg half %a0 2951 %2 = call half @llvm.fma.f16(half %a2, half %a1, half %neg) 2952 %3 = load i8, ptr %mask 2953 %4 = bitcast i8 %3 to <8 x i1> 2954 %5 = extractelement <8 x i1> %4, i64 0 2955 %6 = select i1 %5, half %2, half zeroinitializer 2956 %res = insertelement <8 x half> %a0v, half %6, i64 0 2957 ret <8 x half> %res 2958} 2959 2960define <8 x half> @stack_fold_fmsub132sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2961; CHECK-LABEL: stack_fold_fmsub132sh_intkz: 2962; CHECK: # %bb.0: 2963; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2964; CHECK-NEXT: #APP 2965; CHECK-NEXT: nop 2966; CHECK-NEXT: #NO_APP 2967; CHECK-NEXT: kmovb (%rdi), %k1 2968; CHECK-NEXT: vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 2969; CHECK-NEXT: retq 2970 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2971 %a0 = extractelement <8 x half> %a0v, i64 0 2972 %a1 = extractelement <8 x half> %a1v, i64 0 2973 %a2 = extractelement <8 x half> %a2v, i64 0 2974 %neg = fneg half %a1 2975 %2 = call half @llvm.fma.f16(half %a0, half %a2, half %neg) 2976 %3 = load i8, ptr %mask 2977 %4 = bitcast i8 %3 to <8 x i1> 2978 %5 = extractelement <8 x i1> %4, i64 0 2979 %6 = select i1 %5, half %2, half zeroinitializer 2980 %res = insertelement <8 x half> %a0v, half %6, i64 0 2981 ret <8 x half> %res 2982} 2983 2984define <8 x half> @stack_fold_fmsub312sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 2985; CHECK-LABEL: stack_fold_fmsub312sh_intkz: 2986; CHECK: # %bb.0: 2987; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2988; CHECK-NEXT: #APP 2989; CHECK-NEXT: nop 2990; CHECK-NEXT: #NO_APP 2991; CHECK-NEXT: kmovb (%rdi), %k1 2992; CHECK-NEXT: vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 2993; CHECK-NEXT: retq 2994 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2995 %a0 = extractelement <8 x half> %a0v, i64 0 2996 %a1 = extractelement <8 x half> %a1v, i64 0 2997 %a2 = extractelement <8 x half> %a2v, i64 0 2998 %neg = fneg half %a1 2999 %2 = call half @llvm.fma.f16(half %a2, half %a0, half %neg) 3000 %3 = load i8, ptr %mask 3001 %4 = bitcast i8 %3 to <8 x i1> 3002 %5 = extractelement <8 x i1> %4, i64 0 3003 %6 = select i1 %5, half %2, half zeroinitializer 3004 %res = insertelement <8 x half> %a0v, half %6, i64 0 3005 ret <8 x half> %res 3006} 3007 3008define <8 x half> @stack_fold_fnmadd123sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 3009; CHECK-LABEL: stack_fold_fnmadd123sh_intkz: 3010; CHECK: # %bb.0: 3011; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3012; CHECK-NEXT: #APP 3013; CHECK-NEXT: nop 3014; CHECK-NEXT: #NO_APP 3015; CHECK-NEXT: kmovb (%rdi), %k1 3016; CHECK-NEXT: vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 3017; CHECK-NEXT: retq 3018 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3019 %a0 = extractelement <8 x half> %a0v, i64 0 3020 %a1 = extractelement <8 x half> %a1v, i64 0 3021 %a2 = extractelement <8 x half> %a2v, i64 0 3022 %neg1 = fneg half %a0 3023 %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a2) 3024 %3 = load i8, ptr %mask 3025 %4 = bitcast i8 %3 to <8 x i1> 3026 %5 = extractelement <8 x i1> %4, i64 0 3027 %6 = select i1 %5, half %2, half zeroinitializer 3028 %res = insertelement <8 x half> %a0v, half %6, i64 0 3029 ret <8 x half> %res 3030} 3031 3032define <8 x half> @stack_fold_fnmadd213sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 3033; CHECK-LABEL: stack_fold_fnmadd213sh_intkz: 3034; CHECK: # %bb.0: 3035; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3036; CHECK-NEXT: #APP 3037; CHECK-NEXT: nop 3038; CHECK-NEXT: #NO_APP 3039; CHECK-NEXT: kmovb (%rdi), %k1 3040; CHECK-NEXT: vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 3041; CHECK-NEXT: retq 3042 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3043 %a0 = extractelement <8 x half> %a0v, i64 0 3044 %a1 = extractelement <8 x half> %a1v, i64 0 3045 %a2 = extractelement <8 x half> %a2v, i64 0 3046 %neg1 = fneg half %a1 3047 %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a2) 3048 %3 = load i8, ptr %mask 3049 %4 = bitcast i8 %3 to <8 x i1> 3050 %5 = extractelement <8 x i1> %4, i64 0 3051 %6 = select i1 %5, half %2, half zeroinitializer 3052 %res = insertelement <8 x half> %a0v, half %6, i64 0 3053 ret <8 x half> %res 3054} 3055 3056define <8 x half> @stack_fold_fnmadd231sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 3057; CHECK-LABEL: stack_fold_fnmadd231sh_intkz: 3058; CHECK: # %bb.0: 3059; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3060; CHECK-NEXT: #APP 3061; CHECK-NEXT: nop 3062; CHECK-NEXT: #NO_APP 3063; CHECK-NEXT: kmovb (%rdi), %k1 3064; CHECK-NEXT: vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 3065; CHECK-NEXT: retq 3066 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3067 %a0 = extractelement <8 x half> %a0v, i64 0 3068 %a1 = extractelement <8 x half> %a1v, i64 0 3069 %a2 = extractelement <8 x half> %a2v, i64 0 3070 %neg1 = fneg half %a1 3071 %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a0) 3072 %3 = load i8, ptr %mask 3073 %4 = bitcast i8 %3 to <8 x i1> 3074 %5 = extractelement <8 x i1> %4, i64 0 3075 %6 = select i1 %5, half %2, half zeroinitializer 3076 %res = insertelement <8 x half> %a0v, half %6, i64 0 3077 ret <8 x half> %res 3078} 3079 3080define <8 x half> @stack_fold_fnmadd321sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 3081; CHECK-LABEL: stack_fold_fnmadd321sh_intkz: 3082; CHECK: # %bb.0: 3083; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3084; CHECK-NEXT: #APP 3085; CHECK-NEXT: nop 3086; CHECK-NEXT: #NO_APP 3087; CHECK-NEXT: kmovb (%rdi), %k1 3088; CHECK-NEXT: vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 3089; CHECK-NEXT: retq 3090 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3091 %a0 = extractelement <8 x half> %a0v, i64 0 3092 %a1 = extractelement <8 x half> %a1v, i64 0 3093 %a2 = extractelement <8 x half> %a2v, i64 0 3094 %neg1 = fneg half %a2 3095 %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a0) 3096 %3 = load i8, ptr %mask 3097 %4 = bitcast i8 %3 to <8 x i1> 3098 %5 = extractelement <8 x i1> %4, i64 0 3099 %6 = select i1 %5, half %2, half zeroinitializer 3100 %res = insertelement <8 x half> %a0v, half %6, i64 0 3101 ret <8 x half> %res 3102} 3103 3104define <8 x half> @stack_fold_fnmadd132sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 3105; CHECK-LABEL: stack_fold_fnmadd132sh_intkz: 3106; CHECK: # %bb.0: 3107; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3108; CHECK-NEXT: #APP 3109; CHECK-NEXT: nop 3110; CHECK-NEXT: #NO_APP 3111; CHECK-NEXT: kmovb (%rdi), %k1 3112; CHECK-NEXT: vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 3113; CHECK-NEXT: retq 3114 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3115 %a0 = extractelement <8 x half> %a0v, i64 0 3116 %a1 = extractelement <8 x half> %a1v, i64 0 3117 %a2 = extractelement <8 x half> %a2v, i64 0 3118 %neg1 = fneg half %a0 3119 %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a1) 3120 %3 = load i8, ptr %mask 3121 %4 = bitcast i8 %3 to <8 x i1> 3122 %5 = extractelement <8 x i1> %4, i64 0 3123 %6 = select i1 %5, half %2, half zeroinitializer 3124 %res = insertelement <8 x half> %a0v, half %6, i64 0 3125 ret <8 x half> %res 3126} 3127 3128define <8 x half> @stack_fold_fnmadd312sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 3129; CHECK-LABEL: stack_fold_fnmadd312sh_intkz: 3130; CHECK: # %bb.0: 3131; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3132; CHECK-NEXT: #APP 3133; CHECK-NEXT: nop 3134; CHECK-NEXT: #NO_APP 3135; CHECK-NEXT: kmovb (%rdi), %k1 3136; CHECK-NEXT: vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 3137; CHECK-NEXT: retq 3138 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3139 %a0 = extractelement <8 x half> %a0v, i64 0 3140 %a1 = extractelement <8 x half> %a1v, i64 0 3141 %a2 = extractelement <8 x half> %a2v, i64 0 3142 %neg1 = fneg half %a2 3143 %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a1) 3144 %3 = load i8, ptr %mask 3145 %4 = bitcast i8 %3 to <8 x i1> 3146 %5 = extractelement <8 x i1> %4, i64 0 3147 %6 = select i1 %5, half %2, half zeroinitializer 3148 %res = insertelement <8 x half> %a0v, half %6, i64 0 3149 ret <8 x half> %res 3150} 3151 3152define <8 x half> @stack_fold_fnmsub123sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 3153; CHECK-LABEL: stack_fold_fnmsub123sh_intkz: 3154; CHECK: # %bb.0: 3155; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3156; CHECK-NEXT: #APP 3157; CHECK-NEXT: nop 3158; CHECK-NEXT: #NO_APP 3159; CHECK-NEXT: kmovb (%rdi), %k1 3160; CHECK-NEXT: vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 3161; CHECK-NEXT: retq 3162 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3163 %a0 = extractelement <8 x half> %a0v, i64 0 3164 %a1 = extractelement <8 x half> %a1v, i64 0 3165 %a2 = extractelement <8 x half> %a2v, i64 0 3166 %neg = fneg half %a2 3167 %neg1 = fneg half %a0 3168 %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg) 3169 %3 = load i8, ptr %mask 3170 %4 = bitcast i8 %3 to <8 x i1> 3171 %5 = extractelement <8 x i1> %4, i64 0 3172 %6 = select i1 %5, half %2, half zeroinitializer 3173 %res = insertelement <8 x half> %a0v, half %6, i64 0 3174 ret <8 x half> %res 3175} 3176 3177define <8 x half> @stack_fold_fnmsub213sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 3178; CHECK-LABEL: stack_fold_fnmsub213sh_intkz: 3179; CHECK: # %bb.0: 3180; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3181; CHECK-NEXT: #APP 3182; CHECK-NEXT: nop 3183; CHECK-NEXT: #NO_APP 3184; CHECK-NEXT: kmovb (%rdi), %k1 3185; CHECK-NEXT: vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 3186; CHECK-NEXT: retq 3187 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3188 %a0 = extractelement <8 x half> %a0v, i64 0 3189 %a1 = extractelement <8 x half> %a1v, i64 0 3190 %a2 = extractelement <8 x half> %a2v, i64 0 3191 %neg = fneg half %a2 3192 %neg1 = fneg half %a1 3193 %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg) 3194 %3 = load i8, ptr %mask 3195 %4 = bitcast i8 %3 to <8 x i1> 3196 %5 = extractelement <8 x i1> %4, i64 0 3197 %6 = select i1 %5, half %2, half zeroinitializer 3198 %res = insertelement <8 x half> %a0v, half %6, i64 0 3199 ret <8 x half> %res 3200} 3201 3202define <8 x half> @stack_fold_fnmsub231sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 3203; CHECK-LABEL: stack_fold_fnmsub231sh_intkz: 3204; CHECK: # %bb.0: 3205; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3206; CHECK-NEXT: #APP 3207; CHECK-NEXT: nop 3208; CHECK-NEXT: #NO_APP 3209; CHECK-NEXT: kmovb (%rdi), %k1 3210; CHECK-NEXT: vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 3211; CHECK-NEXT: retq 3212 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3213 %a0 = extractelement <8 x half> %a0v, i64 0 3214 %a1 = extractelement <8 x half> %a1v, i64 0 3215 %a2 = extractelement <8 x half> %a2v, i64 0 3216 %neg = fneg half %a0 3217 %neg1 = fneg half %a1 3218 %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg) 3219 %3 = load i8, ptr %mask 3220 %4 = bitcast i8 %3 to <8 x i1> 3221 %5 = extractelement <8 x i1> %4, i64 0 3222 %6 = select i1 %5, half %2, half zeroinitializer 3223 %res = insertelement <8 x half> %a0v, half %6, i64 0 3224 ret <8 x half> %res 3225} 3226 3227define <8 x half> @stack_fold_fnmsub321sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 3228; CHECK-LABEL: stack_fold_fnmsub321sh_intkz: 3229; CHECK: # %bb.0: 3230; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3231; CHECK-NEXT: #APP 3232; CHECK-NEXT: nop 3233; CHECK-NEXT: #NO_APP 3234; CHECK-NEXT: kmovb (%rdi), %k1 3235; CHECK-NEXT: vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 3236; CHECK-NEXT: retq 3237 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3238 %a0 = extractelement <8 x half> %a0v, i64 0 3239 %a1 = extractelement <8 x half> %a1v, i64 0 3240 %a2 = extractelement <8 x half> %a2v, i64 0 3241 %neg = fneg half %a0 3242 %neg1 = fneg half %a2 3243 %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg) 3244 %3 = load i8, ptr %mask 3245 %4 = bitcast i8 %3 to <8 x i1> 3246 %5 = extractelement <8 x i1> %4, i64 0 3247 %6 = select i1 %5, half %2, half zeroinitializer 3248 %res = insertelement <8 x half> %a0v, half %6, i64 0 3249 ret <8 x half> %res 3250} 3251 3252define <8 x half> @stack_fold_fnmsub132sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 3253; CHECK-LABEL: stack_fold_fnmsub132sh_intkz: 3254; CHECK: # %bb.0: 3255; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3256; CHECK-NEXT: #APP 3257; CHECK-NEXT: nop 3258; CHECK-NEXT: #NO_APP 3259; CHECK-NEXT: kmovb (%rdi), %k1 3260; CHECK-NEXT: vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 3261; CHECK-NEXT: retq 3262 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3263 %a0 = extractelement <8 x half> %a0v, i64 0 3264 %a1 = extractelement <8 x half> %a1v, i64 0 3265 %a2 = extractelement <8 x half> %a2v, i64 0 3266 %neg = fneg half %a1 3267 %neg1 = fneg half %a0 3268 %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg) 3269 %3 = load i8, ptr %mask 3270 %4 = bitcast i8 %3 to <8 x i1> 3271 %5 = extractelement <8 x i1> %4, i64 0 3272 %6 = select i1 %5, half %2, half zeroinitializer 3273 %res = insertelement <8 x half> %a0v, half %6, i64 0 3274 ret <8 x half> %res 3275} 3276 3277define <8 x half> @stack_fold_fnmsub312sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) { 3278; CHECK-LABEL: stack_fold_fnmsub312sh_intkz: 3279; CHECK: # %bb.0: 3280; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3281; CHECK-NEXT: #APP 3282; CHECK-NEXT: nop 3283; CHECK-NEXT: #NO_APP 3284; CHECK-NEXT: kmovb (%rdi), %k1 3285; CHECK-NEXT: vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 3286; CHECK-NEXT: retq 3287 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3288 %a0 = extractelement <8 x half> %a0v, i64 0 3289 %a1 = extractelement <8 x half> %a1v, i64 0 3290 %a2 = extractelement <8 x half> %a2v, i64 0 3291 %neg = fneg half %a1 3292 %neg1 = fneg half %a2 3293 %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg) 3294 %3 = load i8, ptr %mask 3295 %4 = bitcast i8 %3 to <8 x i1> 3296 %5 = extractelement <8 x i1> %4, i64 0 3297 %6 = select i1 %5, half %2, half zeroinitializer 3298 %res = insertelement <8 x half> %a0v, half %6, i64 0 3299 ret <8 x half> %res 3300} 3301 3302define <32 x half> @stack_fold_fmaddsub123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 3303; CHECK-LABEL: stack_fold_fmaddsub123ph: 3304; CHECK: # %bb.0: 3305; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3306; CHECK-NEXT: #APP 3307; CHECK-NEXT: nop 3308; CHECK-NEXT: #NO_APP 3309; CHECK-NEXT: vfmaddsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 3310; CHECK-NEXT: retq 3311 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3312 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 4) 3313 ret <32 x half> %2 3314} 3315declare <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half>, <32 x half>, <32 x half>, i32) 3316 3317define <32 x half> @stack_fold_fmaddsub213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 3318; CHECK-LABEL: stack_fold_fmaddsub213ph: 3319; CHECK: # %bb.0: 3320; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3321; CHECK-NEXT: #APP 3322; CHECK-NEXT: nop 3323; CHECK-NEXT: #NO_APP 3324; CHECK-NEXT: vfmaddsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 3325; CHECK-NEXT: retq 3326 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3327 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2, i32 4) 3328 ret <32 x half> %2 3329} 3330 3331define <32 x half> @stack_fold_fmaddsub231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 3332; CHECK-LABEL: stack_fold_fmaddsub231ph: 3333; CHECK: # %bb.0: 3334; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3335; CHECK-NEXT: #APP 3336; CHECK-NEXT: nop 3337; CHECK-NEXT: #NO_APP 3338; CHECK-NEXT: vfmaddsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 3339; CHECK-NEXT: retq 3340 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3341 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0, i32 4) 3342 ret <32 x half> %2 3343} 3344 3345define <32 x half> @stack_fold_fmaddsub321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 3346; CHECK-LABEL: stack_fold_fmaddsub321ph: 3347; CHECK: # %bb.0: 3348; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3349; CHECK-NEXT: #APP 3350; CHECK-NEXT: nop 3351; CHECK-NEXT: #NO_APP 3352; CHECK-NEXT: vfmaddsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 3353; CHECK-NEXT: retq 3354 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3355 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0, i32 4) 3356 ret <32 x half> %2 3357} 3358 3359define <32 x half> @stack_fold_fmaddsub132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 3360; CHECK-LABEL: stack_fold_fmaddsub132ph: 3361; CHECK: # %bb.0: 3362; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3363; CHECK-NEXT: #APP 3364; CHECK-NEXT: nop 3365; CHECK-NEXT: #NO_APP 3366; CHECK-NEXT: vfmaddsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 3367; CHECK-NEXT: retq 3368 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3369 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1, i32 4) 3370 ret <32 x half> %2 3371} 3372 3373define <32 x half> @stack_fold_fmaddsub312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 3374; CHECK-LABEL: stack_fold_fmaddsub312ph: 3375; CHECK: # %bb.0: 3376; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3377; CHECK-NEXT: #APP 3378; CHECK-NEXT: nop 3379; CHECK-NEXT: #NO_APP 3380; CHECK-NEXT: vfmaddsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 3381; CHECK-NEXT: retq 3382 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3383 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1, i32 4) 3384 ret <32 x half> %2 3385} 3386 3387define <32 x half> @stack_fold_fmaddsub123ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 3388; CHECK-LABEL: stack_fold_fmaddsub123ph_mask: 3389; CHECK: # %bb.0: 3390; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3391; CHECK-NEXT: #APP 3392; CHECK-NEXT: nop 3393; CHECK-NEXT: #NO_APP 3394; CHECK-NEXT: vmovaps (%rdi), %zmm2 3395; CHECK-NEXT: kmovd %esi, %k1 3396; CHECK-NEXT: vfmaddsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3397; CHECK-NEXT: vmovaps %zmm2, %zmm0 3398; CHECK-NEXT: retq 3399 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3400 %a0 = load <32 x half>, ptr %p 3401 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 4) 3402 %3 = bitcast i32 %mask to <32 x i1> 3403 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 3404 ret <32 x half> %4 3405} 3406 3407define <32 x half> @stack_fold_fmaddsub213ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 3408; CHECK-LABEL: stack_fold_fmaddsub213ph_mask: 3409; CHECK: # %bb.0: 3410; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3411; CHECK-NEXT: #APP 3412; CHECK-NEXT: nop 3413; CHECK-NEXT: #NO_APP 3414; CHECK-NEXT: vmovaps (%rdi), %zmm2 3415; CHECK-NEXT: kmovd %esi, %k1 3416; CHECK-NEXT: vfmaddsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3417; CHECK-NEXT: vmovaps %zmm2, %zmm0 3418; CHECK-NEXT: retq 3419 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3420 %a0 = load <32 x half>, ptr %p 3421 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2, i32 4) 3422 %3 = bitcast i32 %mask to <32 x i1> 3423 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 3424 ret <32 x half> %4 3425} 3426 3427define <32 x half> @stack_fold_fmaddsub231ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 3428; CHECK-LABEL: stack_fold_fmaddsub231ph_mask: 3429; CHECK: # %bb.0: 3430; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3431; CHECK-NEXT: #APP 3432; CHECK-NEXT: nop 3433; CHECK-NEXT: #NO_APP 3434; CHECK-NEXT: vmovaps (%rdi), %zmm2 3435; CHECK-NEXT: kmovd %esi, %k1 3436; CHECK-NEXT: vfmaddsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3437; CHECK-NEXT: vmovaps %zmm2, %zmm0 3438; CHECK-NEXT: retq 3439 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3440 %a0 = load <32 x half>, ptr %p 3441 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0, i32 4) 3442 %3 = bitcast i32 %mask to <32 x i1> 3443 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 3444 ret <32 x half> %4 3445} 3446 3447define <32 x half> @stack_fold_fmaddsub321ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 3448; CHECK-LABEL: stack_fold_fmaddsub321ph_mask: 3449; CHECK: # %bb.0: 3450; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3451; CHECK-NEXT: #APP 3452; CHECK-NEXT: nop 3453; CHECK-NEXT: #NO_APP 3454; CHECK-NEXT: vmovaps (%rdi), %zmm2 3455; CHECK-NEXT: kmovd %esi, %k1 3456; CHECK-NEXT: vfmaddsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3457; CHECK-NEXT: vmovaps %zmm2, %zmm0 3458; CHECK-NEXT: retq 3459 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3460 %a0 = load <32 x half>, ptr %p 3461 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0, i32 4) 3462 %3 = bitcast i32 %mask to <32 x i1> 3463 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 3464 ret <32 x half> %4 3465} 3466 3467define <32 x half> @stack_fold_fmaddsub132ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 3468; CHECK-LABEL: stack_fold_fmaddsub132ph_mask: 3469; CHECK: # %bb.0: 3470; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3471; CHECK-NEXT: #APP 3472; CHECK-NEXT: nop 3473; CHECK-NEXT: #NO_APP 3474; CHECK-NEXT: vmovaps (%rdi), %zmm2 3475; CHECK-NEXT: kmovd %esi, %k1 3476; CHECK-NEXT: vfmaddsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3477; CHECK-NEXT: vmovaps %zmm2, %zmm0 3478; CHECK-NEXT: retq 3479 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3480 %a0 = load <32 x half>, ptr %p 3481 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1, i32 4) 3482 %3 = bitcast i32 %mask to <32 x i1> 3483 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 3484 ret <32 x half> %4 3485} 3486 3487define <32 x half> @stack_fold_fmaddsub312ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 3488; CHECK-LABEL: stack_fold_fmaddsub312ph_mask: 3489; CHECK: # %bb.0: 3490; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3491; CHECK-NEXT: #APP 3492; CHECK-NEXT: nop 3493; CHECK-NEXT: #NO_APP 3494; CHECK-NEXT: vmovaps (%rdi), %zmm2 3495; CHECK-NEXT: kmovd %esi, %k1 3496; CHECK-NEXT: vfmaddsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3497; CHECK-NEXT: vmovaps %zmm2, %zmm0 3498; CHECK-NEXT: retq 3499 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3500 %a0 = load <32 x half>, ptr %p 3501 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1, i32 4) 3502 %3 = bitcast i32 %mask to <32 x i1> 3503 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 3504 ret <32 x half> %4 3505} 3506 3507define <32 x half> @stack_fold_fmaddsub123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 3508; CHECK-LABEL: stack_fold_fmaddsub123ph_maskz: 3509; CHECK: # %bb.0: 3510; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3511; CHECK-NEXT: #APP 3512; CHECK-NEXT: nop 3513; CHECK-NEXT: #NO_APP 3514; CHECK-NEXT: kmovd (%rdi), %k1 3515; CHECK-NEXT: vfmaddsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 3516; CHECK-NEXT: retq 3517 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3518 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 4) 3519 %3 = load i32, ptr %mask 3520 %4 = bitcast i32 %3 to <32 x i1> 3521 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 3522 ret <32 x half> %5 3523} 3524 3525define <32 x half> @stack_fold_fmaddsub213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 3526; CHECK-LABEL: stack_fold_fmaddsub213ph_maskz: 3527; CHECK: # %bb.0: 3528; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3529; CHECK-NEXT: #APP 3530; CHECK-NEXT: nop 3531; CHECK-NEXT: #NO_APP 3532; CHECK-NEXT: kmovd (%rdi), %k1 3533; CHECK-NEXT: vfmaddsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 3534; CHECK-NEXT: retq 3535 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3536 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2, i32 4) 3537 %3 = load i32, ptr %mask 3538 %4 = bitcast i32 %3 to <32 x i1> 3539 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 3540 ret <32 x half> %5 3541} 3542 3543define <32 x half> @stack_fold_fmaddsub231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 3544; CHECK-LABEL: stack_fold_fmaddsub231ph_maskz: 3545; CHECK: # %bb.0: 3546; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3547; CHECK-NEXT: #APP 3548; CHECK-NEXT: nop 3549; CHECK-NEXT: #NO_APP 3550; CHECK-NEXT: kmovd (%rdi), %k1 3551; CHECK-NEXT: vfmaddsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 3552; CHECK-NEXT: retq 3553 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3554 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0, i32 4) 3555 %3 = load i32, ptr %mask 3556 %4 = bitcast i32 %3 to <32 x i1> 3557 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 3558 ret <32 x half> %5 3559} 3560 3561define <32 x half> @stack_fold_fmaddsub321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 3562; CHECK-LABEL: stack_fold_fmaddsub321ph_maskz: 3563; CHECK: # %bb.0: 3564; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3565; CHECK-NEXT: #APP 3566; CHECK-NEXT: nop 3567; CHECK-NEXT: #NO_APP 3568; CHECK-NEXT: kmovd (%rdi), %k1 3569; CHECK-NEXT: vfmaddsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 3570; CHECK-NEXT: retq 3571 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3572 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0, i32 4) 3573 %3 = load i32, ptr %mask 3574 %4 = bitcast i32 %3 to <32 x i1> 3575 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 3576 ret <32 x half> %5 3577} 3578 3579define <32 x half> @stack_fold_fmaddsub132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 3580; CHECK-LABEL: stack_fold_fmaddsub132ph_maskz: 3581; CHECK: # %bb.0: 3582; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3583; CHECK-NEXT: #APP 3584; CHECK-NEXT: nop 3585; CHECK-NEXT: #NO_APP 3586; CHECK-NEXT: kmovd (%rdi), %k1 3587; CHECK-NEXT: vfmaddsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 3588; CHECK-NEXT: retq 3589 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3590 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1, i32 4) 3591 %3 = load i32, ptr %mask 3592 %4 = bitcast i32 %3 to <32 x i1> 3593 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 3594 ret <32 x half> %5 3595} 3596 3597define <32 x half> @stack_fold_fmaddsub312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 3598; CHECK-LABEL: stack_fold_fmaddsub312ph_maskz: 3599; CHECK: # %bb.0: 3600; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3601; CHECK-NEXT: #APP 3602; CHECK-NEXT: nop 3603; CHECK-NEXT: #NO_APP 3604; CHECK-NEXT: kmovd (%rdi), %k1 3605; CHECK-NEXT: vfmaddsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 3606; CHECK-NEXT: retq 3607 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3608 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1, i32 4) 3609 %3 = load i32, ptr %mask 3610 %4 = bitcast i32 %3 to <32 x i1> 3611 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 3612 ret <32 x half> %5 3613} 3614 3615define <32 x half> @stack_fold_fmsubadd123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 3616; CHECK-LABEL: stack_fold_fmsubadd123ph: 3617; CHECK: # %bb.0: 3618; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3619; CHECK-NEXT: #APP 3620; CHECK-NEXT: nop 3621; CHECK-NEXT: #NO_APP 3622; CHECK-NEXT: vfmsubadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 3623; CHECK-NEXT: retq 3624 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3625 %2 = fneg <32 x half> %a2 3626 %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %2, i32 4) 3627 ret <32 x half> %3 3628} 3629 3630define <32 x half> @stack_fold_fmsubadd213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 3631; CHECK-LABEL: stack_fold_fmsubadd213ph: 3632; CHECK: # %bb.0: 3633; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3634; CHECK-NEXT: #APP 3635; CHECK-NEXT: nop 3636; CHECK-NEXT: #NO_APP 3637; CHECK-NEXT: vfmsubadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 3638; CHECK-NEXT: retq 3639 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3640 %2 = fneg <32 x half> %a2 3641 %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %2, i32 4) 3642 ret <32 x half> %3 3643} 3644 3645define <32 x half> @stack_fold_fmsubadd231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 3646; CHECK-LABEL: stack_fold_fmsubadd231ph: 3647; CHECK: # %bb.0: 3648; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3649; CHECK-NEXT: #APP 3650; CHECK-NEXT: nop 3651; CHECK-NEXT: #NO_APP 3652; CHECK-NEXT: vfmsubadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 3653; CHECK-NEXT: retq 3654 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3655 %2 = fneg <32 x half> %a0 3656 %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %2, i32 4) 3657 ret <32 x half> %3 3658} 3659 3660define <32 x half> @stack_fold_fmsubadd321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 3661; CHECK-LABEL: stack_fold_fmsubadd321ph: 3662; CHECK: # %bb.0: 3663; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3664; CHECK-NEXT: #APP 3665; CHECK-NEXT: nop 3666; CHECK-NEXT: #NO_APP 3667; CHECK-NEXT: vfmsubadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 3668; CHECK-NEXT: retq 3669 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3670 %2 = fneg <32 x half> %a0 3671 %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %2, i32 4) 3672 ret <32 x half> %3 3673} 3674 3675define <32 x half> @stack_fold_fmsubadd132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 3676; CHECK-LABEL: stack_fold_fmsubadd132ph: 3677; CHECK: # %bb.0: 3678; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3679; CHECK-NEXT: #APP 3680; CHECK-NEXT: nop 3681; CHECK-NEXT: #NO_APP 3682; CHECK-NEXT: vfmsubadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 3683; CHECK-NEXT: retq 3684 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3685 %2 = fneg <32 x half> %a1 3686 %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %2, i32 4) 3687 ret <32 x half> %3 3688} 3689 3690define <32 x half> @stack_fold_fmsubadd312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) { 3691; CHECK-LABEL: stack_fold_fmsubadd312ph: 3692; CHECK: # %bb.0: 3693; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3694; CHECK-NEXT: #APP 3695; CHECK-NEXT: nop 3696; CHECK-NEXT: #NO_APP 3697; CHECK-NEXT: vfmsubadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 3698; CHECK-NEXT: retq 3699 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3700 %2 = fneg <32 x half> %a1 3701 %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %2, i32 4) 3702 ret <32 x half> %3 3703} 3704 3705define <32 x half> @stack_fold_fmsubadd123ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 3706; CHECK-LABEL: stack_fold_fmsubadd123ph_mask: 3707; CHECK: # %bb.0: 3708; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3709; CHECK-NEXT: #APP 3710; CHECK-NEXT: nop 3711; CHECK-NEXT: #NO_APP 3712; CHECK-NEXT: vmovaps (%rdi), %zmm2 3713; CHECK-NEXT: kmovd %esi, %k1 3714; CHECK-NEXT: vfmsubadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3715; CHECK-NEXT: vmovaps %zmm2, %zmm0 3716; CHECK-NEXT: retq 3717 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3718 %a0 = load <32 x half>, ptr %p 3719 %neg = fneg <32 x half> %a2 3720 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %neg, i32 4) 3721 %3 = bitcast i32 %mask to <32 x i1> 3722 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 3723 ret <32 x half> %4 3724} 3725 3726define <32 x half> @stack_fold_fmsubadd213ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 3727; CHECK-LABEL: stack_fold_fmsubadd213ph_mask: 3728; CHECK: # %bb.0: 3729; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3730; CHECK-NEXT: #APP 3731; CHECK-NEXT: nop 3732; CHECK-NEXT: #NO_APP 3733; CHECK-NEXT: vmovaps (%rdi), %zmm2 3734; CHECK-NEXT: kmovd %esi, %k1 3735; CHECK-NEXT: vfmsubadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3736; CHECK-NEXT: vmovaps %zmm2, %zmm0 3737; CHECK-NEXT: retq 3738 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3739 %a0 = load <32 x half>, ptr %p 3740 %neg = fneg <32 x half> %a2 3741 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %neg, i32 4) 3742 %3 = bitcast i32 %mask to <32 x i1> 3743 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 3744 ret <32 x half> %4 3745} 3746 3747define <32 x half> @stack_fold_fmsubadd231ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 3748; CHECK-LABEL: stack_fold_fmsubadd231ph_mask: 3749; CHECK: # %bb.0: 3750; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3751; CHECK-NEXT: #APP 3752; CHECK-NEXT: nop 3753; CHECK-NEXT: #NO_APP 3754; CHECK-NEXT: vmovaps (%rdi), %zmm2 3755; CHECK-NEXT: kmovd %esi, %k1 3756; CHECK-NEXT: vfmsubadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3757; CHECK-NEXT: vmovaps %zmm2, %zmm0 3758; CHECK-NEXT: retq 3759 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3760 %a0 = load <32 x half>, ptr %p 3761 %neg = fneg <32 x half> %a0 3762 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %neg, i32 4) 3763 %3 = bitcast i32 %mask to <32 x i1> 3764 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 3765 ret <32 x half> %4 3766} 3767 3768define <32 x half> @stack_fold_fmsubadd321ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 3769; CHECK-LABEL: stack_fold_fmsubadd321ph_mask: 3770; CHECK: # %bb.0: 3771; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3772; CHECK-NEXT: #APP 3773; CHECK-NEXT: nop 3774; CHECK-NEXT: #NO_APP 3775; CHECK-NEXT: vmovaps (%rdi), %zmm2 3776; CHECK-NEXT: kmovd %esi, %k1 3777; CHECK-NEXT: vfmsubadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3778; CHECK-NEXT: vmovaps %zmm2, %zmm0 3779; CHECK-NEXT: retq 3780 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3781 %a0 = load <32 x half>, ptr %p 3782 %neg = fneg <32 x half> %a0 3783 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %neg, i32 4) 3784 %3 = bitcast i32 %mask to <32 x i1> 3785 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 3786 ret <32 x half> %4 3787} 3788 3789define <32 x half> @stack_fold_fmsubadd132ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 3790; CHECK-LABEL: stack_fold_fmsubadd132ph_mask: 3791; CHECK: # %bb.0: 3792; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3793; CHECK-NEXT: #APP 3794; CHECK-NEXT: nop 3795; CHECK-NEXT: #NO_APP 3796; CHECK-NEXT: vmovaps (%rdi), %zmm2 3797; CHECK-NEXT: kmovd %esi, %k1 3798; CHECK-NEXT: vfmsubadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3799; CHECK-NEXT: vmovaps %zmm2, %zmm0 3800; CHECK-NEXT: retq 3801 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3802 %a0 = load <32 x half>, ptr %p 3803 %neg = fneg <32 x half> %a1 3804 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %neg, i32 4) 3805 %3 = bitcast i32 %mask to <32 x i1> 3806 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 3807 ret <32 x half> %4 3808} 3809 3810define <32 x half> @stack_fold_fmsubadd312ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) { 3811; CHECK-LABEL: stack_fold_fmsubadd312ph_mask: 3812; CHECK: # %bb.0: 3813; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3814; CHECK-NEXT: #APP 3815; CHECK-NEXT: nop 3816; CHECK-NEXT: #NO_APP 3817; CHECK-NEXT: vmovaps (%rdi), %zmm2 3818; CHECK-NEXT: kmovd %esi, %k1 3819; CHECK-NEXT: vfmsubadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3820; CHECK-NEXT: vmovaps %zmm2, %zmm0 3821; CHECK-NEXT: retq 3822 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3823 %a0 = load <32 x half>, ptr %p 3824 %neg = fneg <32 x half> %a1 3825 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %neg, i32 4) 3826 %3 = bitcast i32 %mask to <32 x i1> 3827 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0 3828 ret <32 x half> %4 3829} 3830 3831define <32 x half> @stack_fold_fmsubadd123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 3832; CHECK-LABEL: stack_fold_fmsubadd123ph_maskz: 3833; CHECK: # %bb.0: 3834; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3835; CHECK-NEXT: #APP 3836; CHECK-NEXT: nop 3837; CHECK-NEXT: #NO_APP 3838; CHECK-NEXT: kmovd (%rdi), %k1 3839; CHECK-NEXT: vfmsubadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 3840; CHECK-NEXT: retq 3841 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3842 %neg = fneg <32 x half> %a2 3843 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %neg, i32 4) 3844 %3 = load i32, ptr %mask 3845 %4 = bitcast i32 %3 to <32 x i1> 3846 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 3847 ret <32 x half> %5 3848} 3849 3850define <32 x half> @stack_fold_fmsubadd213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 3851; CHECK-LABEL: stack_fold_fmsubadd213ph_maskz: 3852; CHECK: # %bb.0: 3853; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3854; CHECK-NEXT: #APP 3855; CHECK-NEXT: nop 3856; CHECK-NEXT: #NO_APP 3857; CHECK-NEXT: kmovd (%rdi), %k1 3858; CHECK-NEXT: vfmsubadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 3859; CHECK-NEXT: retq 3860 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3861 %neg = fneg <32 x half> %a2 3862 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %neg, i32 4) 3863 %3 = load i32, ptr %mask 3864 %4 = bitcast i32 %3 to <32 x i1> 3865 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 3866 ret <32 x half> %5 3867} 3868 3869define <32 x half> @stack_fold_fmsubadd231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 3870; CHECK-LABEL: stack_fold_fmsubadd231ph_maskz: 3871; CHECK: # %bb.0: 3872; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3873; CHECK-NEXT: #APP 3874; CHECK-NEXT: nop 3875; CHECK-NEXT: #NO_APP 3876; CHECK-NEXT: kmovd (%rdi), %k1 3877; CHECK-NEXT: vfmsubadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 3878; CHECK-NEXT: retq 3879 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3880 %neg = fneg <32 x half> %a0 3881 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %neg, i32 4) 3882 %3 = load i32, ptr %mask 3883 %4 = bitcast i32 %3 to <32 x i1> 3884 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 3885 ret <32 x half> %5 3886} 3887 3888define <32 x half> @stack_fold_fmsubadd321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 3889; CHECK-LABEL: stack_fold_fmsubadd321ph_maskz: 3890; CHECK: # %bb.0: 3891; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3892; CHECK-NEXT: #APP 3893; CHECK-NEXT: nop 3894; CHECK-NEXT: #NO_APP 3895; CHECK-NEXT: kmovd (%rdi), %k1 3896; CHECK-NEXT: vfmsubadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 3897; CHECK-NEXT: retq 3898 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3899 %neg = fneg <32 x half> %a0 3900 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %neg, i32 4) 3901 %3 = load i32, ptr %mask 3902 %4 = bitcast i32 %3 to <32 x i1> 3903 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 3904 ret <32 x half> %5 3905} 3906 3907define <32 x half> @stack_fold_fmsubadd132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 3908; CHECK-LABEL: stack_fold_fmsubadd132ph_maskz: 3909; CHECK: # %bb.0: 3910; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3911; CHECK-NEXT: #APP 3912; CHECK-NEXT: nop 3913; CHECK-NEXT: #NO_APP 3914; CHECK-NEXT: kmovd (%rdi), %k1 3915; CHECK-NEXT: vfmsubadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 3916; CHECK-NEXT: retq 3917 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3918 %neg = fneg <32 x half> %a1 3919 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %neg, i32 4) 3920 %3 = load i32, ptr %mask 3921 %4 = bitcast i32 %3 to <32 x i1> 3922 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 3923 ret <32 x half> %5 3924} 3925 3926define <32 x half> @stack_fold_fmsubadd312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) { 3927; CHECK-LABEL: stack_fold_fmsubadd312ph_maskz: 3928; CHECK: # %bb.0: 3929; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3930; CHECK-NEXT: #APP 3931; CHECK-NEXT: nop 3932; CHECK-NEXT: #NO_APP 3933; CHECK-NEXT: kmovd (%rdi), %k1 3934; CHECK-NEXT: vfmsubadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 3935; CHECK-NEXT: retq 3936 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3937 %neg = fneg <32 x half> %a1 3938 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %neg, i32 4) 3939 %3 = load i32, ptr %mask 3940 %4 = bitcast i32 %3 to <32 x i1> 3941 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer 3942 ret <32 x half> %5 3943} 3944