1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq < %s | FileCheck %s 3 4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 5target triple = "x86_64-unknown-unknown" 6 7; Stack reload folding tests. 8; 9; By including a nop call with sideeffects we can force a partial register spill of the 10; relevant registers and check that the reload is correctly folded into the instruction. 11 12define <8 x double> @stack_fold_addpd_zmm(<8 x double> %a0, <8 x double> %a1) { 13; CHECK-LABEL: stack_fold_addpd_zmm: 14; CHECK: # %bb.0: 15; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16; CHECK-NEXT: #APP 17; CHECK-NEXT: nop 18; CHECK-NEXT: #NO_APP 19; CHECK-NEXT: vaddpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 20; CHECK-NEXT: retq 21 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 22 %2 = fadd <8 x double> %a0, %a1 23 ret <8 x double> %2 24} 25 26define <8 x double> @stack_fold_addpd_zmm_k(<8 x double> %a0, <8 x double> %a1, i8 %mask, ptr %passthru) { 27; CHECK-LABEL: stack_fold_addpd_zmm_k: 28; CHECK: # %bb.0: 29; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 30; CHECK-NEXT: #APP 31; CHECK-NEXT: nop 32; CHECK-NEXT: #NO_APP 33; CHECK-NEXT: kmovw %edi, %k1 34; CHECK-NEXT: vmovapd (%rsi), %zmm2 35; CHECK-NEXT: vaddpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 36; CHECK-NEXT: vmovapd %zmm2, %zmm0 37; CHECK-NEXT: retq 38 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 39 %2 = fadd <8 x double> %a0, %a1 40 %3 = bitcast i8 %mask to <8 x i1> 41 %4 = load <8 x double>, ptr %passthru 42 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4 43 ret <8 x double> %5 44} 45 46define <8 x double> @stack_fold_addpd_zmm_k_commuted(<8 x double> %a0, <8 x double> %a1, i8 %mask, ptr %passthru) { 47; CHECK-LABEL: stack_fold_addpd_zmm_k_commuted: 48; CHECK: # %bb.0: 49; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 50; CHECK-NEXT: #APP 51; CHECK-NEXT: nop 52; CHECK-NEXT: #NO_APP 53; CHECK-NEXT: kmovw %edi, %k1 54; CHECK-NEXT: vmovapd (%rsi), %zmm2 55; CHECK-NEXT: vaddpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 56; CHECK-NEXT: vmovapd %zmm2, %zmm0 57; CHECK-NEXT: retq 58 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 59 %2 = fadd <8 x double> %a1, %a0 60 %3 = bitcast i8 %mask to <8 x i1> 61 %4 = load <8 x double>, ptr %passthru 62 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4 63 ret <8 x double> %5 64} 65 66define <8 x double> @stack_fold_addpd_zmm_kz(<8 x double> %a0, <8 x double> %a1, i8 %mask) { 67; CHECK-LABEL: stack_fold_addpd_zmm_kz: 68; CHECK: # %bb.0: 69; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 70; CHECK-NEXT: #APP 71; CHECK-NEXT: nop 72; CHECK-NEXT: #NO_APP 73; CHECK-NEXT: kmovw %edi, %k1 74; CHECK-NEXT: vaddpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 75; CHECK-NEXT: retq 76 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 77 %2 = fadd <8 x double> %a1, %a0 78 %3 = bitcast i8 %mask to <8 x i1> 79 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer 80 ret <8 x double> %4 81} 82 83define <16 x float> @stack_fold_addps_zmm(<16 x float> %a0, <16 x float> %a1) { 84; CHECK-LABEL: stack_fold_addps_zmm: 85; CHECK: # %bb.0: 86; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 87; CHECK-NEXT: #APP 88; CHECK-NEXT: nop 89; CHECK-NEXT: #NO_APP 90; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 91; CHECK-NEXT: retq 92 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 93 %2 = fadd <16 x float> %a0, %a1 94 ret <16 x float> %2 95} 96 97define <16 x float> @stack_fold_addps_zmm_k(<16 x float> %a0, <16 x float> %a1, i16 %mask, ptr %passthru) { 98; CHECK-LABEL: stack_fold_addps_zmm_k: 99; CHECK: # %bb.0: 100; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 101; CHECK-NEXT: #APP 102; CHECK-NEXT: nop 103; CHECK-NEXT: #NO_APP 104; CHECK-NEXT: kmovw %edi, %k1 105; CHECK-NEXT: vmovaps (%rsi), %zmm2 106; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 107; CHECK-NEXT: vmovaps %zmm2, %zmm0 108; CHECK-NEXT: retq 109 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 110 %2 = fadd <16 x float> %a0, %a1 111 %3 = bitcast i16 %mask to <16 x i1> 112 %4 = load <16 x float>, ptr %passthru 113 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4 114 ret <16 x float> %5 115} 116 117define <16 x float> @stack_fold_addps_zmm_k_commuted(<16 x float> %a0, <16 x float> %a1, i16 %mask, ptr %passthru) { 118; CHECK-LABEL: stack_fold_addps_zmm_k_commuted: 119; CHECK: # %bb.0: 120; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 121; CHECK-NEXT: #APP 122; CHECK-NEXT: nop 123; CHECK-NEXT: #NO_APP 124; CHECK-NEXT: kmovw %edi, %k1 125; CHECK-NEXT: vmovaps (%rsi), %zmm2 126; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 127; CHECK-NEXT: vmovaps %zmm2, %zmm0 128; CHECK-NEXT: retq 129 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 130 %2 = fadd <16 x float> %a1, %a0 131 %3 = bitcast i16 %mask to <16 x i1> 132 %4 = load <16 x float>, ptr %passthru 133 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4 134 ret <16 x float> %5 135} 136 137define <16 x float> @stack_fold_addps_zmm_kz(<16 x float> %a0, <16 x float> %a1, i16 %mask) { 138; CHECK-LABEL: stack_fold_addps_zmm_kz: 139; CHECK: # %bb.0: 140; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 141; CHECK-NEXT: #APP 142; CHECK-NEXT: nop 143; CHECK-NEXT: #NO_APP 144; CHECK-NEXT: kmovw %edi, %k1 145; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 146; CHECK-NEXT: retq 147 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 148 %2 = fadd <16 x float> %a1, %a0 149 %3 = bitcast i16 %mask to <16 x i1> 150 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer 151 ret <16 x float> %4 152} 153 154define double @stack_fold_addsd(double %a0, double %a1) { 155; CHECK-LABEL: stack_fold_addsd: 156; CHECK: # %bb.0: 157; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 158; CHECK-NEXT: #APP 159; CHECK-NEXT: nop 160; CHECK-NEXT: #NO_APP 161; CHECK-NEXT: vaddsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload 162; CHECK-NEXT: retq 163 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 164 %2 = fadd double %a0, %a1 165 ret double %2 166} 167 168define <2 x double> @stack_fold_addsd_int(<2 x double> %a0, <2 x double> %a1) { 169; CHECK-LABEL: stack_fold_addsd_int: 170; CHECK: # %bb.0: 171; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 172; CHECK-NEXT: #APP 173; CHECK-NEXT: nop 174; CHECK-NEXT: #NO_APP 175; CHECK-NEXT: vaddsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 176; CHECK-NEXT: retq 177 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 178 %2 = extractelement <2 x double> %a0, i32 0 179 %3 = extractelement <2 x double> %a1, i32 0 180 %4 = fadd double %2, %3 181 %5 = insertelement <2 x double> %a0, double %4, i32 0 182 ret <2 x double> %5 183} 184 185define float @stack_fold_addss(float %a0, float %a1) { 186; CHECK-LABEL: stack_fold_addss: 187; CHECK: # %bb.0: 188; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 189; CHECK-NEXT: #APP 190; CHECK-NEXT: nop 191; CHECK-NEXT: #NO_APP 192; CHECK-NEXT: vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload 193; CHECK-NEXT: retq 194 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 195 %2 = fadd float %a0, %a1 196 ret float %2 197} 198 199define <4 x float> @stack_fold_addss_int(<4 x float> %a0, <4 x float> %a1) { 200; CHECK-LABEL: stack_fold_addss_int: 201; CHECK: # %bb.0: 202; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 203; CHECK-NEXT: #APP 204; CHECK-NEXT: nop 205; CHECK-NEXT: #NO_APP 206; CHECK-NEXT: vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 207; CHECK-NEXT: retq 208 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 209 %2 = extractelement <4 x float> %a0, i32 0 210 %3 = extractelement <4 x float> %a1, i32 0 211 %4 = fadd float %2, %3 212 %5 = insertelement <4 x float> %a0, float %4, i32 0 213 ret <4 x float> %5 214} 215 216define <8 x double> @stack_fold_andnpd_zmm(<8 x double> %a0, <8 x double> %a1) { 217; CHECK-LABEL: stack_fold_andnpd_zmm: 218; CHECK: # %bb.0: 219; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 220; CHECK-NEXT: #APP 221; CHECK-NEXT: nop 222; CHECK-NEXT: #NO_APP 223; CHECK-NEXT: vandnpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 224; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 225; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 226; CHECK-NEXT: retq 227 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 228 %2 = bitcast <8 x double> %a0 to <8 x i64> 229 %3 = bitcast <8 x double> %a1 to <8 x i64> 230 %4 = xor <8 x i64> %2, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1> 231 %5 = and <8 x i64> %4, %3 232 %6 = bitcast <8 x i64> %5 to <8 x double> 233 ; fadd forces execution domain 234 %7 = fadd <8 x double> %6, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0> 235 ret <8 x double> %7 236} 237 238define <16 x float> @stack_fold_andnps_zmm(<16 x float> %a0, <16 x float> %a1) { 239; CHECK-LABEL: stack_fold_andnps_zmm: 240; CHECK: # %bb.0: 241; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 242; CHECK-NEXT: #APP 243; CHECK-NEXT: nop 244; CHECK-NEXT: #NO_APP 245; CHECK-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 246; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 247; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 248; CHECK-NEXT: retq 249 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 250 %2 = bitcast <16 x float> %a0 to <16 x i32> 251 %3 = bitcast <16 x float> %a1 to <16 x i32> 252 %4 = xor <16 x i32> %2, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 253 %5 = and <16 x i32> %4, %3 254 %6 = bitcast <16 x i32> %5 to <16 x float> 255 ; fadd forces execution domain 256 %7 = fadd <16 x float> %6, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0> 257 ret <16 x float> %7 258} 259 260define <8 x double> @stack_fold_andpd_zmm(<8 x double> %a0, <8 x double> %a1) { 261; CHECK-LABEL: stack_fold_andpd_zmm: 262; CHECK: # %bb.0: 263; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 264; CHECK-NEXT: #APP 265; CHECK-NEXT: nop 266; CHECK-NEXT: #NO_APP 267; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 268; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 269; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 270; CHECK-NEXT: retq 271 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 272 %2 = bitcast <8 x double> %a0 to <8 x i64> 273 %3 = bitcast <8 x double> %a1 to <8 x i64> 274 %4 = and <8 x i64> %2, %3 275 %5 = bitcast <8 x i64> %4 to <8 x double> 276 ; fadd forces execution domain 277 %6 = fadd <8 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0> 278 ret <8 x double> %6 279} 280 281define <16 x float> @stack_fold_andps_zmm(<16 x float> %a0, <16 x float> %a1) { 282; CHECK-LABEL: stack_fold_andps_zmm: 283; CHECK: # %bb.0: 284; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 285; CHECK-NEXT: #APP 286; CHECK-NEXT: nop 287; CHECK-NEXT: #NO_APP 288; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 289; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 290; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 291; CHECK-NEXT: retq 292 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 293 %2 = bitcast <16 x float> %a0 to <16 x i32> 294 %3 = bitcast <16 x float> %a1 to <16 x i32> 295 %4 = and <16 x i32> %2, %3 296 %5 = bitcast <16 x i32> %4 to <16 x float> 297 ; fadd forces execution domain 298 %6 = fadd <16 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0> 299 ret <16 x float> %6 300} 301 302define i8 @stack_fold_cmppd(<8 x double> %a0, <8 x double> %a1) { 303; CHECK-LABEL: stack_fold_cmppd: 304; CHECK: # %bb.0: 305; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 306; CHECK-NEXT: #APP 307; CHECK-NEXT: nop 308; CHECK-NEXT: #NO_APP 309; CHECK-NEXT: vcmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload 310; CHECK-NEXT: kmovw %k0, %eax 311; CHECK-NEXT: # kill: def $al killed $al killed $eax 312; CHECK-NEXT: vzeroupper 313; CHECK-NEXT: retq 314 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 315 %res = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a0, <8 x double> %a1, i32 0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4) 316 %2 = bitcast <8 x i1> %res to i8 317 ret i8 %2 318} 319declare <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, <8 x i1>, i32) 320 321define <8 x double> @stack_fold_cmppd_mask(<8 x double> %a0, <8 x double> %a1, ptr %a2, i8 %mask, <8 x double> %b0, <8 x double> %b1) { 322; CHECK-LABEL: stack_fold_cmppd_mask: 323; CHECK: # %bb.0: 324; CHECK-NEXT: subq $136, %rsp 325; CHECK-NEXT: .cfi_def_cfa_offset 144 326; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 327; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill 328; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 329; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 330; CHECK-NEXT: #APP 331; CHECK-NEXT: nop 332; CHECK-NEXT: #NO_APP 333; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 334; CHECK-NEXT: vaddpd (%rdi), %zmm0, %zmm0 335; CHECK-NEXT: vcmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload 336; CHECK-NEXT: kmovw %esi, %k1 337; CHECK-NEXT: kandb %k0, %k1, %k1 338; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 339; CHECK-NEXT: vmovupd (%rsp), %zmm1 # 64-byte Reload 340; CHECK-NEXT: vmovapd %zmm1, %zmm0 {%k1} 341; CHECK-NEXT: addq $136, %rsp 342; CHECK-NEXT: .cfi_def_cfa_offset 8 343; CHECK-NEXT: retq 344 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 345 ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load 346 %2 = load <8 x double>, ptr %a2 347 %3 = fadd <8 x double> %a1, %2 348 %4 = bitcast i8 %mask to <8 x i1> 349 %5 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %3, <8 x double> %a0, i32 0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4) 350 %6 = and <8 x i1> %4, %5 351 %7 = select <8 x i1> %6, <8 x double> %b0, <8 x double> %b1 352 ret <8 x double> %7 353} 354 355define <8 x double> @stack_fold_cmppd_mask_commuted(<8 x double> %a0, <8 x double> %a1, ptr %a2, i8 %mask, <8 x double> %b0, <8 x double> %b1) { 356; CHECK-LABEL: stack_fold_cmppd_mask_commuted: 357; CHECK: # %bb.0: 358; CHECK-NEXT: subq $136, %rsp 359; CHECK-NEXT: .cfi_def_cfa_offset 144 360; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 361; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill 362; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 363; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 364; CHECK-NEXT: #APP 365; CHECK-NEXT: nop 366; CHECK-NEXT: #NO_APP 367; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 368; CHECK-NEXT: vaddpd (%rdi), %zmm0, %zmm0 369; CHECK-NEXT: vcmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload 370; CHECK-NEXT: kmovw %esi, %k1 371; CHECK-NEXT: kandb %k0, %k1, %k1 372; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 373; CHECK-NEXT: vmovupd (%rsp), %zmm1 # 64-byte Reload 374; CHECK-NEXT: vmovapd %zmm1, %zmm0 {%k1} 375; CHECK-NEXT: addq $136, %rsp 376; CHECK-NEXT: .cfi_def_cfa_offset 8 377; CHECK-NEXT: retq 378 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 379 ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load 380 %2 = load <8 x double>, ptr %a2 381 %3 = fadd <8 x double> %a1, %2 382 %4 = bitcast i8 %mask to <8 x i1> 383 %5 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a0, <8 x double> %3, i32 0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4) 384 %6 = and <8 x i1> %4, %5 385 %7 = select <8 x i1> %6, <8 x double> %b0, <8 x double> %b1 386 ret <8 x double> %7 387} 388 389define i16 @stack_fold_cmpps(<16 x float> %a0, <16 x float> %a1) { 390; CHECK-LABEL: stack_fold_cmpps: 391; CHECK: # %bb.0: 392; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 393; CHECK-NEXT: #APP 394; CHECK-NEXT: nop 395; CHECK-NEXT: #NO_APP 396; CHECK-NEXT: vcmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload 397; CHECK-NEXT: kmovw %k0, %eax 398; CHECK-NEXT: # kill: def $ax killed $ax killed $eax 399; CHECK-NEXT: vzeroupper 400; CHECK-NEXT: retq 401 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 402 %res = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4) 403 %2 = bitcast <16 x i1> %res to i16 404 ret i16 %2 405} 406declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, <16 x i1>, i32) 407 408define <16 x float> @stack_fold_cmpps_mask(<16 x float> %a0, <16 x float> %a1, ptr %a2, i16 %mask, <16 x float> %b0, <16 x float> %b1) { 409; CHECK-LABEL: stack_fold_cmpps_mask: 410; CHECK: # %bb.0: 411; CHECK-NEXT: subq $136, %rsp 412; CHECK-NEXT: .cfi_def_cfa_offset 144 413; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 414; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill 415; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 416; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 417; CHECK-NEXT: #APP 418; CHECK-NEXT: nop 419; CHECK-NEXT: #NO_APP 420; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 421; CHECK-NEXT: vaddps (%rdi), %zmm0, %zmm0 422; CHECK-NEXT: vcmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload 423; CHECK-NEXT: kmovw %esi, %k1 424; CHECK-NEXT: kandw %k0, %k1, %k1 425; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 426; CHECK-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload 427; CHECK-NEXT: vmovaps %zmm1, %zmm0 {%k1} 428; CHECK-NEXT: addq $136, %rsp 429; CHECK-NEXT: .cfi_def_cfa_offset 8 430; CHECK-NEXT: retq 431 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 432 ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load 433 %2 = load <16 x float>, ptr %a2 434 %3 = fadd <16 x float> %a1, %2 435 %4 = bitcast i16 %mask to <16 x i1> 436 %5 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %3, <16 x float> %a0, i32 0, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4) 437 %6 = and <16 x i1> %4, %5 438 %7 = select <16 x i1> %6, <16 x float> %b0, <16 x float> %b1 439 ret <16 x float> %7 440} 441 442define <16 x float> @stack_fold_cmpps_mask_commuted(<16 x float> %a0, <16 x float> %a1, ptr %a2, i16 %mask, <16 x float> %b0, <16 x float> %b1) { 443; CHECK-LABEL: stack_fold_cmpps_mask_commuted: 444; CHECK: # %bb.0: 445; CHECK-NEXT: subq $136, %rsp 446; CHECK-NEXT: .cfi_def_cfa_offset 144 447; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 448; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill 449; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 450; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 451; CHECK-NEXT: #APP 452; CHECK-NEXT: nop 453; CHECK-NEXT: #NO_APP 454; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 455; CHECK-NEXT: vaddps (%rdi), %zmm0, %zmm0 456; CHECK-NEXT: vcmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload 457; CHECK-NEXT: kmovw %esi, %k1 458; CHECK-NEXT: kandw %k0, %k1, %k1 459; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 460; CHECK-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload 461; CHECK-NEXT: vmovaps %zmm1, %zmm0 {%k1} 462; CHECK-NEXT: addq $136, %rsp 463; CHECK-NEXT: .cfi_def_cfa_offset 8 464; CHECK-NEXT: retq 465 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 466 ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load 467 %2 = load <16 x float>, ptr %a2 468 %3 = fadd <16 x float> %a1, %2 469 %4 = bitcast i16 %mask to <16 x i1> 470 %5 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a0, <16 x float> %3, i32 0, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4) 471 %6 = and <16 x i1> %4, %5 472 %7 = select <16 x i1> %6, <16 x float> %b0, <16 x float> %b1 473 ret <16 x float> %7 474} 475 476define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) { 477; CHECK-LABEL: stack_fold_divsd_int: 478; CHECK: # %bb.0: 479; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 480; CHECK-NEXT: #APP 481; CHECK-NEXT: nop 482; CHECK-NEXT: #NO_APP 483; CHECK-NEXT: vdivsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 484; CHECK-NEXT: retq 485 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 486 %2 = extractelement <2 x double> %a0, i32 0 487 %3 = extractelement <2 x double> %a1, i32 0 488 %4 = fdiv double %2, %3 489 %5 = insertelement <2 x double> %a0, double %4, i32 0 490 ret <2 x double> %5 491} 492 493define float @stack_fold_divss(float %a0, float %a1) { 494; CHECK-LABEL: stack_fold_divss: 495; CHECK: # %bb.0: 496; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 497; CHECK-NEXT: #APP 498; CHECK-NEXT: nop 499; CHECK-NEXT: #NO_APP 500; CHECK-NEXT: vdivss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload 501; CHECK-NEXT: retq 502 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 503 %2 = fdiv float %a0, %a1 504 ret float %2 505} 506 507define <4 x float> @stack_fold_divss_int(<4 x float> %a0, <4 x float> %a1) { 508; CHECK-LABEL: stack_fold_divss_int: 509; CHECK: # %bb.0: 510; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 511; CHECK-NEXT: #APP 512; CHECK-NEXT: nop 513; CHECK-NEXT: #NO_APP 514; CHECK-NEXT: vdivss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 515; CHECK-NEXT: retq 516 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 517 %2 = extractelement <4 x float> %a0, i32 0 518 %3 = extractelement <4 x float> %a1, i32 0 519 %4 = fdiv float %2, %3 520 %5 = insertelement <4 x float> %a0, float %4, i32 0 521 ret <4 x float> %5 522} 523 524define <8 x double> @stack_fold_cvtdq2pd(<8 x i32> %a0) { 525; CHECK-LABEL: stack_fold_cvtdq2pd: 526; CHECK: # %bb.0: 527; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 528; CHECK-NEXT: #APP 529; CHECK-NEXT: nop 530; CHECK-NEXT: #NO_APP 531; CHECK-NEXT: vcvtdq2pd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload 532; CHECK-NEXT: retq 533 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 534 %2 = sitofp <8 x i32> %a0 to <8 x double> 535 ret <8 x double> %2 536} 537 538define <8 x double> @stack_fold_cvtudq2pd(<8 x i32> %a0) { 539; CHECK-LABEL: stack_fold_cvtudq2pd: 540; CHECK: # %bb.0: 541; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 542; CHECK-NEXT: #APP 543; CHECK-NEXT: nop 544; CHECK-NEXT: #NO_APP 545; CHECK-NEXT: vcvtudq2pd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload 546; CHECK-NEXT: retq 547 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 548 %2 = uitofp <8 x i32> %a0 to <8 x double> 549 ret <8 x double> %2 550} 551 552define <8 x float> @stack_fold_cvtpd2ps(<8 x double> %a0) { 553; CHECK-LABEL: stack_fold_cvtpd2ps: 554; CHECK: # %bb.0: 555; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 556; CHECK-NEXT: #APP 557; CHECK-NEXT: nop 558; CHECK-NEXT: #NO_APP 559; CHECK-NEXT: vcvtpd2ps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 64-byte Folded Reload 560; CHECK-NEXT: retq 561 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 562 %2 = fptrunc <8 x double> %a0 to <8 x float> 563 ret <8 x float> %2 564} 565 566define <16 x float> @stack_fold_cvtph2ps(<16 x i16> %a0) { 567; CHECK-LABEL: stack_fold_cvtph2ps: 568; CHECK: # %bb.0: 569; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 570; CHECK-NEXT: #APP 571; CHECK-NEXT: nop 572; CHECK-NEXT: #NO_APP 573; CHECK-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload 574; CHECK-NEXT: retq 575 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 576 %2 = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> undef, i16 -1, i32 4) 577 ret <16 x float> %2 578} 579declare <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16>, <16 x float>, i16, i32) nounwind readonly 580 581define <16 x i16> @stack_fold_cvtps2ph(<16 x float> %a0) { 582; CHECK-LABEL: stack_fold_cvtps2ph: 583; CHECK: # %bb.0: 584; CHECK-NEXT: vcvtps2ph $0, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill 585; CHECK-NEXT: #APP 586; CHECK-NEXT: nop 587; CHECK-NEXT: #NO_APP 588; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 589; CHECK-NEXT: retq 590 %1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 0, <16 x i16> undef, i16 -1) 591 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 592 ret <16 x i16> %1 593} 594declare <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float>, i32, <16 x i16>, i16) nounwind readonly 595 596define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) { 597; CHECK-LABEL: stack_fold_insertps: 598; CHECK: # %bb.0: 599; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 600; CHECK-NEXT: #APP 601; CHECK-NEXT: nop 602; CHECK-NEXT: #NO_APP 603; CHECK-NEXT: vinsertps $17, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 604; CHECK-NEXT: # xmm0 = zero,mem[0],xmm0[2,3] 605; CHECK-NEXT: retq 606 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 607 %2 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 209) 608 ret <4 x float> %2 609} 610declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone 611 612define <8 x double> @stack_fold_maxpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 { 613; CHECK-LABEL: stack_fold_maxpd_zmm: 614; CHECK: # %bb.0: 615; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 616; CHECK-NEXT: #APP 617; CHECK-NEXT: nop 618; CHECK-NEXT: #NO_APP 619; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 620; CHECK-NEXT: retq 621 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 622 %2 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4) 623 ret <8 x double> %2 624} 625declare <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double>, <8 x double>, i32) nounwind readnone 626 627define <8 x double> @stack_fold_maxpd_zmm_commutable(<8 x double> %a0, <8 x double> %a1) #1 { 628; CHECK-LABEL: stack_fold_maxpd_zmm_commutable: 629; CHECK: # %bb.0: 630; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 631; CHECK-NEXT: #APP 632; CHECK-NEXT: nop 633; CHECK-NEXT: #NO_APP 634; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 635; CHECK-NEXT: retq 636 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 637 %2 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4) 638 ret <8 x double> %2 639} 640 641define <8 x double> @stack_fold_maxpd_zmm_commutable_k(<8 x double> %a0, <8 x double> %a1, i8 %mask, ptr %passthru) #1 { 642; CHECK-LABEL: stack_fold_maxpd_zmm_commutable_k: 643; CHECK: # %bb.0: 644; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 645; CHECK-NEXT: #APP 646; CHECK-NEXT: nop 647; CHECK-NEXT: #NO_APP 648; CHECK-NEXT: kmovw %edi, %k1 649; CHECK-NEXT: vmovapd (%rsi), %zmm2 650; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 651; CHECK-NEXT: vmovapd %zmm2, %zmm0 652; CHECK-NEXT: retq 653 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 654 %2 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4) 655 %3 = bitcast i8 %mask to <8 x i1> 656 %4 = load <8 x double>, ptr %passthru 657 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4 658 ret <8 x double> %5 659} 660 661define <8 x double> @stack_fold_maxpd_zmm_commutable_k_commuted(<8 x double> %a0, <8 x double> %a1, i8 %mask, ptr %passthru) #1 { 662; CHECK-LABEL: stack_fold_maxpd_zmm_commutable_k_commuted: 663; CHECK: # %bb.0: 664; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 665; CHECK-NEXT: #APP 666; CHECK-NEXT: nop 667; CHECK-NEXT: #NO_APP 668; CHECK-NEXT: kmovw %edi, %k1 669; CHECK-NEXT: vmovapd (%rsi), %zmm2 670; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 671; CHECK-NEXT: vmovapd %zmm2, %zmm0 672; CHECK-NEXT: retq 673 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 674 %2 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a1, <8 x double> %a0, i32 4) 675 %3 = bitcast i8 %mask to <8 x i1> 676 %4 = load <8 x double>, ptr %passthru 677 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4 678 ret <8 x double> %5 679} 680 681define <8 x double> @stack_fold_maxpd_zmm_commutable_kz(<8 x double> %a0, <8 x double> %a1, i8 %mask) #1 { 682; CHECK-LABEL: stack_fold_maxpd_zmm_commutable_kz: 683; CHECK: # %bb.0: 684; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 685; CHECK-NEXT: #APP 686; CHECK-NEXT: nop 687; CHECK-NEXT: #NO_APP 688; CHECK-NEXT: kmovw %edi, %k1 689; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 690; CHECK-NEXT: retq 691 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 692 %2 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a1, <8 x double> %a0, i32 4) 693 %3 = bitcast i8 %mask to <8 x i1> 694 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer 695 ret <8 x double> %4 696} 697 698define <16 x float> @stack_fold_maxps_zmm(<16 x float> %a0, <16 x float> %a1) #0 { 699; CHECK-LABEL: stack_fold_maxps_zmm: 700; CHECK: # %bb.0: 701; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 702; CHECK-NEXT: #APP 703; CHECK-NEXT: nop 704; CHECK-NEXT: #NO_APP 705; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 706; CHECK-NEXT: retq 707 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 708 %2 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) 709 ret <16 x float> %2 710} 711declare <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float>, <16 x float>, i32) nounwind readnone 712 713define <16 x float> @stack_fold_maxps_zmm_commutable(<16 x float> %a0, <16 x float> %a1) #1 { 714; CHECK-LABEL: stack_fold_maxps_zmm_commutable: 715; CHECK: # %bb.0: 716; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 717; CHECK-NEXT: #APP 718; CHECK-NEXT: nop 719; CHECK-NEXT: #NO_APP 720; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 721; CHECK-NEXT: retq 722 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 723 %2 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) 724 ret <16 x float> %2 725} 726 727define <16 x float> @stack_fold_maxps_zmm_commutable_k(<16 x float> %a0, <16 x float> %a1, i16 %mask, ptr %passthru) #1 { 728; CHECK-LABEL: stack_fold_maxps_zmm_commutable_k: 729; CHECK: # %bb.0: 730; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 731; CHECK-NEXT: #APP 732; CHECK-NEXT: nop 733; CHECK-NEXT: #NO_APP 734; CHECK-NEXT: kmovw %edi, %k1 735; CHECK-NEXT: vmovaps (%rsi), %zmm2 736; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 737; CHECK-NEXT: vmovaps %zmm2, %zmm0 738; CHECK-NEXT: retq 739 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 740 %2 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) 741 %3 = bitcast i16 %mask to <16 x i1> 742 %4 = load <16 x float>, ptr %passthru 743 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4 744 ret <16 x float> %5 745} 746 747define <16 x float> @stack_fold_maxps_zmm_commutable_k_commuted(<16 x float> %a0, <16 x float> %a1, i16 %mask, ptr %passthru) #1 { 748; CHECK-LABEL: stack_fold_maxps_zmm_commutable_k_commuted: 749; CHECK: # %bb.0: 750; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 751; CHECK-NEXT: #APP 752; CHECK-NEXT: nop 753; CHECK-NEXT: #NO_APP 754; CHECK-NEXT: kmovw %edi, %k1 755; CHECK-NEXT: vmovaps (%rsi), %zmm2 756; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 757; CHECK-NEXT: vmovaps %zmm2, %zmm0 758; CHECK-NEXT: retq 759 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 760 %2 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a1, <16 x float> %a0, i32 4) 761 %3 = bitcast i16 %mask to <16 x i1> 762 %4 = load <16 x float>, ptr %passthru 763 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4 764 ret <16 x float> %5 765} 766 767define <16 x float> @stack_fold_maxps_zmm_commutable_kz(<16 x float> %a0, <16 x float> %a1, i16 %mask) #1 { 768; CHECK-LABEL: stack_fold_maxps_zmm_commutable_kz: 769; CHECK: # %bb.0: 770; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 771; CHECK-NEXT: #APP 772; CHECK-NEXT: nop 773; CHECK-NEXT: #NO_APP 774; CHECK-NEXT: kmovw %edi, %k1 775; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 776; CHECK-NEXT: retq 777 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 778 %2 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a1, <16 x float> %a0, i32 4) 779 %3 = bitcast i16 %mask to <16 x i1> 780 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer 781 ret <16 x float> %4 782} 783 784define <8 x double> @stack_fold_minpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 { 785; CHECK-LABEL: stack_fold_minpd_zmm: 786; CHECK: # %bb.0: 787; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 788; CHECK-NEXT: #APP 789; CHECK-NEXT: nop 790; CHECK-NEXT: #NO_APP 791; CHECK-NEXT: vminpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 792; CHECK-NEXT: retq 793 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 794 %2 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4) 795 ret <8 x double> %2 796} 797declare <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double>, <8 x double>, i32) nounwind readnone 798 799define <8 x double> @stack_fold_minpd_zmm_commutable(<8 x double> %a0, <8 x double> %a1) #1 { 800; CHECK-LABEL: stack_fold_minpd_zmm_commutable: 801; CHECK: # %bb.0: 802; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 803; CHECK-NEXT: #APP 804; CHECK-NEXT: nop 805; CHECK-NEXT: #NO_APP 806; CHECK-NEXT: vminpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 807; CHECK-NEXT: retq 808 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 809 %2 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4) 810 ret <8 x double> %2 811} 812 813define <8 x double> @stack_fold_minpd_zmm_commutable_k(<8 x double> %a0, <8 x double> %a1, i8 %mask, ptr %passthru) #1 { 814; CHECK-LABEL: stack_fold_minpd_zmm_commutable_k: 815; CHECK: # %bb.0: 816; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 817; CHECK-NEXT: #APP 818; CHECK-NEXT: nop 819; CHECK-NEXT: #NO_APP 820; CHECK-NEXT: kmovw %edi, %k1 821; CHECK-NEXT: vmovapd (%rsi), %zmm2 822; CHECK-NEXT: vminpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 823; CHECK-NEXT: vmovapd %zmm2, %zmm0 824; CHECK-NEXT: retq 825 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 826 %2 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4) 827 %3 = bitcast i8 %mask to <8 x i1> 828 %4 = load <8 x double>, ptr %passthru 829 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4 830 ret <8 x double> %5 831} 832 833define <8 x double> @stack_fold_minpd_zmm_commutable_k_commuted(<8 x double> %a0, <8 x double> %a1, i8 %mask, ptr %passthru) #1 { 834; CHECK-LABEL: stack_fold_minpd_zmm_commutable_k_commuted: 835; CHECK: # %bb.0: 836; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 837; CHECK-NEXT: #APP 838; CHECK-NEXT: nop 839; CHECK-NEXT: #NO_APP 840; CHECK-NEXT: kmovw %edi, %k1 841; CHECK-NEXT: vmovapd (%rsi), %zmm2 842; CHECK-NEXT: vminpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 843; CHECK-NEXT: vmovapd %zmm2, %zmm0 844; CHECK-NEXT: retq 845 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 846 %2 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a1, <8 x double> %a0, i32 4) 847 %3 = bitcast i8 %mask to <8 x i1> 848 %4 = load <8 x double>, ptr %passthru 849 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4 850 ret <8 x double> %5 851} 852 853define <8 x double> @stack_fold_minpd_zmm_commutable_kz(<8 x double> %a0, <8 x double> %a1, i8 %mask) #1 { 854; CHECK-LABEL: stack_fold_minpd_zmm_commutable_kz: 855; CHECK: # %bb.0: 856; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 857; CHECK-NEXT: #APP 858; CHECK-NEXT: nop 859; CHECK-NEXT: #NO_APP 860; CHECK-NEXT: kmovw %edi, %k1 861; CHECK-NEXT: vminpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 862; CHECK-NEXT: retq 863 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 864 %2 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a1, <8 x double> %a0, i32 4) 865 %3 = bitcast i8 %mask to <8 x i1> 866 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer 867 ret <8 x double> %4 868} 869 870define <16 x float> @stack_fold_minps_zmm(<16 x float> %a0, <16 x float> %a1) #0 { 871; CHECK-LABEL: stack_fold_minps_zmm: 872; CHECK: # %bb.0: 873; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 874; CHECK-NEXT: #APP 875; CHECK-NEXT: nop 876; CHECK-NEXT: #NO_APP 877; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 878; CHECK-NEXT: retq 879 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 880 %2 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) 881 ret <16 x float> %2 882} 883declare <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float>, <16 x float>, i32) nounwind readnone 884 885define <16 x float> @stack_fold_minps_zmm_commutable(<16 x float> %a0, <16 x float> %a1) #1 { 886; CHECK-LABEL: stack_fold_minps_zmm_commutable: 887; CHECK: # %bb.0: 888; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 889; CHECK-NEXT: #APP 890; CHECK-NEXT: nop 891; CHECK-NEXT: #NO_APP 892; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 893; CHECK-NEXT: retq 894 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 895 %2 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) 896 ret <16 x float> %2 897} 898 899define <16 x float> @stack_fold_minps_zmm_commutable_k(<16 x float> %a0, <16 x float> %a1, i16 %mask, ptr %passthru) #1 { 900; CHECK-LABEL: stack_fold_minps_zmm_commutable_k: 901; CHECK: # %bb.0: 902; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 903; CHECK-NEXT: #APP 904; CHECK-NEXT: nop 905; CHECK-NEXT: #NO_APP 906; CHECK-NEXT: kmovw %edi, %k1 907; CHECK-NEXT: vmovaps (%rsi), %zmm2 908; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 909; CHECK-NEXT: vmovaps %zmm2, %zmm0 910; CHECK-NEXT: retq 911 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 912 %2 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) 913 %3 = bitcast i16 %mask to <16 x i1> 914 %4 = load <16 x float>, ptr %passthru 915 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4 916 ret <16 x float> %5 917} 918 919define <16 x float> @stack_fold_minps_zmm_commutable_k_commuted(<16 x float> %a0, <16 x float> %a1, i16 %mask, ptr %passthru) #1 { 920; CHECK-LABEL: stack_fold_minps_zmm_commutable_k_commuted: 921; CHECK: # %bb.0: 922; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 923; CHECK-NEXT: #APP 924; CHECK-NEXT: nop 925; CHECK-NEXT: #NO_APP 926; CHECK-NEXT: kmovw %edi, %k1 927; CHECK-NEXT: vmovaps (%rsi), %zmm2 928; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 929; CHECK-NEXT: vmovaps %zmm2, %zmm0 930; CHECK-NEXT: retq 931 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 932 %2 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a1, <16 x float> %a0, i32 4) 933 %3 = bitcast i16 %mask to <16 x i1> 934 %4 = load <16 x float>, ptr %passthru 935 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4 936 ret <16 x float> %5 937} 938 939define <16 x float> @stack_fold_minps_zmm_commutable_kz(<16 x float> %a0, <16 x float> %a1, i16 %mask) #1 { 940; CHECK-LABEL: stack_fold_minps_zmm_commutable_kz: 941; CHECK: # %bb.0: 942; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 943; CHECK-NEXT: #APP 944; CHECK-NEXT: nop 945; CHECK-NEXT: #NO_APP 946; CHECK-NEXT: kmovw %edi, %k1 947; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 948; CHECK-NEXT: retq 949 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 950 %2 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a1, <16 x float> %a0, i32 4) 951 %3 = bitcast i16 %mask to <16 x i1> 952 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer 953 ret <16 x float> %4 954} 955 956define <8 x double> @stack_fold_mulpd_zmm(<8 x double> %a0, <8 x double> %a1) { 957; CHECK-LABEL: stack_fold_mulpd_zmm: 958; CHECK: # %bb.0: 959; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 960; CHECK-NEXT: #APP 961; CHECK-NEXT: nop 962; CHECK-NEXT: #NO_APP 963; CHECK-NEXT: vmulpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 964; CHECK-NEXT: retq 965 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 966 %2 = fmul <8 x double> %a0, %a1 967 ret <8 x double> %2 968} 969 970define <8 x double> @stack_fold_mulpd_zmm_k(<8 x double> %a0, <8 x double> %a1, i8 %mask, ptr %passthru) { 971; CHECK-LABEL: stack_fold_mulpd_zmm_k: 972; CHECK: # %bb.0: 973; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 974; CHECK-NEXT: #APP 975; CHECK-NEXT: nop 976; CHECK-NEXT: #NO_APP 977; CHECK-NEXT: kmovw %edi, %k1 978; CHECK-NEXT: vmovapd (%rsi), %zmm2 979; CHECK-NEXT: vmulpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 980; CHECK-NEXT: vmovapd %zmm2, %zmm0 981; CHECK-NEXT: retq 982 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 983 %2 = fmul <8 x double> %a0, %a1 984 %3 = bitcast i8 %mask to <8 x i1> 985 %4 = load <8 x double>, ptr %passthru 986 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4 987 ret <8 x double> %5 988} 989 990define <8 x double> @stack_fold_mulpd_zmm_k_commuted(<8 x double> %a0, <8 x double> %a1, i8 %mask, ptr %passthru) { 991; CHECK-LABEL: stack_fold_mulpd_zmm_k_commuted: 992; CHECK: # %bb.0: 993; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 994; CHECK-NEXT: #APP 995; CHECK-NEXT: nop 996; CHECK-NEXT: #NO_APP 997; CHECK-NEXT: kmovw %edi, %k1 998; CHECK-NEXT: vmovapd (%rsi), %zmm2 999; CHECK-NEXT: vmulpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 1000; CHECK-NEXT: vmovapd %zmm2, %zmm0 1001; CHECK-NEXT: retq 1002 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1003 %2 = fmul <8 x double> %a1, %a0 1004 %3 = bitcast i8 %mask to <8 x i1> 1005 %4 = load <8 x double>, ptr %passthru 1006 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4 1007 ret <8 x double> %5 1008} 1009 1010define <8 x double> @stack_fold_mulpd_zmm_kz(<8 x double> %a0, <8 x double> %a1, i8 %mask) { 1011; CHECK-LABEL: stack_fold_mulpd_zmm_kz: 1012; CHECK: # %bb.0: 1013; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1014; CHECK-NEXT: #APP 1015; CHECK-NEXT: nop 1016; CHECK-NEXT: #NO_APP 1017; CHECK-NEXT: kmovw %edi, %k1 1018; CHECK-NEXT: vmulpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1019; CHECK-NEXT: retq 1020 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1021 %2 = fmul <8 x double> %a1, %a0 1022 %3 = bitcast i8 %mask to <8 x i1> 1023 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer 1024 ret <8 x double> %4 1025} 1026 1027define <16 x float> @stack_fold_mulps_zmm(<16 x float> %a0, <16 x float> %a1) { 1028; CHECK-LABEL: stack_fold_mulps_zmm: 1029; CHECK: # %bb.0: 1030; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1031; CHECK-NEXT: #APP 1032; CHECK-NEXT: nop 1033; CHECK-NEXT: #NO_APP 1034; CHECK-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1035; CHECK-NEXT: retq 1036 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1037 %2 = fmul <16 x float> %a0, %a1 1038 ret <16 x float> %2 1039} 1040 1041define <16 x float> @stack_fold_mulps_zmm_k(<16 x float> %a0, <16 x float> %a1, i16 %mask, ptr %passthru) { 1042; CHECK-LABEL: stack_fold_mulps_zmm_k: 1043; CHECK: # %bb.0: 1044; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1045; CHECK-NEXT: #APP 1046; CHECK-NEXT: nop 1047; CHECK-NEXT: #NO_APP 1048; CHECK-NEXT: kmovw %edi, %k1 1049; CHECK-NEXT: vmovaps (%rsi), %zmm2 1050; CHECK-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 1051; CHECK-NEXT: vmovaps %zmm2, %zmm0 1052; CHECK-NEXT: retq 1053 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1054 %2 = fmul <16 x float> %a0, %a1 1055 %3 = bitcast i16 %mask to <16 x i1> 1056 %4 = load <16 x float>, ptr %passthru 1057 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4 1058 ret <16 x float> %5 1059} 1060 1061define <16 x float> @stack_fold_mulps_zmm_k_commuted(<16 x float> %a0, <16 x float> %a1, i16 %mask, ptr %passthru) { 1062; CHECK-LABEL: stack_fold_mulps_zmm_k_commuted: 1063; CHECK: # %bb.0: 1064; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1065; CHECK-NEXT: #APP 1066; CHECK-NEXT: nop 1067; CHECK-NEXT: #NO_APP 1068; CHECK-NEXT: kmovw %edi, %k1 1069; CHECK-NEXT: vmovaps (%rsi), %zmm2 1070; CHECK-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 1071; CHECK-NEXT: vmovaps %zmm2, %zmm0 1072; CHECK-NEXT: retq 1073 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1074 %2 = fmul <16 x float> %a1, %a0 1075 %3 = bitcast i16 %mask to <16 x i1> 1076 %4 = load <16 x float>, ptr %passthru 1077 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4 1078 ret <16 x float> %5 1079} 1080 1081define <16 x float> @stack_fold_mulps_zmm_kz(<16 x float> %a0, <16 x float> %a1, i16 %mask) { 1082; CHECK-LABEL: stack_fold_mulps_zmm_kz: 1083; CHECK: # %bb.0: 1084; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1085; CHECK-NEXT: #APP 1086; CHECK-NEXT: nop 1087; CHECK-NEXT: #NO_APP 1088; CHECK-NEXT: kmovw %edi, %k1 1089; CHECK-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1090; CHECK-NEXT: retq 1091 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1092 %2 = fmul <16 x float> %a1, %a0 1093 %3 = bitcast i16 %mask to <16 x i1> 1094 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer 1095 ret <16 x float> %4 1096} 1097 1098define double @stack_fold_mulsd(double %a0, double %a1) { 1099; CHECK-LABEL: stack_fold_mulsd: 1100; CHECK: # %bb.0: 1101; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1102; CHECK-NEXT: #APP 1103; CHECK-NEXT: nop 1104; CHECK-NEXT: #NO_APP 1105; CHECK-NEXT: vmulsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload 1106; CHECK-NEXT: retq 1107 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1108 %2 = fmul double %a0, %a1 1109 ret double %2 1110} 1111 1112define <2 x double> @stack_fold_mulsd_int(<2 x double> %a0, <2 x double> %a1) { 1113; CHECK-LABEL: stack_fold_mulsd_int: 1114; CHECK: # %bb.0: 1115; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1116; CHECK-NEXT: #APP 1117; CHECK-NEXT: nop 1118; CHECK-NEXT: #NO_APP 1119; CHECK-NEXT: vmulsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1120; CHECK-NEXT: retq 1121 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1122 %2 = extractelement <2 x double> %a0, i32 0 1123 %3 = extractelement <2 x double> %a1, i32 0 1124 %4 = fmul double %2, %3 1125 %5 = insertelement <2 x double> %a0, double %4, i32 0 1126 ret <2 x double> %5 1127} 1128 1129define float @stack_fold_mulss(float %a0, float %a1) { 1130; CHECK-LABEL: stack_fold_mulss: 1131; CHECK: # %bb.0: 1132; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1133; CHECK-NEXT: #APP 1134; CHECK-NEXT: nop 1135; CHECK-NEXT: #NO_APP 1136; CHECK-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload 1137; CHECK-NEXT: retq 1138 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1139 %2 = fmul float %a0, %a1 1140 ret float %2 1141} 1142 1143define <4 x float> @stack_fold_mulss_int(<4 x float> %a0, <4 x float> %a1) { 1144; CHECK-LABEL: stack_fold_mulss_int: 1145; CHECK: # %bb.0: 1146; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1147; CHECK-NEXT: #APP 1148; CHECK-NEXT: nop 1149; CHECK-NEXT: #NO_APP 1150; CHECK-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1151; CHECK-NEXT: retq 1152 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1153 %2 = extractelement <4 x float> %a0, i32 0 1154 %3 = extractelement <4 x float> %a1, i32 0 1155 %4 = fmul float %2, %3 1156 %5 = insertelement <4 x float> %a0, float %4, i32 0 1157 ret <4 x float> %5 1158} 1159 1160define <8 x double> @stack_fold_orpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 { 1161; CHECK-LABEL: stack_fold_orpd_zmm: 1162; CHECK: # %bb.0: 1163; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1164; CHECK-NEXT: #APP 1165; CHECK-NEXT: nop 1166; CHECK-NEXT: #NO_APP 1167; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1168; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1169; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 1170; CHECK-NEXT: retq 1171 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1172 %2 = bitcast <8 x double> %a0 to <8 x i64> 1173 %3 = bitcast <8 x double> %a1 to <8 x i64> 1174 %4 = or <8 x i64> %2, %3 1175 %5 = bitcast <8 x i64> %4 to <8 x double> 1176 ; fadd forces execution domain 1177 %6 = fadd <8 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0> 1178 ret <8 x double> %6 1179} 1180 1181define <16 x float> @stack_fold_orps_zmm(<16 x float> %a0, <16 x float> %a1) #0 { 1182; CHECK-LABEL: stack_fold_orps_zmm: 1183; CHECK: # %bb.0: 1184; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1185; CHECK-NEXT: #APP 1186; CHECK-NEXT: nop 1187; CHECK-NEXT: #NO_APP 1188; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1189; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 1190; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 1191; CHECK-NEXT: retq 1192 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1193 %2 = bitcast <16 x float> %a0 to <16 x i32> 1194 %3 = bitcast <16 x float> %a1 to <16 x i32> 1195 %4 = or <16 x i32> %2, %3 1196 %5 = bitcast <16 x i32> %4 to <16 x float> 1197 ; fadd forces execution domain 1198 %6 = fadd <16 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0> 1199 ret <16 x float> %6 1200} 1201 1202define <8 x double> @stack_fold_shuff64x2(<8 x double> %a, <8 x double> %b) { 1203; CHECK-LABEL: stack_fold_shuff64x2: 1204; CHECK: # %bb.0: 1205; CHECK-NEXT: pushq %rax 1206; CHECK-NEXT: .cfi_def_cfa_offset 16 1207; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1208; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1209; CHECK-NEXT: #APP 1210; CHECK-NEXT: nop 1211; CHECK-NEXT: #NO_APP 1212; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 1213; CHECK-NEXT: vshuff64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1214; CHECK-NEXT: # zmm0 = zmm0[0,1,4,5],mem[2,3,0,1] 1215; CHECK-NEXT: popq %rax 1216; CHECK-NEXT: .cfi_def_cfa_offset 8 1217; CHECK-NEXT: retq 1218 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1219 %2 = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9> 1220 ret <8 x double> %2 1221} 1222 1223define <8 x double> @stack_fold_shuff64x2_mask(<8 x double> %a, <8 x double> %b, i8 %mask, ptr %passthru) { 1224; CHECK-LABEL: stack_fold_shuff64x2_mask: 1225; CHECK: # %bb.0: 1226; CHECK-NEXT: pushq %rax 1227; CHECK-NEXT: .cfi_def_cfa_offset 16 1228; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1229; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1230; CHECK-NEXT: #APP 1231; CHECK-NEXT: nop 1232; CHECK-NEXT: #NO_APP 1233; CHECK-NEXT: kmovw %edi, %k1 1234; CHECK-NEXT: vmovapd (%rsi), %zmm1 1235; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 1236; CHECK-NEXT: vshuff64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload 1237; CHECK-NEXT: # zmm1 {%k1} = zmm0[0,1,4,5],mem[2,3,0,1] 1238; CHECK-NEXT: vmovapd %zmm1, %zmm0 1239; CHECK-NEXT: popq %rax 1240; CHECK-NEXT: .cfi_def_cfa_offset 8 1241; CHECK-NEXT: retq 1242 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1243 %2 = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9> 1244 %3 = bitcast i8 %mask to <8 x i1> 1245 ; load needed to keep the operation from being scheduled above the asm block 1246 %4 = load <8 x double>, ptr %passthru 1247 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4 1248 ret <8 x double> %5 1249} 1250 1251define <8 x double> @stack_fold_shuff64x2_maskz(<8 x double> %a, <8 x double> %b, i8 %mask, ptr %passthru) { 1252; CHECK-LABEL: stack_fold_shuff64x2_maskz: 1253; CHECK: # %bb.0: 1254; CHECK-NEXT: pushq %rax 1255; CHECK-NEXT: .cfi_def_cfa_offset 16 1256; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1257; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1258; CHECK-NEXT: #APP 1259; CHECK-NEXT: nop 1260; CHECK-NEXT: #NO_APP 1261; CHECK-NEXT: kmovw %edi, %k1 1262; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 1263; CHECK-NEXT: vshuff64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1264; CHECK-NEXT: # zmm0 {%k1} {z} = zmm0[0,1,4,5],mem[2,3,0,1] 1265; CHECK-NEXT: popq %rax 1266; CHECK-NEXT: .cfi_def_cfa_offset 8 1267; CHECK-NEXT: retq 1268 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1269 %2 = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9> 1270 %3 = bitcast i8 %mask to <8 x i1> 1271 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer 1272 ret <8 x double> %4 1273} 1274 1275define <16 x float> @stack_fold_shuff32x4_mask(<16 x float> %a, <16 x float> %b, i16 %mask, ptr %passthru) { 1276; CHECK-LABEL: stack_fold_shuff32x4_mask: 1277; CHECK: # %bb.0: 1278; CHECK-NEXT: pushq %rax 1279; CHECK-NEXT: .cfi_def_cfa_offset 16 1280; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1281; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1282; CHECK-NEXT: #APP 1283; CHECK-NEXT: nop 1284; CHECK-NEXT: #NO_APP 1285; CHECK-NEXT: kmovw %edi, %k1 1286; CHECK-NEXT: vmovaps (%rsi), %zmm1 1287; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 1288; CHECK-NEXT: vshuff32x4 $20, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload 1289; CHECK-NEXT: # zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,7],mem[4,5,6,7,0,1,2,3] 1290; CHECK-NEXT: vmovaps %zmm1, %zmm0 1291; CHECK-NEXT: popq %rax 1292; CHECK-NEXT: .cfi_def_cfa_offset 8 1293; CHECK-NEXT: retq 1294 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1295 %2 = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19> 1296 %3 = bitcast i16 %mask to <16 x i1> 1297 ; load needed to keep the operation from being scheduled above the asm block 1298 %4 = load <16 x float>, ptr %passthru 1299 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4 1300 ret <16 x float> %5 1301} 1302 1303define <16 x float> @stack_fold_shuff32x4_maskz(<16 x float> %a, <16 x float> %b, i16 %mask) { 1304; CHECK-LABEL: stack_fold_shuff32x4_maskz: 1305; CHECK: # %bb.0: 1306; CHECK-NEXT: pushq %rax 1307; CHECK-NEXT: .cfi_def_cfa_offset 16 1308; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1309; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1310; CHECK-NEXT: #APP 1311; CHECK-NEXT: nop 1312; CHECK-NEXT: #NO_APP 1313; CHECK-NEXT: kmovw %edi, %k1 1314; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 1315; CHECK-NEXT: vshuff32x4 $20, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1316; CHECK-NEXT: # zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],mem[4,5,6,7,0,1,2,3] 1317; CHECK-NEXT: popq %rax 1318; CHECK-NEXT: .cfi_def_cfa_offset 8 1319; CHECK-NEXT: retq 1320 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1321 %2 = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19> 1322 %3 = bitcast i16 %mask to <16 x i1> 1323 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer 1324 ret <16 x float> %4 1325} 1326 1327define <8 x double> @stack_fold_subpd_zmm(<8 x double> %a0, <8 x double> %a1) { 1328; CHECK-LABEL: stack_fold_subpd_zmm: 1329; CHECK: # %bb.0: 1330; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1331; CHECK-NEXT: #APP 1332; CHECK-NEXT: nop 1333; CHECK-NEXT: #NO_APP 1334; CHECK-NEXT: vsubpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1335; CHECK-NEXT: retq 1336 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1337 %2 = fsub <8 x double> %a0, %a1 1338 ret <8 x double> %2 1339} 1340 1341define <16 x float> @stack_fold_subps_zmm(<16 x float> %a0, <16 x float> %a1) { 1342; CHECK-LABEL: stack_fold_subps_zmm: 1343; CHECK: # %bb.0: 1344; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1345; CHECK-NEXT: #APP 1346; CHECK-NEXT: nop 1347; CHECK-NEXT: #NO_APP 1348; CHECK-NEXT: vsubps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1349; CHECK-NEXT: retq 1350 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1351 %2 = fsub <16 x float> %a0, %a1 1352 ret <16 x float> %2 1353} 1354 1355define double @stack_fold_subsd(double %a0, double %a1) { 1356; CHECK-LABEL: stack_fold_subsd: 1357; CHECK: # %bb.0: 1358; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1359; CHECK-NEXT: #APP 1360; CHECK-NEXT: nop 1361; CHECK-NEXT: #NO_APP 1362; CHECK-NEXT: vsubsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload 1363; CHECK-NEXT: retq 1364 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1365 %2 = fsub double %a0, %a1 1366 ret double %2 1367} 1368 1369define <2 x double> @stack_fold_subsd_int(<2 x double> %a0, <2 x double> %a1) { 1370; CHECK-LABEL: stack_fold_subsd_int: 1371; CHECK: # %bb.0: 1372; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1373; CHECK-NEXT: #APP 1374; CHECK-NEXT: nop 1375; CHECK-NEXT: #NO_APP 1376; CHECK-NEXT: vsubsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1377; CHECK-NEXT: retq 1378 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1379 %2 = extractelement <2 x double> %a0, i32 0 1380 %3 = extractelement <2 x double> %a1, i32 0 1381 %4 = fsub double %2, %3 1382 %5 = insertelement <2 x double> %a0, double %4, i32 0 1383 ret <2 x double> %5 1384} 1385 1386define float @stack_fold_subss(float %a0, float %a1) { 1387; CHECK-LABEL: stack_fold_subss: 1388; CHECK: # %bb.0: 1389; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1390; CHECK-NEXT: #APP 1391; CHECK-NEXT: nop 1392; CHECK-NEXT: #NO_APP 1393; CHECK-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload 1394; CHECK-NEXT: retq 1395 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1396 %2 = fsub float %a0, %a1 1397 ret float %2 1398} 1399 1400define <4 x float> @stack_fold_subss_int(<4 x float> %a0, <4 x float> %a1) { 1401; CHECK-LABEL: stack_fold_subss_int: 1402; CHECK: # %bb.0: 1403; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1404; CHECK-NEXT: #APP 1405; CHECK-NEXT: nop 1406; CHECK-NEXT: #NO_APP 1407; CHECK-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1408; CHECK-NEXT: retq 1409 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1410 %2 = extractelement <4 x float> %a0, i32 0 1411 %3 = extractelement <4 x float> %a1, i32 0 1412 %4 = fsub float %2, %3 1413 %5 = insertelement <4 x float> %a0, float %4, i32 0 1414 ret <4 x float> %5 1415} 1416 1417define <8 x double> @stack_fold_xorpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 { 1418; CHECK-LABEL: stack_fold_xorpd_zmm: 1419; CHECK: # %bb.0: 1420; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1421; CHECK-NEXT: #APP 1422; CHECK-NEXT: nop 1423; CHECK-NEXT: #NO_APP 1424; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1425; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1426; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 1427; CHECK-NEXT: retq 1428 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1429 %2 = bitcast <8 x double> %a0 to <8 x i64> 1430 %3 = bitcast <8 x double> %a1 to <8 x i64> 1431 %4 = xor <8 x i64> %2, %3 1432 %5 = bitcast <8 x i64> %4 to <8 x double> 1433 ; fadd forces execution domain 1434 %6 = fadd <8 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0> 1435 ret <8 x double> %6 1436} 1437 1438define <16 x float> @stack_fold_xorps_zmm(<16 x float> %a0, <16 x float> %a1) #0 { 1439; CHECK-LABEL: stack_fold_xorps_zmm: 1440; CHECK: # %bb.0: 1441; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1442; CHECK-NEXT: #APP 1443; CHECK-NEXT: nop 1444; CHECK-NEXT: #NO_APP 1445; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1446; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 1447; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 1448; CHECK-NEXT: retq 1449 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1450 %2 = bitcast <16 x float> %a0 to <16 x i32> 1451 %3 = bitcast <16 x float> %a1 to <16 x i32> 1452 %4 = xor <16 x i32> %2, %3 1453 %5 = bitcast <16 x i32> %4 to <16 x float> 1454 ; fadd forces execution domain 1455 %6 = fadd <16 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0> 1456 ret <16 x float> %6 1457} 1458 1459define i32 @stack_fold_extractps(<4 x float> %a0) { 1460; CHECK-LABEL: stack_fold_extractps: 1461; CHECK: # %bb.0: 1462; CHECK-NEXT: pushq %rbp 1463; CHECK-NEXT: .cfi_def_cfa_offset 16 1464; CHECK-NEXT: pushq %r15 1465; CHECK-NEXT: .cfi_def_cfa_offset 24 1466; CHECK-NEXT: pushq %r14 1467; CHECK-NEXT: .cfi_def_cfa_offset 32 1468; CHECK-NEXT: pushq %r13 1469; CHECK-NEXT: .cfi_def_cfa_offset 40 1470; CHECK-NEXT: pushq %r12 1471; CHECK-NEXT: .cfi_def_cfa_offset 48 1472; CHECK-NEXT: pushq %rbx 1473; CHECK-NEXT: .cfi_def_cfa_offset 56 1474; CHECK-NEXT: .cfi_offset %rbx, -56 1475; CHECK-NEXT: .cfi_offset %r12, -48 1476; CHECK-NEXT: .cfi_offset %r13, -40 1477; CHECK-NEXT: .cfi_offset %r14, -32 1478; CHECK-NEXT: .cfi_offset %r15, -24 1479; CHECK-NEXT: .cfi_offset %rbp, -16 1480; CHECK-NEXT: vextractps $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill 1481; CHECK-NEXT: #APP 1482; CHECK-NEXT: nop 1483; CHECK-NEXT: #NO_APP 1484; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload 1485; CHECK-NEXT: popq %rbx 1486; CHECK-NEXT: .cfi_def_cfa_offset 48 1487; CHECK-NEXT: popq %r12 1488; CHECK-NEXT: .cfi_def_cfa_offset 40 1489; CHECK-NEXT: popq %r13 1490; CHECK-NEXT: .cfi_def_cfa_offset 32 1491; CHECK-NEXT: popq %r14 1492; CHECK-NEXT: .cfi_def_cfa_offset 24 1493; CHECK-NEXT: popq %r15 1494; CHECK-NEXT: .cfi_def_cfa_offset 16 1495; CHECK-NEXT: popq %rbp 1496; CHECK-NEXT: .cfi_def_cfa_offset 8 1497; CHECK-NEXT: retq 1498 %1 = extractelement <4 x float> %a0, i32 1 1499 %2 = bitcast float %1 to i32 1500 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 1501 ret i32 %2 1502} 1503 1504define <4 x float> @stack_fold_extracti32x4(<16 x float> %a0) { 1505; CHECK-LABEL: stack_fold_extracti32x4: 1506; CHECK: # %bb.0: 1507; CHECK-NEXT: vextractf32x4 $3, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 1508; CHECK-NEXT: #APP 1509; CHECK-NEXT: nop 1510; CHECK-NEXT: #NO_APP 1511; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1512; CHECK-NEXT: vzeroupper 1513; CHECK-NEXT: retq 1514 %1 = shufflevector <16 x float> %a0, <16 x float> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15> 1515 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1516 ret <4 x float> %1 1517} 1518 1519define <2 x double> @stack_fold_extractf64x2(<8 x double> %a0) { 1520; CHECK-LABEL: stack_fold_extractf64x2: 1521; CHECK: # %bb.0: 1522; CHECK-NEXT: vextractf32x4 $3, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 1523; CHECK-NEXT: #APP 1524; CHECK-NEXT: nop 1525; CHECK-NEXT: #NO_APP 1526; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1527; CHECK-NEXT: vzeroupper 1528; CHECK-NEXT: retq 1529 %1 = shufflevector <8 x double> %a0, <8 x double> undef, <2 x i32> <i32 6, i32 7> 1530 %2 = tail call <2 x double> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1531 ret <2 x double> %1 1532} 1533 1534define <8 x float> @stack_fold_extracti32x8(<16 x float> %a0) { 1535; CHECK-LABEL: stack_fold_extracti32x8: 1536; CHECK: # %bb.0: 1537; CHECK-NEXT: vextractf64x4 $1, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill 1538; CHECK-NEXT: #APP 1539; CHECK-NEXT: nop 1540; CHECK-NEXT: #NO_APP 1541; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1542; CHECK-NEXT: retq 1543 %1 = shufflevector <16 x float> %a0, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1544 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1545 ret <8 x float> %1 1546} 1547 1548define <4 x double> @stack_fold_extractf64x4(<8 x double> %a0) { 1549; CHECK-LABEL: stack_fold_extractf64x4: 1550; CHECK: # %bb.0: 1551; CHECK-NEXT: vextractf64x4 $1, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill 1552; CHECK-NEXT: #APP 1553; CHECK-NEXT: nop 1554; CHECK-NEXT: #NO_APP 1555; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1556; CHECK-NEXT: retq 1557 %1 = shufflevector <8 x double> %a0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1558 %2 = tail call <2 x double> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1559 ret <4 x double> %1 1560} 1561 1562define <16 x float> @stack_fold_insertf32x8(<8 x float> %a0, <8 x float> %a1) { 1563; CHECK-LABEL: stack_fold_insertf32x8: 1564; CHECK: # %bb.0: 1565; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1566; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1567; CHECK-NEXT: #APP 1568; CHECK-NEXT: nop 1569; CHECK-NEXT: #NO_APP 1570; CHECK-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload 1571; CHECK-NEXT: retq 1572 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1573 %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1574 ret <16 x float> %2 1575} 1576 1577define <8 x double> @stack_fold_insertf64x4(<4 x double> %a0, <4 x double> %a1) { 1578; CHECK-LABEL: stack_fold_insertf64x4: 1579; CHECK: # %bb.0: 1580; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1581; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1582; CHECK-NEXT: #APP 1583; CHECK-NEXT: nop 1584; CHECK-NEXT: #NO_APP 1585; CHECK-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload 1586; CHECK-NEXT: retq 1587 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1588 %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1589 ret <8 x double> %2 1590} 1591 1592define <8 x double> @stack_fold_insertf64x4_mask(ptr %passthru, <4 x double> %a0, <4 x double> %a1, i8 %mask) { 1593; CHECK-LABEL: stack_fold_insertf64x4_mask: 1594; CHECK: # %bb.0: 1595; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1596; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1597; CHECK-NEXT: #APP 1598; CHECK-NEXT: nop 1599; CHECK-NEXT: #NO_APP 1600; CHECK-NEXT: kmovw %esi, %k1 1601; CHECK-NEXT: vmovapd (%rdi), %zmm2 1602; CHECK-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 32-byte Folded Reload 1603; CHECK-NEXT: vmovapd %zmm2, %zmm0 1604; CHECK-NEXT: retq 1605 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1606 %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1607 %3 = bitcast i8 %mask to <8 x i1> 1608 %4 = load <8 x double>, ptr %passthru 1609 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4 1610 ret <8 x double> %5 1611} 1612 1613define <8 x double> @stack_fold_insertf64x4_maskz(<4 x double> %a0, <4 x double> %a1, i8 %mask) { 1614; CHECK-LABEL: stack_fold_insertf64x4_maskz: 1615; CHECK: # %bb.0: 1616; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1617; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1618; CHECK-NEXT: #APP 1619; CHECK-NEXT: nop 1620; CHECK-NEXT: #NO_APP 1621; CHECK-NEXT: kmovw %edi, %k1 1622; CHECK-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 32-byte Folded Reload 1623; CHECK-NEXT: retq 1624 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1625 %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1626 %3 = bitcast i8 %mask to <8 x i1> 1627 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer 1628 ret <8 x double> %4 1629} 1630 1631define <16 x float> @stack_fold_vpermt2ps(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) { 1632; CHECK-LABEL: stack_fold_vpermt2ps: 1633; CHECK: # %bb.0: 1634; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1635; CHECK-NEXT: #APP 1636; CHECK-NEXT: nop 1637; CHECK-NEXT: #NO_APP 1638; CHECK-NEXT: vpermt2ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 1639; CHECK-NEXT: retq 1640 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1641 %2 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) 1642 ret <16 x float> %2 1643} 1644declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>) 1645 1646define <16 x float> @stack_fold_vpermi2ps(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2) { 1647; CHECK-LABEL: stack_fold_vpermi2ps: 1648; CHECK: # %bb.0: 1649; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1650; CHECK-NEXT: #APP 1651; CHECK-NEXT: nop 1652; CHECK-NEXT: #NO_APP 1653; CHECK-NEXT: vpermi2ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 1654; CHECK-NEXT: retq 1655 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1656 %2 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x1, <16 x i32> %x0, <16 x float> %x2) 1657 ret <16 x float> %2 1658} 1659 1660define <16 x float> @stack_fold_vpermi2ps_mask(<16 x float> %x0, ptr %x1, <16 x float> %x2, i16 %mask) { 1661; CHECK-LABEL: stack_fold_vpermi2ps_mask: 1662; CHECK: # %bb.0: 1663; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1664; CHECK-NEXT: #APP 1665; CHECK-NEXT: nop 1666; CHECK-NEXT: #NO_APP 1667; CHECK-NEXT: vmovaps (%rdi), %zmm2 1668; CHECK-NEXT: kmovw %esi, %k1 1669; CHECK-NEXT: vpermi2ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 1670; CHECK-NEXT: vmovaps %zmm2, %zmm0 1671; CHECK-NEXT: retq 1672 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1673 %x1b = load <16 x i32>, ptr %x1 1674 %2 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1b, <16 x float> %x2) 1675 %3 = bitcast <16 x i32> %x1b to <16 x float> 1676 %4 = bitcast i16 %mask to <16 x i1> 1677 %5 = select <16 x i1> %4, <16 x float> %2, <16 x float> %3 1678 ret <16 x float> %5 1679} 1680 1681define <16 x float> @stack_fold_vpermt2ps_mask(ptr %x0, <16 x float> %x1, <16 x float> %x2, i16 %mask) { 1682; CHECK-LABEL: stack_fold_vpermt2ps_mask: 1683; CHECK: # %bb.0: 1684; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1685; CHECK-NEXT: #APP 1686; CHECK-NEXT: nop 1687; CHECK-NEXT: #NO_APP 1688; CHECK-NEXT: vmovaps (%rdi), %zmm1 1689; CHECK-NEXT: kmovw %esi, %k1 1690; CHECK-NEXT: vpermt2ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1691; CHECK-NEXT: retq 1692 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1693 %x0b = load <16 x i32>, ptr %x0 1694 %2 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x1, <16 x i32> %x0b, <16 x float> %x2) 1695 %3 = bitcast i16 %mask to <16 x i1> 1696 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %x1 1697 ret <16 x float> %4 1698} 1699 1700define <16 x float> @stack_fold_vpermt2ps_maskz(ptr %x0, <16 x float> %x1, <16 x float> %x2, i16 %mask) { 1701; CHECK-LABEL: stack_fold_vpermt2ps_maskz: 1702; CHECK: # %bb.0: 1703; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1704; CHECK-NEXT: #APP 1705; CHECK-NEXT: nop 1706; CHECK-NEXT: #NO_APP 1707; CHECK-NEXT: vmovaps (%rdi), %zmm1 1708; CHECK-NEXT: kmovw %esi, %k1 1709; CHECK-NEXT: vpermt2ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 1710; CHECK-NEXT: retq 1711 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1712 %x0b = load <16 x i32>, ptr %x0 1713 %2 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x1, <16 x i32> %x0b, <16 x float> %x2) 1714 %3 = bitcast i16 %mask to <16 x i1> 1715 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer 1716 ret <16 x float> %4 1717} 1718 1719define <8 x double> @stack_fold_vpermt2pd(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) { 1720; CHECK-LABEL: stack_fold_vpermt2pd: 1721; CHECK: # %bb.0: 1722; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1723; CHECK-NEXT: #APP 1724; CHECK-NEXT: nop 1725; CHECK-NEXT: #NO_APP 1726; CHECK-NEXT: vpermt2pd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 1727; CHECK-NEXT: retq 1728 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1729 %2 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) 1730 %3 = bitcast <8 x i64> %x1 to <8 x double> 1731 ret <8 x double> %2 1732} 1733declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>) 1734 1735define <8 x double> @stack_fold_vpermi2pd(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2) { 1736; CHECK-LABEL: stack_fold_vpermi2pd: 1737; CHECK: # %bb.0: 1738; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1739; CHECK-NEXT: #APP 1740; CHECK-NEXT: nop 1741; CHECK-NEXT: #NO_APP 1742; CHECK-NEXT: vpermi2pd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 1743; CHECK-NEXT: retq 1744 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1745 %2 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x1, <8 x i64> %x0, <8 x double> %x2) 1746 ret <8 x double> %2 1747} 1748 1749define <8 x double> @stack_fold_permpd(<8 x double> %a0) { 1750; CHECK-LABEL: stack_fold_permpd: 1751; CHECK: # %bb.0: 1752; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1753; CHECK-NEXT: #APP 1754; CHECK-NEXT: nop 1755; CHECK-NEXT: #NO_APP 1756; CHECK-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 1757; CHECK-NEXT: # zmm0 = mem[3,2,2,3,7,6,6,7] 1758; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1759; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 1760; CHECK-NEXT: retq 1761 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1762 %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7> 1763 ; fadd forces execution domain 1764 %3 = fadd <8 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0> 1765 ret <8 x double> %3 1766} 1767 1768define <8 x double> @stack_fold_permpd_mask(ptr %passthru, <8 x double> %a0, i8 %mask) { 1769; CHECK-LABEL: stack_fold_permpd_mask: 1770; CHECK: # %bb.0: 1771; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1772; CHECK-NEXT: #APP 1773; CHECK-NEXT: nop 1774; CHECK-NEXT: #NO_APP 1775; CHECK-NEXT: kmovw %esi, %k1 1776; CHECK-NEXT: vmovapd (%rdi), %zmm0 1777; CHECK-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload 1778; CHECK-NEXT: # zmm0 {%k1} = mem[3,2,2,3,7,6,6,7] 1779; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1780; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 1781; CHECK-NEXT: retq 1782 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1783 %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7> 1784 %3 = bitcast i8 %mask to <8 x i1> 1785 ; load needed to keep the operation from being scheduled above the asm block 1786 %4 = load <8 x double>, ptr %passthru 1787 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4 1788 ; fadd forces execution domain 1789 %6 = fadd <8 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0> 1790 ret <8 x double> %6 1791} 1792 1793define <8 x double> @stack_fold_permpd_maskz(<8 x double> %a0, i8 %mask) { 1794; CHECK-LABEL: stack_fold_permpd_maskz: 1795; CHECK: # %bb.0: 1796; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1797; CHECK-NEXT: #APP 1798; CHECK-NEXT: nop 1799; CHECK-NEXT: #NO_APP 1800; CHECK-NEXT: kmovw %edi, %k1 1801; CHECK-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload 1802; CHECK-NEXT: # zmm0 {%k1} {z} = mem[3,2,2,3,7,6,6,7] 1803; CHECK-NEXT: retq 1804 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1805 %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7> 1806 %3 = bitcast i8 %mask to <8 x i1> 1807 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer 1808 ret <8 x double> %4 1809} 1810 1811define <8 x double> @stack_fold_permpdvar(<8 x i64> %a0, <8 x double> %a1) { 1812; CHECK-LABEL: stack_fold_permpdvar: 1813; CHECK: # %bb.0: 1814; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1815; CHECK-NEXT: #APP 1816; CHECK-NEXT: nop 1817; CHECK-NEXT: #NO_APP 1818; CHECK-NEXT: vpermpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1819; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1820; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 1821; CHECK-NEXT: retq 1822 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1823 %2 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a1, <8 x i64> %a0) 1824 ; fadd forces execution domain 1825 %3 = fadd <8 x double> %2, zeroinitializer 1826 ret <8 x double> %3 1827} 1828declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>) nounwind readonly 1829 1830define <16 x float> @stack_fold_permps(<16 x i32> %a0, <16 x float> %a1) { 1831; CHECK-LABEL: stack_fold_permps: 1832; CHECK: # %bb.0: 1833; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1834; CHECK-NEXT: #APP 1835; CHECK-NEXT: nop 1836; CHECK-NEXT: #NO_APP 1837; CHECK-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1838; CHECK-NEXT: retq 1839 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1840 %2 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a1, <16 x i32> %a0) 1841 ret <16 x float> %2 1842} 1843declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>) nounwind readonly 1844 1845define <8 x double> @stack_fold_permilpd_zmm(<8 x double> %a0) { 1846; CHECK-LABEL: stack_fold_permilpd_zmm: 1847; CHECK: # %bb.0: 1848; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1849; CHECK-NEXT: #APP 1850; CHECK-NEXT: nop 1851; CHECK-NEXT: #NO_APP 1852; CHECK-NEXT: vpermilpd $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 1853; CHECK-NEXT: # zmm0 = mem[1,0,3,2,5,4,7,6] 1854; CHECK-NEXT: retq 1855 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1856 %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 1857 ret <8 x double> %2 1858} 1859 1860define <8 x double> @stack_fold_permilpd_zmm_mask(ptr %passthru, <8 x double> %a0, i8 %mask) { 1861; CHECK-LABEL: stack_fold_permilpd_zmm_mask: 1862; CHECK: # %bb.0: 1863; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1864; CHECK-NEXT: #APP 1865; CHECK-NEXT: nop 1866; CHECK-NEXT: #NO_APP 1867; CHECK-NEXT: kmovw %esi, %k1 1868; CHECK-NEXT: vmovapd (%rdi), %zmm1 1869; CHECK-NEXT: vpermilpd $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload 1870; CHECK-NEXT: # zmm1 {%k1} = mem[1,0,3,2,5,4,7,6] 1871; CHECK-NEXT: vmovapd %zmm1, %zmm0 1872; CHECK-NEXT: retq 1873 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1874 %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 1875 %3 = bitcast i8 %mask to <8 x i1> 1876 ; load needed to keep the operation from being scheduled above the asm block 1877 %4 = load <8 x double>, ptr %passthru 1878 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4 1879 ret <8 x double> %5 1880} 1881 1882define <8 x double> @stack_fold_permilpd_zmm_maskz(<8 x double> %a0, i8 %mask) { 1883; CHECK-LABEL: stack_fold_permilpd_zmm_maskz: 1884; CHECK: # %bb.0: 1885; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1886; CHECK-NEXT: #APP 1887; CHECK-NEXT: nop 1888; CHECK-NEXT: #NO_APP 1889; CHECK-NEXT: kmovw %edi, %k1 1890; CHECK-NEXT: vpermilpd $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload 1891; CHECK-NEXT: # zmm0 {%k1} {z} = mem[1,0,3,2,5,4,7,6] 1892; CHECK-NEXT: retq 1893 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1894 %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 1895 %3 = bitcast i8 %mask to <8 x i1> 1896 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer 1897 ret <8 x double> %4 1898} 1899 1900define <8 x double> @stack_fold_permilpdvar_zmm(<8 x double> %a0, <8 x i64> %a1) { 1901; CHECK-LABEL: stack_fold_permilpdvar_zmm: 1902; CHECK: # %bb.0: 1903; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1904; CHECK-NEXT: #APP 1905; CHECK-NEXT: nop 1906; CHECK-NEXT: #NO_APP 1907; CHECK-NEXT: vpermilpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1908; CHECK-NEXT: retq 1909 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1910 %2 = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %a0, <8 x i64> %a1) 1911 ret <8 x double> %2 1912} 1913declare <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double>, <8 x i64>) nounwind readnone 1914 1915define <8 x double> @stack_fold_permilpdvar_zmm_mask(ptr %passthru, <8 x double> %a0, <8 x i64> %a1, i8 %mask) { 1916; CHECK-LABEL: stack_fold_permilpdvar_zmm_mask: 1917; CHECK: # %bb.0: 1918; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1919; CHECK-NEXT: #APP 1920; CHECK-NEXT: nop 1921; CHECK-NEXT: #NO_APP 1922; CHECK-NEXT: kmovw %esi, %k1 1923; CHECK-NEXT: vmovapd (%rdi), %zmm2 1924; CHECK-NEXT: vpermilpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 1925; CHECK-NEXT: vmovapd %zmm2, %zmm0 1926; CHECK-NEXT: retq 1927 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1928 %2 = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %a0, <8 x i64> %a1) 1929 %3 = bitcast i8 %mask to <8 x i1> 1930 ; load needed to keep the operation from being scheduled above the asm block 1931 %4 = load <8 x double>, ptr %passthru 1932 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4 1933 ret <8 x double> %5 1934} 1935 1936define <8 x double> @stack_fold_permilpdvar_zmm_maskz(<8 x double> %a0, <8 x i64> %a1, i8 %mask) { 1937; CHECK-LABEL: stack_fold_permilpdvar_zmm_maskz: 1938; CHECK: # %bb.0: 1939; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1940; CHECK-NEXT: #APP 1941; CHECK-NEXT: nop 1942; CHECK-NEXT: #NO_APP 1943; CHECK-NEXT: kmovw %edi, %k1 1944; CHECK-NEXT: vpermilpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1945; CHECK-NEXT: retq 1946 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1947 %2 = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %a0, <8 x i64> %a1) 1948 %3 = bitcast i8 %mask to <8 x i1> 1949 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer 1950 ret <8 x double> %4 1951} 1952 1953define <16 x float> @stack_fold_permilps_zmm(<16 x float> %a0) { 1954; CHECK-LABEL: stack_fold_permilps_zmm: 1955; CHECK: # %bb.0: 1956; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1957; CHECK-NEXT: #APP 1958; CHECK-NEXT: nop 1959; CHECK-NEXT: #NO_APP 1960; CHECK-NEXT: vpermilps $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 1961; CHECK-NEXT: # zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1962; CHECK-NEXT: retq 1963 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1964 %2 = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12> 1965 ret <16 x float> %2 1966} 1967 1968define <16 x float> @stack_fold_permilps_zmm_mask(ptr %passthru, <16 x float> %a0, i16 %mask) { 1969; CHECK-LABEL: stack_fold_permilps_zmm_mask: 1970; CHECK: # %bb.0: 1971; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1972; CHECK-NEXT: #APP 1973; CHECK-NEXT: nop 1974; CHECK-NEXT: #NO_APP 1975; CHECK-NEXT: kmovw %esi, %k1 1976; CHECK-NEXT: vmovaps (%rdi), %zmm1 1977; CHECK-NEXT: vpermilps $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload 1978; CHECK-NEXT: # zmm1 {%k1} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1979; CHECK-NEXT: vmovaps %zmm1, %zmm0 1980; CHECK-NEXT: retq 1981 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1982 %2 = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12> 1983 %3 = bitcast i16 %mask to <16 x i1> 1984 ; load needed to keep the operation from being scheduled above the asm block 1985 %4 = load <16 x float>, ptr %passthru 1986 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4 1987 ret <16 x float> %5 1988} 1989 1990define <16 x float> @stack_fold_permilps_zmm_maskz(<16 x float> %a0, i16 %mask) { 1991; CHECK-LABEL: stack_fold_permilps_zmm_maskz: 1992; CHECK: # %bb.0: 1993; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1994; CHECK-NEXT: #APP 1995; CHECK-NEXT: nop 1996; CHECK-NEXT: #NO_APP 1997; CHECK-NEXT: kmovw %edi, %k1 1998; CHECK-NEXT: vpermilps $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload 1999; CHECK-NEXT: # zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2000; CHECK-NEXT: retq 2001 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2002 %2 = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12> 2003 %3 = bitcast i16 %mask to <16 x i1> 2004 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer 2005 ret <16 x float> %4 2006} 2007 2008define <16 x float> @stack_fold_permilpsvar_zmm(<16 x float> %a0, <16 x i32> %a1) { 2009; CHECK-LABEL: stack_fold_permilpsvar_zmm: 2010; CHECK: # %bb.0: 2011; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2012; CHECK-NEXT: #APP 2013; CHECK-NEXT: nop 2014; CHECK-NEXT: #NO_APP 2015; CHECK-NEXT: vpermilps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 2016; CHECK-NEXT: retq 2017 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2018 %2 = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %a0, <16 x i32> %a1) 2019 ret <16 x float> %2 2020} 2021declare <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float>, <16 x i32>) nounwind readnone 2022 2023define <16 x float> @stack_fold_permilpsvar_zmm_mask(ptr %passthru, <16 x float> %a0, <16 x i32> %a1, i16 %mask) { 2024; CHECK-LABEL: stack_fold_permilpsvar_zmm_mask: 2025; CHECK: # %bb.0: 2026; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2027; CHECK-NEXT: #APP 2028; CHECK-NEXT: nop 2029; CHECK-NEXT: #NO_APP 2030; CHECK-NEXT: kmovw %esi, %k1 2031; CHECK-NEXT: vmovaps (%rdi), %zmm2 2032; CHECK-NEXT: vpermilps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 2033; CHECK-NEXT: vmovaps %zmm2, %zmm0 2034; CHECK-NEXT: retq 2035 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2036 %2 = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %a0, <16 x i32> %a1) 2037 %3 = bitcast i16 %mask to <16 x i1> 2038 ; load needed to keep the operation from being scheduled above the asm block 2039 %4 = load <16 x float>, ptr %passthru 2040 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4 2041 ret <16 x float> %5 2042} 2043 2044define <16 x float> @stack_fold_permilpsvar_zmm_maskz(<16 x float> %a0, <16 x i32> %a1, i16 %mask) { 2045; CHECK-LABEL: stack_fold_permilpsvar_zmm_maskz: 2046; CHECK: # %bb.0: 2047; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2048; CHECK-NEXT: #APP 2049; CHECK-NEXT: nop 2050; CHECK-NEXT: #NO_APP 2051; CHECK-NEXT: kmovw %edi, %k1 2052; CHECK-NEXT: vpermilps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 2053; CHECK-NEXT: retq 2054 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2055 %2 = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %a0, <16 x i32> %a1) 2056 %3 = bitcast i16 %mask to <16 x i1> 2057 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer 2058 ret <16 x float> %4 2059} 2060 2061attributes #0 = { "unsafe-fp-math"="false" } 2062attributes #1 = { "unsafe-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" } 2063