1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512dq,+avx512vbmi,+avx512cd,+avx512vpopcntdq < %s | FileCheck %s 3 4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 5target triple = "x86_64-unknown-unknown" 6 7; Stack reload folding tests. 8; 9; By including a nop call with sideeffects we can force a partial register spill of the 10; relevant registers and check that the reload is correctly folded into the instruction. 11 12define <8 x i32> @stack_fold_valignd_ymm(<8 x i32> %a, <8 x i32> %b) { 13; CHECK-LABEL: stack_fold_valignd_ymm: 14; CHECK: # %bb.0: 15; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17; CHECK-NEXT: #APP 18; CHECK-NEXT: nop 19; CHECK-NEXT: #NO_APP 20; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 21; CHECK-NEXT: valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 22; CHECK-NEXT: # ymm0 = mem[1,2,3,4,5,6,7],ymm0[0] 23; CHECK-NEXT: retq 24 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 25 %2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 26 ret <8 x i32> %2 27} 28 29define <8 x i32> @stack_fold_valignd_ymm_mask(<8 x i32> %a, <8 x i32> %b, ptr %passthru, i8 %mask) { 30; CHECK-LABEL: stack_fold_valignd_ymm_mask: 31; CHECK: # %bb.0: 32; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 33; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 34; CHECK-NEXT: #APP 35; CHECK-NEXT: nop 36; CHECK-NEXT: #NO_APP 37; CHECK-NEXT: kmovd %esi, %k1 38; CHECK-NEXT: vmovdqa (%rdi), %ymm1 39; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 40; CHECK-NEXT: valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 {%k1} # 32-byte Folded Reload 41; CHECK-NEXT: # ymm1 {%k1} = mem[1,2,3,4,5,6,7],ymm0[0] 42; CHECK-NEXT: vmovdqa %ymm1, %ymm0 43; CHECK-NEXT: retq 44 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 45 %2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 46 %3 = bitcast i8 %mask to <8 x i1> 47 %4 = load <8 x i32>, ptr %passthru 48 %5 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> %4 49 ret <8 x i32> %5 50} 51 52define <8 x i32> @stack_fold_valignd_ymm_maskz(<8 x i32> %a, <8 x i32> %b, i8 %mask) { 53; CHECK-LABEL: stack_fold_valignd_ymm_maskz: 54; CHECK: # %bb.0: 55; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 56; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 57; CHECK-NEXT: #APP 58; CHECK-NEXT: nop 59; CHECK-NEXT: #NO_APP 60; CHECK-NEXT: kmovd %edi, %k1 61; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 62; CHECK-NEXT: valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload 63; CHECK-NEXT: # ymm0 {%k1} {z} = mem[1,2,3,4,5,6,7],ymm0[0] 64; CHECK-NEXT: retq 65 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 66 %2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 67 %3 = bitcast i8 %mask to <8 x i1> 68 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer 69 ret <8 x i32> %4 70} 71 72define <4 x i64> @stack_fold_valignq_ymm(<4 x i64> %a, <4 x i64> %b) { 73; CHECK-LABEL: stack_fold_valignq_ymm: 74; CHECK: # %bb.0: 75; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 76; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 77; CHECK-NEXT: #APP 78; CHECK-NEXT: nop 79; CHECK-NEXT: #NO_APP 80; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 81; CHECK-NEXT: valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 82; CHECK-NEXT: # ymm0 = mem[1,2,3],ymm0[0] 83; CHECK-NEXT: retq 84 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 85 %2 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4> 86 ret <4 x i64> %2 87} 88 89define <16 x i8> @stack_fold_pavgb(<16 x i8> %a0, <16 x i8> %a1) { 90; CHECK-LABEL: stack_fold_pavgb: 91; CHECK: # %bb.0: 92; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 93; CHECK-NEXT: #APP 94; CHECK-NEXT: nop 95; CHECK-NEXT: #NO_APP 96; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 97; CHECK-NEXT: retq 98 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 99 %2 = zext <16 x i8> %a0 to <16 x i16> 100 %3 = zext <16 x i8> %a1 to <16 x i16> 101 %4 = add <16 x i16> %2, %3 102 %5 = add <16 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 103 %6 = lshr <16 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 104 %7 = trunc <16 x i16> %6 to <16 x i8> 105 ret <16 x i8> %7 106} 107 108define <32 x i8> @stack_fold_pavgb_ymm(<32 x i8> %a0, <32 x i8> %a1) { 109; CHECK-LABEL: stack_fold_pavgb_ymm: 110; CHECK: # %bb.0: 111; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 112; CHECK-NEXT: #APP 113; CHECK-NEXT: nop 114; CHECK-NEXT: #NO_APP 115; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 116; CHECK-NEXT: retq 117 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 118 %2 = zext <32 x i8> %a0 to <32 x i16> 119 %3 = zext <32 x i8> %a1 to <32 x i16> 120 %4 = add <32 x i16> %2, %3 121 %5 = add <32 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 122 %6 = lshr <32 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 123 %7 = trunc <32 x i16> %6 to <32 x i8> 124 ret <32 x i8> %7 125} 126 127define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) { 128; CHECK-LABEL: stack_fold_pavgw: 129; CHECK: # %bb.0: 130; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 131; CHECK-NEXT: #APP 132; CHECK-NEXT: nop 133; CHECK-NEXT: #NO_APP 134; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 135; CHECK-NEXT: retq 136 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 137 %2 = zext <8 x i16> %a0 to <8 x i32> 138 %3 = zext <8 x i16> %a1 to <8 x i32> 139 %4 = add <8 x i32> %2, %3 140 %5 = add <8 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 141 %6 = lshr <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 142 %7 = trunc <8 x i32> %6 to <8 x i16> 143 ret <8 x i16> %7 144} 145 146define <16 x i16> @stack_fold_pavgw_ymm(<16 x i16> %a0, <16 x i16> %a1) { 147; CHECK-LABEL: stack_fold_pavgw_ymm: 148; CHECK: # %bb.0: 149; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 150; CHECK-NEXT: #APP 151; CHECK-NEXT: nop 152; CHECK-NEXT: #NO_APP 153; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 154; CHECK-NEXT: retq 155 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 156 %2 = zext <16 x i16> %a0 to <16 x i32> 157 %3 = zext <16 x i16> %a1 to <16 x i32> 158 %4 = add <16 x i32> %2, %3 159 %5 = add <16 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 160 %6 = lshr <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 161 %7 = trunc <16 x i32> %6 to <16 x i16> 162 ret <16 x i16> %7 163} 164 165define <4 x i32> @stack_fold_vpconflictd(<4 x i32> %a0) { 166; CHECK-LABEL: stack_fold_vpconflictd: 167; CHECK: # %bb.0: 168; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 169; CHECK-NEXT: #APP 170; CHECK-NEXT: nop 171; CHECK-NEXT: #NO_APP 172; CHECK-NEXT: vpconflictd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 173; CHECK-NEXT: retq 174 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 175 %2 = call <4 x i32> @llvm.x86.avx512.conflict.d.128(<4 x i32> %a0) 176 ret <4 x i32> %2 177} 178 179define <8 x i32> @stack_fold_vpconflictd_ymm(<8 x i32> %a0) { 180; CHECK-LABEL: stack_fold_vpconflictd_ymm: 181; CHECK: # %bb.0: 182; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 183; CHECK-NEXT: #APP 184; CHECK-NEXT: nop 185; CHECK-NEXT: #NO_APP 186; CHECK-NEXT: vpconflictd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 187; CHECK-NEXT: retq 188 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 189 %2 = call <8 x i32> @llvm.x86.avx512.conflict.d.256(<8 x i32> %a0) 190 ret <8 x i32> %2 191} 192 193define <2 x i64> @stack_fold_vpconflictq(<2 x i64> %a0) { 194; CHECK-LABEL: stack_fold_vpconflictq: 195; CHECK: # %bb.0: 196; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 197; CHECK-NEXT: #APP 198; CHECK-NEXT: nop 199; CHECK-NEXT: #NO_APP 200; CHECK-NEXT: vpconflictq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 201; CHECK-NEXT: retq 202 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 203 %2 = call <2 x i64> @llvm.x86.avx512.conflict.q.128(<2 x i64> %a0) 204 ret <2 x i64> %2 205} 206 207define <4 x i64> @stack_fold_vpconflictq_ymm(<4 x i64> %a0) { 208; CHECK-LABEL: stack_fold_vpconflictq_ymm: 209; CHECK: # %bb.0: 210; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 211; CHECK-NEXT: #APP 212; CHECK-NEXT: nop 213; CHECK-NEXT: #NO_APP 214; CHECK-NEXT: vpconflictq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 215; CHECK-NEXT: retq 216 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 217 %2 = call <4 x i64> @llvm.x86.avx512.conflict.q.256(<4 x i64> %a0) 218 ret <4 x i64> %2 219} 220 221define <4 x i32> @stack_fold_extracti32x4(<8 x i16> %a0, <8 x i32> %a1) { 222; CHECK-LABEL: stack_fold_extracti32x4: 223; CHECK: # %bb.0: 224; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 225; CHECK-NEXT: vextracti128 $1, %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 226; CHECK-NEXT: #APP 227; CHECK-NEXT: nop 228; CHECK-NEXT: #NO_APP 229; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 230; CHECK-NEXT: vzeroupper 231; CHECK-NEXT: retq 232 ; zext forces execution domain 233 %1 = zext <8 x i16> %a0 to <8 x i32> 234 %2 = shufflevector <8 x i32> %1, <8 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 235 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 236 ret <4 x i32> %2 237} 238 239define <2 x i64> @stack_fold_extracti64x2(<4 x i32> %a0, <4 x i64> %a1) { 240; CHECK-LABEL: stack_fold_extracti64x2: 241; CHECK: # %bb.0: 242; CHECK-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 243; CHECK-NEXT: vextracti128 $1, %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 244; CHECK-NEXT: #APP 245; CHECK-NEXT: nop 246; CHECK-NEXT: #NO_APP 247; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 248; CHECK-NEXT: vzeroupper 249; CHECK-NEXT: retq 250 ; zext forces execution domain 251 %1 = zext <4 x i32> %a0 to <4 x i64> 252 %2 = shufflevector <4 x i64> %1, <4 x i64> %a1, <2 x i32> <i32 2, i32 3> 253 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 254 ret <2 x i64> %2 255} 256 257define <8 x i32> @stack_fold_inserti32x4(<4 x i32> %a0, <4 x i32> %a1) { 258; CHECK-LABEL: stack_fold_inserti32x4: 259; CHECK: # %bb.0: 260; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 261; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 262; CHECK-NEXT: #APP 263; CHECK-NEXT: nop 264; CHECK-NEXT: #NO_APP 265; CHECK-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 266; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 267; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm0 268; CHECK-NEXT: retq 269 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 270 %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 271 ; add forces execution domain 272 %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 273 ret <8 x i32> %3 274} 275 276define <4 x i64> @stack_fold_inserti64x2(<2 x i64> %a0, <2 x i64> %a1) { 277; CHECK-LABEL: stack_fold_inserti64x2: 278; CHECK: # %bb.0: 279; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 280; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 281; CHECK-NEXT: #APP 282; CHECK-NEXT: nop 283; CHECK-NEXT: #NO_APP 284; CHECK-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 285; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 286; CHECK-NEXT: vpsubq %ymm1, %ymm0, %ymm0 287; CHECK-NEXT: retq 288 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 289 %2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 290 ; add forces execution domain 291 %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1> 292 ret <4 x i64> %3 293} 294 295define <16 x i8> @stack_fold_pabsb(<16 x i8> %a0) { 296; CHECK-LABEL: stack_fold_pabsb: 297; CHECK: # %bb.0: 298; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 299; CHECK-NEXT: #APP 300; CHECK-NEXT: nop 301; CHECK-NEXT: #NO_APP 302; CHECK-NEXT: vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 303; CHECK-NEXT: retq 304 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 305 %2 = icmp sgt <16 x i8> %a0, zeroinitializer 306 %3 = sub <16 x i8> zeroinitializer, %a0 307 %4 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %3 308 ret <16 x i8> %4 309} 310 311define <32 x i8> @stack_fold_pabsb_ymm(<32 x i8> %a0) { 312; CHECK-LABEL: stack_fold_pabsb_ymm: 313; CHECK: # %bb.0: 314; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 315; CHECK-NEXT: #APP 316; CHECK-NEXT: nop 317; CHECK-NEXT: #NO_APP 318; CHECK-NEXT: vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 319; CHECK-NEXT: retq 320 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 321 %2 = icmp sgt <32 x i8> %a0, zeroinitializer 322 %3 = sub <32 x i8> zeroinitializer, %a0 323 %4 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %3 324 ret <32 x i8> %4 325} 326 327define <4 x i32> @stack_fold_pabsd(<4 x i32> %a0) { 328; CHECK-LABEL: stack_fold_pabsd: 329; CHECK: # %bb.0: 330; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 331; CHECK-NEXT: #APP 332; CHECK-NEXT: nop 333; CHECK-NEXT: #NO_APP 334; CHECK-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 335; CHECK-NEXT: retq 336 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 337 %2 = icmp sgt <4 x i32> %a0, zeroinitializer 338 %3 = sub <4 x i32> zeroinitializer, %a0 339 %4 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %3 340 ret <4 x i32> %4 341} 342 343define <8 x i32> @stack_fold_pabsd_ymm(<8 x i32> %a0) { 344; CHECK-LABEL: stack_fold_pabsd_ymm: 345; CHECK: # %bb.0: 346; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 347; CHECK-NEXT: #APP 348; CHECK-NEXT: nop 349; CHECK-NEXT: #NO_APP 350; CHECK-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 351; CHECK-NEXT: retq 352 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 353 %2 = icmp sgt <8 x i32> %a0, zeroinitializer 354 %3 = sub <8 x i32> zeroinitializer, %a0 355 %4 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %3 356 ret <8 x i32> %4 357} 358 359define <2 x i64> @stack_fold_pabsq(<2 x i64> %a0) { 360; CHECK-LABEL: stack_fold_pabsq: 361; CHECK: # %bb.0: 362; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 363; CHECK-NEXT: #APP 364; CHECK-NEXT: nop 365; CHECK-NEXT: #NO_APP 366; CHECK-NEXT: vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 367; CHECK-NEXT: retq 368 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 369 %2 = icmp sgt <2 x i64> %a0, zeroinitializer 370 %3 = sub <2 x i64> zeroinitializer, %a0 371 %4 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %3 372 ret <2 x i64> %4 373} 374 375define <4 x i64> @stack_fold_pabsq_ymm(<4 x i64> %a0) { 376; CHECK-LABEL: stack_fold_pabsq_ymm: 377; CHECK: # %bb.0: 378; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 379; CHECK-NEXT: #APP 380; CHECK-NEXT: nop 381; CHECK-NEXT: #NO_APP 382; CHECK-NEXT: vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 383; CHECK-NEXT: retq 384 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 385 %2 = icmp sgt <4 x i64> %a0, zeroinitializer 386 %3 = sub <4 x i64> zeroinitializer, %a0 387 %4 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %3 388 ret <4 x i64> %4 389} 390 391define <8 x i16> @stack_fold_pabsw(<8 x i16> %a0) { 392; CHECK-LABEL: stack_fold_pabsw: 393; CHECK: # %bb.0: 394; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 395; CHECK-NEXT: #APP 396; CHECK-NEXT: nop 397; CHECK-NEXT: #NO_APP 398; CHECK-NEXT: vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 399; CHECK-NEXT: retq 400 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 401 %2 = icmp sgt <8 x i16> %a0, zeroinitializer 402 %3 = sub <8 x i16> zeroinitializer, %a0 403 %4 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %3 404 ret <8 x i16> %4 405} 406 407define <16 x i16> @stack_fold_pabsw_ymm(<16 x i16> %a0) { 408; CHECK-LABEL: stack_fold_pabsw_ymm: 409; CHECK: # %bb.0: 410; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 411; CHECK-NEXT: #APP 412; CHECK-NEXT: nop 413; CHECK-NEXT: #NO_APP 414; CHECK-NEXT: vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 415; CHECK-NEXT: retq 416 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 417 %2 = icmp sgt <16 x i16> %a0, zeroinitializer 418 %3 = sub <16 x i16> zeroinitializer, %a0 419 %4 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %3 420 ret <16 x i16> %4 421} 422 423define <8 x i16> @stack_fold_packssdw(<4 x i32> %a0, <4 x i32> %a1) { 424; CHECK-LABEL: stack_fold_packssdw: 425; CHECK: # %bb.0: 426; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 427; CHECK-NEXT: #APP 428; CHECK-NEXT: nop 429; CHECK-NEXT: #NO_APP 430; CHECK-NEXT: vpackssdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 431; CHECK-NEXT: retq 432 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 433 %2 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1) 434 ret <8 x i16> %2 435} 436declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone 437 438define <16 x i16> @stack_fold_packssdw_ymm(<8 x i32> %a0, <8 x i32> %a1) { 439; CHECK-LABEL: stack_fold_packssdw_ymm: 440; CHECK: # %bb.0: 441; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 442; CHECK-NEXT: #APP 443; CHECK-NEXT: nop 444; CHECK-NEXT: #NO_APP 445; CHECK-NEXT: vpackssdw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 446; CHECK-NEXT: retq 447 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 448 %2 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1) 449 ret <16 x i16> %2 450} 451declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone 452 453define <16 x i8> @stack_fold_packsswb(<8 x i16> %a0, <8 x i16> %a1) { 454; CHECK-LABEL: stack_fold_packsswb: 455; CHECK: # %bb.0: 456; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 457; CHECK-NEXT: #APP 458; CHECK-NEXT: nop 459; CHECK-NEXT: #NO_APP 460; CHECK-NEXT: vpacksswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 461; CHECK-NEXT: retq 462 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 463 %2 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1) 464 ret <16 x i8> %2 465} 466declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone 467 468define <32 x i8> @stack_fold_packsswb_ymm(<16 x i16> %a0, <16 x i16> %a1) { 469; CHECK-LABEL: stack_fold_packsswb_ymm: 470; CHECK: # %bb.0: 471; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 472; CHECK-NEXT: #APP 473; CHECK-NEXT: nop 474; CHECK-NEXT: #NO_APP 475; CHECK-NEXT: vpacksswb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 476; CHECK-NEXT: retq 477 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 478 %2 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1) 479 ret <32 x i8> %2 480} 481declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone 482 483define <8 x i16> @stack_fold_packusdw(<4 x i32> %a0, <4 x i32> %a1) { 484; CHECK-LABEL: stack_fold_packusdw: 485; CHECK: # %bb.0: 486; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 487; CHECK-NEXT: #APP 488; CHECK-NEXT: nop 489; CHECK-NEXT: #NO_APP 490; CHECK-NEXT: vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 491; CHECK-NEXT: retq 492 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 493 %2 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1) 494 ret <8 x i16> %2 495} 496declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone 497 498define <16 x i16> @stack_fold_packusdw_ymm(<8 x i32> %a0, <8 x i32> %a1) { 499; CHECK-LABEL: stack_fold_packusdw_ymm: 500; CHECK: # %bb.0: 501; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 502; CHECK-NEXT: #APP 503; CHECK-NEXT: nop 504; CHECK-NEXT: #NO_APP 505; CHECK-NEXT: vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 506; CHECK-NEXT: retq 507 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 508 %2 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1) 509 ret <16 x i16> %2 510} 511declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone 512 513define <16 x i8> @stack_fold_packuswb(<8 x i16> %a0, <8 x i16> %a1) { 514; CHECK-LABEL: stack_fold_packuswb: 515; CHECK: # %bb.0: 516; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 517; CHECK-NEXT: #APP 518; CHECK-NEXT: nop 519; CHECK-NEXT: #NO_APP 520; CHECK-NEXT: vpackuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 521; CHECK-NEXT: retq 522 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 523 %2 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1) 524 ret <16 x i8> %2 525} 526declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone 527 528define <32 x i8> @stack_fold_packuswb_ymm(<16 x i16> %a0, <16 x i16> %a1) { 529; CHECK-LABEL: stack_fold_packuswb_ymm: 530; CHECK: # %bb.0: 531; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 532; CHECK-NEXT: #APP 533; CHECK-NEXT: nop 534; CHECK-NEXT: #NO_APP 535; CHECK-NEXT: vpackuswb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 536; CHECK-NEXT: retq 537 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 538 %2 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1) 539 ret <32 x i8> %2 540} 541declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone 542 543define <16 x i8> @stack_fold_paddb(<16 x i8> %a0, <16 x i8> %a1) { 544; CHECK-LABEL: stack_fold_paddb: 545; CHECK: # %bb.0: 546; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 547; CHECK-NEXT: #APP 548; CHECK-NEXT: nop 549; CHECK-NEXT: #NO_APP 550; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 551; CHECK-NEXT: retq 552 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 553 %2 = add <16 x i8> %a0, %a1 554 ret <16 x i8> %2 555} 556 557define <16 x i8> @stack_fold_paddb_mask(<16 x i8> %a0, <16 x i8> %a1, ptr %a2, i16 %mask) { 558; CHECK-LABEL: stack_fold_paddb_mask: 559; CHECK: # %bb.0: 560; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 561; CHECK-NEXT: #APP 562; CHECK-NEXT: nop 563; CHECK-NEXT: #NO_APP 564; CHECK-NEXT: kmovd %esi, %k1 565; CHECK-NEXT: vmovdqa (%rdi), %xmm2 566; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 567; CHECK-NEXT: vmovdqa %xmm2, %xmm0 568; CHECK-NEXT: retq 569 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 570 %2 = add <16 x i8> %a0, %a1 571 %3 = bitcast i16 %mask to <16 x i1> 572 ; load needed to keep the operation from being scheduled about the asm block 573 %4 = load <16 x i8>, ptr %a2 574 %5 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> %4 575 ret <16 x i8> %5 576} 577 578define <16 x i8> @stack_fold_paddb_maskz(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) { 579; CHECK-LABEL: stack_fold_paddb_maskz: 580; CHECK: # %bb.0: 581; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 582; CHECK-NEXT: #APP 583; CHECK-NEXT: nop 584; CHECK-NEXT: #NO_APP 585; CHECK-NEXT: kmovd %edi, %k1 586; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload 587; CHECK-NEXT: retq 588 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 589 %2 = add <16 x i8> %a0, %a1 590 %3 = bitcast i16 %mask to <16 x i1> 591 %4 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> zeroinitializer 592 ret <16 x i8> %4 593} 594 595define <32 x i8> @stack_fold_paddb_ymm(<32 x i8> %a0, <32 x i8> %a1) { 596; CHECK-LABEL: stack_fold_paddb_ymm: 597; CHECK: # %bb.0: 598; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 599; CHECK-NEXT: #APP 600; CHECK-NEXT: nop 601; CHECK-NEXT: #NO_APP 602; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 603; CHECK-NEXT: retq 604 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 605 %2 = add <32 x i8> %a0, %a1 606 ret <32 x i8> %2 607} 608 609define <32 x i8> @stack_fold_paddb_mask_ymm(<32 x i8> %a0, <32 x i8> %a1, ptr %a2, i32 %mask) { 610; CHECK-LABEL: stack_fold_paddb_mask_ymm: 611; CHECK: # %bb.0: 612; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 613; CHECK-NEXT: #APP 614; CHECK-NEXT: nop 615; CHECK-NEXT: #NO_APP 616; CHECK-NEXT: kmovd %esi, %k1 617; CHECK-NEXT: vmovdqa (%rdi), %ymm2 618; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 619; CHECK-NEXT: vmovdqa %ymm2, %ymm0 620; CHECK-NEXT: retq 621 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 622 %2 = add <32 x i8> %a0, %a1 623 %3 = bitcast i32 %mask to <32 x i1> 624 ; load needed to keep the operation from being scheduled about the asm block 625 %4 = load <32 x i8>, ptr %a2 626 %5 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> %4 627 ret <32 x i8> %5 628} 629 630define <32 x i8> @stack_fold_paddb_maskz_ymm(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) { 631; CHECK-LABEL: stack_fold_paddb_maskz_ymm: 632; CHECK: # %bb.0: 633; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 634; CHECK-NEXT: #APP 635; CHECK-NEXT: nop 636; CHECK-NEXT: #NO_APP 637; CHECK-NEXT: kmovd %edi, %k1 638; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload 639; CHECK-NEXT: retq 640 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 641 %2 = add <32 x i8> %a0, %a1 642 %3 = bitcast i32 %mask to <32 x i1> 643 %4 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> zeroinitializer 644 ret <32 x i8> %4 645} 646 647define <4 x i32> @stack_fold_paddd(<4 x i32> %a0, <4 x i32> %a1) { 648; CHECK-LABEL: stack_fold_paddd: 649; CHECK: # %bb.0: 650; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 651; CHECK-NEXT: #APP 652; CHECK-NEXT: nop 653; CHECK-NEXT: #NO_APP 654; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 655; CHECK-NEXT: retq 656 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 657 %2 = add <4 x i32> %a0, %a1 658 ret <4 x i32> %2 659} 660 661define <8 x i32> @stack_fold_paddd_ymm(<8 x i32> %a0, <8 x i32> %a1) { 662; CHECK-LABEL: stack_fold_paddd_ymm: 663; CHECK: # %bb.0: 664; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 665; CHECK-NEXT: #APP 666; CHECK-NEXT: nop 667; CHECK-NEXT: #NO_APP 668; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 669; CHECK-NEXT: retq 670 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 671 %2 = add <8 x i32> %a0, %a1 672 ret <8 x i32> %2 673} 674 675define <2 x i64> @stack_fold_paddq(<2 x i64> %a0, <2 x i64> %a1) { 676; CHECK-LABEL: stack_fold_paddq: 677; CHECK: # %bb.0: 678; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 679; CHECK-NEXT: #APP 680; CHECK-NEXT: nop 681; CHECK-NEXT: #NO_APP 682; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 683; CHECK-NEXT: retq 684 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 685 %2 = add <2 x i64> %a0, %a1 686 ret <2 x i64> %2 687} 688 689define <4 x i64> @stack_fold_paddq_ymm(<4 x i64> %a0, <4 x i64> %a1) { 690; CHECK-LABEL: stack_fold_paddq_ymm: 691; CHECK: # %bb.0: 692; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 693; CHECK-NEXT: #APP 694; CHECK-NEXT: nop 695; CHECK-NEXT: #NO_APP 696; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 697; CHECK-NEXT: retq 698 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 699 %2 = add <4 x i64> %a0, %a1 700 ret <4 x i64> %2 701} 702 703define <16 x i8> @stack_fold_paddsb(<16 x i8> %a0, <16 x i8> %a1) { 704; CHECK-LABEL: stack_fold_paddsb: 705; CHECK: # %bb.0: 706; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 707; CHECK-NEXT: #APP 708; CHECK-NEXT: nop 709; CHECK-NEXT: #NO_APP 710; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 711; CHECK-NEXT: retq 712 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 713 %2 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1) 714 ret <16 x i8> %2 715} 716declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 717 718define <32 x i8> @stack_fold_paddsb_ymm(<32 x i8> %a0, <32 x i8> %a1) { 719; CHECK-LABEL: stack_fold_paddsb_ymm: 720; CHECK: # %bb.0: 721; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 722; CHECK-NEXT: #APP 723; CHECK-NEXT: nop 724; CHECK-NEXT: #NO_APP 725; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 726; CHECK-NEXT: retq 727 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 728 %2 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1) 729 ret <32 x i8> %2 730} 731declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone 732 733define <8 x i16> @stack_fold_paddsw(<8 x i16> %a0, <8 x i16> %a1) { 734; CHECK-LABEL: stack_fold_paddsw: 735; CHECK: # %bb.0: 736; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 737; CHECK-NEXT: #APP 738; CHECK-NEXT: nop 739; CHECK-NEXT: #NO_APP 740; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 741; CHECK-NEXT: retq 742 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 743 %2 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1) 744 ret <8 x i16> %2 745} 746declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 747 748define <16 x i16> @stack_fold_paddsw_ymm(<16 x i16> %a0, <16 x i16> %a1) { 749; CHECK-LABEL: stack_fold_paddsw_ymm: 750; CHECK: # %bb.0: 751; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 752; CHECK-NEXT: #APP 753; CHECK-NEXT: nop 754; CHECK-NEXT: #NO_APP 755; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 756; CHECK-NEXT: retq 757 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 758 %2 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1) 759 ret <16 x i16> %2 760} 761declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone 762 763define <16 x i8> @stack_fold_paddusb(<16 x i8> %a0, <16 x i8> %a1) { 764; CHECK-LABEL: stack_fold_paddusb: 765; CHECK: # %bb.0: 766; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 767; CHECK-NEXT: #APP 768; CHECK-NEXT: nop 769; CHECK-NEXT: #NO_APP 770; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 771; CHECK-NEXT: retq 772 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 773 %2 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1) 774 ret <16 x i8> %2 775} 776declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 777 778define <32 x i8> @stack_fold_paddusb_ymm(<32 x i8> %a0, <32 x i8> %a1) { 779; CHECK-LABEL: stack_fold_paddusb_ymm: 780; CHECK: # %bb.0: 781; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 782; CHECK-NEXT: #APP 783; CHECK-NEXT: nop 784; CHECK-NEXT: #NO_APP 785; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 786; CHECK-NEXT: retq 787 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 788 %2 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1) 789 ret <32 x i8> %2 790} 791declare <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone 792 793define <8 x i16> @stack_fold_paddusw(<8 x i16> %a0, <8 x i16> %a1) { 794; CHECK-LABEL: stack_fold_paddusw: 795; CHECK: # %bb.0: 796; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 797; CHECK-NEXT: #APP 798; CHECK-NEXT: nop 799; CHECK-NEXT: #NO_APP 800; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 801; CHECK-NEXT: retq 802 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 803 %2 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1) 804 ret <8 x i16> %2 805} 806declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 807 808define <16 x i16> @stack_fold_paddusw_ymm(<16 x i16> %a0, <16 x i16> %a1) { 809; CHECK-LABEL: stack_fold_paddusw_ymm: 810; CHECK: # %bb.0: 811; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 812; CHECK-NEXT: #APP 813; CHECK-NEXT: nop 814; CHECK-NEXT: #NO_APP 815; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 816; CHECK-NEXT: retq 817 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 818 %2 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1) 819 ret <16 x i16> %2 820} 821declare <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone 822 823define <8 x i16> @stack_fold_paddw(<8 x i16> %a0, <8 x i16> %a1) { 824; CHECK-LABEL: stack_fold_paddw: 825; CHECK: # %bb.0: 826; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 827; CHECK-NEXT: #APP 828; CHECK-NEXT: nop 829; CHECK-NEXT: #NO_APP 830; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 831; CHECK-NEXT: retq 832 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 833 %2 = add <8 x i16> %a0, %a1 834 ret <8 x i16> %2 835} 836 837define <16 x i16> @stack_fold_paddw_ymm(<16 x i16> %a0, <16 x i16> %a1) { 838; CHECK-LABEL: stack_fold_paddw_ymm: 839; CHECK: # %bb.0: 840; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 841; CHECK-NEXT: #APP 842; CHECK-NEXT: nop 843; CHECK-NEXT: #NO_APP 844; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 845; CHECK-NEXT: retq 846 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 847 %2 = add <16 x i16> %a0, %a1 848 ret <16 x i16> %2 849} 850 851define <32 x i8> @stack_fold_palignr(<32 x i8> %a0, <32 x i8> %a1) { 852; CHECK-LABEL: stack_fold_palignr: 853; CHECK: # %bb.0: 854; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 855; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 856; CHECK-NEXT: #APP 857; CHECK-NEXT: nop 858; CHECK-NEXT: #NO_APP 859; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 860; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 861; CHECK-NEXT: # ymm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] 862; CHECK-NEXT: retq 863 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 864 %2 = shufflevector <32 x i8> %a1, <32 x i8> %a0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48> 865 ret <32 x i8> %2 866} 867 868define <32 x i8> @stack_fold_palignr_mask(<32 x i8> %a0, <32 x i8> %a1, ptr %passthru, i32 %mask) { 869; CHECK-LABEL: stack_fold_palignr_mask: 870; CHECK: # %bb.0: 871; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 872; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 873; CHECK-NEXT: #APP 874; CHECK-NEXT: nop 875; CHECK-NEXT: #NO_APP 876; CHECK-NEXT: kmovd %esi, %k1 877; CHECK-NEXT: vmovdqa (%rdi), %ymm1 878; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 879; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 {%k1} # 32-byte Folded Reload 880; CHECK-NEXT: # ymm1 {%k1} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] 881; CHECK-NEXT: vmovdqa %ymm1, %ymm0 882; CHECK-NEXT: retq 883 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 884 %2 = shufflevector <32 x i8> %a1, <32 x i8> %a0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48> 885 %3 = bitcast i32 %mask to <32 x i1> 886 %4 = load <32 x i8>, ptr %passthru 887 %5 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> %4 888 ret <32 x i8> %5 889} 890 891define <32 x i8> @stack_fold_palignr_maskz(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) { 892; CHECK-LABEL: stack_fold_palignr_maskz: 893; CHECK: # %bb.0: 894; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 895; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 896; CHECK-NEXT: #APP 897; CHECK-NEXT: nop 898; CHECK-NEXT: #NO_APP 899; CHECK-NEXT: kmovd %edi, %k1 900; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 901; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload 902; CHECK-NEXT: # ymm0 {%k1} {z} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] 903; CHECK-NEXT: retq 904 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 905 %2 = shufflevector <32 x i8> %a1, <32 x i8> %a0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48> 906 %3 = bitcast i32 %mask to <32 x i1> 907 %4 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> zeroinitializer 908 ret <32 x i8> %4 909} 910 911define i16 @stack_fold_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1) { 912; CHECK-LABEL: stack_fold_pcmpeqb: 913; CHECK: # %bb.0: 914; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 915; CHECK-NEXT: #APP 916; CHECK-NEXT: nop 917; CHECK-NEXT: #NO_APP 918; CHECK-NEXT: vpcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload 919; CHECK-NEXT: kmovd %k0, %eax 920; CHECK-NEXT: # kill: def $ax killed $ax killed $eax 921; CHECK-NEXT: retq 922 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 923 %2 = icmp eq <16 x i8> %a0, %a1 924 %3 = bitcast <16 x i1> %2 to i16 925 ret i16 %3 926} 927 928define i8 @stack_fold_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1) { 929; CHECK-LABEL: stack_fold_pcmpeqd: 930; CHECK: # %bb.0: 931; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 932; CHECK-NEXT: #APP 933; CHECK-NEXT: nop 934; CHECK-NEXT: #NO_APP 935; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload 936; CHECK-NEXT: kmovd %k0, %eax 937; CHECK-NEXT: # kill: def $al killed $al killed $eax 938; CHECK-NEXT: retq 939 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 940 %2 = icmp eq <4 x i32> %a0, %a1 941 %3 = shufflevector <4 x i1> %2, <4 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 942 %4 = bitcast <8 x i1> %3 to i8 943 ret i8 %4 944} 945 946define i8 @stack_fold_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1) { 947; CHECK-LABEL: stack_fold_pcmpeqq: 948; CHECK: # %bb.0: 949; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 950; CHECK-NEXT: #APP 951; CHECK-NEXT: nop 952; CHECK-NEXT: #NO_APP 953; CHECK-NEXT: vpcmpeqq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload 954; CHECK-NEXT: kmovd %k0, %eax 955; CHECK-NEXT: # kill: def $al killed $al killed $eax 956; CHECK-NEXT: retq 957 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 958 %2 = icmp eq <2 x i64> %a0, %a1 959 %3 = shufflevector <2 x i1> %2, <2 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 960 %4 = bitcast <8 x i1> %3 to i8 961 ret i8 %4 962} 963 964define i8 @stack_fold_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1) { 965; CHECK-LABEL: stack_fold_pcmpeqw: 966; CHECK: # %bb.0: 967; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 968; CHECK-NEXT: #APP 969; CHECK-NEXT: nop 970; CHECK-NEXT: #NO_APP 971; CHECK-NEXT: vpcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload 972; CHECK-NEXT: kmovd %k0, %eax 973; CHECK-NEXT: # kill: def $al killed $al killed $eax 974; CHECK-NEXT: retq 975 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 976 %2 = icmp eq <8 x i16> %a0, %a1 977 %3 = bitcast <8 x i1> %2 to i8 978 ret i8 %3 979} 980 981define <32 x i8> @stack_fold_permbvar(<32 x i8> %a0, <32 x i8> %a1) { 982; CHECK-LABEL: stack_fold_permbvar: 983; CHECK: # %bb.0: 984; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 985; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 986; CHECK-NEXT: #APP 987; CHECK-NEXT: nop 988; CHECK-NEXT: #NO_APP 989; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 990; CHECK-NEXT: vpermb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 991; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 992; CHECK-NEXT: vpsubb %ymm1, %ymm0, %ymm0 993; CHECK-NEXT: retq 994 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 995 %2 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a1, <32 x i8> %a0) 996 ; add forces execution domain 997 %3 = add <32 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 998 ret <32 x i8> %3 999} 1000declare <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8>, <32 x i8>) nounwind readonly 1001 1002define <8 x i32> @stack_fold_permd(<8 x i32> %a0, <8 x i32> %a1) { 1003; CHECK-LABEL: stack_fold_permd: 1004; CHECK: # %bb.0: 1005; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1006; CHECK-NEXT: #APP 1007; CHECK-NEXT: nop 1008; CHECK-NEXT: #NO_APP 1009; CHECK-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1010; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1011; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm0 1012; CHECK-NEXT: retq 1013 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1014 %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a1, <8 x i32> %a0) 1015 ; add forces execution domain 1016 %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1017 ret <8 x i32> %3 1018} 1019declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly 1020 1021define <16 x i8> @stack_fold_vpermi2b(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { 1022; CHECK-LABEL: stack_fold_vpermi2b: 1023; CHECK: # %bb.0: 1024; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1025; CHECK-NEXT: #APP 1026; CHECK-NEXT: nop 1027; CHECK-NEXT: #NO_APP 1028; CHECK-NEXT: vpermi2b {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1029; CHECK-NEXT: retq 1030 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1031 %2 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x1, <16 x i8> %x0, <16 x i8> %x2) 1032 ret <16 x i8> %2 1033} 1034 1035define <32 x i8> @stack_fold_vpermi2b_ymm(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { 1036; CHECK-LABEL: stack_fold_vpermi2b_ymm: 1037; CHECK: # %bb.0: 1038; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1039; CHECK-NEXT: #APP 1040; CHECK-NEXT: nop 1041; CHECK-NEXT: #NO_APP 1042; CHECK-NEXT: vpermi2b {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1043; CHECK-NEXT: retq 1044 %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1045 %2 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x1, <32 x i8> %x0, <32 x i8> %x2) 1046 ret <32 x i8> %2 1047} 1048 1049define <4 x i32> @stack_fold_vpermi2d(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { 1050; CHECK-LABEL: stack_fold_vpermi2d: 1051; CHECK: # %bb.0: 1052; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1053; CHECK-NEXT: #APP 1054; CHECK-NEXT: nop 1055; CHECK-NEXT: #NO_APP 1056; CHECK-NEXT: vpermi2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1057; CHECK-NEXT: retq 1058 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1059 %2 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x1, <4 x i32> %x0, <4 x i32> %x2) 1060 ret <4 x i32> %2 1061} 1062 1063define <8 x i32> @stack_fold_vpermi2d_ymm(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { 1064; CHECK-LABEL: stack_fold_vpermi2d_ymm: 1065; CHECK: # %bb.0: 1066; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1067; CHECK-NEXT: #APP 1068; CHECK-NEXT: nop 1069; CHECK-NEXT: #NO_APP 1070; CHECK-NEXT: vpermi2d {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1071; CHECK-NEXT: retq 1072 %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1073 %2 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x1, <8 x i32> %x0, <8 x i32> %x2) 1074 ret <8 x i32> %2 1075} 1076 1077define <2 x i64> @stack_fold_vpermi2q(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { 1078; CHECK-LABEL: stack_fold_vpermi2q: 1079; CHECK: # %bb.0: 1080; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1081; CHECK-NEXT: #APP 1082; CHECK-NEXT: nop 1083; CHECK-NEXT: #NO_APP 1084; CHECK-NEXT: vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1085; CHECK-NEXT: retq 1086 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1087 %2 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x1, <2 x i64> %x0, <2 x i64> %x2) 1088 ret <2 x i64> %2 1089} 1090 1091define <4 x i64> @stack_fold_vpermi2q_ymm(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) { 1092; CHECK-LABEL: stack_fold_vpermi2q_ymm: 1093; CHECK: # %bb.0: 1094; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1095; CHECK-NEXT: #APP 1096; CHECK-NEXT: nop 1097; CHECK-NEXT: #NO_APP 1098; CHECK-NEXT: vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1099; CHECK-NEXT: retq 1100 %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1101 %2 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x1, <4 x i64> %x0, <4 x i64> %x2) 1102 ret <4 x i64> %2 1103} 1104 1105define <8 x i16> @stack_fold_vpermi2w(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { 1106; CHECK-LABEL: stack_fold_vpermi2w: 1107; CHECK: # %bb.0: 1108; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1109; CHECK-NEXT: #APP 1110; CHECK-NEXT: nop 1111; CHECK-NEXT: #NO_APP 1112; CHECK-NEXT: vpermi2w {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1113; CHECK-NEXT: retq 1114 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1115 %2 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x1, <8 x i16> %x0, <8 x i16> %x2) 1116 ret <8 x i16> %2 1117} 1118 1119define <16 x i16> @stack_fold_vpermi2w_ymm(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) { 1120; CHECK-LABEL: stack_fold_vpermi2w_ymm: 1121; CHECK: # %bb.0: 1122; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1123; CHECK-NEXT: #APP 1124; CHECK-NEXT: nop 1125; CHECK-NEXT: #NO_APP 1126; CHECK-NEXT: vpermi2w {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1127; CHECK-NEXT: retq 1128 %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1129 %2 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x1, <16 x i16> %x0, <16 x i16> %x2) 1130 ret <16 x i16> %2 1131} 1132 1133define <4 x i64> @stack_fold_permq(<4 x i64> %a0) { 1134; CHECK-LABEL: stack_fold_permq: 1135; CHECK: # %bb.0: 1136; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1137; CHECK-NEXT: #APP 1138; CHECK-NEXT: nop 1139; CHECK-NEXT: #NO_APP 1140; CHECK-NEXT: vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 1141; CHECK-NEXT: # ymm0 = mem[3,2,2,3] 1142; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1143; CHECK-NEXT: vpsubq %ymm1, %ymm0, %ymm0 1144; CHECK-NEXT: retq 1145 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1146 %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3> 1147 ; add forces execution domain 1148 %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1> 1149 ret <4 x i64> %3 1150} 1151 1152define <4 x i64> @stack_fold_permqvar(<4 x i64> %a0, <4 x i64> %a1) { 1153; CHECK-LABEL: stack_fold_permqvar: 1154; CHECK: # %bb.0: 1155; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1156; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1157; CHECK-NEXT: #APP 1158; CHECK-NEXT: nop 1159; CHECK-NEXT: #NO_APP 1160; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1161; CHECK-NEXT: vpermq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1162; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1163; CHECK-NEXT: vpsubq %ymm1, %ymm0, %ymm0 1164; CHECK-NEXT: retq 1165 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1166 %2 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a1, <4 x i64> %a0) 1167 ; add forces execution domain 1168 %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1> 1169 ret <4 x i64> %3 1170} 1171declare <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64>, <4 x i64>) nounwind readonly 1172 1173define <16 x i8> @stack_fold_vpermt2b(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { 1174; CHECK-LABEL: stack_fold_vpermt2b: 1175; CHECK: # %bb.0: 1176; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1177; CHECK-NEXT: #APP 1178; CHECK-NEXT: nop 1179; CHECK-NEXT: #NO_APP 1180; CHECK-NEXT: vpermt2b {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1181; CHECK-NEXT: retq 1182 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1183 %2 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) 1184 ret <16 x i8> %2 1185} 1186declare <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>) 1187 1188define <32 x i8> @stack_fold_vpermt2b_ymm(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { 1189; CHECK-LABEL: stack_fold_vpermt2b_ymm: 1190; CHECK: # %bb.0: 1191; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1192; CHECK-NEXT: #APP 1193; CHECK-NEXT: nop 1194; CHECK-NEXT: #NO_APP 1195; CHECK-NEXT: vpermt2b {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1196; CHECK-NEXT: retq 1197 %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1198 %2 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) 1199 ret <32 x i8> %2 1200} 1201declare <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>) 1202 1203define <4 x i32> @stack_fold_vpermt2d(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { 1204; CHECK-LABEL: stack_fold_vpermt2d: 1205; CHECK: # %bb.0: 1206; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1207; CHECK-NEXT: #APP 1208; CHECK-NEXT: nop 1209; CHECK-NEXT: #NO_APP 1210; CHECK-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1211; CHECK-NEXT: retq 1212 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1213 %2 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) 1214 ret <4 x i32> %2 1215} 1216declare <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>) 1217 1218define <8 x i32> @stack_fold_vpermt2d_ymm(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { 1219; CHECK-LABEL: stack_fold_vpermt2d_ymm: 1220; CHECK: # %bb.0: 1221; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1222; CHECK-NEXT: #APP 1223; CHECK-NEXT: nop 1224; CHECK-NEXT: #NO_APP 1225; CHECK-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1226; CHECK-NEXT: retq 1227 %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1228 %2 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) 1229 ret <8 x i32> %2 1230} 1231declare <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>) 1232 1233define <2 x i64> @stack_fold_vpermt2q(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { 1234; CHECK-LABEL: stack_fold_vpermt2q: 1235; CHECK: # %bb.0: 1236; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1237; CHECK-NEXT: #APP 1238; CHECK-NEXT: nop 1239; CHECK-NEXT: #NO_APP 1240; CHECK-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1241; CHECK-NEXT: retq 1242 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1243 %2 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) 1244 ret <2 x i64> %2 1245} 1246declare <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>) 1247 1248define <4 x i64> @stack_fold_vpermt2q_ymm(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) { 1249; CHECK-LABEL: stack_fold_vpermt2q_ymm: 1250; CHECK: # %bb.0: 1251; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1252; CHECK-NEXT: #APP 1253; CHECK-NEXT: nop 1254; CHECK-NEXT: #NO_APP 1255; CHECK-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1256; CHECK-NEXT: retq 1257 %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1258 %2 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) 1259 ret <4 x i64> %2 1260} 1261declare <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>) 1262 1263define <8 x i16> @stack_fold_vpermt2w(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { 1264; CHECK-LABEL: stack_fold_vpermt2w: 1265; CHECK: # %bb.0: 1266; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1267; CHECK-NEXT: #APP 1268; CHECK-NEXT: nop 1269; CHECK-NEXT: #NO_APP 1270; CHECK-NEXT: vpermt2w {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1271; CHECK-NEXT: retq 1272 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1273 %2 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) 1274 ret <8 x i16> %2 1275} 1276declare <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>) 1277 1278define <16 x i16> @stack_fold_vpermt2w_ymm(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) { 1279; CHECK-LABEL: stack_fold_vpermt2w_ymm: 1280; CHECK: # %bb.0: 1281; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1282; CHECK-NEXT: #APP 1283; CHECK-NEXT: nop 1284; CHECK-NEXT: #NO_APP 1285; CHECK-NEXT: vpermt2w {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1286; CHECK-NEXT: retq 1287 %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1288 %2 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) 1289 ret <16 x i16> %2 1290} 1291declare <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>) 1292 1293define <16 x i16> @stack_fold_permwvar(<16 x i16> %a0, <16 x i16> %a1) { 1294; CHECK-LABEL: stack_fold_permwvar: 1295; CHECK: # %bb.0: 1296; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1297; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1298; CHECK-NEXT: #APP 1299; CHECK-NEXT: nop 1300; CHECK-NEXT: #NO_APP 1301; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1302; CHECK-NEXT: vpermw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1303; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1304; CHECK-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1305; CHECK-NEXT: retq 1306 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1307 %2 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a1, <16 x i16> %a0) 1308 ; add forces execution domain 1309 %3 = add <16 x i16> %2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1310 ret <16 x i16> %3 1311} 1312declare <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16>, <16 x i16>) nounwind readonly 1313 1314define <4 x i32> @stack_fold_vplzcntd(<4 x i32> %a0) { 1315; CHECK-LABEL: stack_fold_vplzcntd: 1316; CHECK: # %bb.0: 1317; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1318; CHECK-NEXT: #APP 1319; CHECK-NEXT: nop 1320; CHECK-NEXT: #NO_APP 1321; CHECK-NEXT: vplzcntd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1322; CHECK-NEXT: retq 1323 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1324 %2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a0, i1 false) 1325 ret <4 x i32> %2 1326} 1327 1328define <8 x i32> @stack_fold_vplzcntd_ymm(<8 x i32> %a0) { 1329; CHECK-LABEL: stack_fold_vplzcntd_ymm: 1330; CHECK: # %bb.0: 1331; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1332; CHECK-NEXT: #APP 1333; CHECK-NEXT: nop 1334; CHECK-NEXT: #NO_APP 1335; CHECK-NEXT: vplzcntd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 1336; CHECK-NEXT: retq 1337 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1338 %2 = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a0, i1 false) 1339 ret <8 x i32> %2 1340} 1341 1342define <2 x i64> @stack_fold_vplzcntq(<2 x i64> %a0) { 1343; CHECK-LABEL: stack_fold_vplzcntq: 1344; CHECK: # %bb.0: 1345; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1346; CHECK-NEXT: #APP 1347; CHECK-NEXT: nop 1348; CHECK-NEXT: #NO_APP 1349; CHECK-NEXT: vplzcntq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1350; CHECK-NEXT: retq 1351 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1352 %2 = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a0, i1 false) 1353 ret <2 x i64> %2 1354} 1355 1356define <4 x i64> @stack_fold_vplzcntq_ymm(<4 x i64> %a0) { 1357; CHECK-LABEL: stack_fold_vplzcntq_ymm: 1358; CHECK: # %bb.0: 1359; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1360; CHECK-NEXT: #APP 1361; CHECK-NEXT: nop 1362; CHECK-NEXT: #NO_APP 1363; CHECK-NEXT: vplzcntq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 1364; CHECK-NEXT: retq 1365 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1366 %2 = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a0, i1 false) 1367 ret <4 x i64> %2 1368} 1369 1370define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) { 1371; CHECK-LABEL: stack_fold_pmaddubsw: 1372; CHECK: # %bb.0: 1373; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1374; CHECK-NEXT: #APP 1375; CHECK-NEXT: nop 1376; CHECK-NEXT: #NO_APP 1377; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1378; CHECK-NEXT: retq 1379 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1380 %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1) 1381 ret <8 x i16> %2 1382} 1383declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone 1384 1385define <8 x i16> @stack_fold_pmaddubsw_mask(ptr %passthru, <16 x i8> %a0, <16 x i8> %a1, i8 %mask) { 1386; CHECK-LABEL: stack_fold_pmaddubsw_mask: 1387; CHECK: # %bb.0: 1388; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1389; CHECK-NEXT: #APP 1390; CHECK-NEXT: nop 1391; CHECK-NEXT: #NO_APP 1392; CHECK-NEXT: kmovd %esi, %k1 1393; CHECK-NEXT: vmovdqa (%rdi), %xmm2 1394; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 1395; CHECK-NEXT: vmovdqa %xmm2, %xmm0 1396; CHECK-NEXT: retq 1397 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1398 %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1) 1399 %3 = bitcast i8 %mask to <8 x i1> 1400 ; load needed to keep the operation from being scheduled about the asm block 1401 %4 = load <8 x i16>, ptr %passthru 1402 %5 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> %4 1403 ret <8 x i16> %5 1404} 1405 1406define <8 x i16> @stack_fold_pmaddubsw_maskz(<16 x i8> %a0, <16 x i8> %a1, i8 %mask) { 1407; CHECK-LABEL: stack_fold_pmaddubsw_maskz: 1408; CHECK: # %bb.0: 1409; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1410; CHECK-NEXT: #APP 1411; CHECK-NEXT: nop 1412; CHECK-NEXT: #NO_APP 1413; CHECK-NEXT: kmovd %edi, %k1 1414; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload 1415; CHECK-NEXT: retq 1416 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1417 %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1) 1418 %3 = bitcast i8 %mask to <8 x i1> 1419 %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer 1420 ret <8 x i16> %4 1421} 1422 1423define <16 x i16> @stack_fold_pmaddubsw_ymm(<32 x i8> %a0, <32 x i8> %a1) { 1424; CHECK-LABEL: stack_fold_pmaddubsw_ymm: 1425; CHECK: # %bb.0: 1426; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1427; CHECK-NEXT: #APP 1428; CHECK-NEXT: nop 1429; CHECK-NEXT: #NO_APP 1430; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1431; CHECK-NEXT: retq 1432 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1433 %2 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1) 1434 ret <16 x i16> %2 1435} 1436declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone 1437 1438define <16 x i16> @stack_fold_pmaddubsw_ymm_mask(ptr %passthru, <32 x i8> %a0, <32 x i8> %a1, i16 %mask) { 1439; CHECK-LABEL: stack_fold_pmaddubsw_ymm_mask: 1440; CHECK: # %bb.0: 1441; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1442; CHECK-NEXT: #APP 1443; CHECK-NEXT: nop 1444; CHECK-NEXT: #NO_APP 1445; CHECK-NEXT: kmovd %esi, %k1 1446; CHECK-NEXT: vmovdqa (%rdi), %ymm2 1447; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 1448; CHECK-NEXT: vmovdqa %ymm2, %ymm0 1449; CHECK-NEXT: retq 1450 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1451 %2 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1) 1452 %3 = bitcast i16 %mask to <16 x i1> 1453 ; load needed to keep the operation from being scheduled about the asm block 1454 %4 = load <16 x i16>, ptr %passthru 1455 %5 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> %4 1456 ret <16 x i16> %5 1457} 1458 1459define <16 x i16> @stack_fold_pmaddubsw_ymm_maskz(<32 x i8> %a0, <32 x i8> %a1, i16 %mask) { 1460; CHECK-LABEL: stack_fold_pmaddubsw_ymm_maskz: 1461; CHECK: # %bb.0: 1462; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1463; CHECK-NEXT: #APP 1464; CHECK-NEXT: nop 1465; CHECK-NEXT: #NO_APP 1466; CHECK-NEXT: kmovd %edi, %k1 1467; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload 1468; CHECK-NEXT: retq 1469 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1470 %2 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1) 1471 %3 = bitcast i16 %mask to <16 x i1> 1472 %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer 1473 ret <16 x i16> %4 1474} 1475 1476define <4 x i32> @stack_fold_pmaddwd(<8 x i16> %a0, <8 x i16> %a1) { 1477; CHECK-LABEL: stack_fold_pmaddwd: 1478; CHECK: # %bb.0: 1479; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1480; CHECK-NEXT: #APP 1481; CHECK-NEXT: nop 1482; CHECK-NEXT: #NO_APP 1483; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1484; CHECK-NEXT: retq 1485 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1486 %2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1) 1487 ret <4 x i32> %2 1488} 1489declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone 1490 1491define <4 x i32> @stack_fold_pmaddwd_mask(ptr %passthru, <8 x i16> %a0, <8 x i16> %a1, i8 %mask) { 1492; CHECK-LABEL: stack_fold_pmaddwd_mask: 1493; CHECK: # %bb.0: 1494; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1495; CHECK-NEXT: #APP 1496; CHECK-NEXT: nop 1497; CHECK-NEXT: #NO_APP 1498; CHECK-NEXT: kmovd %esi, %k1 1499; CHECK-NEXT: vmovdqa (%rdi), %xmm2 1500; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 1501; CHECK-NEXT: vmovdqa %xmm2, %xmm0 1502; CHECK-NEXT: retq 1503 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1504 %2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1) 1505 %3 = bitcast i8 %mask to <8 x i1> 1506 %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1507 ; load needed to keep the operation from being scheduled about the asm block 1508 %5 = load <4 x i32>, ptr %passthru 1509 %6 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> %5 1510 ret <4 x i32> %6 1511} 1512 1513define <4 x i32> @stack_fold_pmaddwd_maskz(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) { 1514; CHECK-LABEL: stack_fold_pmaddwd_maskz: 1515; CHECK: # %bb.0: 1516; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1517; CHECK-NEXT: #APP 1518; CHECK-NEXT: nop 1519; CHECK-NEXT: #NO_APP 1520; CHECK-NEXT: kmovd %edi, %k1 1521; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload 1522; CHECK-NEXT: retq 1523 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1524 %2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1) 1525 %3 = bitcast i8 %mask to <8 x i1> 1526 %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1527 %5 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> zeroinitializer 1528 ret <4 x i32> %5 1529} 1530 1531define <8 x i32> @stack_fold_pmaddwd_ymm(<16 x i16> %a0, <16 x i16> %a1) { 1532; CHECK-LABEL: stack_fold_pmaddwd_ymm: 1533; CHECK: # %bb.0: 1534; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1535; CHECK-NEXT: #APP 1536; CHECK-NEXT: nop 1537; CHECK-NEXT: #NO_APP 1538; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1539; CHECK-NEXT: retq 1540 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1541 %2 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1) 1542 ret <8 x i32> %2 1543} 1544declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone 1545 1546define <8 x i32> @stack_fold_pmaddwd_ymm_mask(ptr %passthru, <16 x i16> %a0, <16 x i16> %a1, i8 %mask) { 1547; CHECK-LABEL: stack_fold_pmaddwd_ymm_mask: 1548; CHECK: # %bb.0: 1549; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1550; CHECK-NEXT: #APP 1551; CHECK-NEXT: nop 1552; CHECK-NEXT: #NO_APP 1553; CHECK-NEXT: kmovd %esi, %k1 1554; CHECK-NEXT: vmovdqa (%rdi), %ymm2 1555; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 1556; CHECK-NEXT: vmovdqa %ymm2, %ymm0 1557; CHECK-NEXT: retq 1558 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1559 %2 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1) 1560 %3 = bitcast i8 %mask to <8 x i1> 1561 ; load needed to keep the operation from being scheduled about the asm block 1562 %4 = load <8 x i32>, ptr %passthru 1563 %5 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> %4 1564 ret <8 x i32> %5 1565} 1566 1567define <8 x i32> @stack_fold_pmaddwd_ymm_maskz(<16 x i16> %a0, <16 x i16> %a1, i8 %mask) { 1568; CHECK-LABEL: stack_fold_pmaddwd_ymm_maskz: 1569; CHECK: # %bb.0: 1570; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1571; CHECK-NEXT: #APP 1572; CHECK-NEXT: nop 1573; CHECK-NEXT: #NO_APP 1574; CHECK-NEXT: kmovd %edi, %k1 1575; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload 1576; CHECK-NEXT: retq 1577 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1578 %2 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1) 1579 %3 = bitcast i8 %mask to <8 x i1> 1580 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer 1581 ret <8 x i32> %4 1582} 1583 1584define <16 x i8> @stack_fold_pmaxsb(<16 x i8> %a0, <16 x i8> %a1) { 1585; CHECK-LABEL: stack_fold_pmaxsb: 1586; CHECK: # %bb.0: 1587; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1588; CHECK-NEXT: #APP 1589; CHECK-NEXT: nop 1590; CHECK-NEXT: #NO_APP 1591; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1592; CHECK-NEXT: retq 1593 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1594 %2 = icmp sgt <16 x i8> %a0, %a1 1595 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1 1596 ret <16 x i8> %3 1597} 1598 1599define <32 x i8> @stack_fold_pmaxsb_ymm(<32 x i8> %a0, <32 x i8> %a1) { 1600; CHECK-LABEL: stack_fold_pmaxsb_ymm: 1601; CHECK: # %bb.0: 1602; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1603; CHECK-NEXT: #APP 1604; CHECK-NEXT: nop 1605; CHECK-NEXT: #NO_APP 1606; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1607; CHECK-NEXT: retq 1608 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1609 %2 = icmp sgt <32 x i8> %a0, %a1 1610 %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1 1611 ret <32 x i8> %3 1612} 1613 1614define <4 x i32> @stack_fold_pmaxsd(<4 x i32> %a0, <4 x i32> %a1) { 1615; CHECK-LABEL: stack_fold_pmaxsd: 1616; CHECK: # %bb.0: 1617; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1618; CHECK-NEXT: #APP 1619; CHECK-NEXT: nop 1620; CHECK-NEXT: #NO_APP 1621; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1622; CHECK-NEXT: retq 1623 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1624 %2 = icmp sgt <4 x i32> %a0, %a1 1625 %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1 1626 ret <4 x i32> %3 1627} 1628 1629define <8 x i32> @stack_fold_pmaxsd_ymm(<8 x i32> %a0, <8 x i32> %a1) { 1630; CHECK-LABEL: stack_fold_pmaxsd_ymm: 1631; CHECK: # %bb.0: 1632; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1633; CHECK-NEXT: #APP 1634; CHECK-NEXT: nop 1635; CHECK-NEXT: #NO_APP 1636; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1637; CHECK-NEXT: retq 1638 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1639 %2 = icmp sgt <8 x i32> %a0, %a1 1640 %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1 1641 ret <8 x i32> %3 1642} 1643 1644define <2 x i64> @stack_fold_pmaxsq(<2 x i64> %a0, <2 x i64> %a1) { 1645; CHECK-LABEL: stack_fold_pmaxsq: 1646; CHECK: # %bb.0: 1647; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1648; CHECK-NEXT: #APP 1649; CHECK-NEXT: nop 1650; CHECK-NEXT: #NO_APP 1651; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1652; CHECK-NEXT: retq 1653 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1654 %2 = icmp sgt <2 x i64> %a0, %a1 1655 %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %a1 1656 ret <2 x i64> %3 1657} 1658 1659define <4 x i64> @stack_fold_pmaxsq_ymm(<4 x i64> %a0, <4 x i64> %a1) { 1660; CHECK-LABEL: stack_fold_pmaxsq_ymm: 1661; CHECK: # %bb.0: 1662; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1663; CHECK-NEXT: #APP 1664; CHECK-NEXT: nop 1665; CHECK-NEXT: #NO_APP 1666; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1667; CHECK-NEXT: retq 1668 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1669 %2 = icmp sgt <4 x i64> %a0, %a1 1670 %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %a1 1671 ret <4 x i64> %3 1672} 1673 1674define <8 x i16> @stack_fold_pmaxsw(<8 x i16> %a0, <8 x i16> %a1) { 1675; CHECK-LABEL: stack_fold_pmaxsw: 1676; CHECK: # %bb.0: 1677; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1678; CHECK-NEXT: #APP 1679; CHECK-NEXT: nop 1680; CHECK-NEXT: #NO_APP 1681; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1682; CHECK-NEXT: retq 1683 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1684 %2 = icmp sgt <8 x i16> %a0, %a1 1685 %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1 1686 ret <8 x i16> %3 1687} 1688 1689define <16 x i16> @stack_fold_pmaxsw_ymm(<16 x i16> %a0, <16 x i16> %a1) { 1690; CHECK-LABEL: stack_fold_pmaxsw_ymm: 1691; CHECK: # %bb.0: 1692; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1693; CHECK-NEXT: #APP 1694; CHECK-NEXT: nop 1695; CHECK-NEXT: #NO_APP 1696; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1697; CHECK-NEXT: retq 1698 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1699 %2 = icmp sgt <16 x i16> %a0, %a1 1700 %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1 1701 ret <16 x i16> %3 1702} 1703 1704define <16 x i8> @stack_fold_pmaxub(<16 x i8> %a0, <16 x i8> %a1) { 1705; CHECK-LABEL: stack_fold_pmaxub: 1706; CHECK: # %bb.0: 1707; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1708; CHECK-NEXT: #APP 1709; CHECK-NEXT: nop 1710; CHECK-NEXT: #NO_APP 1711; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1712; CHECK-NEXT: retq 1713 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1714 %2 = icmp ugt <16 x i8> %a0, %a1 1715 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1 1716 ret <16 x i8> %3 1717} 1718 1719define <32 x i8> @stack_fold_pmaxub_ymm(<32 x i8> %a0, <32 x i8> %a1) { 1720; CHECK-LABEL: stack_fold_pmaxub_ymm: 1721; CHECK: # %bb.0: 1722; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1723; CHECK-NEXT: #APP 1724; CHECK-NEXT: nop 1725; CHECK-NEXT: #NO_APP 1726; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1727; CHECK-NEXT: retq 1728 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1729 %2 = icmp ugt <32 x i8> %a0, %a1 1730 %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1 1731 ret <32 x i8> %3 1732} 1733 1734define <4 x i32> @stack_fold_pmaxud(<4 x i32> %a0, <4 x i32> %a1) { 1735; CHECK-LABEL: stack_fold_pmaxud: 1736; CHECK: # %bb.0: 1737; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1738; CHECK-NEXT: #APP 1739; CHECK-NEXT: nop 1740; CHECK-NEXT: #NO_APP 1741; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1742; CHECK-NEXT: retq 1743 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1744 %2 = icmp ugt <4 x i32> %a0, %a1 1745 %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1 1746 ret <4 x i32> %3 1747} 1748 1749define <8 x i32> @stack_fold_pmaxud_ymm(<8 x i32> %a0, <8 x i32> %a1) { 1750; CHECK-LABEL: stack_fold_pmaxud_ymm: 1751; CHECK: # %bb.0: 1752; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1753; CHECK-NEXT: #APP 1754; CHECK-NEXT: nop 1755; CHECK-NEXT: #NO_APP 1756; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1757; CHECK-NEXT: retq 1758 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1759 %2 = icmp ugt <8 x i32> %a0, %a1 1760 %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1 1761 ret <8 x i32> %3 1762} 1763 1764define <2 x i64> @stack_fold_pmaxuq(<2 x i64> %a0, <2 x i64> %a1) { 1765; CHECK-LABEL: stack_fold_pmaxuq: 1766; CHECK: # %bb.0: 1767; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1768; CHECK-NEXT: #APP 1769; CHECK-NEXT: nop 1770; CHECK-NEXT: #NO_APP 1771; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1772; CHECK-NEXT: retq 1773 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1774 %2 = icmp ugt <2 x i64> %a0, %a1 1775 %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %a1 1776 ret <2 x i64> %3 1777} 1778 1779define <2 x i64> @stack_fold_pmaxuq_mask(ptr %passthru, <2 x i64> %a0, <2 x i64> %a1, i8 %mask) { 1780; CHECK-LABEL: stack_fold_pmaxuq_mask: 1781; CHECK: # %bb.0: 1782; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1783; CHECK-NEXT: #APP 1784; CHECK-NEXT: nop 1785; CHECK-NEXT: #NO_APP 1786; CHECK-NEXT: vmovdqa (%rdi), %xmm2 1787; CHECK-NEXT: kmovd %esi, %k1 1788; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 1789; CHECK-NEXT: vmovdqa %xmm2, %xmm0 1790; CHECK-NEXT: retq 1791 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1792 %2 = load <2 x i64>, ptr %passthru 1793 %3 = icmp ugt <2 x i64> %a0, %a1 1794 %4 = select <2 x i1> %3, <2 x i64> %a0, <2 x i64> %a1 1795 %5 = bitcast i8 %mask to <8 x i1> 1796 %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> <i32 0, i32 1> 1797 %6 = select <2 x i1> %extract, <2 x i64> %4, <2 x i64> %2 1798 ret <2 x i64> %6 1799} 1800 1801define <2 x i64> @stack_fold_pmaxuq_maskz(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) { 1802; CHECK-LABEL: stack_fold_pmaxuq_maskz: 1803; CHECK: # %bb.0: 1804; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1805; CHECK-NEXT: #APP 1806; CHECK-NEXT: nop 1807; CHECK-NEXT: #NO_APP 1808; CHECK-NEXT: kmovd %edi, %k1 1809; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload 1810; CHECK-NEXT: retq 1811 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1812 %2 = icmp ugt <2 x i64> %a0, %a1 1813 %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %a1 1814 %4 = bitcast i8 %mask to <8 x i1> 1815 %extract = shufflevector <8 x i1> %4, <8 x i1> %4, <2 x i32> <i32 0, i32 1> 1816 %5 = select <2 x i1> %extract, <2 x i64> %3, <2 x i64> zeroinitializer 1817 ret <2 x i64> %5 1818} 1819 1820define <4 x i64> @stack_fold_pmaxuq_ymm(<4 x i64> %a0, <4 x i64> %a1) { 1821; CHECK-LABEL: stack_fold_pmaxuq_ymm: 1822; CHECK: # %bb.0: 1823; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1824; CHECK-NEXT: #APP 1825; CHECK-NEXT: nop 1826; CHECK-NEXT: #NO_APP 1827; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1828; CHECK-NEXT: retq 1829 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1830 %2 = icmp ugt <4 x i64> %a0, %a1 1831 %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %a1 1832 ret <4 x i64> %3 1833} 1834 1835define <4 x i64> @stack_fold_pmaxuq_ymm_mask(ptr %passthru, <4 x i64> %a0, <4 x i64> %a1, i8 %mask) { 1836; CHECK-LABEL: stack_fold_pmaxuq_ymm_mask: 1837; CHECK: # %bb.0: 1838; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1839; CHECK-NEXT: #APP 1840; CHECK-NEXT: nop 1841; CHECK-NEXT: #NO_APP 1842; CHECK-NEXT: vmovdqa (%rdi), %ymm2 1843; CHECK-NEXT: kmovd %esi, %k1 1844; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 1845; CHECK-NEXT: vmovdqa %ymm2, %ymm0 1846; CHECK-NEXT: retq 1847 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1848 %2 = load <4 x i64>, ptr %passthru 1849 %3 = icmp ugt <4 x i64> %a0, %a1 1850 %4 = select <4 x i1> %3, <4 x i64> %a0, <4 x i64> %a1 1851 %5 = bitcast i8 %mask to <8 x i1> 1852 %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1853 %6 = select <4 x i1> %extract, <4 x i64> %4, <4 x i64> %2 1854 ret <4 x i64> %6 1855} 1856 1857define <4 x i64> @stack_fold_pmaxuq_ymm_maskz(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) { 1858; CHECK-LABEL: stack_fold_pmaxuq_ymm_maskz: 1859; CHECK: # %bb.0: 1860; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1861; CHECK-NEXT: #APP 1862; CHECK-NEXT: nop 1863; CHECK-NEXT: #NO_APP 1864; CHECK-NEXT: kmovd %edi, %k1 1865; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload 1866; CHECK-NEXT: retq 1867 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1868 %2 = icmp ugt <4 x i64> %a0, %a1 1869 %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %a1 1870 %4 = bitcast i8 %mask to <8 x i1> 1871 %extract = shufflevector <8 x i1> %4, <8 x i1> %4, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1872 %5 = select <4 x i1> %extract, <4 x i64> %3, <4 x i64> zeroinitializer 1873 ret <4 x i64> %5 1874} 1875 1876define <8 x i16> @stack_fold_pmaxuw(<8 x i16> %a0, <8 x i16> %a1) { 1877; CHECK-LABEL: stack_fold_pmaxuw: 1878; CHECK: # %bb.0: 1879; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1880; CHECK-NEXT: #APP 1881; CHECK-NEXT: nop 1882; CHECK-NEXT: #NO_APP 1883; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1884; CHECK-NEXT: retq 1885 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1886 %2 = icmp ugt <8 x i16> %a0, %a1 1887 %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1 1888 ret <8 x i16> %3 1889} 1890 1891define <16 x i16> @stack_fold_pmaxuw_ymm(<16 x i16> %a0, <16 x i16> %a1) { 1892; CHECK-LABEL: stack_fold_pmaxuw_ymm: 1893; CHECK: # %bb.0: 1894; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1895; CHECK-NEXT: #APP 1896; CHECK-NEXT: nop 1897; CHECK-NEXT: #NO_APP 1898; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1899; CHECK-NEXT: retq 1900 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1901 %2 = icmp ugt <16 x i16> %a0, %a1 1902 %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1 1903 ret <16 x i16> %3 1904} 1905 1906define <16 x i8> @stack_fold_pminsb(<16 x i8> %a0, <16 x i8> %a1) { 1907; CHECK-LABEL: stack_fold_pminsb: 1908; CHECK: # %bb.0: 1909; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1910; CHECK-NEXT: #APP 1911; CHECK-NEXT: nop 1912; CHECK-NEXT: #NO_APP 1913; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1914; CHECK-NEXT: retq 1915 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1916 %2 = icmp slt <16 x i8> %a0, %a1 1917 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1 1918 ret <16 x i8> %3 1919} 1920 1921define <32 x i8> @stack_fold_pminsb_ymm(<32 x i8> %a0, <32 x i8> %a1) { 1922; CHECK-LABEL: stack_fold_pminsb_ymm: 1923; CHECK: # %bb.0: 1924; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1925; CHECK-NEXT: #APP 1926; CHECK-NEXT: nop 1927; CHECK-NEXT: #NO_APP 1928; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1929; CHECK-NEXT: retq 1930 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1931 %2 = icmp slt <32 x i8> %a0, %a1 1932 %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1 1933 ret <32 x i8> %3 1934} 1935 1936define <4 x i32> @stack_fold_pminsd(<4 x i32> %a0, <4 x i32> %a1) { 1937; CHECK-LABEL: stack_fold_pminsd: 1938; CHECK: # %bb.0: 1939; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1940; CHECK-NEXT: #APP 1941; CHECK-NEXT: nop 1942; CHECK-NEXT: #NO_APP 1943; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1944; CHECK-NEXT: retq 1945 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1946 %2 = icmp slt <4 x i32> %a0, %a1 1947 %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1 1948 ret <4 x i32> %3 1949} 1950 1951define <8 x i32> @stack_fold_pminsd_ymm(<8 x i32> %a0, <8 x i32> %a1) { 1952; CHECK-LABEL: stack_fold_pminsd_ymm: 1953; CHECK: # %bb.0: 1954; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1955; CHECK-NEXT: #APP 1956; CHECK-NEXT: nop 1957; CHECK-NEXT: #NO_APP 1958; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1959; CHECK-NEXT: retq 1960 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1961 %2 = icmp slt <8 x i32> %a0, %a1 1962 %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1 1963 ret <8 x i32> %3 1964} 1965 1966define <2 x i64> @stack_fold_pminsq(<2 x i64> %a0, <2 x i64> %a1) { 1967; CHECK-LABEL: stack_fold_pminsq: 1968; CHECK: # %bb.0: 1969; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1970; CHECK-NEXT: #APP 1971; CHECK-NEXT: nop 1972; CHECK-NEXT: #NO_APP 1973; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1974; CHECK-NEXT: retq 1975 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1976 %2 = icmp slt <2 x i64> %a0, %a1 1977 %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %a1 1978 ret <2 x i64> %3 1979} 1980 1981define <4 x i64> @stack_fold_pminsq_ymm(<4 x i64> %a0, <4 x i64> %a1) { 1982; CHECK-LABEL: stack_fold_pminsq_ymm: 1983; CHECK: # %bb.0: 1984; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1985; CHECK-NEXT: #APP 1986; CHECK-NEXT: nop 1987; CHECK-NEXT: #NO_APP 1988; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1989; CHECK-NEXT: retq 1990 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1991 %2 = icmp slt <4 x i64> %a0, %a1 1992 %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %a1 1993 ret <4 x i64> %3 1994} 1995 1996define <8 x i16> @stack_fold_pminsw(<8 x i16> %a0, <8 x i16> %a1) { 1997; CHECK-LABEL: stack_fold_pminsw: 1998; CHECK: # %bb.0: 1999; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2000; CHECK-NEXT: #APP 2001; CHECK-NEXT: nop 2002; CHECK-NEXT: #NO_APP 2003; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2004; CHECK-NEXT: retq 2005 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2006 %2 = icmp slt <8 x i16> %a0, %a1 2007 %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1 2008 ret <8 x i16> %3 2009} 2010 2011define <16 x i16> @stack_fold_pminsw_ymm(<16 x i16> %a0, <16 x i16> %a1) { 2012; CHECK-LABEL: stack_fold_pminsw_ymm: 2013; CHECK: # %bb.0: 2014; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2015; CHECK-NEXT: #APP 2016; CHECK-NEXT: nop 2017; CHECK-NEXT: #NO_APP 2018; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2019; CHECK-NEXT: retq 2020 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2021 %2 = icmp slt <16 x i16> %a0, %a1 2022 %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1 2023 ret <16 x i16> %3 2024} 2025 2026define <16 x i8> @stack_fold_pminub(<16 x i8> %a0, <16 x i8> %a1) { 2027; CHECK-LABEL: stack_fold_pminub: 2028; CHECK: # %bb.0: 2029; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2030; CHECK-NEXT: #APP 2031; CHECK-NEXT: nop 2032; CHECK-NEXT: #NO_APP 2033; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2034; CHECK-NEXT: retq 2035 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2036 %2 = icmp ult <16 x i8> %a0, %a1 2037 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1 2038 ret <16 x i8> %3 2039} 2040 2041define <32 x i8> @stack_fold_pminub_ymm(<32 x i8> %a0, <32 x i8> %a1) { 2042; CHECK-LABEL: stack_fold_pminub_ymm: 2043; CHECK: # %bb.0: 2044; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2045; CHECK-NEXT: #APP 2046; CHECK-NEXT: nop 2047; CHECK-NEXT: #NO_APP 2048; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2049; CHECK-NEXT: retq 2050 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2051 %2 = icmp ult <32 x i8> %a0, %a1 2052 %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1 2053 ret <32 x i8> %3 2054} 2055 2056define <4 x i32> @stack_fold_pminud(<4 x i32> %a0, <4 x i32> %a1) { 2057; CHECK-LABEL: stack_fold_pminud: 2058; CHECK: # %bb.0: 2059; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2060; CHECK-NEXT: #APP 2061; CHECK-NEXT: nop 2062; CHECK-NEXT: #NO_APP 2063; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2064; CHECK-NEXT: retq 2065 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2066 %2 = icmp ult <4 x i32> %a0, %a1 2067 %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1 2068 ret <4 x i32> %3 2069} 2070 2071define <8 x i32> @stack_fold_pminud_ymm(<8 x i32> %a0, <8 x i32> %a1) { 2072; CHECK-LABEL: stack_fold_pminud_ymm: 2073; CHECK: # %bb.0: 2074; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2075; CHECK-NEXT: #APP 2076; CHECK-NEXT: nop 2077; CHECK-NEXT: #NO_APP 2078; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2079; CHECK-NEXT: retq 2080 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2081 %2 = icmp ult <8 x i32> %a0, %a1 2082 %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1 2083 ret <8 x i32> %3 2084} 2085 2086define <2 x i64> @stack_fold_pminuq(<2 x i64> %a0, <2 x i64> %a1) { 2087; CHECK-LABEL: stack_fold_pminuq: 2088; CHECK: # %bb.0: 2089; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2090; CHECK-NEXT: #APP 2091; CHECK-NEXT: nop 2092; CHECK-NEXT: #NO_APP 2093; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2094; CHECK-NEXT: retq 2095 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2096 %2 = icmp ult <2 x i64> %a0, %a1 2097 %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %a1 2098 ret <2 x i64> %3 2099} 2100 2101define <4 x i64> @stack_fold_pminuq_ymm(<4 x i64> %a0, <4 x i64> %a1) { 2102; CHECK-LABEL: stack_fold_pminuq_ymm: 2103; CHECK: # %bb.0: 2104; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2105; CHECK-NEXT: #APP 2106; CHECK-NEXT: nop 2107; CHECK-NEXT: #NO_APP 2108; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2109; CHECK-NEXT: retq 2110 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2111 %2 = icmp ult <4 x i64> %a0, %a1 2112 %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %a1 2113 ret <4 x i64> %3 2114} 2115 2116define <8 x i16> @stack_fold_pminuw(<8 x i16> %a0, <8 x i16> %a1) { 2117; CHECK-LABEL: stack_fold_pminuw: 2118; CHECK: # %bb.0: 2119; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2120; CHECK-NEXT: #APP 2121; CHECK-NEXT: nop 2122; CHECK-NEXT: #NO_APP 2123; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2124; CHECK-NEXT: retq 2125 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2126 %2 = icmp ult <8 x i16> %a0, %a1 2127 %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1 2128 ret <8 x i16> %3 2129} 2130 2131define <16 x i16> @stack_fold_pminuw_ymm(<16 x i16> %a0, <16 x i16> %a1) { 2132; CHECK-LABEL: stack_fold_pminuw_ymm: 2133; CHECK: # %bb.0: 2134; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2135; CHECK-NEXT: #APP 2136; CHECK-NEXT: nop 2137; CHECK-NEXT: #NO_APP 2138; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2139; CHECK-NEXT: retq 2140 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2141 %2 = icmp ult <16 x i16> %a0, %a1 2142 %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1 2143 ret <16 x i16> %3 2144} 2145 2146define <8 x i16> @stack_fold_vpmovdw(<8 x i32> %a0) { 2147; CHECK-LABEL: stack_fold_vpmovdw: 2148; CHECK: # %bb.0: 2149; CHECK-NEXT: vpmovdw %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 2150; CHECK-NEXT: #APP 2151; CHECK-NEXT: nop 2152; CHECK-NEXT: #NO_APP 2153; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2154; CHECK-NEXT: vzeroupper 2155; CHECK-NEXT: retq 2156 %1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %a0, <8 x i16> undef, i8 -1) 2157 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2158 ret <8 x i16> %1 2159} 2160declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32>, <8 x i16>, i8) 2161 2162define <4 x i32> @stack_fold_vpmovqd(<4 x i64> %a0) { 2163; CHECK-LABEL: stack_fold_vpmovqd: 2164; CHECK: # %bb.0: 2165; CHECK-NEXT: vpmovqd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 2166; CHECK-NEXT: #APP 2167; CHECK-NEXT: nop 2168; CHECK-NEXT: #NO_APP 2169; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2170; CHECK-NEXT: vzeroupper 2171; CHECK-NEXT: retq 2172 %1 = trunc <4 x i64> %a0 to <4 x i32> 2173 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2174 ret <4 x i32> %1 2175} 2176declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64>, <4 x i32>, i8) 2177 2178define <16 x i8> @stack_fold_vpmovwb(<16 x i16> %a0) { 2179; CHECK-LABEL: stack_fold_vpmovwb: 2180; CHECK: # %bb.0: 2181; CHECK-NEXT: vpmovwb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 2182; CHECK-NEXT: #APP 2183; CHECK-NEXT: nop 2184; CHECK-NEXT: #NO_APP 2185; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2186; CHECK-NEXT: vzeroupper 2187; CHECK-NEXT: retq 2188 %1 = trunc <16 x i16> %a0 to <16 x i8> 2189 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2190 ret <16 x i8> %1 2191} 2192declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16>, <16 x i8>, i16) 2193 2194define <8 x i16> @stack_fold_vpmovsdw(<8 x i32> %a0) { 2195; CHECK-LABEL: stack_fold_vpmovsdw: 2196; CHECK: # %bb.0: 2197; CHECK-NEXT: vpmovsdw %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 2198; CHECK-NEXT: #APP 2199; CHECK-NEXT: nop 2200; CHECK-NEXT: #NO_APP 2201; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2202; CHECK-NEXT: vzeroupper 2203; CHECK-NEXT: retq 2204 %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %a0, <8 x i16> undef, i8 -1) 2205 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2206 ret <8 x i16> %1 2207} 2208declare <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32>, <8 x i16>, i8) 2209 2210define <4 x i32> @stack_fold_vpmovsqd(<4 x i64> %a0) { 2211; CHECK-LABEL: stack_fold_vpmovsqd: 2212; CHECK: # %bb.0: 2213; CHECK-NEXT: vpmovsqd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 2214; CHECK-NEXT: #APP 2215; CHECK-NEXT: nop 2216; CHECK-NEXT: #NO_APP 2217; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2218; CHECK-NEXT: vzeroupper 2219; CHECK-NEXT: retq 2220 %1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %a0, <4 x i32> undef, i8 -1) 2221 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2222 ret <4 x i32> %1 2223} 2224declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64>, <4 x i32>, i8) 2225 2226define <16 x i8> @stack_fold_vpmovswb(<16 x i16> %a0) { 2227; CHECK-LABEL: stack_fold_vpmovswb: 2228; CHECK: # %bb.0: 2229; CHECK-NEXT: vpmovswb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 2230; CHECK-NEXT: #APP 2231; CHECK-NEXT: nop 2232; CHECK-NEXT: #NO_APP 2233; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2234; CHECK-NEXT: vzeroupper 2235; CHECK-NEXT: retq 2236 %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %a0, <16 x i8> undef, i16 -1) 2237 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2238 ret <16 x i8> %1 2239} 2240declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16>, <16 x i8>, i16) 2241 2242define <4 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) { 2243; CHECK-LABEL: stack_fold_pmovsxbd: 2244; CHECK: # %bb.0: 2245; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2246; CHECK-NEXT: #APP 2247; CHECK-NEXT: nop 2248; CHECK-NEXT: #NO_APP 2249; CHECK-NEXT: vpmovsxbd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2250; CHECK-NEXT: retq 2251 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2252 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2253 %3 = sext <4 x i8> %2 to <4 x i32> 2254 ret <4 x i32> %3 2255} 2256 2257define <8 x i32> @stack_fold_pmovsxbd_ymm(<16 x i8> %a0) { 2258; CHECK-LABEL: stack_fold_pmovsxbd_ymm: 2259; CHECK: # %bb.0: 2260; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2261; CHECK-NEXT: #APP 2262; CHECK-NEXT: nop 2263; CHECK-NEXT: #NO_APP 2264; CHECK-NEXT: vpmovsxbd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 2265; CHECK-NEXT: retq 2266 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2267 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2268 %3 = sext <8 x i8> %2 to <8 x i32> 2269 ret <8 x i32> %3 2270} 2271 2272define <2 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) { 2273; CHECK-LABEL: stack_fold_pmovsxbq: 2274; CHECK: # %bb.0: 2275; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2276; CHECK-NEXT: #APP 2277; CHECK-NEXT: nop 2278; CHECK-NEXT: #NO_APP 2279; CHECK-NEXT: vpmovsxbq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2280; CHECK-NEXT: retq 2281 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2282 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> <i32 0, i32 1> 2283 %3 = sext <2 x i8> %2 to <2 x i64> 2284 ret <2 x i64> %3 2285} 2286 2287define <4 x i64> @stack_fold_pmovsxbq_ymm(<16 x i8> %a0) { 2288; CHECK-LABEL: stack_fold_pmovsxbq_ymm: 2289; CHECK: # %bb.0: 2290; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2291; CHECK-NEXT: #APP 2292; CHECK-NEXT: nop 2293; CHECK-NEXT: #NO_APP 2294; CHECK-NEXT: vpmovsxbq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 2295; CHECK-NEXT: retq 2296 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2297 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2298 %3 = sext <4 x i8> %2 to <4 x i64> 2299 ret <4 x i64> %3 2300} 2301 2302define <8 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) { 2303; CHECK-LABEL: stack_fold_pmovsxbw: 2304; CHECK: # %bb.0: 2305; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2306; CHECK-NEXT: #APP 2307; CHECK-NEXT: nop 2308; CHECK-NEXT: #NO_APP 2309; CHECK-NEXT: vpmovsxbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2310; CHECK-NEXT: retq 2311 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2312 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2313 %3 = sext <8 x i8> %2 to <8 x i16> 2314 ret <8 x i16> %3 2315} 2316 2317define <16 x i16> @stack_fold_pmovsxbw_ymm(<16 x i8> %a0) { 2318; CHECK-LABEL: stack_fold_pmovsxbw_ymm: 2319; CHECK: # %bb.0: 2320; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2321; CHECK-NEXT: #APP 2322; CHECK-NEXT: nop 2323; CHECK-NEXT: #NO_APP 2324; CHECK-NEXT: vpmovsxbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 2325; CHECK-NEXT: retq 2326 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2327 %2 = sext <16 x i8> %a0 to <16 x i16> 2328 ret <16 x i16> %2 2329} 2330 2331define <2 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) { 2332; CHECK-LABEL: stack_fold_pmovsxdq: 2333; CHECK: # %bb.0: 2334; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2335; CHECK-NEXT: #APP 2336; CHECK-NEXT: nop 2337; CHECK-NEXT: #NO_APP 2338; CHECK-NEXT: vpmovsxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2339; CHECK-NEXT: retq 2340 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2341 %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 2342 %3 = sext <2 x i32> %2 to <2 x i64> 2343 ret <2 x i64> %3 2344} 2345 2346define <4 x i64> @stack_fold_pmovsxdq_ymm(<4 x i32> %a0) { 2347; CHECK-LABEL: stack_fold_pmovsxdq_ymm: 2348; CHECK: # %bb.0: 2349; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2350; CHECK-NEXT: #APP 2351; CHECK-NEXT: nop 2352; CHECK-NEXT: #NO_APP 2353; CHECK-NEXT: vpmovsxdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 2354; CHECK-NEXT: retq 2355 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2356 %2 = sext <4 x i32> %a0 to <4 x i64> 2357 ret <4 x i64> %2 2358} 2359 2360define <4 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) { 2361; CHECK-LABEL: stack_fold_pmovsxwd: 2362; CHECK: # %bb.0: 2363; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2364; CHECK-NEXT: #APP 2365; CHECK-NEXT: nop 2366; CHECK-NEXT: #NO_APP 2367; CHECK-NEXT: vpmovsxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2368; CHECK-NEXT: retq 2369 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2370 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2371 %3 = sext <4 x i16> %2 to <4 x i32> 2372 ret <4 x i32> %3 2373} 2374 2375define <8 x i32> @stack_fold_pmovsxwd_ymm(<8 x i16> %a0) { 2376; CHECK-LABEL: stack_fold_pmovsxwd_ymm: 2377; CHECK: # %bb.0: 2378; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2379; CHECK-NEXT: #APP 2380; CHECK-NEXT: nop 2381; CHECK-NEXT: #NO_APP 2382; CHECK-NEXT: vpmovsxwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 2383; CHECK-NEXT: retq 2384 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2385 %2 = sext <8 x i16> %a0 to <8 x i32> 2386 ret <8 x i32> %2 2387} 2388 2389define <2 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) { 2390; CHECK-LABEL: stack_fold_pmovsxwq: 2391; CHECK: # %bb.0: 2392; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2393; CHECK-NEXT: #APP 2394; CHECK-NEXT: nop 2395; CHECK-NEXT: #NO_APP 2396; CHECK-NEXT: vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2397; CHECK-NEXT: retq 2398 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2399 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1> 2400 %3 = sext <2 x i16> %2 to <2 x i64> 2401 ret <2 x i64> %3 2402} 2403 2404define <4 x i64> @stack_fold_pmovsxwq_ymm(<8 x i16> %a0) { 2405; CHECK-LABEL: stack_fold_pmovsxwq_ymm: 2406; CHECK: # %bb.0: 2407; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2408; CHECK-NEXT: #APP 2409; CHECK-NEXT: nop 2410; CHECK-NEXT: #NO_APP 2411; CHECK-NEXT: vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 2412; CHECK-NEXT: retq 2413 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2414 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2415 %3 = sext <4 x i16> %2 to <4 x i64> 2416 ret <4 x i64> %3 2417} 2418 2419define <8 x i16> @stack_fold_vpmovusdw(<8 x i32> %a0) { 2420; CHECK-LABEL: stack_fold_vpmovusdw: 2421; CHECK: # %bb.0: 2422; CHECK-NEXT: vpmovusdw %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 2423; CHECK-NEXT: #APP 2424; CHECK-NEXT: nop 2425; CHECK-NEXT: #NO_APP 2426; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2427; CHECK-NEXT: vzeroupper 2428; CHECK-NEXT: retq 2429 %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %a0, <8 x i16> undef, i8 -1) 2430 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2431 ret <8 x i16> %1 2432} 2433declare <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32>, <8 x i16>, i8) 2434 2435define <4 x i32> @stack_fold_vpmovusqd(<4 x i64> %a0) { 2436; CHECK-LABEL: stack_fold_vpmovusqd: 2437; CHECK: # %bb.0: 2438; CHECK-NEXT: vpmovusqd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 2439; CHECK-NEXT: #APP 2440; CHECK-NEXT: nop 2441; CHECK-NEXT: #NO_APP 2442; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2443; CHECK-NEXT: vzeroupper 2444; CHECK-NEXT: retq 2445 %1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %a0, <4 x i32> undef, i8 -1) 2446 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2447 ret <4 x i32> %1 2448} 2449declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64>, <4 x i32>, i8) 2450 2451define <16 x i8> @stack_fold_vpmovuswb(<16 x i16> %a0) { 2452; CHECK-LABEL: stack_fold_vpmovuswb: 2453; CHECK: # %bb.0: 2454; CHECK-NEXT: vpmovuswb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 2455; CHECK-NEXT: #APP 2456; CHECK-NEXT: nop 2457; CHECK-NEXT: #NO_APP 2458; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2459; CHECK-NEXT: vzeroupper 2460; CHECK-NEXT: retq 2461 %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %a0, <16 x i8> undef, i16 -1) 2462 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2463 ret <16 x i8> %1 2464} 2465declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16>, <16 x i8>, i16) 2466 2467define <4 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) { 2468; CHECK-LABEL: stack_fold_pmovzxbd: 2469; CHECK: # %bb.0: 2470; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2471; CHECK-NEXT: #APP 2472; CHECK-NEXT: nop 2473; CHECK-NEXT: #NO_APP 2474; CHECK-NEXT: vpmovzxbd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2475; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2476; CHECK-NEXT: retq 2477 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2478 %2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 1, i32 19, i32 20, i32 21, i32 2, i32 22, i32 23, i32 24, i32 3, i32 25, i32 26, i32 27> 2479 %3 = bitcast <16 x i8> %2 to <4 x i32> 2480 ret <4 x i32> %3 2481} 2482 2483define <8 x i32> @stack_fold_pmovzxbd_ymm(<16 x i8> %a0) { 2484; CHECK-LABEL: stack_fold_pmovzxbd_ymm: 2485; CHECK: # %bb.0: 2486; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2487; CHECK-NEXT: #APP 2488; CHECK-NEXT: nop 2489; CHECK-NEXT: #NO_APP 2490; CHECK-NEXT: vpmovzxbd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 2491; CHECK-NEXT: # ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 2492; CHECK-NEXT: retq 2493 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2494 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2495 %3 = zext <8 x i8> %2 to <8 x i32> 2496 ret <8 x i32> %3 2497} 2498 2499define <2 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) { 2500; CHECK-LABEL: stack_fold_pmovzxbq: 2501; CHECK: # %bb.0: 2502; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2503; CHECK-NEXT: #APP 2504; CHECK-NEXT: nop 2505; CHECK-NEXT: #NO_APP 2506; CHECK-NEXT: vpmovzxbq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2507; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 2508; CHECK-NEXT: retq 2509 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2510 %2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 1, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28> 2511 %3 = bitcast <16 x i8> %2 to <2 x i64> 2512 ret <2 x i64> %3 2513} 2514 2515define <4 x i64> @stack_fold_pmovzxbq_ymm(<16 x i8> %a0) { 2516; CHECK-LABEL: stack_fold_pmovzxbq_ymm: 2517; CHECK: # %bb.0: 2518; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2519; CHECK-NEXT: #APP 2520; CHECK-NEXT: nop 2521; CHECK-NEXT: #NO_APP 2522; CHECK-NEXT: vpmovzxbq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 2523; CHECK-NEXT: # ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero 2524; CHECK-NEXT: retq 2525 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2526 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2527 %3 = zext <4 x i8> %2 to <4 x i64> 2528 ret <4 x i64> %3 2529} 2530 2531define <8 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) { 2532; CHECK-LABEL: stack_fold_pmovzxbw: 2533; CHECK: # %bb.0: 2534; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2535; CHECK-NEXT: #APP 2536; CHECK-NEXT: nop 2537; CHECK-NEXT: #NO_APP 2538; CHECK-NEXT: vpmovzxbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2539; CHECK-NEXT: # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 2540; CHECK-NEXT: retq 2541 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2542 %2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> 2543 %3 = bitcast <16 x i8> %2 to <8 x i16> 2544 ret <8 x i16> %3 2545} 2546 2547define <16 x i16> @stack_fold_pmovzxbw_ymm(<16 x i8> %a0) { 2548; CHECK-LABEL: stack_fold_pmovzxbw_ymm: 2549; CHECK: # %bb.0: 2550; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2551; CHECK-NEXT: #APP 2552; CHECK-NEXT: nop 2553; CHECK-NEXT: #NO_APP 2554; CHECK-NEXT: vpmovzxbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 2555; CHECK-NEXT: # ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 2556; CHECK-NEXT: retq 2557 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2558 %2 = zext <16 x i8> %a0 to <16 x i16> 2559 ret <16 x i16> %2 2560} 2561 2562define <2 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) { 2563; CHECK-LABEL: stack_fold_pmovzxdq: 2564; CHECK: # %bb.0: 2565; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2566; CHECK-NEXT: #APP 2567; CHECK-NEXT: nop 2568; CHECK-NEXT: #NO_APP 2569; CHECK-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2570; CHECK-NEXT: # xmm0 = mem[0],zero,mem[1],zero 2571; CHECK-NEXT: retq 2572 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2573 %2 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 2574 %3 = bitcast <4 x i32> %2 to <2 x i64> 2575 ret <2 x i64> %3 2576} 2577 2578define <4 x i64> @stack_fold_pmovzxdq_ymm(<4 x i32> %a0) { 2579; CHECK-LABEL: stack_fold_pmovzxdq_ymm: 2580; CHECK: # %bb.0: 2581; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2582; CHECK-NEXT: #APP 2583; CHECK-NEXT: nop 2584; CHECK-NEXT: #NO_APP 2585; CHECK-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 2586; CHECK-NEXT: # ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 2587; CHECK-NEXT: retq 2588 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2589 %2 = zext <4 x i32> %a0 to <4 x i64> 2590 ret <4 x i64> %2 2591} 2592 2593define <4 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) { 2594; CHECK-LABEL: stack_fold_pmovzxwd: 2595; CHECK: # %bb.0: 2596; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2597; CHECK-NEXT: #APP 2598; CHECK-NEXT: nop 2599; CHECK-NEXT: #NO_APP 2600; CHECK-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2601; CHECK-NEXT: # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 2602; CHECK-NEXT: retq 2603 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2604 %2 = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> 2605 %3 = bitcast <8 x i16> %2 to <4 x i32> 2606 ret <4 x i32> %3 2607} 2608 2609define <8 x i32> @stack_fold_pmovzxwd_ymm(<8 x i16> %a0) { 2610; CHECK-LABEL: stack_fold_pmovzxwd_ymm: 2611; CHECK: # %bb.0: 2612; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2613; CHECK-NEXT: #APP 2614; CHECK-NEXT: nop 2615; CHECK-NEXT: #NO_APP 2616; CHECK-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 2617; CHECK-NEXT: # ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 2618; CHECK-NEXT: retq 2619 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2620 %2 = zext <8 x i16> %a0 to <8 x i32> 2621 ret <8 x i32> %2 2622} 2623 2624define <2 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) { 2625; CHECK-LABEL: stack_fold_pmovzxwq: 2626; CHECK: # %bb.0: 2627; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2628; CHECK-NEXT: #APP 2629; CHECK-NEXT: nop 2630; CHECK-NEXT: #NO_APP 2631; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2632; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero 2633; CHECK-NEXT: retq 2634 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2635 %2 = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 1, i32 11, i32 12, i32 13> 2636 %3 = bitcast <8 x i16> %2 to <2 x i64> 2637 ret <2 x i64> %3 2638} 2639 2640define <4 x i64> @stack_fold_pmovzxwq_ymm(<8 x i16> %a0) { 2641; CHECK-LABEL: stack_fold_pmovzxwq_ymm: 2642; CHECK: # %bb.0: 2643; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2644; CHECK-NEXT: #APP 2645; CHECK-NEXT: nop 2646; CHECK-NEXT: #NO_APP 2647; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 2648; CHECK-NEXT: # ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2649; CHECK-NEXT: retq 2650 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2651 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2652 %3 = zext <4 x i16> %2 to <4 x i64> 2653 ret <4 x i64> %3 2654} 2655 2656define <4 x i64> @stack_fold_pmovzxwq_maskz_ymm(<8 x i16> %a0, i8 %mask) { 2657; CHECK-LABEL: stack_fold_pmovzxwq_maskz_ymm: 2658; CHECK: # %bb.0: 2659; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2660; CHECK-NEXT: #APP 2661; CHECK-NEXT: nop 2662; CHECK-NEXT: #NO_APP 2663; CHECK-NEXT: kmovd %edi, %k1 2664; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 16-byte Folded Reload 2665; CHECK-NEXT: # ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2666; CHECK-NEXT: retq 2667 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2668 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2669 %3 = zext <4 x i16> %2 to <4 x i64> 2670 %4 = bitcast i8 %mask to <8 x i1> 2671 %5 = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2672 %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> zeroinitializer 2673 ret <4 x i64> %6 2674} 2675 2676define <4 x i64> @stack_fold_pmovzxwq_mask_ymm(<4 x i64> %passthru, <8 x i16> %a0, i8 %mask) { 2677; CHECK-LABEL: stack_fold_pmovzxwq_mask_ymm: 2678; CHECK: # %bb.0: 2679; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2680; CHECK-NEXT: #APP 2681; CHECK-NEXT: nop 2682; CHECK-NEXT: #NO_APP 2683; CHECK-NEXT: kmovd %edi, %k1 2684; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} # 16-byte Folded Reload 2685; CHECK-NEXT: # ymm0 {%k1} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2686; CHECK-NEXT: retq 2687 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2688 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2689 %3 = zext <4 x i16> %2 to <4 x i64> 2690 %4 = bitcast i8 %mask to <8 x i1> 2691 %5 = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2692 %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %passthru 2693 ret <4 x i64> %6 2694} 2695 2696define <2 x i64> @stack_fold_pmuldq(<4 x i32> %a0, <4 x i32> %a1) { 2697; CHECK-LABEL: stack_fold_pmuldq: 2698; CHECK: # %bb.0: 2699; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2700; CHECK-NEXT: #APP 2701; CHECK-NEXT: nop 2702; CHECK-NEXT: #NO_APP 2703; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2704; CHECK-NEXT: retq 2705 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2706 %2 = bitcast <4 x i32> %a0 to <2 x i64> 2707 %3 = bitcast <4 x i32> %a1 to <2 x i64> 2708 %4 = shl <2 x i64> %2, <i64 32, i64 32> 2709 %5 = ashr <2 x i64> %4, <i64 32, i64 32> 2710 %6 = shl <2 x i64> %3, <i64 32, i64 32> 2711 %7 = ashr <2 x i64> %6, <i64 32, i64 32> 2712 %8 = mul <2 x i64> %5, %7 2713 ret <2 x i64> %8 2714} 2715 2716define <4 x i64> @stack_fold_pmuldq_ymm(<8 x i32> %a0, <8 x i32> %a1) { 2717; CHECK-LABEL: stack_fold_pmuldq_ymm: 2718; CHECK: # %bb.0: 2719; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2720; CHECK-NEXT: #APP 2721; CHECK-NEXT: nop 2722; CHECK-NEXT: #NO_APP 2723; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2724; CHECK-NEXT: retq 2725 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2726 %2 = bitcast <8 x i32> %a0 to <4 x i64> 2727 %3 = bitcast <8 x i32> %a1 to <4 x i64> 2728 %4 = shl <4 x i64> %2, <i64 32, i64 32, i64 32, i64 32> 2729 %5 = ashr <4 x i64> %4, <i64 32, i64 32, i64 32, i64 32> 2730 %6 = shl <4 x i64> %3, <i64 32, i64 32, i64 32, i64 32> 2731 %7 = ashr <4 x i64> %6, <i64 32, i64 32, i64 32, i64 32> 2732 %8 = mul <4 x i64> %5, %7 2733 ret <4 x i64> %8 2734} 2735 2736define <2 x i64> @stack_fold_pmuludq(<4 x i32> %a0, <4 x i32> %a1) { 2737; CHECK-LABEL: stack_fold_pmuludq: 2738; CHECK: # %bb.0: 2739; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2740; CHECK-NEXT: #APP 2741; CHECK-NEXT: nop 2742; CHECK-NEXT: #NO_APP 2743; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2744; CHECK-NEXT: retq 2745 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2746 %2 = bitcast <4 x i32> %a0 to <2 x i64> 2747 %3 = bitcast <4 x i32> %a1 to <2 x i64> 2748 %4 = and <2 x i64> %2, <i64 4294967295, i64 4294967295> 2749 %5 = and <2 x i64> %3, <i64 4294967295, i64 4294967295> 2750 %6 = mul <2 x i64> %4, %5 2751 ret <2 x i64> %6 2752} 2753 2754define <4 x i64> @stack_fold_pmuludq_ymm(<8 x i32> %a0, <8 x i32> %a1) { 2755; CHECK-LABEL: stack_fold_pmuludq_ymm: 2756; CHECK: # %bb.0: 2757; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2758; CHECK-NEXT: #APP 2759; CHECK-NEXT: nop 2760; CHECK-NEXT: #NO_APP 2761; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2762; CHECK-NEXT: retq 2763 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2764 %2 = bitcast <8 x i32> %a0 to <4 x i64> 2765 %3 = bitcast <8 x i32> %a1 to <4 x i64> 2766 %4 = and <4 x i64> %2, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 2767 %5 = and <4 x i64> %3, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 2768 %6 = mul <4 x i64> %4, %5 2769 ret <4 x i64> %6 2770} 2771 2772define <4 x i64> @stack_fold_pmuludq_ymm_mask(ptr %passthru, <8 x i32> %a0, <8 x i32> %a1, i8 %mask) { 2773; CHECK-LABEL: stack_fold_pmuludq_ymm_mask: 2774; CHECK: # %bb.0: 2775; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2776; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2777; CHECK-NEXT: #APP 2778; CHECK-NEXT: nop 2779; CHECK-NEXT: #NO_APP 2780; CHECK-NEXT: kmovd %esi, %k1 2781; CHECK-NEXT: vmovdqa (%rdi), %ymm1 2782; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2783; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 {%k1} # 32-byte Folded Reload 2784; CHECK-NEXT: vmovdqa %ymm1, %ymm0 2785; CHECK-NEXT: retq 2786 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2787 %2 = bitcast <8 x i32> %a0 to <4 x i64> 2788 %3 = bitcast <8 x i32> %a1 to <4 x i64> 2789 %4 = and <4 x i64> %2, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 2790 %5 = and <4 x i64> %3, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 2791 %6 = mul <4 x i64> %4, %5 2792 %7 = bitcast i8 %mask to <8 x i1> 2793 %8 = shufflevector <8 x i1> %7, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2794 %9 = load <4 x i64>, ptr %passthru 2795 %10 = select <4 x i1> %8, <4 x i64> %6, <4 x i64> %9 2796 ret <4 x i64> %10 2797} 2798 2799define <4 x i64> @stack_fold_pmuludq_ymm_maskz(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) { 2800; CHECK-LABEL: stack_fold_pmuludq_ymm_maskz: 2801; CHECK: # %bb.0: 2802; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2803; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2804; CHECK-NEXT: #APP 2805; CHECK-NEXT: nop 2806; CHECK-NEXT: #NO_APP 2807; CHECK-NEXT: kmovd %edi, %k1 2808; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2809; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload 2810; CHECK-NEXT: retq 2811 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2812 %2 = bitcast <8 x i32> %a0 to <4 x i64> 2813 %3 = bitcast <8 x i32> %a1 to <4 x i64> 2814 %4 = and <4 x i64> %2, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 2815 %5 = and <4 x i64> %3, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 2816 %6 = mul <4 x i64> %4, %5 2817 %7 = bitcast i8 %mask to <8 x i1> 2818 %8 = shufflevector <8 x i1> %7, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2819 %9 = select <4 x i1> %8, <4 x i64> %6, <4 x i64> zeroinitializer 2820 ret <4 x i64> %9 2821} 2822 2823define <4 x i32> @stack_fold_vpopcntd(<4 x i32> %a0) { 2824; CHECK-LABEL: stack_fold_vpopcntd: 2825; CHECK: # %bb.0: 2826; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2827; CHECK-NEXT: #APP 2828; CHECK-NEXT: nop 2829; CHECK-NEXT: #NO_APP 2830; CHECK-NEXT: vpopcntd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2831; CHECK-NEXT: retq 2832 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2833 %2 = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a0) 2834 ret <4 x i32> %2 2835} 2836declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readonly 2837 2838define <8 x i32> @stack_fold_vpopcntd_ymm(<8 x i32> %a0) { 2839; CHECK-LABEL: stack_fold_vpopcntd_ymm: 2840; CHECK: # %bb.0: 2841; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2842; CHECK-NEXT: #APP 2843; CHECK-NEXT: nop 2844; CHECK-NEXT: #NO_APP 2845; CHECK-NEXT: vpopcntd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 2846; CHECK-NEXT: retq 2847 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2848 %2 = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a0) 2849 ret <8 x i32> %2 2850} 2851declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) nounwind readonly 2852 2853define <2 x i64> @stack_fold_vpopcntq(<2 x i64> %a0) { 2854; CHECK-LABEL: stack_fold_vpopcntq: 2855; CHECK: # %bb.0: 2856; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2857; CHECK-NEXT: #APP 2858; CHECK-NEXT: nop 2859; CHECK-NEXT: #NO_APP 2860; CHECK-NEXT: vpopcntq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2861; CHECK-NEXT: retq 2862 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2863 %2 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a0) 2864 ret <2 x i64> %2 2865} 2866declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone 2867 2868define <4 x i64> @stack_fold_vpopcntq_ymm(<4 x i64> %a0) { 2869; CHECK-LABEL: stack_fold_vpopcntq_ymm: 2870; CHECK: # %bb.0: 2871; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2872; CHECK-NEXT: #APP 2873; CHECK-NEXT: nop 2874; CHECK-NEXT: #NO_APP 2875; CHECK-NEXT: vpopcntq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 2876; CHECK-NEXT: retq 2877 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2878 %2 = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a0) 2879 ret <4 x i64> %2 2880} 2881declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone 2882 2883define <2 x i64> @stack_fold_psadbw(<16 x i8> %a0, <16 x i8> %a1) { 2884; CHECK-LABEL: stack_fold_psadbw: 2885; CHECK: # %bb.0: 2886; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2887; CHECK-NEXT: #APP 2888; CHECK-NEXT: nop 2889; CHECK-NEXT: #NO_APP 2890; CHECK-NEXT: vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2891; CHECK-NEXT: retq 2892 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2893 %2 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1) 2894 ret <2 x i64> %2 2895} 2896declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone 2897 2898define <2 x i64> @stack_fold_psadbw_commute(<16 x i8> %a0, <16 x i8> %a1) { 2899; CHECK-LABEL: stack_fold_psadbw_commute: 2900; CHECK: # %bb.0: 2901; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2902; CHECK-NEXT: #APP 2903; CHECK-NEXT: nop 2904; CHECK-NEXT: #NO_APP 2905; CHECK-NEXT: vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2906; CHECK-NEXT: retq 2907 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2908 %2 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a1, <16 x i8> %a0) 2909 ret <2 x i64> %2 2910} 2911 2912define <4 x i64> @stack_fold_psadbw_ymm(<32 x i8> %a0, <32 x i8> %a1) { 2913; CHECK-LABEL: stack_fold_psadbw_ymm: 2914; CHECK: # %bb.0: 2915; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2916; CHECK-NEXT: #APP 2917; CHECK-NEXT: nop 2918; CHECK-NEXT: #NO_APP 2919; CHECK-NEXT: vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2920; CHECK-NEXT: retq 2921 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2922 %2 = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a0, <32 x i8> %a1) 2923 ret <4 x i64> %2 2924} 2925declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone 2926 2927define <4 x i64> @stack_fold_psadbw_ymm_commute(<32 x i8> %a0, <32 x i8> %a1) { 2928; CHECK-LABEL: stack_fold_psadbw_ymm_commute: 2929; CHECK: # %bb.0: 2930; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2931; CHECK-NEXT: #APP 2932; CHECK-NEXT: nop 2933; CHECK-NEXT: #NO_APP 2934; CHECK-NEXT: vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2935; CHECK-NEXT: retq 2936 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2937 %2 = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a1, <32 x i8> %a0) 2938 ret <4 x i64> %2 2939} 2940 2941define <16 x i8> @stack_fold_pshufb(<16 x i8> %a0, <16 x i8> %a1) { 2942; CHECK-LABEL: stack_fold_pshufb: 2943; CHECK: # %bb.0: 2944; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2945; CHECK-NEXT: #APP 2946; CHECK-NEXT: nop 2947; CHECK-NEXT: #NO_APP 2948; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2949; CHECK-NEXT: retq 2950 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2951 %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1) 2952 ret <16 x i8> %2 2953} 2954declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone 2955 2956define <16 x i8> @stack_fold_pshufb_mask(ptr %passthru, <16 x i8> %a0, <16 x i8> %a1, i16 %mask) { 2957; CHECK-LABEL: stack_fold_pshufb_mask: 2958; CHECK: # %bb.0: 2959; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2960; CHECK-NEXT: #APP 2961; CHECK-NEXT: nop 2962; CHECK-NEXT: #NO_APP 2963; CHECK-NEXT: vmovdqa (%rdi), %xmm2 2964; CHECK-NEXT: kmovd %esi, %k1 2965; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 2966; CHECK-NEXT: vmovdqa %xmm2, %xmm0 2967; CHECK-NEXT: retq 2968 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2969 %2 = load <16 x i8>, ptr %passthru 2970 %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1) 2971 %4 = bitcast i16 %mask to <16 x i1> 2972 %5 = select <16 x i1> %4, <16 x i8> %3, <16 x i8> %2 2973 ret <16 x i8> %5 2974} 2975 2976define <16 x i8> @stack_fold_pshufb_maskz(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) { 2977; CHECK-LABEL: stack_fold_pshufb_maskz: 2978; CHECK: # %bb.0: 2979; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2980; CHECK-NEXT: #APP 2981; CHECK-NEXT: nop 2982; CHECK-NEXT: #NO_APP 2983; CHECK-NEXT: kmovd %edi, %k1 2984; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload 2985; CHECK-NEXT: retq 2986 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2987 %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1) 2988 %3 = bitcast i16 %mask to <16 x i1> 2989 %4 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> zeroinitializer 2990 ret <16 x i8> %4 2991} 2992 2993define <32 x i8> @stack_fold_pshufb_ymm(<32 x i8> %a0, <32 x i8> %a1) { 2994; CHECK-LABEL: stack_fold_pshufb_ymm: 2995; CHECK: # %bb.0: 2996; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2997; CHECK-NEXT: #APP 2998; CHECK-NEXT: nop 2999; CHECK-NEXT: #NO_APP 3000; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3001; CHECK-NEXT: retq 3002 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3003 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1) 3004 ret <32 x i8> %2 3005} 3006declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) 3007 3008define <32 x i8> @stack_fold_pshufb_ymm_mask(ptr %passthru, <32 x i8> %a0, <32 x i8> %a1, i32 %mask) { 3009; CHECK-LABEL: stack_fold_pshufb_ymm_mask: 3010; CHECK: # %bb.0: 3011; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3012; CHECK-NEXT: #APP 3013; CHECK-NEXT: nop 3014; CHECK-NEXT: #NO_APP 3015; CHECK-NEXT: vmovdqa (%rdi), %ymm2 3016; CHECK-NEXT: kmovd %esi, %k1 3017; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 3018; CHECK-NEXT: vmovdqa %ymm2, %ymm0 3019; CHECK-NEXT: retq 3020 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3021 %2 = load <32 x i8>, ptr %passthru 3022 %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1) 3023 %4 = bitcast i32 %mask to <32 x i1> 3024 %5 = select <32 x i1> %4, <32 x i8> %3, <32 x i8> %2 3025 ret <32 x i8> %5 3026} 3027 3028define <32 x i8> @stack_fold_pshufb_ymm_maskz(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) { 3029; CHECK-LABEL: stack_fold_pshufb_ymm_maskz: 3030; CHECK: # %bb.0: 3031; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3032; CHECK-NEXT: #APP 3033; CHECK-NEXT: nop 3034; CHECK-NEXT: #NO_APP 3035; CHECK-NEXT: kmovd %edi, %k1 3036; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload 3037; CHECK-NEXT: retq 3038 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3039 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1) 3040 %3 = bitcast i32 %mask to <32 x i1> 3041 %4 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> zeroinitializer 3042 ret <32 x i8> %4 3043} 3044 3045define <4 x i32> @stack_fold_pshufd(<4 x i32> %a0) { 3046; CHECK-LABEL: stack_fold_pshufd: 3047; CHECK: # %bb.0: 3048; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3049; CHECK-NEXT: #APP 3050; CHECK-NEXT: nop 3051; CHECK-NEXT: #NO_APP 3052; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3053; CHECK-NEXT: # xmm0 = mem[3,2,1,0] 3054; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 3055; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 3056; CHECK-NEXT: retq 3057 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3058 %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 3059 %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1> 3060 ret <4 x i32> %3 3061} 3062 3063define <4 x i32> @stack_fold_pshufd_mask(<4 x i32> %passthru, <4 x i32> %a0, i8 %mask) { 3064; CHECK-LABEL: stack_fold_pshufd_mask: 3065; CHECK: # %bb.0: 3066; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3067; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3068; CHECK-NEXT: #APP 3069; CHECK-NEXT: nop 3070; CHECK-NEXT: #NO_APP 3071; CHECK-NEXT: kmovd %edi, %k1 3072; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3073; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} # 16-byte Folded Reload 3074; CHECK-NEXT: # xmm0 {%k1} = mem[3,2,1,0] 3075; CHECK-NEXT: retq 3076 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3077 %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 3078 %3 = bitcast i8 %mask to <8 x i1> 3079 %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3080 %5 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> %passthru 3081 ret <4 x i32> %5 3082} 3083 3084define <4 x i32> @stack_fold_pshufd_maskz(<4 x i32> %a0, i8 %mask) { 3085; CHECK-LABEL: stack_fold_pshufd_maskz: 3086; CHECK: # %bb.0: 3087; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3088; CHECK-NEXT: #APP 3089; CHECK-NEXT: nop 3090; CHECK-NEXT: #NO_APP 3091; CHECK-NEXT: kmovd %edi, %k1 3092; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload 3093; CHECK-NEXT: # xmm0 {%k1} {z} = mem[3,2,1,0] 3094; CHECK-NEXT: retq 3095 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3096 %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 3097 %3 = bitcast i8 %mask to <8 x i1> 3098 %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3099 %5 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> zeroinitializer 3100 ret <4 x i32> %5 3101} 3102 3103define <8 x i32> @stack_fold_pshufd_ymm(<8 x i32> %a0) { 3104; CHECK-LABEL: stack_fold_pshufd_ymm: 3105; CHECK: # %bb.0: 3106; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3107; CHECK-NEXT: #APP 3108; CHECK-NEXT: nop 3109; CHECK-NEXT: #NO_APP 3110; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 3111; CHECK-NEXT: # ymm0 = mem[3,2,1,0,7,6,5,4] 3112; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 3113; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm0 3114; CHECK-NEXT: retq 3115 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3116 %2 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 3117 %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 3118 ret <8 x i32> %3 3119} 3120 3121define <8 x i32> @stack_fold_pshufd_ymm_mask(<8 x i32> %passthru, <8 x i32> %a0, i8 %mask) { 3122; CHECK-LABEL: stack_fold_pshufd_ymm_mask: 3123; CHECK: # %bb.0: 3124; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3125; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3126; CHECK-NEXT: #APP 3127; CHECK-NEXT: nop 3128; CHECK-NEXT: #NO_APP 3129; CHECK-NEXT: kmovd %edi, %k1 3130; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3131; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} # 32-byte Folded Reload 3132; CHECK-NEXT: # ymm0 {%k1} = mem[3,2,1,0,7,6,5,4] 3133; CHECK-NEXT: retq 3134 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3135 %2 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 3136 %3 = bitcast i8 %mask to <8 x i1> 3137 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> %passthru 3138 ret <8 x i32> %4 3139} 3140 3141define <8 x i32> @stack_fold_pshufd_ymm_maskz(<8 x i32> %a0, i8 %mask) { 3142; CHECK-LABEL: stack_fold_pshufd_ymm_maskz: 3143; CHECK: # %bb.0: 3144; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3145; CHECK-NEXT: #APP 3146; CHECK-NEXT: nop 3147; CHECK-NEXT: #NO_APP 3148; CHECK-NEXT: kmovd %edi, %k1 3149; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload 3150; CHECK-NEXT: # ymm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4] 3151; CHECK-NEXT: retq 3152 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3153 %2 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 3154 %3 = bitcast i8 %mask to <8 x i1> 3155 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer 3156 ret <8 x i32> %4 3157} 3158 3159define <8 x i16> @stack_fold_pshufhw(<8 x i16> %a0) { 3160; CHECK-LABEL: stack_fold_pshufhw: 3161; CHECK: # %bb.0: 3162; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3163; CHECK-NEXT: #APP 3164; CHECK-NEXT: nop 3165; CHECK-NEXT: #NO_APP 3166; CHECK-NEXT: vpshufhw $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3167; CHECK-NEXT: # xmm0 = mem[0,1,2,3,7,6,4,4] 3168; CHECK-NEXT: retq 3169 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3170 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4> 3171 ret <8 x i16> %2 3172} 3173 3174define <8 x i16> @stack_fold_pshufhw_mask(<8 x i16> %passthru, <8 x i16> %a0, i8 %mask) { 3175; CHECK-LABEL: stack_fold_pshufhw_mask: 3176; CHECK: # %bb.0: 3177; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3178; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3179; CHECK-NEXT: #APP 3180; CHECK-NEXT: nop 3181; CHECK-NEXT: #NO_APP 3182; CHECK-NEXT: kmovd %edi, %k1 3183; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3184; CHECK-NEXT: vpshufhw $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} # 16-byte Folded Reload 3185; CHECK-NEXT: # xmm0 {%k1} = mem[0,1,2,3,7,6,4,4] 3186; CHECK-NEXT: retq 3187 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3188 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4> 3189 %3 = bitcast i8 %mask to <8 x i1> 3190 %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> %passthru 3191 ret <8 x i16> %4 3192} 3193 3194define <8 x i16> @stack_fold_pshufhw_maskz(<8 x i16> %a0, i8 %mask) { 3195; CHECK-LABEL: stack_fold_pshufhw_maskz: 3196; CHECK: # %bb.0: 3197; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3198; CHECK-NEXT: #APP 3199; CHECK-NEXT: nop 3200; CHECK-NEXT: #NO_APP 3201; CHECK-NEXT: kmovd %edi, %k1 3202; CHECK-NEXT: vpshufhw $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload 3203; CHECK-NEXT: # xmm0 {%k1} {z} = mem[0,1,2,3,7,6,4,4] 3204; CHECK-NEXT: retq 3205 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3206 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4> 3207 %3 = bitcast i8 %mask to <8 x i1> 3208 %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer 3209 ret <8 x i16> %4 3210} 3211 3212define <16 x i16> @stack_fold_pshufhw_ymm(<16 x i16> %a0) { 3213; CHECK-LABEL: stack_fold_pshufhw_ymm: 3214; CHECK: # %bb.0: 3215; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3216; CHECK-NEXT: #APP 3217; CHECK-NEXT: nop 3218; CHECK-NEXT: #NO_APP 3219; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 3220; CHECK-NEXT: # ymm0 = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] 3221; CHECK-NEXT: retq 3222 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3223 %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12> 3224 ret <16 x i16> %2 3225} 3226 3227define <16 x i16> @stack_fold_pshufhw_ymm_mask(<16 x i16> %passthru, <16 x i16> %a0, i16 %mask) { 3228; CHECK-LABEL: stack_fold_pshufhw_ymm_mask: 3229; CHECK: # %bb.0: 3230; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3231; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3232; CHECK-NEXT: #APP 3233; CHECK-NEXT: nop 3234; CHECK-NEXT: #NO_APP 3235; CHECK-NEXT: kmovd %edi, %k1 3236; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3237; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} # 32-byte Folded Reload 3238; CHECK-NEXT: # ymm0 {%k1} = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] 3239; CHECK-NEXT: retq 3240 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3241 %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12> 3242 %3 = bitcast i16 %mask to <16 x i1> 3243 %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> %passthru 3244 ret <16 x i16> %4 3245} 3246 3247define <16 x i16> @stack_fold_pshufhw_ymm_maskz(<16 x i16> %a0, i16 %mask) { 3248; CHECK-LABEL: stack_fold_pshufhw_ymm_maskz: 3249; CHECK: # %bb.0: 3250; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3251; CHECK-NEXT: #APP 3252; CHECK-NEXT: nop 3253; CHECK-NEXT: #NO_APP 3254; CHECK-NEXT: kmovd %edi, %k1 3255; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload 3256; CHECK-NEXT: # ymm0 {%k1} {z} = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] 3257; CHECK-NEXT: retq 3258 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3259 %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12> 3260 %3 = bitcast i16 %mask to <16 x i1> 3261 %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer 3262 ret <16 x i16> %4 3263} 3264 3265define <8 x i16> @stack_fold_pshuflw(<8 x i16> %a0) { 3266; CHECK-LABEL: stack_fold_pshuflw: 3267; CHECK: # %bb.0: 3268; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3269; CHECK-NEXT: #APP 3270; CHECK-NEXT: nop 3271; CHECK-NEXT: #NO_APP 3272; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3273; CHECK-NEXT: # xmm0 = mem[3,2,1,0,4,5,6,7] 3274; CHECK-NEXT: retq 3275 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3276 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7> 3277 ret <8 x i16> %2 3278} 3279 3280define <8 x i16> @stack_fold_pshuflw_mask(<8 x i16> %passthru, <8 x i16> %a0, i8 %mask) { 3281; CHECK-LABEL: stack_fold_pshuflw_mask: 3282; CHECK: # %bb.0: 3283; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3284; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3285; CHECK-NEXT: #APP 3286; CHECK-NEXT: nop 3287; CHECK-NEXT: #NO_APP 3288; CHECK-NEXT: kmovd %edi, %k1 3289; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3290; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} # 16-byte Folded Reload 3291; CHECK-NEXT: # xmm0 {%k1} = mem[3,2,1,0,4,5,6,7] 3292; CHECK-NEXT: retq 3293 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3294 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7> 3295 %3 = bitcast i8 %mask to <8 x i1> 3296 %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> %passthru 3297 ret <8 x i16> %4 3298} 3299 3300define <8 x i16> @stack_fold_pshuflw_maskz(<8 x i16> %a0, i8 %mask) { 3301; CHECK-LABEL: stack_fold_pshuflw_maskz: 3302; CHECK: # %bb.0: 3303; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3304; CHECK-NEXT: #APP 3305; CHECK-NEXT: nop 3306; CHECK-NEXT: #NO_APP 3307; CHECK-NEXT: kmovd %edi, %k1 3308; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload 3309; CHECK-NEXT: # xmm0 {%k1} {z} = mem[3,2,1,0,4,5,6,7] 3310; CHECK-NEXT: retq 3311 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3312 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7> 3313 %3 = bitcast i8 %mask to <8 x i1> 3314 %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer 3315 ret <8 x i16> %4 3316} 3317 3318define <16 x i16> @stack_fold_pshuflw_ymm(<16 x i16> %a0) { 3319; CHECK-LABEL: stack_fold_pshuflw_ymm: 3320; CHECK: # %bb.0: 3321; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3322; CHECK-NEXT: #APP 3323; CHECK-NEXT: nop 3324; CHECK-NEXT: #NO_APP 3325; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 3326; CHECK-NEXT: # ymm0 = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] 3327; CHECK-NEXT: retq 3328 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3329 %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15> 3330 ret <16 x i16> %2 3331} 3332 3333define <16 x i16> @stack_fold_pshuflw_ymm_mask(<16 x i16> %passthru, <16 x i16> %a0, i16 %mask) { 3334; CHECK-LABEL: stack_fold_pshuflw_ymm_mask: 3335; CHECK: # %bb.0: 3336; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3337; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3338; CHECK-NEXT: #APP 3339; CHECK-NEXT: nop 3340; CHECK-NEXT: #NO_APP 3341; CHECK-NEXT: kmovd %edi, %k1 3342; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3343; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} # 32-byte Folded Reload 3344; CHECK-NEXT: # ymm0 {%k1} = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] 3345; CHECK-NEXT: retq 3346 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3347 %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15> 3348 %3 = bitcast i16 %mask to <16 x i1> 3349 %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> %passthru 3350 ret <16 x i16> %4 3351} 3352 3353define <16 x i16> @stack_fold_pshuflw_ymm_maskz(<16 x i16> %a0, i16 %mask) { 3354; CHECK-LABEL: stack_fold_pshuflw_ymm_maskz: 3355; CHECK: # %bb.0: 3356; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3357; CHECK-NEXT: #APP 3358; CHECK-NEXT: nop 3359; CHECK-NEXT: #NO_APP 3360; CHECK-NEXT: kmovd %edi, %k1 3361; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload 3362; CHECK-NEXT: # ymm0 {%k1} {z} = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] 3363; CHECK-NEXT: retq 3364 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3365 %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15> 3366 %3 = bitcast i16 %mask to <16 x i1> 3367 %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer 3368 ret <16 x i16> %4 3369} 3370 3371define <4 x i32> @stack_fold_pslld(<4 x i32> %a0, <4 x i32> %a1) { 3372; CHECK-LABEL: stack_fold_pslld: 3373; CHECK: # %bb.0: 3374; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3375; CHECK-NEXT: #APP 3376; CHECK-NEXT: nop 3377; CHECK-NEXT: #NO_APP 3378; CHECK-NEXT: vpslld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3379; CHECK-NEXT: retq 3380 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3381 %2 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1) 3382 ret <4 x i32> %2 3383} 3384declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone 3385 3386define <8 x i32> @stack_fold_pslld_ymm(<8 x i32> %a0, <4 x i32> %a1) { 3387; CHECK-LABEL: stack_fold_pslld_ymm: 3388; CHECK: # %bb.0: 3389; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3390; CHECK-NEXT: #APP 3391; CHECK-NEXT: nop 3392; CHECK-NEXT: #NO_APP 3393; CHECK-NEXT: vpslld {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 3394; CHECK-NEXT: retq 3395 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3396 %2 = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %a0, <4 x i32> %a1) 3397 ret <8 x i32> %2 3398} 3399declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone 3400 3401define <16 x i8> @stack_fold_pslldq(<16 x i8> %a) { 3402; CHECK-LABEL: stack_fold_pslldq: 3403; CHECK: # %bb.0: 3404; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3405; CHECK-NEXT: #APP 3406; CHECK-NEXT: nop 3407; CHECK-NEXT: #NO_APP 3408; CHECK-NEXT: vpslldq $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3409; CHECK-NEXT: # xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,mem[0,1,2,3] 3410; CHECK-NEXT: retq 3411 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3412 %2 = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 17, i32 18, i32 19> 3413 ret <16 x i8> %2 3414} 3415 3416define <32 x i8> @stack_fold_pslldq_ymm(<32 x i8> %a) { 3417; CHECK-LABEL: stack_fold_pslldq_ymm: 3418; CHECK: # %bb.0: 3419; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3420; CHECK-NEXT: #APP 3421; CHECK-NEXT: nop 3422; CHECK-NEXT: #NO_APP 3423; CHECK-NEXT: vpslldq $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 3424; CHECK-NEXT: # ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,mem[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,mem[16] 3425; CHECK-NEXT: retq 3426 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3427 %2 = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 32, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 48> 3428 ret <32 x i8> %2 3429} 3430 3431define <2 x i64> @stack_fold_psllq(<2 x i64> %a0, <2 x i64> %a1) { 3432; CHECK-LABEL: stack_fold_psllq: 3433; CHECK: # %bb.0: 3434; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3435; CHECK-NEXT: #APP 3436; CHECK-NEXT: nop 3437; CHECK-NEXT: #NO_APP 3438; CHECK-NEXT: vpsllq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3439; CHECK-NEXT: retq 3440 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3441 %2 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1) 3442 ret <2 x i64> %2 3443} 3444declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone 3445 3446define <4 x i64> @stack_fold_psllq_ymm(<4 x i64> %a0, <2 x i64> %a1) { 3447; CHECK-LABEL: stack_fold_psllq_ymm: 3448; CHECK: # %bb.0: 3449; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3450; CHECK-NEXT: #APP 3451; CHECK-NEXT: nop 3452; CHECK-NEXT: #NO_APP 3453; CHECK-NEXT: vpsllq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 3454; CHECK-NEXT: retq 3455 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3456 %2 = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1) 3457 ret <4 x i64> %2 3458} 3459declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone 3460 3461define <4 x i32> @stack_fold_psllvd(<4 x i32> %a0, <4 x i32> %a1) { 3462; CHECK-LABEL: stack_fold_psllvd: 3463; CHECK: # %bb.0: 3464; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3465; CHECK-NEXT: #APP 3466; CHECK-NEXT: nop 3467; CHECK-NEXT: #NO_APP 3468; CHECK-NEXT: vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3469; CHECK-NEXT: retq 3470 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3471 %2 = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %a0, <4 x i32> %a1) 3472 ret <4 x i32> %2 3473} 3474declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone 3475 3476define <8 x i32> @stack_fold_psllvd_ymm(<8 x i32> %a0, <8 x i32> %a1) { 3477; CHECK-LABEL: stack_fold_psllvd_ymm: 3478; CHECK: # %bb.0: 3479; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3480; CHECK-NEXT: #APP 3481; CHECK-NEXT: nop 3482; CHECK-NEXT: #NO_APP 3483; CHECK-NEXT: vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3484; CHECK-NEXT: retq 3485 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3486 %2 = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %a0, <8 x i32> %a1) 3487 ret <8 x i32> %2 3488} 3489declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone 3490 3491define <2 x i64> @stack_fold_psllvq(<2 x i64> %a0, <2 x i64> %a1) { 3492; CHECK-LABEL: stack_fold_psllvq: 3493; CHECK: # %bb.0: 3494; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3495; CHECK-NEXT: #APP 3496; CHECK-NEXT: nop 3497; CHECK-NEXT: #NO_APP 3498; CHECK-NEXT: vpsllvq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3499; CHECK-NEXT: retq 3500 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3501 %2 = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1) 3502 ret <2 x i64> %2 3503} 3504declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone 3505 3506define <4 x i64> @stack_fold_psllvq_ymm(<4 x i64> %a0, <4 x i64> %a1) { 3507; CHECK-LABEL: stack_fold_psllvq_ymm: 3508; CHECK: # %bb.0: 3509; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3510; CHECK-NEXT: #APP 3511; CHECK-NEXT: nop 3512; CHECK-NEXT: #NO_APP 3513; CHECK-NEXT: vpsllvq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3514; CHECK-NEXT: retq 3515 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3516 %2 = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1) 3517 ret <4 x i64> %2 3518} 3519declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone 3520 3521define <8 x i16> @stack_fold_psllvw(<8 x i16> %a0, <8 x i16> %a1) { 3522; CHECK-LABEL: stack_fold_psllvw: 3523; CHECK: # %bb.0: 3524; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3525; CHECK-NEXT: #APP 3526; CHECK-NEXT: nop 3527; CHECK-NEXT: #NO_APP 3528; CHECK-NEXT: vpsllvw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3529; CHECK-NEXT: retq 3530 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3531 %2 = call <8 x i16> @llvm.x86.avx512.psllv.w.128(<8 x i16> %a0, <8 x i16> %a1) 3532 ret <8 x i16> %2 3533} 3534declare <8 x i16> @llvm.x86.avx512.psllv.w.128(<8 x i16>, <8 x i16>) nounwind readnone 3535 3536define <16 x i16> @stack_fold_psllvw_ymm(<16 x i16> %a0, <16 x i16> %a1) { 3537; CHECK-LABEL: stack_fold_psllvw_ymm: 3538; CHECK: # %bb.0: 3539; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3540; CHECK-NEXT: #APP 3541; CHECK-NEXT: nop 3542; CHECK-NEXT: #NO_APP 3543; CHECK-NEXT: vpsllvw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3544; CHECK-NEXT: retq 3545 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3546 %2 = call <16 x i16> @llvm.x86.avx512.psllv.w.256(<16 x i16> %a0, <16 x i16> %a1) 3547 ret <16 x i16> %2 3548} 3549declare <16 x i16> @llvm.x86.avx512.psllv.w.256(<16 x i16>, <16 x i16>) nounwind readnone 3550 3551define <8 x i16> @stack_fold_psllw(<8 x i16> %a0, <8 x i16> %a1) { 3552; CHECK-LABEL: stack_fold_psllw: 3553; CHECK: # %bb.0: 3554; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3555; CHECK-NEXT: #APP 3556; CHECK-NEXT: nop 3557; CHECK-NEXT: #NO_APP 3558; CHECK-NEXT: vpsllw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3559; CHECK-NEXT: retq 3560 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3561 %2 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1) 3562 ret <8 x i16> %2 3563} 3564declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone 3565 3566define <16 x i16> @stack_fold_psllw_ymm(<16 x i16> %a0, <8 x i16> %a1) { 3567; CHECK-LABEL: stack_fold_psllw_ymm: 3568; CHECK: # %bb.0: 3569; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3570; CHECK-NEXT: #APP 3571; CHECK-NEXT: nop 3572; CHECK-NEXT: #NO_APP 3573; CHECK-NEXT: vpsllw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 3574; CHECK-NEXT: retq 3575 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3576 %2 = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %a0, <8 x i16> %a1) 3577 ret <16 x i16> %2 3578} 3579declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone 3580 3581define <4 x i32> @stack_fold_psrad(<4 x i32> %a0, <4 x i32> %a1) { 3582; CHECK-LABEL: stack_fold_psrad: 3583; CHECK: # %bb.0: 3584; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3585; CHECK-NEXT: #APP 3586; CHECK-NEXT: nop 3587; CHECK-NEXT: #NO_APP 3588; CHECK-NEXT: vpsrad {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3589; CHECK-NEXT: retq 3590 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3591 %2 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1) 3592 ret <4 x i32> %2 3593} 3594declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone 3595 3596define <8 x i32> @stack_fold_psrad_ymm(<8 x i32> %a0, <4 x i32> %a1) { 3597; CHECK-LABEL: stack_fold_psrad_ymm: 3598; CHECK: # %bb.0: 3599; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3600; CHECK-NEXT: #APP 3601; CHECK-NEXT: nop 3602; CHECK-NEXT: #NO_APP 3603; CHECK-NEXT: vpsrad {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 3604; CHECK-NEXT: retq 3605 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3606 %2 = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %a0, <4 x i32> %a1) 3607 ret <8 x i32> %2 3608} 3609declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone 3610 3611define <2 x i64> @stack_fold_psraq(<2 x i64> %a0, <2 x i64> %a1) { 3612; CHECK-LABEL: stack_fold_psraq: 3613; CHECK: # %bb.0: 3614; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3615; CHECK-NEXT: #APP 3616; CHECK-NEXT: nop 3617; CHECK-NEXT: #NO_APP 3618; CHECK-NEXT: vpsraq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3619; CHECK-NEXT: retq 3620 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3621 %2 = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %a0, <2 x i64> %a1) 3622 ret <2 x i64> %2 3623} 3624declare <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64>, <2 x i64>) nounwind readnone 3625 3626define <4 x i64> @stack_fold_psraq_ymm(<4 x i64> %a0, <2 x i64> %a1) { 3627; CHECK-LABEL: stack_fold_psraq_ymm: 3628; CHECK: # %bb.0: 3629; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3630; CHECK-NEXT: #APP 3631; CHECK-NEXT: nop 3632; CHECK-NEXT: #NO_APP 3633; CHECK-NEXT: vpsraq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 3634; CHECK-NEXT: retq 3635 %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3636 %2 = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %a0, <2 x i64> %a1) 3637 ret <4 x i64> %2 3638} 3639declare <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64>, <2 x i64>) nounwind readnone 3640 3641define <4 x i32> @stack_fold_psravd(<4 x i32> %a0, <4 x i32> %a1) { 3642; CHECK-LABEL: stack_fold_psravd: 3643; CHECK: # %bb.0: 3644; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3645; CHECK-NEXT: #APP 3646; CHECK-NEXT: nop 3647; CHECK-NEXT: #NO_APP 3648; CHECK-NEXT: vpsravd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3649; CHECK-NEXT: retq 3650 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3651 %2 = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %a0, <4 x i32> %a1) 3652 ret <4 x i32> %2 3653} 3654declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone 3655 3656define <8 x i32> @stack_fold_psravd_ymm(<8 x i32> %a0, <8 x i32> %a1) { 3657; CHECK-LABEL: stack_fold_psravd_ymm: 3658; CHECK: # %bb.0: 3659; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3660; CHECK-NEXT: #APP 3661; CHECK-NEXT: nop 3662; CHECK-NEXT: #NO_APP 3663; CHECK-NEXT: vpsravd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3664; CHECK-NEXT: retq 3665 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3666 %2 = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %a0, <8 x i32> %a1) 3667 ret <8 x i32> %2 3668} 3669declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone 3670 3671define <2 x i64> @stack_fold_psravq(<2 x i64> %a0, <2 x i64> %a1) { 3672; CHECK-LABEL: stack_fold_psravq: 3673; CHECK: # %bb.0: 3674; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3675; CHECK-NEXT: #APP 3676; CHECK-NEXT: nop 3677; CHECK-NEXT: #NO_APP 3678; CHECK-NEXT: vpsravq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3679; CHECK-NEXT: retq 3680 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3681 %2 = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %a0, <2 x i64> %a1) 3682 ret <2 x i64> %2 3683} 3684declare <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64>, <2 x i64>) nounwind readnone 3685 3686define <4 x i64> @stack_fold_psravq_ymm(<4 x i64> %a0, <4 x i64> %a1) { 3687; CHECK-LABEL: stack_fold_psravq_ymm: 3688; CHECK: # %bb.0: 3689; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3690; CHECK-NEXT: #APP 3691; CHECK-NEXT: nop 3692; CHECK-NEXT: #NO_APP 3693; CHECK-NEXT: vpsravq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3694; CHECK-NEXT: retq 3695 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3696 %2 = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %a0, <4 x i64> %a1) 3697 ret <4 x i64> %2 3698} 3699declare <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64>, <4 x i64>) nounwind readnone 3700 3701define <8 x i16> @stack_fold_psravw(<8 x i16> %a0, <8 x i16> %a1) { 3702; CHECK-LABEL: stack_fold_psravw: 3703; CHECK: # %bb.0: 3704; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3705; CHECK-NEXT: #APP 3706; CHECK-NEXT: nop 3707; CHECK-NEXT: #NO_APP 3708; CHECK-NEXT: vpsravw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3709; CHECK-NEXT: retq 3710 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3711 %2 = call <8 x i16> @llvm.x86.avx512.psrav.w.128(<8 x i16> %a0, <8 x i16> %a1) 3712 ret <8 x i16> %2 3713} 3714declare <8 x i16> @llvm.x86.avx512.psrav.w.128(<8 x i16>, <8 x i16>) nounwind readnone 3715 3716define <16 x i16> @stack_fold_psravw_ymm(<16 x i16> %a0, <16 x i16> %a1) { 3717; CHECK-LABEL: stack_fold_psravw_ymm: 3718; CHECK: # %bb.0: 3719; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3720; CHECK-NEXT: #APP 3721; CHECK-NEXT: nop 3722; CHECK-NEXT: #NO_APP 3723; CHECK-NEXT: vpsravw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3724; CHECK-NEXT: retq 3725 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3726 %2 = call <16 x i16> @llvm.x86.avx512.psrav.w.256(<16 x i16> %a0, <16 x i16> %a1) 3727 ret <16 x i16> %2 3728} 3729declare <16 x i16> @llvm.x86.avx512.psrav.w.256(<16 x i16>, <16 x i16>) nounwind readnone 3730 3731define <8 x i16> @stack_fold_psraw(<8 x i16> %a0, <8 x i16> %a1) { 3732; CHECK-LABEL: stack_fold_psraw: 3733; CHECK: # %bb.0: 3734; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3735; CHECK-NEXT: #APP 3736; CHECK-NEXT: nop 3737; CHECK-NEXT: #NO_APP 3738; CHECK-NEXT: vpsraw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3739; CHECK-NEXT: retq 3740 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3741 %2 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1) 3742 ret <8 x i16> %2 3743} 3744declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone 3745 3746define <16 x i16> @stack_fold_psraw_ymm(<16 x i16> %a0, <8 x i16> %a1) { 3747; CHECK-LABEL: stack_fold_psraw_ymm: 3748; CHECK: # %bb.0: 3749; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3750; CHECK-NEXT: #APP 3751; CHECK-NEXT: nop 3752; CHECK-NEXT: #NO_APP 3753; CHECK-NEXT: vpsraw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 3754; CHECK-NEXT: retq 3755 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3756 %2 = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %a0, <8 x i16> %a1) 3757 ret <16 x i16> %2 3758} 3759declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone 3760 3761define <4 x i32> @stack_fold_psrld(<4 x i32> %a0, <4 x i32> %a1) { 3762; CHECK-LABEL: stack_fold_psrld: 3763; CHECK: # %bb.0: 3764; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3765; CHECK-NEXT: #APP 3766; CHECK-NEXT: nop 3767; CHECK-NEXT: #NO_APP 3768; CHECK-NEXT: vpsrld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3769; CHECK-NEXT: retq 3770 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3771 %2 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1) 3772 ret <4 x i32> %2 3773} 3774declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone 3775 3776define <8 x i32> @stack_fold_psrld_ymm(<8 x i32> %a0, <4 x i32> %a1) { 3777; CHECK-LABEL: stack_fold_psrld_ymm: 3778; CHECK: # %bb.0: 3779; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3780; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3781; CHECK-NEXT: #APP 3782; CHECK-NEXT: nop 3783; CHECK-NEXT: #NO_APP 3784; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3785; CHECK-NEXT: vpsrld {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 3786; CHECK-NEXT: retq 3787 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3788 %2 = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %a0, <4 x i32> %a1) 3789 ret <8 x i32> %2 3790} 3791declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone 3792 3793define <16 x i8> @stack_fold_psrldq(<16 x i8> %a) { 3794; CHECK-LABEL: stack_fold_psrldq: 3795; CHECK: # %bb.0: 3796; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3797; CHECK-NEXT: #APP 3798; CHECK-NEXT: nop 3799; CHECK-NEXT: #NO_APP 3800; CHECK-NEXT: vpsrldq $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3801; CHECK-NEXT: # xmm0 = mem[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3802; CHECK-NEXT: retq 3803 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3804 %2 = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 28, i32 29, i32 30, i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 09, i32 0, i32 0, i32 0, i32 0, i32 0> 3805 ret <16 x i8> %2 3806} 3807 3808define <32 x i8> @stack_fold_psrldq_ymm(<32 x i8> %a) { 3809; CHECK-LABEL: stack_fold_psrldq_ymm: 3810; CHECK: # %bb.0: 3811; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3812; CHECK-NEXT: #APP 3813; CHECK-NEXT: nop 3814; CHECK-NEXT: #NO_APP 3815; CHECK-NEXT: vpsrldq $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 3816; CHECK-NEXT: # ymm0 = mem[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,mem[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3817; CHECK-NEXT: retq 3818 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3819 %2 = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 47, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 63, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> 3820 ret <32 x i8> %2 3821} 3822 3823define <2 x i64> @stack_fold_psrlq(<2 x i64> %a0, <2 x i64> %a1) { 3824; CHECK-LABEL: stack_fold_psrlq: 3825; CHECK: # %bb.0: 3826; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3827; CHECK-NEXT: #APP 3828; CHECK-NEXT: nop 3829; CHECK-NEXT: #NO_APP 3830; CHECK-NEXT: vpsrlq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3831; CHECK-NEXT: retq 3832 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3833 %2 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1) 3834 ret <2 x i64> %2 3835} 3836declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone 3837 3838define <4 x i64> @stack_fold_psrlq_ymm(<4 x i64> %a0, <2 x i64> %a1) { 3839; CHECK-LABEL: stack_fold_psrlq_ymm: 3840; CHECK: # %bb.0: 3841; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3842; CHECK-NEXT: #APP 3843; CHECK-NEXT: nop 3844; CHECK-NEXT: #NO_APP 3845; CHECK-NEXT: vpsrlq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 3846; CHECK-NEXT: retq 3847 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3848 %2 = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1) 3849 ret <4 x i64> %2 3850} 3851declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone 3852 3853define <4 x i32> @stack_fold_psrlvd(<4 x i32> %a0, <4 x i32> %a1) { 3854; CHECK-LABEL: stack_fold_psrlvd: 3855; CHECK: # %bb.0: 3856; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3857; CHECK-NEXT: #APP 3858; CHECK-NEXT: nop 3859; CHECK-NEXT: #NO_APP 3860; CHECK-NEXT: vpsrlvd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3861; CHECK-NEXT: retq 3862 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3863 %2 = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %a0, <4 x i32> %a1) 3864 ret <4 x i32> %2 3865} 3866declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone 3867 3868define <8 x i32> @stack_fold_psrlvd_ymm(<8 x i32> %a0, <8 x i32> %a1) { 3869; CHECK-LABEL: stack_fold_psrlvd_ymm: 3870; CHECK: # %bb.0: 3871; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3872; CHECK-NEXT: #APP 3873; CHECK-NEXT: nop 3874; CHECK-NEXT: #NO_APP 3875; CHECK-NEXT: vpsrlvd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3876; CHECK-NEXT: retq 3877 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3878 %2 = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %a0, <8 x i32> %a1) 3879 ret <8 x i32> %2 3880} 3881declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone 3882 3883define <2 x i64> @stack_fold_psrlvq(<2 x i64> %a0, <2 x i64> %a1) { 3884; CHECK-LABEL: stack_fold_psrlvq: 3885; CHECK: # %bb.0: 3886; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3887; CHECK-NEXT: #APP 3888; CHECK-NEXT: nop 3889; CHECK-NEXT: #NO_APP 3890; CHECK-NEXT: vpsrlvq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3891; CHECK-NEXT: retq 3892 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3893 %2 = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1) 3894 ret <2 x i64> %2 3895} 3896declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone 3897 3898define <4 x i64> @stack_fold_psrlvq_ymm(<4 x i64> %a0, <4 x i64> %a1) { 3899; CHECK-LABEL: stack_fold_psrlvq_ymm: 3900; CHECK: # %bb.0: 3901; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3902; CHECK-NEXT: #APP 3903; CHECK-NEXT: nop 3904; CHECK-NEXT: #NO_APP 3905; CHECK-NEXT: vpsrlvq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3906; CHECK-NEXT: retq 3907 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3908 %2 = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1) 3909 ret <4 x i64> %2 3910} 3911declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone 3912 3913define <8 x i16> @stack_fold_psrlvw(<8 x i16> %a0, <8 x i16> %a1) { 3914; CHECK-LABEL: stack_fold_psrlvw: 3915; CHECK: # %bb.0: 3916; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3917; CHECK-NEXT: #APP 3918; CHECK-NEXT: nop 3919; CHECK-NEXT: #NO_APP 3920; CHECK-NEXT: vpsrlvw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3921; CHECK-NEXT: retq 3922 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3923 %2 = call <8 x i16> @llvm.x86.avx512.psrlv.w.128(<8 x i16> %a0, <8 x i16> %a1) 3924 ret <8 x i16> %2 3925} 3926declare <8 x i16> @llvm.x86.avx512.psrlv.w.128(<8 x i16>, <8 x i16>) nounwind readnone 3927 3928define <16 x i16> @stack_fold_psrlvw_ymm(<16 x i16> %a0, <16 x i16> %a1) { 3929; CHECK-LABEL: stack_fold_psrlvw_ymm: 3930; CHECK: # %bb.0: 3931; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3932; CHECK-NEXT: #APP 3933; CHECK-NEXT: nop 3934; CHECK-NEXT: #NO_APP 3935; CHECK-NEXT: vpsrlvw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3936; CHECK-NEXT: retq 3937 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3938 %2 = call <16 x i16> @llvm.x86.avx512.psrlv.w.256(<16 x i16> %a0, <16 x i16> %a1) 3939 ret <16 x i16> %2 3940} 3941declare <16 x i16> @llvm.x86.avx512.psrlv.w.256(<16 x i16>, <16 x i16>) nounwind readnone 3942 3943define <8 x i16> @stack_fold_psrlw(<8 x i16> %a0, <8 x i16> %a1) { 3944; CHECK-LABEL: stack_fold_psrlw: 3945; CHECK: # %bb.0: 3946; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3947; CHECK-NEXT: #APP 3948; CHECK-NEXT: nop 3949; CHECK-NEXT: #NO_APP 3950; CHECK-NEXT: vpsrlw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3951; CHECK-NEXT: retq 3952 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3953 %2 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1) 3954 ret <8 x i16> %2 3955} 3956declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone 3957 3958define <16 x i16> @stack_fold_psrlw_ymm(<16 x i16> %a0, <8 x i16> %a1) { 3959; CHECK-LABEL: stack_fold_psrlw_ymm: 3960; CHECK: # %bb.0: 3961; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3962; CHECK-NEXT: #APP 3963; CHECK-NEXT: nop 3964; CHECK-NEXT: #NO_APP 3965; CHECK-NEXT: vpsrlw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 3966; CHECK-NEXT: retq 3967 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3968 %2 = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1) 3969 ret <16 x i16> %2 3970} 3971declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone 3972 3973define <16 x i8> @stack_fold_psubb(<16 x i8> %a0, <16 x i8> %a1) { 3974; CHECK-LABEL: stack_fold_psubb: 3975; CHECK: # %bb.0: 3976; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3977; CHECK-NEXT: #APP 3978; CHECK-NEXT: nop 3979; CHECK-NEXT: #NO_APP 3980; CHECK-NEXT: vpsubb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3981; CHECK-NEXT: retq 3982 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3983 %2 = sub <16 x i8> %a0, %a1 3984 ret <16 x i8> %2 3985} 3986 3987define <32 x i8> @stack_fold_psubb_ymm(<32 x i8> %a0, <32 x i8> %a1) { 3988; CHECK-LABEL: stack_fold_psubb_ymm: 3989; CHECK: # %bb.0: 3990; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3991; CHECK-NEXT: #APP 3992; CHECK-NEXT: nop 3993; CHECK-NEXT: #NO_APP 3994; CHECK-NEXT: vpsubb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3995; CHECK-NEXT: retq 3996 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3997 %2 = sub <32 x i8> %a0, %a1 3998 ret <32 x i8> %2 3999} 4000 4001define <4 x i32> @stack_fold_psubd(<4 x i32> %a0, <4 x i32> %a1) { 4002; CHECK-LABEL: stack_fold_psubd: 4003; CHECK: # %bb.0: 4004; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4005; CHECK-NEXT: #APP 4006; CHECK-NEXT: nop 4007; CHECK-NEXT: #NO_APP 4008; CHECK-NEXT: vpsubd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4009; CHECK-NEXT: retq 4010 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4011 %2 = sub <4 x i32> %a0, %a1 4012 ret <4 x i32> %2 4013} 4014 4015define <8 x i32> @stack_fold_psubd_ymm(<8 x i32> %a0, <8 x i32> %a1) { 4016; CHECK-LABEL: stack_fold_psubd_ymm: 4017; CHECK: # %bb.0: 4018; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4019; CHECK-NEXT: #APP 4020; CHECK-NEXT: nop 4021; CHECK-NEXT: #NO_APP 4022; CHECK-NEXT: vpsubd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 4023; CHECK-NEXT: retq 4024 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4025 %2 = sub <8 x i32> %a0, %a1 4026 ret <8 x i32> %2 4027} 4028 4029define <2 x i64> @stack_fold_psubq(<2 x i64> %a0, <2 x i64> %a1) { 4030; CHECK-LABEL: stack_fold_psubq: 4031; CHECK: # %bb.0: 4032; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4033; CHECK-NEXT: #APP 4034; CHECK-NEXT: nop 4035; CHECK-NEXT: #NO_APP 4036; CHECK-NEXT: vpsubq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4037; CHECK-NEXT: retq 4038 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4039 %2 = sub <2 x i64> %a0, %a1 4040 ret <2 x i64> %2 4041} 4042 4043define <4 x i64> @stack_fold_psubq_ymm(<4 x i64> %a0, <4 x i64> %a1) { 4044; CHECK-LABEL: stack_fold_psubq_ymm: 4045; CHECK: # %bb.0: 4046; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4047; CHECK-NEXT: #APP 4048; CHECK-NEXT: nop 4049; CHECK-NEXT: #NO_APP 4050; CHECK-NEXT: vpsubq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 4051; CHECK-NEXT: retq 4052 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4053 %2 = sub <4 x i64> %a0, %a1 4054 ret <4 x i64> %2 4055} 4056 4057define <16 x i8> @stack_fold_psubsb(<16 x i8> %a0, <16 x i8> %a1) { 4058; CHECK-LABEL: stack_fold_psubsb: 4059; CHECK: # %bb.0: 4060; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4061; CHECK-NEXT: #APP 4062; CHECK-NEXT: nop 4063; CHECK-NEXT: #NO_APP 4064; CHECK-NEXT: vpsubsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4065; CHECK-NEXT: retq 4066 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4067 %2 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1) 4068 ret <16 x i8> %2 4069} 4070declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 4071 4072define <32 x i8> @stack_fold_psubsb_ymm(<32 x i8> %a0, <32 x i8> %a1) { 4073; CHECK-LABEL: stack_fold_psubsb_ymm: 4074; CHECK: # %bb.0: 4075; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4076; CHECK-NEXT: #APP 4077; CHECK-NEXT: nop 4078; CHECK-NEXT: #NO_APP 4079; CHECK-NEXT: vpsubsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 4080; CHECK-NEXT: retq 4081 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4082 %2 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1) 4083 ret <32 x i8> %2 4084} 4085declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone 4086 4087define <8 x i16> @stack_fold_psubsw(<8 x i16> %a0, <8 x i16> %a1) { 4088; CHECK-LABEL: stack_fold_psubsw: 4089; CHECK: # %bb.0: 4090; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4091; CHECK-NEXT: #APP 4092; CHECK-NEXT: nop 4093; CHECK-NEXT: #NO_APP 4094; CHECK-NEXT: vpsubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4095; CHECK-NEXT: retq 4096 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4097 %2 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1) 4098 ret <8 x i16> %2 4099} 4100declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 4101 4102define <16 x i16> @stack_fold_psubsw_ymm(<16 x i16> %a0, <16 x i16> %a1) { 4103; CHECK-LABEL: stack_fold_psubsw_ymm: 4104; CHECK: # %bb.0: 4105; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4106; CHECK-NEXT: #APP 4107; CHECK-NEXT: nop 4108; CHECK-NEXT: #NO_APP 4109; CHECK-NEXT: vpsubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 4110; CHECK-NEXT: retq 4111 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4112 %2 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1) 4113 ret <16 x i16> %2 4114} 4115declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone 4116 4117define <16 x i8> @stack_fold_psubusb(<16 x i8> %a0, <16 x i8> %a1) { 4118; CHECK-LABEL: stack_fold_psubusb: 4119; CHECK: # %bb.0: 4120; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4121; CHECK-NEXT: #APP 4122; CHECK-NEXT: nop 4123; CHECK-NEXT: #NO_APP 4124; CHECK-NEXT: vpsubusb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4125; CHECK-NEXT: retq 4126 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4127 %2 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1) 4128 ret <16 x i8> %2 4129} 4130declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 4131 4132define <32 x i8> @stack_fold_psubusb_ymm(<32 x i8> %a0, <32 x i8> %a1) { 4133; CHECK-LABEL: stack_fold_psubusb_ymm: 4134; CHECK: # %bb.0: 4135; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4136; CHECK-NEXT: #APP 4137; CHECK-NEXT: nop 4138; CHECK-NEXT: #NO_APP 4139; CHECK-NEXT: vpsubusb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 4140; CHECK-NEXT: retq 4141 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4142 %2 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1) 4143 ret <32 x i8> %2 4144} 4145declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone 4146 4147define <8 x i16> @stack_fold_psubusw(<8 x i16> %a0, <8 x i16> %a1) { 4148; CHECK-LABEL: stack_fold_psubusw: 4149; CHECK: # %bb.0: 4150; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4151; CHECK-NEXT: #APP 4152; CHECK-NEXT: nop 4153; CHECK-NEXT: #NO_APP 4154; CHECK-NEXT: vpsubusw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4155; CHECK-NEXT: retq 4156 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4157 %2 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1) 4158 ret <8 x i16> %2 4159} 4160declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 4161 4162define <16 x i16> @stack_fold_psubusw_ymm(<16 x i16> %a0, <16 x i16> %a1) { 4163; CHECK-LABEL: stack_fold_psubusw_ymm: 4164; CHECK: # %bb.0: 4165; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4166; CHECK-NEXT: #APP 4167; CHECK-NEXT: nop 4168; CHECK-NEXT: #NO_APP 4169; CHECK-NEXT: vpsubusw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 4170; CHECK-NEXT: retq 4171 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4172 %2 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1) 4173 ret <16 x i16> %2 4174} 4175declare <16 x i16> @llvm.usub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone 4176 4177define <8 x i16> @stack_fold_psubw(<8 x i16> %a0, <8 x i16> %a1) { 4178; CHECK-LABEL: stack_fold_psubw: 4179; CHECK: # %bb.0: 4180; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4181; CHECK-NEXT: #APP 4182; CHECK-NEXT: nop 4183; CHECK-NEXT: #NO_APP 4184; CHECK-NEXT: vpsubw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4185; CHECK-NEXT: retq 4186 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4187 %2 = sub <8 x i16> %a0, %a1 4188 ret <8 x i16> %2 4189} 4190 4191define <16 x i16> @stack_fold_psubw_ymm(<16 x i16> %a0, <16 x i16> %a1) { 4192; CHECK-LABEL: stack_fold_psubw_ymm: 4193; CHECK: # %bb.0: 4194; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4195; CHECK-NEXT: #APP 4196; CHECK-NEXT: nop 4197; CHECK-NEXT: #NO_APP 4198; CHECK-NEXT: vpsubw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 4199; CHECK-NEXT: retq 4200 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4201 %2 = sub <16 x i16> %a0, %a1 4202 ret <16 x i16> %2 4203} 4204 4205define <16 x i8> @stack_fold_punpckhbw(<16 x i8> %a0, <16 x i8> %a1) { 4206; CHECK-LABEL: stack_fold_punpckhbw: 4207; CHECK: # %bb.0: 4208; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4209; CHECK-NEXT: #APP 4210; CHECK-NEXT: nop 4211; CHECK-NEXT: #NO_APP 4212; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4213; CHECK-NEXT: # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] 4214; CHECK-NEXT: retq 4215 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4216 %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 4217 ret <16 x i8> %2 4218} 4219 4220define <16 x i8> @stack_fold_punpckhbw_mask(ptr %passthru, <16 x i8> %a0, <16 x i8> %a1, i16 %mask) { 4221; CHECK-LABEL: stack_fold_punpckhbw_mask: 4222; CHECK: # %bb.0: 4223; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4224; CHECK-NEXT: #APP 4225; CHECK-NEXT: nop 4226; CHECK-NEXT: #NO_APP 4227; CHECK-NEXT: kmovd %esi, %k1 4228; CHECK-NEXT: vmovdqa (%rdi), %xmm2 4229; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 4230; CHECK-NEXT: # xmm2 {%k1} = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] 4231; CHECK-NEXT: vmovdqa %xmm2, %xmm0 4232; CHECK-NEXT: retq 4233 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4234 %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 4235 %3 = bitcast i16 %mask to <16 x i1> 4236 ; load needed to keep the operation from being scheduled about the asm block 4237 %4 = load <16 x i8>, ptr %passthru 4238 %5 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> %4 4239 ret <16 x i8> %5 4240} 4241 4242define <16 x i8> @stack_fold_punpckhbw_maskz(<16 x i8> %passthru, <16 x i8> %a0, <16 x i8> %a1, i16 %mask) { 4243; CHECK-LABEL: stack_fold_punpckhbw_maskz: 4244; CHECK: # %bb.0: 4245; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4246; CHECK-NEXT: #APP 4247; CHECK-NEXT: nop 4248; CHECK-NEXT: #NO_APP 4249; CHECK-NEXT: kmovd %edi, %k1 4250; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 4251; CHECK-NEXT: # xmm0 {%k1} {z} = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] 4252; CHECK-NEXT: retq 4253 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4254 %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 4255 %3 = bitcast i16 %mask to <16 x i1> 4256 %4 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> zeroinitializer 4257 ret <16 x i8> %4 4258} 4259 4260define <32 x i8> @stack_fold_punpckhbw_ymm(<32 x i8> %a0, <32 x i8> %a1) { 4261; CHECK-LABEL: stack_fold_punpckhbw_ymm: 4262; CHECK: # %bb.0: 4263; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4264; CHECK-NEXT: #APP 4265; CHECK-NEXT: nop 4266; CHECK-NEXT: #NO_APP 4267; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 4268; CHECK-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] 4269; CHECK-NEXT: retq 4270 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4271 %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 4272 ret <32 x i8> %2 4273} 4274 4275define <32 x i8> @stack_fold_punpckhbw_mask_ymm(ptr %passthru, <32 x i8> %a0, <32 x i8> %a1, i32 %mask) { 4276; CHECK-LABEL: stack_fold_punpckhbw_mask_ymm: 4277; CHECK: # %bb.0: 4278; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4279; CHECK-NEXT: #APP 4280; CHECK-NEXT: nop 4281; CHECK-NEXT: #NO_APP 4282; CHECK-NEXT: kmovd %esi, %k1 4283; CHECK-NEXT: vmovdqa (%rdi), %ymm2 4284; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 4285; CHECK-NEXT: # ymm2 {%k1} = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] 4286; CHECK-NEXT: vmovdqa %ymm2, %ymm0 4287; CHECK-NEXT: retq 4288 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4289 %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 4290 %3 = bitcast i32 %mask to <32 x i1> 4291 ; load needed to keep the operation from being scheduled about the asm block 4292 %4 = load <32 x i8>, ptr %passthru 4293 %5 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> %4 4294 ret <32 x i8> %5 4295} 4296 4297define <32 x i8> @stack_fold_punpckhbw_maskz_ymm(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) { 4298; CHECK-LABEL: stack_fold_punpckhbw_maskz_ymm: 4299; CHECK: # %bb.0: 4300; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4301; CHECK-NEXT: #APP 4302; CHECK-NEXT: nop 4303; CHECK-NEXT: #NO_APP 4304; CHECK-NEXT: kmovd %edi, %k1 4305; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload 4306; CHECK-NEXT: # ymm0 {%k1} {z} = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] 4307; CHECK-NEXT: retq 4308 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4309 %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 4310 %3 = bitcast i32 %mask to <32 x i1> 4311 %4 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> zeroinitializer 4312 ret <32 x i8> %4 4313} 4314 4315define <4 x i64> @stack_fold_shufi64x2_maskz(<4 x i64> %a, <4 x i64> %b, i8 %mask) { 4316; CHECK-LABEL: stack_fold_shufi64x2_maskz: 4317; CHECK: # %bb.0: 4318; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4319; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4320; CHECK-NEXT: #APP 4321; CHECK-NEXT: nop 4322; CHECK-NEXT: #NO_APP 4323; CHECK-NEXT: kmovd %edi, %k1 4324; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4325; CHECK-NEXT: vshufi64x2 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload 4326; CHECK-NEXT: # ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] 4327; CHECK-NEXT: retq 4328 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4329 %2 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 4330 %3 = bitcast i8 %mask to <8 x i1> 4331 %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4332 %5 = select <4 x i1> %4, <4 x i64> %2, <4 x i64> zeroinitializer 4333 ret <4 x i64> %5 4334} 4335 4336define <8 x i32> @stack_fold_shufi32x4_maskz(<8 x i32> %a, <8 x i32> %b, i8 %mask) { 4337; CHECK-LABEL: stack_fold_shufi32x4_maskz: 4338; CHECK: # %bb.0: 4339; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4340; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4341; CHECK-NEXT: #APP 4342; CHECK-NEXT: nop 4343; CHECK-NEXT: #NO_APP 4344; CHECK-NEXT: kmovd %edi, %k1 4345; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4346; CHECK-NEXT: vshufi32x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload 4347; CHECK-NEXT: # ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] 4348; CHECK-NEXT: retq 4349 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4350 %2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 4351 %3 = bitcast i8 %mask to <8 x i1> 4352 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer 4353 ret <8 x i32> %4 4354} 4355 4356declare <4 x i32> @llvm.x86.avx512.conflict.d.128(<4 x i32>) 4357declare <8 x i32> @llvm.x86.avx512.conflict.d.256(<8 x i32>) 4358declare <2 x i64> @llvm.x86.avx512.conflict.q.128(<2 x i64>) 4359declare <4 x i64> @llvm.x86.avx512.conflict.q.256(<4 x i64>) 4360declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) 4361declare <8 x i32> @llvm.ctlz.v8i32(<8 x i32>, i1) 4362declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) 4363declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) 4364