1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vbmi,+avx512cd,+avx512vpopcntdq,+avx512vnni < %s | FileCheck %s 3 4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 5target triple = "x86_64-unknown-unknown" 6 7; Stack reload folding tests. 8; 9; By including a nop call with sideeffects we can force a partial register spill of the 10; relevant registers and check that the reload is correctly folded into the instruction. 11 12define <16 x i32> @stack_fold_valignd(<16 x i32> %a, <16 x i32> %b) { 13; CHECK-LABEL: stack_fold_valignd: 14; CHECK: # %bb.0: 15; CHECK-NEXT: pushq %rax 16; CHECK-NEXT: .cfi_def_cfa_offset 16 17; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 18; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 19; CHECK-NEXT: #APP 20; CHECK-NEXT: nop 21; CHECK-NEXT: #NO_APP 22; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 23; CHECK-NEXT: valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 24; CHECK-NEXT: # zmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0] 25; CHECK-NEXT: popq %rax 26; CHECK-NEXT: .cfi_def_cfa_offset 8 27; CHECK-NEXT: retq 28 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 29 %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16> 30 ret <16 x i32> %2 31} 32 33define <16 x i32> @stack_fold_valignd_mask(<16 x i32> %a, <16 x i32> %b, ptr %passthru, i16 %mask) { 34; CHECK-LABEL: stack_fold_valignd_mask: 35; CHECK: # %bb.0: 36; CHECK-NEXT: pushq %rax 37; CHECK-NEXT: .cfi_def_cfa_offset 16 38; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 39; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 40; CHECK-NEXT: #APP 41; CHECK-NEXT: nop 42; CHECK-NEXT: #NO_APP 43; CHECK-NEXT: kmovd %esi, %k1 44; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 45; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 46; CHECK-NEXT: valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload 47; CHECK-NEXT: # zmm1 {%k1} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0] 48; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 49; CHECK-NEXT: popq %rax 50; CHECK-NEXT: .cfi_def_cfa_offset 8 51; CHECK-NEXT: retq 52 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 53 %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16> 54 %3 = bitcast i16 %mask to <16 x i1> 55 %4 = load <16 x i32>, ptr %passthru 56 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 57 ret <16 x i32> %5 58} 59 60define <16 x i32> @stack_fold_valignd_maskz(<16 x i32> %a, <16 x i32> %b, i16 %mask) { 61; CHECK-LABEL: stack_fold_valignd_maskz: 62; CHECK: # %bb.0: 63; CHECK-NEXT: pushq %rax 64; CHECK-NEXT: .cfi_def_cfa_offset 16 65; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 66; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 67; CHECK-NEXT: #APP 68; CHECK-NEXT: nop 69; CHECK-NEXT: #NO_APP 70; CHECK-NEXT: kmovd %edi, %k1 71; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 72; CHECK-NEXT: valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 73; CHECK-NEXT: # zmm0 {%k1} {z} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0] 74; CHECK-NEXT: popq %rax 75; CHECK-NEXT: .cfi_def_cfa_offset 8 76; CHECK-NEXT: retq 77 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 78 %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16> 79 %3 = bitcast i16 %mask to <16 x i1> 80 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 81 ret <16 x i32> %4 82} 83 84define <8 x i64> @stack_fold_valignq(<8 x i64> %a, <8 x i64> %b) { 85; CHECK-LABEL: stack_fold_valignq: 86; CHECK: # %bb.0: 87; CHECK-NEXT: pushq %rax 88; CHECK-NEXT: .cfi_def_cfa_offset 16 89; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 90; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 91; CHECK-NEXT: #APP 92; CHECK-NEXT: nop 93; CHECK-NEXT: #NO_APP 94; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 95; CHECK-NEXT: valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 96; CHECK-NEXT: # zmm0 = mem[1,2,3,4,5,6,7],zmm0[0] 97; CHECK-NEXT: popq %rax 98; CHECK-NEXT: .cfi_def_cfa_offset 8 99; CHECK-NEXT: retq 100 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 101 %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 102 ret <8 x i64> %2 103} 104 105define <8 x i64> @stack_fold_valignq_mask(<8 x i64> %a, <8 x i64> %b, ptr %passthru, i8 %mask) { 106; CHECK-LABEL: stack_fold_valignq_mask: 107; CHECK: # %bb.0: 108; CHECK-NEXT: pushq %rax 109; CHECK-NEXT: .cfi_def_cfa_offset 16 110; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 111; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 112; CHECK-NEXT: #APP 113; CHECK-NEXT: nop 114; CHECK-NEXT: #NO_APP 115; CHECK-NEXT: kmovd %esi, %k1 116; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 117; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 118; CHECK-NEXT: valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload 119; CHECK-NEXT: # zmm1 {%k1} = mem[1,2,3,4,5,6,7],zmm0[0] 120; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 121; CHECK-NEXT: popq %rax 122; CHECK-NEXT: .cfi_def_cfa_offset 8 123; CHECK-NEXT: retq 124 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 125 %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 126 %3 = bitcast i8 %mask to <8 x i1> 127 %4 = load <8 x i64>, ptr %passthru 128 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4 129 ret <8 x i64> %5 130} 131 132define <8 x i64> @stack_fold_valignq_maskz(<8 x i64> %a, <8 x i64> %b, i8 %mask) { 133; CHECK-LABEL: stack_fold_valignq_maskz: 134; CHECK: # %bb.0: 135; CHECK-NEXT: pushq %rax 136; CHECK-NEXT: .cfi_def_cfa_offset 16 137; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 138; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 139; CHECK-NEXT: #APP 140; CHECK-NEXT: nop 141; CHECK-NEXT: #NO_APP 142; CHECK-NEXT: kmovd %edi, %k1 143; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 144; CHECK-NEXT: valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 145; CHECK-NEXT: # zmm0 {%k1} {z} = mem[1,2,3,4,5,6,7],zmm0[0] 146; CHECK-NEXT: popq %rax 147; CHECK-NEXT: .cfi_def_cfa_offset 8 148; CHECK-NEXT: retq 149 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 150 %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 151 %3 = bitcast i8 %mask to <8 x i1> 152 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 153 ret <8 x i64> %4 154} 155 156define <64 x i8> @stack_fold_pavgb(<64 x i8> %a0, <64 x i8> %a1) { 157; CHECK-LABEL: stack_fold_pavgb: 158; CHECK: # %bb.0: 159; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 160; CHECK-NEXT: #APP 161; CHECK-NEXT: nop 162; CHECK-NEXT: #NO_APP 163; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 164; CHECK-NEXT: retq 165 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 166 %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1) 167 ret <64 x i8> %2 168} 169declare <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8>, <64 x i8>) 170 171define <64 x i8> @stack_fold_pavgb_commuted(<64 x i8> %a0, <64 x i8> %a1) { 172; CHECK-LABEL: stack_fold_pavgb_commuted: 173; CHECK: # %bb.0: 174; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 175; CHECK-NEXT: #APP 176; CHECK-NEXT: nop 177; CHECK-NEXT: #NO_APP 178; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 179; CHECK-NEXT: retq 180 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 181 %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a1, <64 x i8> %a0) 182 ret <64 x i8> %2 183} 184 185define <64 x i8> @stack_fold_pavgb_mask(<64 x i8> %a0, <64 x i8> %a1, ptr %a2, i64 %mask) { 186; CHECK-LABEL: stack_fold_pavgb_mask: 187; CHECK: # %bb.0: 188; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 189; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 190; CHECK-NEXT: #APP 191; CHECK-NEXT: nop 192; CHECK-NEXT: #NO_APP 193; CHECK-NEXT: kmovq %rsi, %k1 194; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 195; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 196; CHECK-NEXT: retq 197 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 198 %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1) 199 %3 = bitcast i64 %mask to <64 x i1> 200 ; load needed to keep the operation from being scheduled about the asm block 201 %4 = load <64 x i8>, ptr %a2 202 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4 203 ret <64 x i8> %5 204} 205 206define <64 x i8> @stack_fold_pavgb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, ptr %a2, i64 %mask) { 207; CHECK-LABEL: stack_fold_pavgb_mask_commuted: 208; CHECK: # %bb.0: 209; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 210; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 211; CHECK-NEXT: #APP 212; CHECK-NEXT: nop 213; CHECK-NEXT: #NO_APP 214; CHECK-NEXT: kmovq %rsi, %k1 215; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 216; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 217; CHECK-NEXT: retq 218 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 219 %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a1, <64 x i8> %a0) 220 %3 = bitcast i64 %mask to <64 x i1> 221 ; load needed to keep the operation from being scheduled about the asm block 222 %4 = load <64 x i8>, ptr %a2 223 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4 224 ret <64 x i8> %5 225} 226 227define <64 x i8> @stack_fold_pavgb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 228; CHECK-LABEL: stack_fold_pavgb_maskz: 229; CHECK: # %bb.0: 230; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 231; CHECK-NEXT: #APP 232; CHECK-NEXT: nop 233; CHECK-NEXT: #NO_APP 234; CHECK-NEXT: kmovq %rdi, %k1 235; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 236; CHECK-NEXT: retq 237 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 238 %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1) 239 %3 = bitcast i64 %mask to <64 x i1> 240 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer 241 ret <64 x i8> %4 242} 243 244define <64 x i8> @stack_fold_pavgb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 245; CHECK-LABEL: stack_fold_pavgb_maskz_commuted: 246; CHECK: # %bb.0: 247; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 248; CHECK-NEXT: #APP 249; CHECK-NEXT: nop 250; CHECK-NEXT: #NO_APP 251; CHECK-NEXT: kmovq %rdi, %k1 252; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 253; CHECK-NEXT: retq 254 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 255 %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a1, <64 x i8> %a0) 256 %3 = bitcast i64 %mask to <64 x i1> 257 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer 258 ret <64 x i8> %4 259} 260 261define <32 x i16> @stack_fold_pavgw(<32 x i16> %a0, <32 x i16> %a1) { 262; CHECK-LABEL: stack_fold_pavgw: 263; CHECK: # %bb.0: 264; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 265; CHECK-NEXT: #APP 266; CHECK-NEXT: nop 267; CHECK-NEXT: #NO_APP 268; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 269; CHECK-NEXT: retq 270 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 271 %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1) 272 ret <32 x i16> %2 273} 274declare <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16>, <32 x i16>) 275 276define <32 x i16> @stack_fold_pavgw_commuted(<32 x i16> %a0, <32 x i16> %a1) { 277; CHECK-LABEL: stack_fold_pavgw_commuted: 278; CHECK: # %bb.0: 279; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 280; CHECK-NEXT: #APP 281; CHECK-NEXT: nop 282; CHECK-NEXT: #NO_APP 283; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 284; CHECK-NEXT: retq 285 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 286 %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a1, <32 x i16> %a0) 287 ret <32 x i16> %2 288} 289 290define <32 x i16> @stack_fold_pavgw_mask(<32 x i16> %a0, <32 x i16> %a1, ptr %a2, i32 %mask) { 291; CHECK-LABEL: stack_fold_pavgw_mask: 292; CHECK: # %bb.0: 293; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 294; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 295; CHECK-NEXT: #APP 296; CHECK-NEXT: nop 297; CHECK-NEXT: #NO_APP 298; CHECK-NEXT: kmovd %esi, %k1 299; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 300; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 301; CHECK-NEXT: retq 302 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 303 %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1) 304 %3 = bitcast i32 %mask to <32 x i1> 305 ; load needed to keep the operation from being scheduled about the asm block 306 %4 = load <32 x i16>, ptr %a2 307 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4 308 ret <32 x i16> %5 309} 310 311define <32 x i16> @stack_fold_pavgw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, ptr %a2, i32 %mask) { 312; CHECK-LABEL: stack_fold_pavgw_mask_commuted: 313; CHECK: # %bb.0: 314; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 315; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 316; CHECK-NEXT: #APP 317; CHECK-NEXT: nop 318; CHECK-NEXT: #NO_APP 319; CHECK-NEXT: kmovd %esi, %k1 320; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 321; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 322; CHECK-NEXT: retq 323 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 324 %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a1, <32 x i16> %a0) 325 %3 = bitcast i32 %mask to <32 x i1> 326 ; load needed to keep the operation from being scheduled about the asm block 327 %4 = load <32 x i16>, ptr %a2 328 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4 329 ret <32 x i16> %5 330} 331 332define <32 x i16> @stack_fold_pavgw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 333; CHECK-LABEL: stack_fold_pavgw_maskz: 334; CHECK: # %bb.0: 335; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 336; CHECK-NEXT: #APP 337; CHECK-NEXT: nop 338; CHECK-NEXT: #NO_APP 339; CHECK-NEXT: kmovd %edi, %k1 340; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 341; CHECK-NEXT: retq 342 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 343 %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1) 344 %3 = bitcast i32 %mask to <32 x i1> 345 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 346 ret <32 x i16> %4 347} 348 349define <32 x i16> @stack_fold_pavgw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 350; CHECK-LABEL: stack_fold_pavgw_maskz_commuted: 351; CHECK: # %bb.0: 352; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 353; CHECK-NEXT: #APP 354; CHECK-NEXT: nop 355; CHECK-NEXT: #NO_APP 356; CHECK-NEXT: kmovd %edi, %k1 357; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 358; CHECK-NEXT: retq 359 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 360 %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a1, <32 x i16> %a0) 361 %3 = bitcast i32 %mask to <32 x i1> 362 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 363 ret <32 x i16> %4 364} 365 366define <4 x i32> @stack_fold_extracti32x4(<16 x i16> %a0, <16 x i32> %a1) { 367; CHECK-LABEL: stack_fold_extracti32x4: 368; CHECK: # %bb.0: 369; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 370; CHECK-NEXT: vextracti32x4 $3, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 371; CHECK-NEXT: #APP 372; CHECK-NEXT: nop 373; CHECK-NEXT: #NO_APP 374; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 375; CHECK-NEXT: vzeroupper 376; CHECK-NEXT: retq 377 ; zext forces execution domain 378 %1 = zext <16 x i16> %a0 to <16 x i32> 379 %2 = shufflevector <16 x i32> %1, <16 x i32> %a1, <4 x i32> <i32 12, i32 13, i32 14, i32 15> 380 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 381 ret <4 x i32> %2 382} 383 384define <2 x i64> @stack_fold_extracti64x2(<8 x i32> %a0, <8 x i64> %a1) { 385; CHECK-LABEL: stack_fold_extracti64x2: 386; CHECK: # %bb.0: 387; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero 388; CHECK-NEXT: vextracti32x4 $3, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 389; CHECK-NEXT: #APP 390; CHECK-NEXT: nop 391; CHECK-NEXT: #NO_APP 392; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 393; CHECK-NEXT: vzeroupper 394; CHECK-NEXT: retq 395 ; zext forces execution domain 396 %1 = zext <8 x i32> %a0 to <8 x i64> 397 %2 = shufflevector <8 x i64> %1, <8 x i64> %a1, <2 x i32> <i32 6, i32 7> 398 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 399 ret <2 x i64> %2 400} 401 402define <8 x i32> @stack_fold_extracti32x8(<16 x i16> %a0, <16 x i32> %a1) { 403; CHECK-LABEL: stack_fold_extracti32x8: 404; CHECK: # %bb.0: 405; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 406; CHECK-NEXT: vextracti64x4 $1, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill 407; CHECK-NEXT: #APP 408; CHECK-NEXT: nop 409; CHECK-NEXT: #NO_APP 410; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 411; CHECK-NEXT: retq 412 ; zext forces execution domain 413 %1 = zext <16 x i16> %a0 to <16 x i32> 414 %2 = shufflevector <16 x i32> %1, <16 x i32> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 415 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 416 ret <8 x i32> %2 417} 418 419define <4 x i64> @stack_fold_extracti64x4(<8 x i32> %a0, <8 x i64> %a1) { 420; CHECK-LABEL: stack_fold_extracti64x4: 421; CHECK: # %bb.0: 422; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero 423; CHECK-NEXT: vextracti64x4 $1, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill 424; CHECK-NEXT: #APP 425; CHECK-NEXT: nop 426; CHECK-NEXT: #NO_APP 427; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 428; CHECK-NEXT: retq 429 ; zext forces execution domain 430 %1 = zext <8 x i32> %a0 to <8 x i64> 431 %2 = shufflevector <8 x i64> %1, <8 x i64> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 432 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 433 ret <4 x i64> %2 434} 435 436define <16 x i32> @stack_fold_inserti32x8(<8 x i32> %a0, <8 x i32> %a1) { 437; CHECK-LABEL: stack_fold_inserti32x8: 438; CHECK: # %bb.0: 439; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 440; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 441; CHECK-NEXT: #APP 442; CHECK-NEXT: nop 443; CHECK-NEXT: #NO_APP 444; CHECK-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload 445; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1 446; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 447; CHECK-NEXT: retq 448 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 449 %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 450 ; add forces execution domain 451 %3 = add <16 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 452 ret <16 x i32> %3 453} 454 455define <8 x i64> @stack_fold_inserti64x4(<4 x i64> %a0, <4 x i64> %a1) { 456; CHECK-LABEL: stack_fold_inserti64x4: 457; CHECK: # %bb.0: 458; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 459; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 460; CHECK-NEXT: #APP 461; CHECK-NEXT: nop 462; CHECK-NEXT: #NO_APP 463; CHECK-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload 464; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1 465; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 466; CHECK-NEXT: retq 467 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 468 %2 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 469 ; add forces execution domain 470 %3 = add <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 471 ret <8 x i64> %3 472} 473 474define <64 x i8> @stack_fold_pabsb(<64 x i8> %a0) { 475; CHECK-LABEL: stack_fold_pabsb: 476; CHECK: # %bb.0: 477; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 478; CHECK-NEXT: #APP 479; CHECK-NEXT: nop 480; CHECK-NEXT: #NO_APP 481; CHECK-NEXT: vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 482; CHECK-NEXT: retq 483 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 484 %2 = icmp sgt <64 x i8> %a0, zeroinitializer 485 %3 = sub <64 x i8> zeroinitializer, %a0 486 %4 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %3 487 ret <64 x i8> %4 488} 489 490define <64 x i8> @stack_fold_pabsb_mask(<64 x i8> %passthru, <64 x i8> %a0, i64 %mask) { 491; CHECK-LABEL: stack_fold_pabsb_mask: 492; CHECK: # %bb.0: 493; CHECK-NEXT: pushq %rax 494; CHECK-NEXT: .cfi_def_cfa_offset 16 495; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 496; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 497; CHECK-NEXT: #APP 498; CHECK-NEXT: nop 499; CHECK-NEXT: #NO_APP 500; CHECK-NEXT: kmovq %rdi, %k1 501; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 502; CHECK-NEXT: vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload 503; CHECK-NEXT: popq %rax 504; CHECK-NEXT: .cfi_def_cfa_offset 8 505; CHECK-NEXT: retq 506 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 507 %2 = icmp sgt <64 x i8> %a0, zeroinitializer 508 %3 = sub <64 x i8> zeroinitializer, %a0 509 %4 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %3 510 %5 = bitcast i64 %mask to <64 x i1> 511 %6 = select <64 x i1> %5, <64 x i8> %4, <64 x i8> %passthru 512 ret <64 x i8> %6 513} 514 515define <64 x i8> @stack_fold_pabsb_maskz(<64 x i8> %a0, i64 %mask) { 516; CHECK-LABEL: stack_fold_pabsb_maskz: 517; CHECK: # %bb.0: 518; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 519; CHECK-NEXT: #APP 520; CHECK-NEXT: nop 521; CHECK-NEXT: #NO_APP 522; CHECK-NEXT: kmovq %rdi, %k1 523; CHECK-NEXT: vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload 524; CHECK-NEXT: retq 525 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 526 %2 = icmp sgt <64 x i8> %a0, zeroinitializer 527 %3 = sub <64 x i8> zeroinitializer, %a0 528 %4 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %3 529 %5 = bitcast i64 %mask to <64 x i1> 530 %6 = select <64 x i1> %5, <64 x i8> %4, <64 x i8> zeroinitializer 531 ret <64 x i8> %6 532} 533 534define <16 x i32> @stack_fold_pabsd(<16 x i32> %a0) { 535; CHECK-LABEL: stack_fold_pabsd: 536; CHECK: # %bb.0: 537; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 538; CHECK-NEXT: #APP 539; CHECK-NEXT: nop 540; CHECK-NEXT: #NO_APP 541; CHECK-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 542; CHECK-NEXT: retq 543 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 544 %2 = icmp sgt <16 x i32> %a0, zeroinitializer 545 %3 = sub <16 x i32> zeroinitializer, %a0 546 %4 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %3 547 ret <16 x i32> %4 548} 549 550define <16 x i32> @stack_fold_pabsd_mask(<16 x i32> %passthru, <16 x i32> %a0, i16 %mask) { 551; CHECK-LABEL: stack_fold_pabsd_mask: 552; CHECK: # %bb.0: 553; CHECK-NEXT: pushq %rax 554; CHECK-NEXT: .cfi_def_cfa_offset 16 555; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 556; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 557; CHECK-NEXT: #APP 558; CHECK-NEXT: nop 559; CHECK-NEXT: #NO_APP 560; CHECK-NEXT: kmovd %edi, %k1 561; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 562; CHECK-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload 563; CHECK-NEXT: popq %rax 564; CHECK-NEXT: .cfi_def_cfa_offset 8 565; CHECK-NEXT: retq 566 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 567 %2 = icmp sgt <16 x i32> %a0, zeroinitializer 568 %3 = sub <16 x i32> zeroinitializer, %a0 569 %4 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %3 570 %5 = bitcast i16 %mask to <16 x i1> 571 %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> %passthru 572 ret <16 x i32> %6 573} 574 575define <16 x i32> @stack_fold_pabsd_maskz(<16 x i32> %a0, i16 %mask) { 576; CHECK-LABEL: stack_fold_pabsd_maskz: 577; CHECK: # %bb.0: 578; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 579; CHECK-NEXT: #APP 580; CHECK-NEXT: nop 581; CHECK-NEXT: #NO_APP 582; CHECK-NEXT: kmovd %edi, %k1 583; CHECK-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload 584; CHECK-NEXT: retq 585 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 586 %2 = icmp sgt <16 x i32> %a0, zeroinitializer 587 %3 = sub <16 x i32> zeroinitializer, %a0 588 %4 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %3 589 %5 = bitcast i16 %mask to <16 x i1> 590 %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer 591 ret <16 x i32> %6 592} 593 594define <8 x i64> @stack_fold_pabsq(<8 x i64> %a0) { 595; CHECK-LABEL: stack_fold_pabsq: 596; CHECK: # %bb.0: 597; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 598; CHECK-NEXT: #APP 599; CHECK-NEXT: nop 600; CHECK-NEXT: #NO_APP 601; CHECK-NEXT: vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 602; CHECK-NEXT: retq 603 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 604 %2 = icmp sgt <8 x i64> %a0, zeroinitializer 605 %3 = sub <8 x i64> zeroinitializer, %a0 606 %4 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %3 607 ret <8 x i64> %4 608} 609 610define <8 x i64> @stack_fold_pabsq_mask(<8 x i64> %passthru, <8 x i64> %a0, i8 %mask) { 611; CHECK-LABEL: stack_fold_pabsq_mask: 612; CHECK: # %bb.0: 613; CHECK-NEXT: pushq %rax 614; CHECK-NEXT: .cfi_def_cfa_offset 16 615; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 616; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 617; CHECK-NEXT: #APP 618; CHECK-NEXT: nop 619; CHECK-NEXT: #NO_APP 620; CHECK-NEXT: kmovd %edi, %k1 621; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 622; CHECK-NEXT: vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload 623; CHECK-NEXT: popq %rax 624; CHECK-NEXT: .cfi_def_cfa_offset 8 625; CHECK-NEXT: retq 626 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 627 %2 = icmp sgt <8 x i64> %a0, zeroinitializer 628 %3 = sub <8 x i64> zeroinitializer, %a0 629 %4 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %3 630 %5 = bitcast i8 %mask to <8 x i1> 631 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %passthru 632 ret <8 x i64> %6 633} 634 635define <8 x i64> @stack_fold_pabsq_maskz(<8 x i64> %a0, i8 %mask) { 636; CHECK-LABEL: stack_fold_pabsq_maskz: 637; CHECK: # %bb.0: 638; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 639; CHECK-NEXT: #APP 640; CHECK-NEXT: nop 641; CHECK-NEXT: #NO_APP 642; CHECK-NEXT: kmovd %edi, %k1 643; CHECK-NEXT: vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload 644; CHECK-NEXT: retq 645 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 646 %2 = icmp sgt <8 x i64> %a0, zeroinitializer 647 %3 = sub <8 x i64> zeroinitializer, %a0 648 %4 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %3 649 %5 = bitcast i8 %mask to <8 x i1> 650 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer 651 ret <8 x i64> %6 652} 653 654define <32 x i16> @stack_fold_pabsw(<32 x i16> %a0) { 655; CHECK-LABEL: stack_fold_pabsw: 656; CHECK: # %bb.0: 657; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 658; CHECK-NEXT: #APP 659; CHECK-NEXT: nop 660; CHECK-NEXT: #NO_APP 661; CHECK-NEXT: vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 662; CHECK-NEXT: retq 663 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 664 %2 = icmp sgt <32 x i16> %a0, zeroinitializer 665 %3 = sub <32 x i16> zeroinitializer, %a0 666 %4 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %3 667 ret <32 x i16> %4 668} 669 670define <32 x i16> @stack_fold_pabsw_mask(<32 x i16> %passthru, <32 x i16> %a0, i32 %mask) { 671; CHECK-LABEL: stack_fold_pabsw_mask: 672; CHECK: # %bb.0: 673; CHECK-NEXT: pushq %rax 674; CHECK-NEXT: .cfi_def_cfa_offset 16 675; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 676; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 677; CHECK-NEXT: #APP 678; CHECK-NEXT: nop 679; CHECK-NEXT: #NO_APP 680; CHECK-NEXT: kmovd %edi, %k1 681; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 682; CHECK-NEXT: vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload 683; CHECK-NEXT: popq %rax 684; CHECK-NEXT: .cfi_def_cfa_offset 8 685; CHECK-NEXT: retq 686 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 687 %2 = icmp sgt <32 x i16> %a0, zeroinitializer 688 %3 = sub <32 x i16> zeroinitializer, %a0 689 %4 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %3 690 %5 = bitcast i32 %mask to <32 x i1> 691 %6 = select <32 x i1> %5, <32 x i16> %4, <32 x i16> %passthru 692 ret <32 x i16> %6 693} 694 695define <32 x i16> @stack_fold_pabsw_maskz(<32 x i16> %a0, i32 %mask) { 696; CHECK-LABEL: stack_fold_pabsw_maskz: 697; CHECK: # %bb.0: 698; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 699; CHECK-NEXT: #APP 700; CHECK-NEXT: nop 701; CHECK-NEXT: #NO_APP 702; CHECK-NEXT: kmovd %edi, %k1 703; CHECK-NEXT: vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload 704; CHECK-NEXT: retq 705 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 706 %2 = icmp sgt <32 x i16> %a0, zeroinitializer 707 %3 = sub <32 x i16> zeroinitializer, %a0 708 %4 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %3 709 %5 = bitcast i32 %mask to <32 x i1> 710 %6 = select <32 x i1> %5, <32 x i16> %4, <32 x i16> zeroinitializer 711 ret <32 x i16> %6 712} 713 714define <32 x i16> @stack_fold_packssdw(<16 x i32> %a0, <16 x i32> %a1) { 715; CHECK-LABEL: stack_fold_packssdw: 716; CHECK: # %bb.0: 717; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 718; CHECK-NEXT: #APP 719; CHECK-NEXT: nop 720; CHECK-NEXT: #NO_APP 721; CHECK-NEXT: vpackssdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 722; CHECK-NEXT: retq 723 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 724 %2 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a0, <16 x i32> %a1) 725 ret <32 x i16> %2 726} 727declare <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32>, <16 x i32>) nounwind readnone 728 729define <64 x i8> @stack_fold_packsswb(<32 x i16> %a0, <32 x i16> %a1) { 730; CHECK-LABEL: stack_fold_packsswb: 731; CHECK: # %bb.0: 732; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 733; CHECK-NEXT: #APP 734; CHECK-NEXT: nop 735; CHECK-NEXT: #NO_APP 736; CHECK-NEXT: vpacksswb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 737; CHECK-NEXT: retq 738 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 739 %2 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a0, <32 x i16> %a1) 740 ret <64 x i8> %2 741} 742declare <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16>, <32 x i16>) nounwind readnone 743 744define <32 x i16> @stack_fold_packusdw(<16 x i32> %a0, <16 x i32> %a1) { 745; CHECK-LABEL: stack_fold_packusdw: 746; CHECK: # %bb.0: 747; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 748; CHECK-NEXT: #APP 749; CHECK-NEXT: nop 750; CHECK-NEXT: #NO_APP 751; CHECK-NEXT: vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 752; CHECK-NEXT: retq 753 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 754 %2 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a0, <16 x i32> %a1) 755 ret <32 x i16> %2 756} 757declare <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32>, <16 x i32>) nounwind readnone 758 759define <32 x i16> @stack_fold_packusdw_mask(ptr %passthru, <16 x i32> %a0, <16 x i32> %a1, i32 %mask) { 760; CHECK-LABEL: stack_fold_packusdw_mask: 761; CHECK: # %bb.0: 762; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 763; CHECK-NEXT: #APP 764; CHECK-NEXT: nop 765; CHECK-NEXT: #NO_APP 766; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 767; CHECK-NEXT: kmovd %esi, %k1 768; CHECK-NEXT: vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 769; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 770; CHECK-NEXT: retq 771 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 772 %2 = load <32 x i16>, ptr %passthru 773 %3 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a0, <16 x i32> %a1) 774 %4 = bitcast i32 %mask to <32 x i1> 775 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %2 776 ret <32 x i16> %5 777} 778 779define <32 x i16> @stack_fold_packusdw_maskz(<16 x i32> %a0, <16 x i32> %a1, i32 %mask) { 780; CHECK-LABEL: stack_fold_packusdw_maskz: 781; CHECK: # %bb.0: 782; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 783; CHECK-NEXT: #APP 784; CHECK-NEXT: nop 785; CHECK-NEXT: #NO_APP 786; CHECK-NEXT: kmovd %edi, %k1 787; CHECK-NEXT: vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 788; CHECK-NEXT: retq 789 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 790 %2 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a0, <16 x i32> %a1) 791 %3 = bitcast i32 %mask to <32 x i1> 792 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 793 ret <32 x i16> %4 794} 795 796define <64 x i8> @stack_fold_packuswb(<32 x i16> %a0, <32 x i16> %a1) { 797; CHECK-LABEL: stack_fold_packuswb: 798; CHECK: # %bb.0: 799; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 800; CHECK-NEXT: #APP 801; CHECK-NEXT: nop 802; CHECK-NEXT: #NO_APP 803; CHECK-NEXT: vpackuswb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 804; CHECK-NEXT: retq 805 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 806 %2 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a0, <32 x i16> %a1) 807 ret <64 x i8> %2 808} 809declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>) nounwind readnone 810 811define <64 x i8> @stack_fold_paddb(<64 x i8> %a0, <64 x i8> %a1) { 812; CHECK-LABEL: stack_fold_paddb: 813; CHECK: # %bb.0: 814; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 815; CHECK-NEXT: #APP 816; CHECK-NEXT: nop 817; CHECK-NEXT: #NO_APP 818; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 819; CHECK-NEXT: retq 820 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 821 %2 = add <64 x i8> %a0, %a1 822 ret <64 x i8> %2 823} 824 825define <64 x i8> @stack_fold_paddb_commuted(<64 x i8> %a0, <64 x i8> %a1) { 826; CHECK-LABEL: stack_fold_paddb_commuted: 827; CHECK: # %bb.0: 828; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 829; CHECK-NEXT: #APP 830; CHECK-NEXT: nop 831; CHECK-NEXT: #NO_APP 832; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 833; CHECK-NEXT: retq 834 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 835 %2 = add <64 x i8> %a1, %a0 836 ret <64 x i8> %2 837} 838 839define <64 x i8> @stack_fold_paddb_mask(<64 x i8> %a0, <64 x i8> %a1, ptr %a2, i64 %mask) { 840; CHECK-LABEL: stack_fold_paddb_mask: 841; CHECK: # %bb.0: 842; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 843; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 844; CHECK-NEXT: #APP 845; CHECK-NEXT: nop 846; CHECK-NEXT: #NO_APP 847; CHECK-NEXT: kmovq %rsi, %k1 848; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 849; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 850; CHECK-NEXT: retq 851 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 852 %2 = add <64 x i8> %a0, %a1 853 %3 = bitcast i64 %mask to <64 x i1> 854 ; load needed to keep the operation from being scheduled about the asm block 855 %4 = load <64 x i8>, ptr %a2 856 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4 857 ret <64 x i8> %5 858} 859 860define <64 x i8> @stack_fold_paddb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, ptr %a2, i64 %mask) { 861; CHECK-LABEL: stack_fold_paddb_mask_commuted: 862; CHECK: # %bb.0: 863; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 864; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 865; CHECK-NEXT: #APP 866; CHECK-NEXT: nop 867; CHECK-NEXT: #NO_APP 868; CHECK-NEXT: kmovq %rsi, %k1 869; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 870; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 871; CHECK-NEXT: retq 872 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 873 %2 = add <64 x i8> %a1, %a0 874 %3 = bitcast i64 %mask to <64 x i1> 875 ; load needed to keep the operation from being scheduled about the asm block 876 %4 = load <64 x i8>, ptr %a2 877 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4 878 ret <64 x i8> %5 879} 880 881define <64 x i8> @stack_fold_paddb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 882; CHECK-LABEL: stack_fold_paddb_maskz: 883; CHECK: # %bb.0: 884; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 885; CHECK-NEXT: #APP 886; CHECK-NEXT: nop 887; CHECK-NEXT: #NO_APP 888; CHECK-NEXT: kmovq %rdi, %k1 889; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 890; CHECK-NEXT: retq 891 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 892 %2 = add <64 x i8> %a0, %a1 893 %3 = bitcast i64 %mask to <64 x i1> 894 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer 895 ret <64 x i8> %4 896} 897 898define <64 x i8> @stack_fold_paddb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 899; CHECK-LABEL: stack_fold_paddb_maskz_commuted: 900; CHECK: # %bb.0: 901; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 902; CHECK-NEXT: #APP 903; CHECK-NEXT: nop 904; CHECK-NEXT: #NO_APP 905; CHECK-NEXT: kmovq %rdi, %k1 906; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 907; CHECK-NEXT: retq 908 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 909 %2 = add <64 x i8> %a1, %a0 910 %3 = bitcast i64 %mask to <64 x i1> 911 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer 912 ret <64 x i8> %4 913} 914 915define <16 x i32> @stack_fold_paddd(<16 x i32> %a0, <16 x i32> %a1) { 916; CHECK-LABEL: stack_fold_paddd: 917; CHECK: # %bb.0: 918; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 919; CHECK-NEXT: #APP 920; CHECK-NEXT: nop 921; CHECK-NEXT: #NO_APP 922; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 923; CHECK-NEXT: retq 924 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 925 %2 = add <16 x i32> %a0, %a1 926 ret <16 x i32> %2 927} 928 929define <16 x i32> @stack_fold_paddd_commuted(<16 x i32> %a0, <16 x i32> %a1) { 930; CHECK-LABEL: stack_fold_paddd_commuted: 931; CHECK: # %bb.0: 932; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 933; CHECK-NEXT: #APP 934; CHECK-NEXT: nop 935; CHECK-NEXT: #NO_APP 936; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 937; CHECK-NEXT: retq 938 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 939 %2 = add <16 x i32> %a1, %a0 940 ret <16 x i32> %2 941} 942 943define <16 x i32> @stack_fold_paddd_mask(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) { 944; CHECK-LABEL: stack_fold_paddd_mask: 945; CHECK: # %bb.0: 946; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 947; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 948; CHECK-NEXT: #APP 949; CHECK-NEXT: nop 950; CHECK-NEXT: #NO_APP 951; CHECK-NEXT: kmovd %esi, %k1 952; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 953; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 954; CHECK-NEXT: retq 955 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 956 %2 = add <16 x i32> %a0, %a1 957 %3 = bitcast i16 %mask to <16 x i1> 958 ; load needed to keep the operation from being scheduled about the asm block 959 %4 = load <16 x i32>, ptr %a2 960 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 961 ret <16 x i32> %5 962} 963 964define <16 x i32> @stack_fold_paddd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) { 965; CHECK-LABEL: stack_fold_paddd_mask_commuted: 966; CHECK: # %bb.0: 967; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 968; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 969; CHECK-NEXT: #APP 970; CHECK-NEXT: nop 971; CHECK-NEXT: #NO_APP 972; CHECK-NEXT: kmovd %esi, %k1 973; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 974; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 975; CHECK-NEXT: retq 976 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 977 %2 = add <16 x i32> %a1, %a0 978 %3 = bitcast i16 %mask to <16 x i1> 979 ; load needed to keep the operation from being scheduled about the asm block 980 %4 = load <16 x i32>, ptr %a2 981 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 982 ret <16 x i32> %5 983} 984 985define <16 x i32> @stack_fold_paddd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 986; CHECK-LABEL: stack_fold_paddd_maskz: 987; CHECK: # %bb.0: 988; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 989; CHECK-NEXT: #APP 990; CHECK-NEXT: nop 991; CHECK-NEXT: #NO_APP 992; CHECK-NEXT: kmovd %edi, %k1 993; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 994; CHECK-NEXT: retq 995 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 996 %2 = add <16 x i32> %a0, %a1 997 %3 = bitcast i16 %mask to <16 x i1> 998 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 999 ret <16 x i32> %4 1000} 1001 1002define <16 x i32> @stack_fold_paddd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 1003; CHECK-LABEL: stack_fold_paddd_maskz_commuted: 1004; CHECK: # %bb.0: 1005; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1006; CHECK-NEXT: #APP 1007; CHECK-NEXT: nop 1008; CHECK-NEXT: #NO_APP 1009; CHECK-NEXT: kmovd %edi, %k1 1010; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1011; CHECK-NEXT: retq 1012 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1013 %2 = add <16 x i32> %a1, %a0 1014 %3 = bitcast i16 %mask to <16 x i1> 1015 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 1016 ret <16 x i32> %4 1017} 1018 1019define <8 x i64> @stack_fold_paddq(<8 x i64> %a0, <8 x i64> %a1) { 1020; CHECK-LABEL: stack_fold_paddq: 1021; CHECK: # %bb.0: 1022; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1023; CHECK-NEXT: #APP 1024; CHECK-NEXT: nop 1025; CHECK-NEXT: #NO_APP 1026; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1027; CHECK-NEXT: retq 1028 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1029 %2 = add <8 x i64> %a0, %a1 1030 ret <8 x i64> %2 1031} 1032 1033define <8 x i64> @stack_fold_paddq_commuted(<8 x i64> %a0, <8 x i64> %a1) { 1034; CHECK-LABEL: stack_fold_paddq_commuted: 1035; CHECK: # %bb.0: 1036; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1037; CHECK-NEXT: #APP 1038; CHECK-NEXT: nop 1039; CHECK-NEXT: #NO_APP 1040; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1041; CHECK-NEXT: retq 1042 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1043 %2 = add <8 x i64> %a1, %a0 1044 ret <8 x i64> %2 1045} 1046 1047define <8 x i64> @stack_fold_paddq_mask(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) { 1048; CHECK-LABEL: stack_fold_paddq_mask: 1049; CHECK: # %bb.0: 1050; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1051; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 1052; CHECK-NEXT: #APP 1053; CHECK-NEXT: nop 1054; CHECK-NEXT: #NO_APP 1055; CHECK-NEXT: kmovd %esi, %k1 1056; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 1057; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1058; CHECK-NEXT: retq 1059 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1060 %2 = add <8 x i64> %a0, %a1 1061 %3 = bitcast i8 %mask to <8 x i1> 1062 ; load needed to keep the operation from being scheduled about the asm block 1063 %4 = load <8 x i64>, ptr %a2 1064 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4 1065 ret <8 x i64> %5 1066} 1067 1068define <8 x i64> @stack_fold_paddq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) { 1069; CHECK-LABEL: stack_fold_paddq_mask_commuted: 1070; CHECK: # %bb.0: 1071; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1072; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 1073; CHECK-NEXT: #APP 1074; CHECK-NEXT: nop 1075; CHECK-NEXT: #NO_APP 1076; CHECK-NEXT: kmovd %esi, %k1 1077; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 1078; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1079; CHECK-NEXT: retq 1080 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1081 %2 = add <8 x i64> %a1, %a0 1082 %3 = bitcast i8 %mask to <8 x i1> 1083 ; load needed to keep the operation from being scheduled about the asm block 1084 %4 = load <8 x i64>, ptr %a2 1085 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4 1086 ret <8 x i64> %5 1087} 1088 1089define <8 x i64> @stack_fold_paddq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 1090; CHECK-LABEL: stack_fold_paddq_maskz: 1091; CHECK: # %bb.0: 1092; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1093; CHECK-NEXT: #APP 1094; CHECK-NEXT: nop 1095; CHECK-NEXT: #NO_APP 1096; CHECK-NEXT: kmovd %edi, %k1 1097; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1098; CHECK-NEXT: retq 1099 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1100 %2 = add <8 x i64> %a0, %a1 1101 %3 = bitcast i8 %mask to <8 x i1> 1102 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 1103 ret <8 x i64> %4 1104} 1105 1106define <8 x i64> @stack_fold_paddq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 1107; CHECK-LABEL: stack_fold_paddq_maskz_commuted: 1108; CHECK: # %bb.0: 1109; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1110; CHECK-NEXT: #APP 1111; CHECK-NEXT: nop 1112; CHECK-NEXT: #NO_APP 1113; CHECK-NEXT: kmovd %edi, %k1 1114; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1115; CHECK-NEXT: retq 1116 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1117 %2 = add <8 x i64> %a1, %a0 1118 %3 = bitcast i8 %mask to <8 x i1> 1119 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 1120 ret <8 x i64> %4 1121} 1122 1123define <64 x i8> @stack_fold_paddsb(<64 x i8> %a0, <64 x i8> %a1) { 1124; CHECK-LABEL: stack_fold_paddsb: 1125; CHECK: # %bb.0: 1126; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1127; CHECK-NEXT: #APP 1128; CHECK-NEXT: nop 1129; CHECK-NEXT: #NO_APP 1130; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1131; CHECK-NEXT: retq 1132 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1133 %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1) 1134 ret <64 x i8> %2 1135} 1136 1137define <64 x i8> @stack_fold_paddsb_commuted(<64 x i8> %a0, <64 x i8> %a1) { 1138; CHECK-LABEL: stack_fold_paddsb_commuted: 1139; CHECK: # %bb.0: 1140; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1141; CHECK-NEXT: #APP 1142; CHECK-NEXT: nop 1143; CHECK-NEXT: #NO_APP 1144; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1145; CHECK-NEXT: retq 1146 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1147 %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0) 1148 ret <64 x i8> %2 1149} 1150 1151define <64 x i8> @stack_fold_paddsb_mask(<64 x i8> %a0, <64 x i8> %a1, ptr %a2, i64 %mask) { 1152; CHECK-LABEL: stack_fold_paddsb_mask: 1153; CHECK: # %bb.0: 1154; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1155; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 1156; CHECK-NEXT: #APP 1157; CHECK-NEXT: nop 1158; CHECK-NEXT: #NO_APP 1159; CHECK-NEXT: kmovq %rsi, %k1 1160; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 1161; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1162; CHECK-NEXT: retq 1163 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1164 %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1) 1165 %3 = bitcast i64 %mask to <64 x i1> 1166 ; load needed to keep the operation from being scheduled about the asm block 1167 %4 = load <64 x i8>, ptr %a2 1168 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4 1169 ret <64 x i8> %5 1170} 1171 1172define <64 x i8> @stack_fold_paddsb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, ptr %a2, i64 %mask) { 1173; CHECK-LABEL: stack_fold_paddsb_mask_commuted: 1174; CHECK: # %bb.0: 1175; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1176; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 1177; CHECK-NEXT: #APP 1178; CHECK-NEXT: nop 1179; CHECK-NEXT: #NO_APP 1180; CHECK-NEXT: kmovq %rsi, %k1 1181; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 1182; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1183; CHECK-NEXT: retq 1184 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1185 %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0) 1186 %3 = bitcast i64 %mask to <64 x i1> 1187 ; load needed to keep the operation from being scheduled about the asm block 1188 %4 = load <64 x i8>, ptr %a2 1189 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4 1190 ret <64 x i8> %5 1191} 1192 1193define <64 x i8> @stack_fold_paddsb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 1194; CHECK-LABEL: stack_fold_paddsb_maskz: 1195; CHECK: # %bb.0: 1196; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1197; CHECK-NEXT: #APP 1198; CHECK-NEXT: nop 1199; CHECK-NEXT: #NO_APP 1200; CHECK-NEXT: kmovq %rdi, %k1 1201; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1202; CHECK-NEXT: retq 1203 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1204 %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1) 1205 %3 = bitcast i64 %mask to <64 x i1> 1206 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer 1207 ret <64 x i8> %4 1208} 1209 1210define <64 x i8> @stack_fold_paddsb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 1211; CHECK-LABEL: stack_fold_paddsb_maskz_commuted: 1212; CHECK: # %bb.0: 1213; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1214; CHECK-NEXT: #APP 1215; CHECK-NEXT: nop 1216; CHECK-NEXT: #NO_APP 1217; CHECK-NEXT: kmovq %rdi, %k1 1218; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1219; CHECK-NEXT: retq 1220 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1221 %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0) 1222 %3 = bitcast i64 %mask to <64 x i1> 1223 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer 1224 ret <64 x i8> %4 1225} 1226 1227define <32 x i16> @stack_fold_paddsw(<32 x i16> %a0, <32 x i16> %a1) { 1228; CHECK-LABEL: stack_fold_paddsw: 1229; CHECK: # %bb.0: 1230; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1231; CHECK-NEXT: #APP 1232; CHECK-NEXT: nop 1233; CHECK-NEXT: #NO_APP 1234; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1235; CHECK-NEXT: retq 1236 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1237 %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1) 1238 ret <32 x i16> %2 1239} 1240 1241define <32 x i16> @stack_fold_paddsw_commuted(<32 x i16> %a0, <32 x i16> %a1) { 1242; CHECK-LABEL: stack_fold_paddsw_commuted: 1243; CHECK: # %bb.0: 1244; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1245; CHECK-NEXT: #APP 1246; CHECK-NEXT: nop 1247; CHECK-NEXT: #NO_APP 1248; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1249; CHECK-NEXT: retq 1250 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1251 %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0) 1252 ret <32 x i16> %2 1253} 1254 1255define <32 x i16> @stack_fold_paddsw_mask(<32 x i16> %a0, <32 x i16> %a1, ptr %a2, i32 %mask) { 1256; CHECK-LABEL: stack_fold_paddsw_mask: 1257; CHECK: # %bb.0: 1258; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1259; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 1260; CHECK-NEXT: #APP 1261; CHECK-NEXT: nop 1262; CHECK-NEXT: #NO_APP 1263; CHECK-NEXT: kmovd %esi, %k1 1264; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 1265; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1266; CHECK-NEXT: retq 1267 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1268 %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1) 1269 %3 = bitcast i32 %mask to <32 x i1> 1270 ; load needed to keep the operation from being scheduled about the asm block 1271 %4 = load <32 x i16>, ptr %a2 1272 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4 1273 ret <32 x i16> %5 1274} 1275 1276define <32 x i16> @stack_fold_paddsw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, ptr %a2, i32 %mask) { 1277; CHECK-LABEL: stack_fold_paddsw_mask_commuted: 1278; CHECK: # %bb.0: 1279; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1280; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 1281; CHECK-NEXT: #APP 1282; CHECK-NEXT: nop 1283; CHECK-NEXT: #NO_APP 1284; CHECK-NEXT: kmovd %esi, %k1 1285; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 1286; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1287; CHECK-NEXT: retq 1288 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1289 %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0) 1290 %3 = bitcast i32 %mask to <32 x i1> 1291 ; load needed to keep the operation from being scheduled about the asm block 1292 %4 = load <32 x i16>, ptr %a2 1293 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4 1294 ret <32 x i16> %5 1295} 1296 1297define <32 x i16> @stack_fold_paddsw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 1298; CHECK-LABEL: stack_fold_paddsw_maskz: 1299; CHECK: # %bb.0: 1300; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1301; CHECK-NEXT: #APP 1302; CHECK-NEXT: nop 1303; CHECK-NEXT: #NO_APP 1304; CHECK-NEXT: kmovd %edi, %k1 1305; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1306; CHECK-NEXT: retq 1307 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1308 %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1) 1309 %3 = bitcast i32 %mask to <32 x i1> 1310 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 1311 ret <32 x i16> %4 1312} 1313 1314define <32 x i16> @stack_fold_paddsw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 1315; CHECK-LABEL: stack_fold_paddsw_maskz_commuted: 1316; CHECK: # %bb.0: 1317; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1318; CHECK-NEXT: #APP 1319; CHECK-NEXT: nop 1320; CHECK-NEXT: #NO_APP 1321; CHECK-NEXT: kmovd %edi, %k1 1322; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1323; CHECK-NEXT: retq 1324 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1325 %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0) 1326 %3 = bitcast i32 %mask to <32 x i1> 1327 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 1328 ret <32 x i16> %4 1329} 1330 1331define <64 x i8> @stack_fold_paddusb(<64 x i8> %a0, <64 x i8> %a1) { 1332; CHECK-LABEL: stack_fold_paddusb: 1333; CHECK: # %bb.0: 1334; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1335; CHECK-NEXT: #APP 1336; CHECK-NEXT: nop 1337; CHECK-NEXT: #NO_APP 1338; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1339; CHECK-NEXT: retq 1340 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1341 %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1) 1342 ret <64 x i8> %2 1343} 1344 1345define <64 x i8> @stack_fold_paddusb_commuted(<64 x i8> %a0, <64 x i8> %a1) { 1346; CHECK-LABEL: stack_fold_paddusb_commuted: 1347; CHECK: # %bb.0: 1348; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1349; CHECK-NEXT: #APP 1350; CHECK-NEXT: nop 1351; CHECK-NEXT: #NO_APP 1352; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1353; CHECK-NEXT: retq 1354 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1355 %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0) 1356 ret <64 x i8> %2 1357} 1358 1359define <64 x i8> @stack_fold_paddusb_mask(<64 x i8> %a0, <64 x i8> %a1, ptr %a2, i64 %mask) { 1360; CHECK-LABEL: stack_fold_paddusb_mask: 1361; CHECK: # %bb.0: 1362; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1363; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 1364; CHECK-NEXT: #APP 1365; CHECK-NEXT: nop 1366; CHECK-NEXT: #NO_APP 1367; CHECK-NEXT: kmovq %rsi, %k1 1368; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 1369; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1370; CHECK-NEXT: retq 1371 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1372 %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1) 1373 %3 = bitcast i64 %mask to <64 x i1> 1374 ; load needed to keep the operation from being scheduled about the asm block 1375 %4 = load <64 x i8>, ptr %a2 1376 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4 1377 ret <64 x i8> %5 1378} 1379 1380define <64 x i8> @stack_fold_paddusb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, ptr %a2, i64 %mask) { 1381; CHECK-LABEL: stack_fold_paddusb_mask_commuted: 1382; CHECK: # %bb.0: 1383; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1384; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 1385; CHECK-NEXT: #APP 1386; CHECK-NEXT: nop 1387; CHECK-NEXT: #NO_APP 1388; CHECK-NEXT: kmovq %rsi, %k1 1389; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 1390; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1391; CHECK-NEXT: retq 1392 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1393 %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0) 1394 %3 = bitcast i64 %mask to <64 x i1> 1395 ; load needed to keep the operation from being scheduled about the asm block 1396 %4 = load <64 x i8>, ptr %a2 1397 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4 1398 ret <64 x i8> %5 1399} 1400 1401define <64 x i8> @stack_fold_paddusb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 1402; CHECK-LABEL: stack_fold_paddusb_maskz: 1403; CHECK: # %bb.0: 1404; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1405; CHECK-NEXT: #APP 1406; CHECK-NEXT: nop 1407; CHECK-NEXT: #NO_APP 1408; CHECK-NEXT: kmovq %rdi, %k1 1409; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1410; CHECK-NEXT: retq 1411 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1412 %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1) 1413 %3 = bitcast i64 %mask to <64 x i1> 1414 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer 1415 ret <64 x i8> %4 1416} 1417 1418define <64 x i8> @stack_fold_paddusb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 1419; CHECK-LABEL: stack_fold_paddusb_maskz_commuted: 1420; CHECK: # %bb.0: 1421; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1422; CHECK-NEXT: #APP 1423; CHECK-NEXT: nop 1424; CHECK-NEXT: #NO_APP 1425; CHECK-NEXT: kmovq %rdi, %k1 1426; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1427; CHECK-NEXT: retq 1428 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1429 %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0) 1430 %3 = bitcast i64 %mask to <64 x i1> 1431 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer 1432 ret <64 x i8> %4 1433} 1434 1435define <32 x i16> @stack_fold_paddusw(<32 x i16> %a0, <32 x i16> %a1) { 1436; CHECK-LABEL: stack_fold_paddusw: 1437; CHECK: # %bb.0: 1438; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1439; CHECK-NEXT: #APP 1440; CHECK-NEXT: nop 1441; CHECK-NEXT: #NO_APP 1442; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1443; CHECK-NEXT: retq 1444 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1445 %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1) 1446 ret <32 x i16> %2 1447} 1448 1449define <32 x i16> @stack_fold_paddusw_commuted(<32 x i16> %a0, <32 x i16> %a1) { 1450; CHECK-LABEL: stack_fold_paddusw_commuted: 1451; CHECK: # %bb.0: 1452; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1453; CHECK-NEXT: #APP 1454; CHECK-NEXT: nop 1455; CHECK-NEXT: #NO_APP 1456; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1457; CHECK-NEXT: retq 1458 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1459 %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0) 1460 ret <32 x i16> %2 1461} 1462 1463define <32 x i16> @stack_fold_paddusw_mask(<32 x i16> %a0, <32 x i16> %a1, ptr %a2, i32 %mask) { 1464; CHECK-LABEL: stack_fold_paddusw_mask: 1465; CHECK: # %bb.0: 1466; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1467; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 1468; CHECK-NEXT: #APP 1469; CHECK-NEXT: nop 1470; CHECK-NEXT: #NO_APP 1471; CHECK-NEXT: kmovd %esi, %k1 1472; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 1473; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1474; CHECK-NEXT: retq 1475 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1476 %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1) 1477 %3 = bitcast i32 %mask to <32 x i1> 1478 ; load needed to keep the operation from being scheduled about the asm block 1479 %4 = load <32 x i16>, ptr %a2 1480 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4 1481 ret <32 x i16> %5 1482} 1483 1484define <32 x i16> @stack_fold_paddusw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, ptr %a2, i32 %mask) { 1485; CHECK-LABEL: stack_fold_paddusw_mask_commuted: 1486; CHECK: # %bb.0: 1487; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1488; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 1489; CHECK-NEXT: #APP 1490; CHECK-NEXT: nop 1491; CHECK-NEXT: #NO_APP 1492; CHECK-NEXT: kmovd %esi, %k1 1493; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 1494; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1495; CHECK-NEXT: retq 1496 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1497 %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0) 1498 %3 = bitcast i32 %mask to <32 x i1> 1499 ; load needed to keep the operation from being scheduled about the asm block 1500 %4 = load <32 x i16>, ptr %a2 1501 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4 1502 ret <32 x i16> %5 1503} 1504 1505define <32 x i16> @stack_fold_paddusw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 1506; CHECK-LABEL: stack_fold_paddusw_maskz: 1507; CHECK: # %bb.0: 1508; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1509; CHECK-NEXT: #APP 1510; CHECK-NEXT: nop 1511; CHECK-NEXT: #NO_APP 1512; CHECK-NEXT: kmovd %edi, %k1 1513; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1514; CHECK-NEXT: retq 1515 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1516 %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1) 1517 %3 = bitcast i32 %mask to <32 x i1> 1518 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 1519 ret <32 x i16> %4 1520} 1521 1522define <32 x i16> @stack_fold_paddusw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 1523; CHECK-LABEL: stack_fold_paddusw_maskz_commuted: 1524; CHECK: # %bb.0: 1525; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1526; CHECK-NEXT: #APP 1527; CHECK-NEXT: nop 1528; CHECK-NEXT: #NO_APP 1529; CHECK-NEXT: kmovd %edi, %k1 1530; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1531; CHECK-NEXT: retq 1532 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1533 %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0) 1534 %3 = bitcast i32 %mask to <32 x i1> 1535 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 1536 ret <32 x i16> %4 1537} 1538 1539define <32 x i16> @stack_fold_paddw(<32 x i16> %a0, <32 x i16> %a1) { 1540; CHECK-LABEL: stack_fold_paddw: 1541; CHECK: # %bb.0: 1542; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1543; CHECK-NEXT: #APP 1544; CHECK-NEXT: nop 1545; CHECK-NEXT: #NO_APP 1546; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1547; CHECK-NEXT: retq 1548 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1549 %2 = add <32 x i16> %a0, %a1 1550 ret <32 x i16> %2 1551} 1552 1553define <32 x i16> @stack_fold_paddw_commuted(<32 x i16> %a0, <32 x i16> %a1) { 1554; CHECK-LABEL: stack_fold_paddw_commuted: 1555; CHECK: # %bb.0: 1556; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1557; CHECK-NEXT: #APP 1558; CHECK-NEXT: nop 1559; CHECK-NEXT: #NO_APP 1560; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1561; CHECK-NEXT: retq 1562 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1563 %2 = add <32 x i16> %a1, %a0 1564 ret <32 x i16> %2 1565} 1566 1567define <32 x i16> @stack_fold_paddw_mask(<32 x i16> %a0, <32 x i16> %a1, ptr %a2, i32 %mask) { 1568; CHECK-LABEL: stack_fold_paddw_mask: 1569; CHECK: # %bb.0: 1570; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1571; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 1572; CHECK-NEXT: #APP 1573; CHECK-NEXT: nop 1574; CHECK-NEXT: #NO_APP 1575; CHECK-NEXT: kmovd %esi, %k1 1576; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 1577; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1578; CHECK-NEXT: retq 1579 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1580 %2 = add <32 x i16> %a0, %a1 1581 %3 = bitcast i32 %mask to <32 x i1> 1582 ; load needed to keep the operation from being scheduled about the asm block 1583 %4 = load <32 x i16>, ptr %a2 1584 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4 1585 ret <32 x i16> %5 1586} 1587 1588define <32 x i16> @stack_fold_paddw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, ptr %a2, i32 %mask) { 1589; CHECK-LABEL: stack_fold_paddw_mask_commuted: 1590; CHECK: # %bb.0: 1591; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1592; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 1593; CHECK-NEXT: #APP 1594; CHECK-NEXT: nop 1595; CHECK-NEXT: #NO_APP 1596; CHECK-NEXT: kmovd %esi, %k1 1597; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 1598; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1599; CHECK-NEXT: retq 1600 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1601 %2 = add <32 x i16> %a1, %a0 1602 %3 = bitcast i32 %mask to <32 x i1> 1603 ; load needed to keep the operation from being scheduled about the asm block 1604 %4 = load <32 x i16>, ptr %a2 1605 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4 1606 ret <32 x i16> %5 1607} 1608 1609define <32 x i16> @stack_fold_paddw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 1610; CHECK-LABEL: stack_fold_paddw_maskz: 1611; CHECK: # %bb.0: 1612; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1613; CHECK-NEXT: #APP 1614; CHECK-NEXT: nop 1615; CHECK-NEXT: #NO_APP 1616; CHECK-NEXT: kmovd %edi, %k1 1617; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1618; CHECK-NEXT: retq 1619 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1620 %2 = add <32 x i16> %a0, %a1 1621 %3 = bitcast i32 %mask to <32 x i1> 1622 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 1623 ret <32 x i16> %4 1624} 1625 1626define <32 x i16> @stack_fold_paddw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 1627; CHECK-LABEL: stack_fold_paddw_maskz_commuted: 1628; CHECK: # %bb.0: 1629; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1630; CHECK-NEXT: #APP 1631; CHECK-NEXT: nop 1632; CHECK-NEXT: #NO_APP 1633; CHECK-NEXT: kmovd %edi, %k1 1634; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1635; CHECK-NEXT: retq 1636 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1637 %2 = add <32 x i16> %a1, %a0 1638 %3 = bitcast i32 %mask to <32 x i1> 1639 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 1640 ret <32 x i16> %4 1641} 1642 1643define <64 x i8> @stack_fold_palignr(<64 x i8> %a0, <64 x i8> %a1) { 1644; CHECK-LABEL: stack_fold_palignr: 1645; CHECK: # %bb.0: 1646; CHECK-NEXT: pushq %rax 1647; CHECK-NEXT: .cfi_def_cfa_offset 16 1648; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1649; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1650; CHECK-NEXT: #APP 1651; CHECK-NEXT: nop 1652; CHECK-NEXT: #NO_APP 1653; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 1654; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1655; CHECK-NEXT: # zmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16],mem[33,34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32],mem[49,50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48] 1656; CHECK-NEXT: popq %rax 1657; CHECK-NEXT: .cfi_def_cfa_offset 8 1658; CHECK-NEXT: retq 1659 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1660 %2 = shufflevector <64 x i8> %a1, <64 x i8> %a0, <64 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112> 1661 ret <64 x i8> %2 1662} 1663 1664define <64 x i8> @stack_fold_palignr_mask(<64 x i8> %a0, <64 x i8> %a1, ptr %passthru, i64 %mask) { 1665; CHECK-LABEL: stack_fold_palignr_mask: 1666; CHECK: # %bb.0: 1667; CHECK-NEXT: pushq %rax 1668; CHECK-NEXT: .cfi_def_cfa_offset 16 1669; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1670; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1671; CHECK-NEXT: #APP 1672; CHECK-NEXT: nop 1673; CHECK-NEXT: #NO_APP 1674; CHECK-NEXT: kmovq %rsi, %k1 1675; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 1676; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 1677; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload 1678; CHECK-NEXT: # zmm1 {%k1} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16],mem[33,34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32],mem[49,50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48] 1679; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 1680; CHECK-NEXT: popq %rax 1681; CHECK-NEXT: .cfi_def_cfa_offset 8 1682; CHECK-NEXT: retq 1683 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1684 %2 = shufflevector <64 x i8> %a1, <64 x i8> %a0, <64 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112> 1685 %3 = bitcast i64 %mask to <64 x i1> 1686 %4 = load <64 x i8>, ptr %passthru 1687 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4 1688 ret <64 x i8> %5 1689} 1690 1691define <64 x i8> @stack_fold_palignr_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 1692; CHECK-LABEL: stack_fold_palignr_maskz: 1693; CHECK: # %bb.0: 1694; CHECK-NEXT: pushq %rax 1695; CHECK-NEXT: .cfi_def_cfa_offset 16 1696; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1697; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1698; CHECK-NEXT: #APP 1699; CHECK-NEXT: nop 1700; CHECK-NEXT: #NO_APP 1701; CHECK-NEXT: kmovq %rdi, %k1 1702; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 1703; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1704; CHECK-NEXT: # zmm0 {%k1} {z} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16],mem[33,34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32],mem[49,50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48] 1705; CHECK-NEXT: popq %rax 1706; CHECK-NEXT: .cfi_def_cfa_offset 8 1707; CHECK-NEXT: retq 1708 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1709 %2 = shufflevector <64 x i8> %a1, <64 x i8> %a0, <64 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112> 1710 %3 = bitcast i64 %mask to <64 x i1> 1711 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer 1712 ret <64 x i8> %4 1713} 1714 1715define <16 x i32> @stack_fold_pandd(<16 x i32> %a0, <16 x i32> %a1) { 1716; CHECK-LABEL: stack_fold_pandd: 1717; CHECK: # %bb.0: 1718; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1719; CHECK-NEXT: #APP 1720; CHECK-NEXT: nop 1721; CHECK-NEXT: #NO_APP 1722; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1723; CHECK-NEXT: retq 1724 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1725 %2 = and <16 x i32> %a0, %a1 1726 ret <16 x i32> %2 1727} 1728 1729define <16 x i32> @stack_fold_pandd_commuted(<16 x i32> %a0, <16 x i32> %a1) { 1730; CHECK-LABEL: stack_fold_pandd_commuted: 1731; CHECK: # %bb.0: 1732; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1733; CHECK-NEXT: #APP 1734; CHECK-NEXT: nop 1735; CHECK-NEXT: #NO_APP 1736; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1737; CHECK-NEXT: retq 1738 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1739 %2 = and <16 x i32> %a1, %a0 1740 ret <16 x i32> %2 1741} 1742 1743define <16 x i32> @stack_fold_pandd_mask(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) { 1744; CHECK-LABEL: stack_fold_pandd_mask: 1745; CHECK: # %bb.0: 1746; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1747; CHECK-NEXT: vmovaps %zmm0, %zmm1 1748; CHECK-NEXT: #APP 1749; CHECK-NEXT: nop 1750; CHECK-NEXT: #NO_APP 1751; CHECK-NEXT: kmovd %esi, %k1 1752; CHECK-NEXT: vmovaps (%rdi), %zmm0 1753; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1754; CHECK-NEXT: retq 1755 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1756 %2 = and <16 x i32> %a0, %a1 1757 %3 = bitcast i16 %mask to <16 x i1> 1758 ; load needed to keep the operation from being scheduled about the asm block 1759 %4 = load <16 x i32>, ptr %a2 1760 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 1761 ret <16 x i32> %5 1762} 1763 1764define <16 x i32> @stack_fold_pandd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) { 1765; CHECK-LABEL: stack_fold_pandd_mask_commuted: 1766; CHECK: # %bb.0: 1767; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1768; CHECK-NEXT: vmovaps %zmm0, %zmm1 1769; CHECK-NEXT: #APP 1770; CHECK-NEXT: nop 1771; CHECK-NEXT: #NO_APP 1772; CHECK-NEXT: kmovd %esi, %k1 1773; CHECK-NEXT: vmovaps (%rdi), %zmm0 1774; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1775; CHECK-NEXT: retq 1776 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1777 %2 = and <16 x i32> %a1, %a0 1778 %3 = bitcast i16 %mask to <16 x i1> 1779 ; load needed to keep the operation from being scheduled about the asm block 1780 %4 = load <16 x i32>, ptr %a2 1781 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 1782 ret <16 x i32> %5 1783} 1784 1785define <16 x i32> @stack_fold_pandd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 1786; CHECK-LABEL: stack_fold_pandd_maskz: 1787; CHECK: # %bb.0: 1788; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1789; CHECK-NEXT: #APP 1790; CHECK-NEXT: nop 1791; CHECK-NEXT: #NO_APP 1792; CHECK-NEXT: kmovd %edi, %k1 1793; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1794; CHECK-NEXT: retq 1795 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1796 %2 = and <16 x i32> %a0, %a1 1797 %3 = bitcast i16 %mask to <16 x i1> 1798 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 1799 ret <16 x i32> %4 1800} 1801 1802define <16 x i32> @stack_fold_pandd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 1803; CHECK-LABEL: stack_fold_pandd_maskz_commuted: 1804; CHECK: # %bb.0: 1805; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1806; CHECK-NEXT: #APP 1807; CHECK-NEXT: nop 1808; CHECK-NEXT: #NO_APP 1809; CHECK-NEXT: kmovd %edi, %k1 1810; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1811; CHECK-NEXT: retq 1812 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1813 %2 = and <16 x i32> %a1, %a0 1814 %3 = bitcast i16 %mask to <16 x i1> 1815 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 1816 ret <16 x i32> %4 1817} 1818 1819define <8 x i64> @stack_fold_pandq(<8 x i64> %a0, <8 x i64> %a1) { 1820; CHECK-LABEL: stack_fold_pandq: 1821; CHECK: # %bb.0: 1822; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1823; CHECK-NEXT: #APP 1824; CHECK-NEXT: nop 1825; CHECK-NEXT: #NO_APP 1826; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1827; CHECK-NEXT: retq 1828 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1829 %2 = and <8 x i64> %a0, %a1 1830 ret <8 x i64> %2 1831} 1832 1833define <8 x i64> @stack_fold_pandq_commuted(<8 x i64> %a0, <8 x i64> %a1) { 1834; CHECK-LABEL: stack_fold_pandq_commuted: 1835; CHECK: # %bb.0: 1836; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1837; CHECK-NEXT: #APP 1838; CHECK-NEXT: nop 1839; CHECK-NEXT: #NO_APP 1840; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 1841; CHECK-NEXT: retq 1842 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1843 %2 = and <8 x i64> %a1, %a0 1844 ret <8 x i64> %2 1845} 1846 1847define <8 x i64> @stack_fold_pandq_mask(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) { 1848; CHECK-LABEL: stack_fold_pandq_mask: 1849; CHECK: # %bb.0: 1850; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1851; CHECK-NEXT: vmovapd %zmm0, %zmm1 1852; CHECK-NEXT: #APP 1853; CHECK-NEXT: nop 1854; CHECK-NEXT: #NO_APP 1855; CHECK-NEXT: kmovd %esi, %k1 1856; CHECK-NEXT: vmovapd (%rdi), %zmm0 1857; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1858; CHECK-NEXT: retq 1859 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1860 %2 = and <8 x i64> %a0, %a1 1861 %3 = bitcast i8 %mask to <8 x i1> 1862 ; load needed to keep the operation from being scheduled about the asm block 1863 %4 = load <8 x i64>, ptr %a2 1864 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4 1865 ret <8 x i64> %5 1866} 1867 1868define <8 x i64> @stack_fold_pandq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) { 1869; CHECK-LABEL: stack_fold_pandq_mask_commuted: 1870; CHECK: # %bb.0: 1871; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1872; CHECK-NEXT: vmovapd %zmm0, %zmm1 1873; CHECK-NEXT: #APP 1874; CHECK-NEXT: nop 1875; CHECK-NEXT: #NO_APP 1876; CHECK-NEXT: kmovd %esi, %k1 1877; CHECK-NEXT: vmovapd (%rdi), %zmm0 1878; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 1879; CHECK-NEXT: retq 1880 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1881 %2 = and <8 x i64> %a1, %a0 1882 %3 = bitcast i8 %mask to <8 x i1> 1883 ; load needed to keep the operation from being scheduled about the asm block 1884 %4 = load <8 x i64>, ptr %a2 1885 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4 1886 ret <8 x i64> %5 1887} 1888 1889define <8 x i64> @stack_fold_pandq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 1890; CHECK-LABEL: stack_fold_pandq_maskz: 1891; CHECK: # %bb.0: 1892; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1893; CHECK-NEXT: #APP 1894; CHECK-NEXT: nop 1895; CHECK-NEXT: #NO_APP 1896; CHECK-NEXT: kmovd %edi, %k1 1897; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1898; CHECK-NEXT: retq 1899 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1900 %2 = and <8 x i64> %a0, %a1 1901 %3 = bitcast i8 %mask to <8 x i1> 1902 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 1903 ret <8 x i64> %4 1904} 1905 1906define <8 x i64> @stack_fold_pandq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 1907; CHECK-LABEL: stack_fold_pandq_maskz_commuted: 1908; CHECK: # %bb.0: 1909; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1910; CHECK-NEXT: #APP 1911; CHECK-NEXT: nop 1912; CHECK-NEXT: #NO_APP 1913; CHECK-NEXT: kmovd %edi, %k1 1914; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 1915; CHECK-NEXT: retq 1916 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1917 %2 = and <8 x i64> %a1, %a0 1918 %3 = bitcast i8 %mask to <8 x i1> 1919 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 1920 ret <8 x i64> %4 1921} 1922 1923define <16 x i32> @stack_fold_vpconflictd(<16 x i32> %a0) { 1924; CHECK-LABEL: stack_fold_vpconflictd: 1925; CHECK: # %bb.0: 1926; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1927; CHECK-NEXT: #APP 1928; CHECK-NEXT: nop 1929; CHECK-NEXT: #NO_APP 1930; CHECK-NEXT: vpconflictd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 1931; CHECK-NEXT: retq 1932 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1933 %2 = call <16 x i32> @llvm.x86.avx512.conflict.d.512(<16 x i32> %a0) 1934 ret <16 x i32> %2 1935} 1936declare <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly 1937 1938define <8 x i64> @stack_fold_vpconflictq(<8 x i64> %a0) { 1939; CHECK-LABEL: stack_fold_vpconflictq: 1940; CHECK: # %bb.0: 1941; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1942; CHECK-NEXT: #APP 1943; CHECK-NEXT: nop 1944; CHECK-NEXT: #NO_APP 1945; CHECK-NEXT: vpconflictq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 1946; CHECK-NEXT: retq 1947 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1948 %2 = call <8 x i64> @llvm.x86.avx512.conflict.q.512(<8 x i64> %a0) 1949 ret <8 x i64> %2 1950} 1951declare <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64>, <8 x i64>, i8) nounwind readnone 1952 1953define i64 @stack_fold_pcmpeqb(<64 x i8> %a0, <64 x i8> %a1) { 1954; CHECK-LABEL: stack_fold_pcmpeqb: 1955; CHECK: # %bb.0: 1956; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1957; CHECK-NEXT: #APP 1958; CHECK-NEXT: nop 1959; CHECK-NEXT: #NO_APP 1960; CHECK-NEXT: vpcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload 1961; CHECK-NEXT: kmovq %k0, %rax 1962; CHECK-NEXT: vzeroupper 1963; CHECK-NEXT: retq 1964 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1965 %2 = icmp eq <64 x i8> %a0, %a1 1966 %3 = bitcast <64 x i1> %2 to i64 1967 ret i64 %3 1968} 1969 1970define i16 @stack_fold_pcmpeqd(<16 x i32> %a0, <16 x i32> %a1) { 1971; CHECK-LABEL: stack_fold_pcmpeqd: 1972; CHECK: # %bb.0: 1973; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1974; CHECK-NEXT: #APP 1975; CHECK-NEXT: nop 1976; CHECK-NEXT: #NO_APP 1977; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload 1978; CHECK-NEXT: kmovd %k0, %eax 1979; CHECK-NEXT: # kill: def $ax killed $ax killed $eax 1980; CHECK-NEXT: vzeroupper 1981; CHECK-NEXT: retq 1982 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1983 %2 = icmp eq <16 x i32> %a0, %a1 1984 %3 = bitcast <16 x i1> %2 to i16 1985 ret i16 %3 1986} 1987 1988define i8 @stack_fold_pcmpeqq(<8 x i64> %a0, <8 x i64> %a1) { 1989; CHECK-LABEL: stack_fold_pcmpeqq: 1990; CHECK: # %bb.0: 1991; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 1992; CHECK-NEXT: #APP 1993; CHECK-NEXT: nop 1994; CHECK-NEXT: #NO_APP 1995; CHECK-NEXT: vpcmpeqq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload 1996; CHECK-NEXT: kmovd %k0, %eax 1997; CHECK-NEXT: # kill: def $al killed $al killed $eax 1998; CHECK-NEXT: vzeroupper 1999; CHECK-NEXT: retq 2000 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2001 %2 = icmp eq <8 x i64> %a0, %a1 2002 %3 = bitcast <8 x i1> %2 to i8 2003 ret i8 %3 2004} 2005 2006define i32 @stack_fold_pcmpeqw(<32 x i16> %a0, <32 x i16> %a1) { 2007; CHECK-LABEL: stack_fold_pcmpeqw: 2008; CHECK: # %bb.0: 2009; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2010; CHECK-NEXT: #APP 2011; CHECK-NEXT: nop 2012; CHECK-NEXT: #NO_APP 2013; CHECK-NEXT: vpcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload 2014; CHECK-NEXT: kmovd %k0, %eax 2015; CHECK-NEXT: vzeroupper 2016; CHECK-NEXT: retq 2017 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2018 %2 = icmp eq <32 x i16> %a0, %a1 2019 %3 = bitcast <32 x i1> %2 to i32 2020 ret i32 %3 2021} 2022 2023define <16 x i32> @stack_fold_pcmpeqd_mask(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask, <16 x i32> %b0, <16 x i32> %b1) { 2024; CHECK-LABEL: stack_fold_pcmpeqd_mask: 2025; CHECK: # %bb.0: 2026; CHECK-NEXT: subq $136, %rsp 2027; CHECK-NEXT: .cfi_def_cfa_offset 144 2028; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2029; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill 2030; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2031; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2032; CHECK-NEXT: #APP 2033; CHECK-NEXT: nop 2034; CHECK-NEXT: #NO_APP 2035; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 2036; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 2037; CHECK-NEXT: kmovd %esi, %k1 2038; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload 2039; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 2040; CHECK-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload 2041; CHECK-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} 2042; CHECK-NEXT: addq $136, %rsp 2043; CHECK-NEXT: .cfi_def_cfa_offset 8 2044; CHECK-NEXT: retq 2045 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2046 ; load and add are here to keep the operations below the side effecting block and to avoid folding the wrong load 2047 %2 = load <16 x i32>, ptr %a2 2048 %3 = add <16 x i32> %a1, %2 2049 %4 = bitcast i16 %mask to <16 x i1> 2050 %5 = icmp eq <16 x i32> %3, %a0 2051 %6 = and <16 x i1> %4, %5 2052 %7 = select <16 x i1> %6, <16 x i32> %b0, <16 x i32> %b1 2053 ret <16 x i32> %7 2054} 2055 2056define <16 x i32> @stack_fold_pcmpeqd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask, <16 x i32> %b0, <16 x i32> %b1) { 2057; CHECK-LABEL: stack_fold_pcmpeqd_mask_commuted: 2058; CHECK: # %bb.0: 2059; CHECK-NEXT: subq $136, %rsp 2060; CHECK-NEXT: .cfi_def_cfa_offset 144 2061; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2062; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill 2063; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2064; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2065; CHECK-NEXT: #APP 2066; CHECK-NEXT: nop 2067; CHECK-NEXT: #NO_APP 2068; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 2069; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 2070; CHECK-NEXT: kmovd %esi, %k1 2071; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload 2072; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 2073; CHECK-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload 2074; CHECK-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} 2075; CHECK-NEXT: addq $136, %rsp 2076; CHECK-NEXT: .cfi_def_cfa_offset 8 2077; CHECK-NEXT: retq 2078 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2079 ; load and add are here to keep the operations below the side effecting block and to avoid folding the wrong load 2080 %2 = load <16 x i32>, ptr %a2 2081 %3 = add <16 x i32> %a1, %2 2082 %4 = bitcast i16 %mask to <16 x i1> 2083 %5 = icmp eq <16 x i32> %a0, %3 2084 %6 = and <16 x i1> %4, %5 2085 %7 = select <16 x i1> %6, <16 x i32> %b0, <16 x i32> %b1 2086 ret <16 x i32> %7 2087} 2088 2089define <16 x i32> @stack_fold_pcmpled_mask(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask, <16 x i32> %b0, <16 x i32> %b1) { 2090; CHECK-LABEL: stack_fold_pcmpled_mask: 2091; CHECK: # %bb.0: 2092; CHECK-NEXT: subq $136, %rsp 2093; CHECK-NEXT: .cfi_def_cfa_offset 144 2094; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2095; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill 2096; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2097; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2098; CHECK-NEXT: #APP 2099; CHECK-NEXT: nop 2100; CHECK-NEXT: #NO_APP 2101; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 2102; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 2103; CHECK-NEXT: kmovd %esi, %k1 2104; CHECK-NEXT: vpcmpled {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload 2105; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 2106; CHECK-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload 2107; CHECK-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} 2108; CHECK-NEXT: addq $136, %rsp 2109; CHECK-NEXT: .cfi_def_cfa_offset 8 2110; CHECK-NEXT: retq 2111 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2112 ; load and add are here to keep the operations below the side effecting block and to avoid folding the wrong load 2113 %2 = load <16 x i32>, ptr %a2 2114 %3 = add <16 x i32> %a1, %2 2115 %4 = bitcast i16 %mask to <16 x i1> 2116 %5 = icmp sge <16 x i32> %a0, %3 2117 %6 = and <16 x i1> %4, %5 2118 %7 = select <16 x i1> %6, <16 x i32> %b0, <16 x i32> %b1 2119 ret <16 x i32> %7 2120} 2121 2122define i16 @stack_fold_pcmpleud(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) { 2123; CHECK-LABEL: stack_fold_pcmpleud: 2124; CHECK: # %bb.0: 2125; CHECK-NEXT: pushq %rax 2126; CHECK-NEXT: .cfi_def_cfa_offset 16 2127; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2128; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2129; CHECK-NEXT: #APP 2130; CHECK-NEXT: nop 2131; CHECK-NEXT: #NO_APP 2132; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 2133; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 2134; CHECK-NEXT: vpcmpleud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload 2135; CHECK-NEXT: kmovd %k0, %eax 2136; CHECK-NEXT: andl %esi, %eax 2137; CHECK-NEXT: # kill: def $ax killed $ax killed $eax 2138; CHECK-NEXT: popq %rcx 2139; CHECK-NEXT: .cfi_def_cfa_offset 8 2140; CHECK-NEXT: vzeroupper 2141; CHECK-NEXT: retq 2142 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2143 %2 = load <16 x i32>, ptr %a2 2144 %3 = add <16 x i32> %a1, %2 2145 %4 = bitcast i16 %mask to <16 x i1> 2146 %5 = icmp uge <16 x i32> %a0, %3 2147 %6 = and <16 x i1> %5, %4 2148 %7 = bitcast <16 x i1> %6 to i16 2149 ret i16 %7 2150} 2151 2152define <64 x i8> @stack_fold_permbvar(<64 x i8> %a0, <64 x i8> %a1) { 2153; CHECK-LABEL: stack_fold_permbvar: 2154; CHECK: # %bb.0: 2155; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2156; CHECK-NEXT: #APP 2157; CHECK-NEXT: nop 2158; CHECK-NEXT: #NO_APP 2159; CHECK-NEXT: vpermb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 2160; CHECK-NEXT: retq 2161 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2162 %2 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0) 2163 ret <64 x i8> %2 2164} 2165declare <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8>, <64 x i8>) nounwind readonly 2166 2167define <64 x i8> @stack_fold_permbvar_mask(ptr %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 2168; CHECK-LABEL: stack_fold_permbvar_mask: 2169; CHECK: # %bb.0: 2170; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2171; CHECK-NEXT: #APP 2172; CHECK-NEXT: nop 2173; CHECK-NEXT: #NO_APP 2174; CHECK-NEXT: kmovq %rsi, %k1 2175; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 2176; CHECK-NEXT: vpermb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 2177; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 2178; CHECK-NEXT: retq 2179 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2180 %2 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0) 2181 %3 = bitcast i64 %mask to <64 x i1> 2182 ; load needed to keep the operation from being scheduled above the asm block 2183 %4 = load <64 x i8>, ptr %passthru 2184 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4 2185 ret <64 x i8> %5 2186} 2187 2188define <64 x i8> @stack_fold_permbvar_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 2189; CHECK-LABEL: stack_fold_permbvar_maskz: 2190; CHECK: # %bb.0: 2191; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2192; CHECK-NEXT: #APP 2193; CHECK-NEXT: nop 2194; CHECK-NEXT: #NO_APP 2195; CHECK-NEXT: kmovq %rdi, %k1 2196; CHECK-NEXT: vpermb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 2197; CHECK-NEXT: retq 2198 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2199 %2 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0) 2200 %3 = bitcast i64 %mask to <64 x i1> 2201 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer 2202 ret <64 x i8> %4 2203} 2204 2205define <16 x i32> @stack_fold_permd(<16 x i32> %a0, <16 x i32> %a1) { 2206; CHECK-LABEL: stack_fold_permd: 2207; CHECK: # %bb.0: 2208; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2209; CHECK-NEXT: #APP 2210; CHECK-NEXT: nop 2211; CHECK-NEXT: #NO_APP 2212; CHECK-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 2213; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1 2214; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 2215; CHECK-NEXT: retq 2216 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2217 %2 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a1, <16 x i32> %a0) 2218 ; add forces execution domain 2219 %3 = add <16 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 2220 ret <16 x i32> %3 2221} 2222declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>) nounwind readonly 2223 2224define <64 x i8> @stack_fold_vpermi2b(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { 2225; CHECK-LABEL: stack_fold_vpermi2b: 2226; CHECK: # %bb.0: 2227; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2228; CHECK-NEXT: #APP 2229; CHECK-NEXT: nop 2230; CHECK-NEXT: #NO_APP 2231; CHECK-NEXT: vpermi2b {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 2232; CHECK-NEXT: retq 2233 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2234 %2 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x1, <64 x i8> %x0, <64 x i8> %x2) 2235 ret <64 x i8> %2 2236} 2237 2238define <16 x i32> @stack_fold_vpermi2d(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { 2239; CHECK-LABEL: stack_fold_vpermi2d: 2240; CHECK: # %bb.0: 2241; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2242; CHECK-NEXT: #APP 2243; CHECK-NEXT: nop 2244; CHECK-NEXT: #NO_APP 2245; CHECK-NEXT: vpermi2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 2246; CHECK-NEXT: retq 2247 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2248 %2 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2) 2249 ret <16 x i32> %2 2250} 2251 2252define <8 x i64> @stack_fold_vpermi2q(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { 2253; CHECK-LABEL: stack_fold_vpermi2q: 2254; CHECK: # %bb.0: 2255; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2256; CHECK-NEXT: #APP 2257; CHECK-NEXT: nop 2258; CHECK-NEXT: #NO_APP 2259; CHECK-NEXT: vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 2260; CHECK-NEXT: retq 2261 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2262 %2 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x1, <8 x i64> %x0, <8 x i64> %x2) 2263 ret <8 x i64> %2 2264} 2265 2266define <32 x i16> @stack_fold_vpermi2w(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { 2267; CHECK-LABEL: stack_fold_vpermi2w: 2268; CHECK: # %bb.0: 2269; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2270; CHECK-NEXT: #APP 2271; CHECK-NEXT: nop 2272; CHECK-NEXT: #NO_APP 2273; CHECK-NEXT: vpermi2w {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 2274; CHECK-NEXT: retq 2275 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2276 %2 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x1, <32 x i16> %x0, <32 x i16> %x2) 2277 ret <32 x i16> %2 2278} 2279 2280define <8 x i64> @stack_fold_permq(<8 x i64> %a0) { 2281; CHECK-LABEL: stack_fold_permq: 2282; CHECK: # %bb.0: 2283; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2284; CHECK-NEXT: #APP 2285; CHECK-NEXT: nop 2286; CHECK-NEXT: #NO_APP 2287; CHECK-NEXT: vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 2288; CHECK-NEXT: # zmm0 = mem[3,2,2,3,7,6,6,7] 2289; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1 2290; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 2291; CHECK-NEXT: retq 2292 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2293 %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7> 2294 ; add forces execution domain 2295 %3 = add <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 2296 ret <8 x i64> %3 2297} 2298 2299define <8 x i64> @stack_fold_permq_mask(ptr %passthru, <8 x i64> %a0, i8 %mask) { 2300; CHECK-LABEL: stack_fold_permq_mask: 2301; CHECK: # %bb.0: 2302; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2303; CHECK-NEXT: #APP 2304; CHECK-NEXT: nop 2305; CHECK-NEXT: #NO_APP 2306; CHECK-NEXT: kmovd %esi, %k1 2307; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 2308; CHECK-NEXT: vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload 2309; CHECK-NEXT: # zmm0 {%k1} = mem[3,2,2,3,7,6,6,7] 2310; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1 2311; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 2312; CHECK-NEXT: retq 2313 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2314 %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7> 2315 %3 = bitcast i8 %mask to <8 x i1> 2316 ; load needed to keep the operation from being scheduled above the asm block 2317 %4 = load <8 x i64>, ptr %passthru 2318 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4 2319 ; add forces execution domain 2320 %6 = add <8 x i64> %5, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 2321 ret <8 x i64> %6 2322} 2323 2324define <8 x i64> @stack_fold_permq_maskz(ptr %passthru, <8 x i64> %a0, i8 %mask) { 2325; CHECK-LABEL: stack_fold_permq_maskz: 2326; CHECK: # %bb.0: 2327; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2328; CHECK-NEXT: #APP 2329; CHECK-NEXT: nop 2330; CHECK-NEXT: #NO_APP 2331; CHECK-NEXT: kmovd %esi, %k1 2332; CHECK-NEXT: vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload 2333; CHECK-NEXT: # zmm0 {%k1} {z} = mem[3,2,2,3,7,6,6,7] 2334; CHECK-NEXT: retq 2335 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2336 %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7> 2337 %3 = bitcast i8 %mask to <8 x i1> 2338 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 2339 ret <8 x i64> %4 2340} 2341 2342define <8 x i64> @stack_fold_permqvar(<8 x i64> %a0, <8 x i64> %a1) { 2343; CHECK-LABEL: stack_fold_permqvar: 2344; CHECK: # %bb.0: 2345; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2346; CHECK-NEXT: #APP 2347; CHECK-NEXT: nop 2348; CHECK-NEXT: #NO_APP 2349; CHECK-NEXT: vpermq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 2350; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1 2351; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 2352; CHECK-NEXT: retq 2353 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2354 %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a1, <8 x i64> %a0) 2355 ; add forces execution domain 2356 %3 = add <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 2357 ret <8 x i64> %3 2358} 2359declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>) nounwind readonly 2360 2361define <8 x i64> @stack_fold_permqvar_mask(ptr %passthru, <8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 2362; CHECK-LABEL: stack_fold_permqvar_mask: 2363; CHECK: # %bb.0: 2364; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2365; CHECK-NEXT: #APP 2366; CHECK-NEXT: nop 2367; CHECK-NEXT: #NO_APP 2368; CHECK-NEXT: kmovd %esi, %k1 2369; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 2370; CHECK-NEXT: vpermq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload 2371; CHECK-NEXT: vpternlogd {{.*#+}} zmm0 = -1 2372; CHECK-NEXT: vpsubq %zmm0, %zmm1, %zmm0 2373; CHECK-NEXT: retq 2374 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2375 %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a1, <8 x i64> %a0) 2376 %3 = bitcast i8 %mask to <8 x i1> 2377 ; load needed to keep the operation from being scheduled above the asm block 2378 %4 = load <8 x i64>, ptr %passthru 2379 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4 2380 ; add forces execution domain 2381 %6 = add <8 x i64> %5, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 2382 ret <8 x i64> %6 2383} 2384 2385define <64 x i8> @stack_fold_vpermt2b(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { 2386; CHECK-LABEL: stack_fold_vpermt2b: 2387; CHECK: # %bb.0: 2388; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2389; CHECK-NEXT: #APP 2390; CHECK-NEXT: nop 2391; CHECK-NEXT: #NO_APP 2392; CHECK-NEXT: vpermt2b {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 2393; CHECK-NEXT: retq 2394 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2395 %2 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) 2396 ret <64 x i8> %2 2397} 2398declare <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>) 2399 2400define <16 x i32> @stack_fold_vpermt2d(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { 2401; CHECK-LABEL: stack_fold_vpermt2d: 2402; CHECK: # %bb.0: 2403; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2404; CHECK-NEXT: #APP 2405; CHECK-NEXT: nop 2406; CHECK-NEXT: #NO_APP 2407; CHECK-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 2408; CHECK-NEXT: retq 2409 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2410 %2 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) 2411 ret <16 x i32> %2 2412} 2413declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>) 2414 2415define <8 x i64> @stack_fold_vpermt2q(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { 2416; CHECK-LABEL: stack_fold_vpermt2q: 2417; CHECK: # %bb.0: 2418; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2419; CHECK-NEXT: #APP 2420; CHECK-NEXT: nop 2421; CHECK-NEXT: #NO_APP 2422; CHECK-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 2423; CHECK-NEXT: retq 2424 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2425 %2 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) 2426 ret <8 x i64> %2 2427} 2428declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>) 2429 2430define <32 x i16> @stack_fold_vpermt2w(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { 2431; CHECK-LABEL: stack_fold_vpermt2w: 2432; CHECK: # %bb.0: 2433; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2434; CHECK-NEXT: #APP 2435; CHECK-NEXT: nop 2436; CHECK-NEXT: #NO_APP 2437; CHECK-NEXT: vpermt2w {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 2438; CHECK-NEXT: retq 2439 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2440 %2 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) 2441 ret <32 x i16> %2 2442} 2443declare <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>) 2444 2445define <32 x i16> @stack_fold_permwvar(<32 x i16> %a0, <32 x i16> %a1) { 2446; CHECK-LABEL: stack_fold_permwvar: 2447; CHECK: # %bb.0: 2448; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2449; CHECK-NEXT: #APP 2450; CHECK-NEXT: nop 2451; CHECK-NEXT: #NO_APP 2452; CHECK-NEXT: vpermw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 2453; CHECK-NEXT: retq 2454 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2455 %2 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0) 2456 ret <32 x i16> %2 2457} 2458declare <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16>, <32 x i16>) nounwind readonly 2459 2460define <32 x i16> @stack_fold_permwvar_mask(ptr %passthru, <32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 2461; CHECK-LABEL: stack_fold_permwvar_mask: 2462; CHECK: # %bb.0: 2463; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2464; CHECK-NEXT: #APP 2465; CHECK-NEXT: nop 2466; CHECK-NEXT: #NO_APP 2467; CHECK-NEXT: kmovd %esi, %k1 2468; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 2469; CHECK-NEXT: vpermw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 2470; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 2471; CHECK-NEXT: retq 2472 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2473 %2 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0) 2474 %3 = bitcast i32 %mask to <32 x i1> 2475 ; load needed to keep the operation from being scheduled above the asm block 2476 %4 = load <32 x i16>, ptr %passthru 2477 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4 2478 ret <32 x i16> %5 2479} 2480 2481define <32 x i16> @stack_fold_permwvar_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 2482; CHECK-LABEL: stack_fold_permwvar_maskz: 2483; CHECK: # %bb.0: 2484; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2485; CHECK-NEXT: #APP 2486; CHECK-NEXT: nop 2487; CHECK-NEXT: #NO_APP 2488; CHECK-NEXT: kmovd %edi, %k1 2489; CHECK-NEXT: vpermw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 2490; CHECK-NEXT: retq 2491 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2492 %2 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0) 2493 %3 = bitcast i32 %mask to <32 x i1> 2494 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 2495 ret <32 x i16> %4 2496} 2497 2498define i32 @stack_fold_pextrd(<4 x i32> %a0, <4 x i32> %a1) { 2499; CHECK-LABEL: stack_fold_pextrd: 2500; CHECK: # %bb.0: 2501; CHECK-NEXT: pushq %rbp 2502; CHECK-NEXT: .cfi_def_cfa_offset 16 2503; CHECK-NEXT: pushq %r15 2504; CHECK-NEXT: .cfi_def_cfa_offset 24 2505; CHECK-NEXT: pushq %r14 2506; CHECK-NEXT: .cfi_def_cfa_offset 32 2507; CHECK-NEXT: pushq %r13 2508; CHECK-NEXT: .cfi_def_cfa_offset 40 2509; CHECK-NEXT: pushq %r12 2510; CHECK-NEXT: .cfi_def_cfa_offset 48 2511; CHECK-NEXT: pushq %rbx 2512; CHECK-NEXT: .cfi_def_cfa_offset 56 2513; CHECK-NEXT: .cfi_offset %rbx, -56 2514; CHECK-NEXT: .cfi_offset %r12, -48 2515; CHECK-NEXT: .cfi_offset %r13, -40 2516; CHECK-NEXT: .cfi_offset %r14, -32 2517; CHECK-NEXT: .cfi_offset %r15, -24 2518; CHECK-NEXT: .cfi_offset %rbp, -16 2519; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2520; CHECK-NEXT: vpextrd $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill 2521; CHECK-NEXT: #APP 2522; CHECK-NEXT: nop 2523; CHECK-NEXT: #NO_APP 2524; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload 2525; CHECK-NEXT: popq %rbx 2526; CHECK-NEXT: .cfi_def_cfa_offset 48 2527; CHECK-NEXT: popq %r12 2528; CHECK-NEXT: .cfi_def_cfa_offset 40 2529; CHECK-NEXT: popq %r13 2530; CHECK-NEXT: .cfi_def_cfa_offset 32 2531; CHECK-NEXT: popq %r14 2532; CHECK-NEXT: .cfi_def_cfa_offset 24 2533; CHECK-NEXT: popq %r15 2534; CHECK-NEXT: .cfi_def_cfa_offset 16 2535; CHECK-NEXT: popq %rbp 2536; CHECK-NEXT: .cfi_def_cfa_offset 8 2537; CHECK-NEXT: retq 2538 ; add forces execution domain 2539 %1 = add <4 x i32> %a0, %a1 2540 %2 = extractelement <4 x i32> %1, i32 1 2541 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 2542 ret i32 %2 2543} 2544 2545define i64 @stack_fold_pextrq(<2 x i64> %a0) { 2546; CHECK-LABEL: stack_fold_pextrq: 2547; CHECK: # %bb.0: 2548; CHECK-NEXT: pushq %rbp 2549; CHECK-NEXT: .cfi_def_cfa_offset 16 2550; CHECK-NEXT: pushq %r15 2551; CHECK-NEXT: .cfi_def_cfa_offset 24 2552; CHECK-NEXT: pushq %r14 2553; CHECK-NEXT: .cfi_def_cfa_offset 32 2554; CHECK-NEXT: pushq %r13 2555; CHECK-NEXT: .cfi_def_cfa_offset 40 2556; CHECK-NEXT: pushq %r12 2557; CHECK-NEXT: .cfi_def_cfa_offset 48 2558; CHECK-NEXT: pushq %rbx 2559; CHECK-NEXT: .cfi_def_cfa_offset 56 2560; CHECK-NEXT: .cfi_offset %rbx, -56 2561; CHECK-NEXT: .cfi_offset %r12, -48 2562; CHECK-NEXT: .cfi_offset %r13, -40 2563; CHECK-NEXT: .cfi_offset %r14, -32 2564; CHECK-NEXT: .cfi_offset %r15, -24 2565; CHECK-NEXT: .cfi_offset %rbp, -16 2566; CHECK-NEXT: vpextrq $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill 2567; CHECK-NEXT: #APP 2568; CHECK-NEXT: nop 2569; CHECK-NEXT: #NO_APP 2570; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 2571; CHECK-NEXT: popq %rbx 2572; CHECK-NEXT: .cfi_def_cfa_offset 48 2573; CHECK-NEXT: popq %r12 2574; CHECK-NEXT: .cfi_def_cfa_offset 40 2575; CHECK-NEXT: popq %r13 2576; CHECK-NEXT: .cfi_def_cfa_offset 32 2577; CHECK-NEXT: popq %r14 2578; CHECK-NEXT: .cfi_def_cfa_offset 24 2579; CHECK-NEXT: popq %r15 2580; CHECK-NEXT: .cfi_def_cfa_offset 16 2581; CHECK-NEXT: popq %rbp 2582; CHECK-NEXT: .cfi_def_cfa_offset 8 2583; CHECK-NEXT: retq 2584 %1 = extractelement <2 x i64> %a0, i32 1 2585 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 2586 ret i64 %1 2587} 2588 2589define <16 x i8> @stack_fold_pinsrb(<16 x i8> %a0, i8 %a1) { 2590; CHECK-LABEL: stack_fold_pinsrb: 2591; CHECK: # %bb.0: 2592; CHECK-NEXT: pushq %rbp 2593; CHECK-NEXT: .cfi_def_cfa_offset 16 2594; CHECK-NEXT: pushq %r15 2595; CHECK-NEXT: .cfi_def_cfa_offset 24 2596; CHECK-NEXT: pushq %r14 2597; CHECK-NEXT: .cfi_def_cfa_offset 32 2598; CHECK-NEXT: pushq %r13 2599; CHECK-NEXT: .cfi_def_cfa_offset 40 2600; CHECK-NEXT: pushq %r12 2601; CHECK-NEXT: .cfi_def_cfa_offset 48 2602; CHECK-NEXT: pushq %rbx 2603; CHECK-NEXT: .cfi_def_cfa_offset 56 2604; CHECK-NEXT: .cfi_offset %rbx, -56 2605; CHECK-NEXT: .cfi_offset %r12, -48 2606; CHECK-NEXT: .cfi_offset %r13, -40 2607; CHECK-NEXT: .cfi_offset %r14, -32 2608; CHECK-NEXT: .cfi_offset %r15, -24 2609; CHECK-NEXT: .cfi_offset %rbp, -16 2610; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 2611; CHECK-NEXT: #APP 2612; CHECK-NEXT: nop 2613; CHECK-NEXT: #NO_APP 2614; CHECK-NEXT: vpinsrb $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload 2615; CHECK-NEXT: popq %rbx 2616; CHECK-NEXT: .cfi_def_cfa_offset 48 2617; CHECK-NEXT: popq %r12 2618; CHECK-NEXT: .cfi_def_cfa_offset 40 2619; CHECK-NEXT: popq %r13 2620; CHECK-NEXT: .cfi_def_cfa_offset 32 2621; CHECK-NEXT: popq %r14 2622; CHECK-NEXT: .cfi_def_cfa_offset 24 2623; CHECK-NEXT: popq %r15 2624; CHECK-NEXT: .cfi_def_cfa_offset 16 2625; CHECK-NEXT: popq %rbp 2626; CHECK-NEXT: .cfi_def_cfa_offset 8 2627; CHECK-NEXT: retq 2628 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 2629 %2 = insertelement <16 x i8> %a0, i8 %a1, i32 1 2630 ret <16 x i8> %2 2631} 2632 2633define <4 x i32> @stack_fold_pinsrd(<4 x i32> %a0, i32 %a1) { 2634; CHECK-LABEL: stack_fold_pinsrd: 2635; CHECK: # %bb.0: 2636; CHECK-NEXT: pushq %rbp 2637; CHECK-NEXT: .cfi_def_cfa_offset 16 2638; CHECK-NEXT: pushq %r15 2639; CHECK-NEXT: .cfi_def_cfa_offset 24 2640; CHECK-NEXT: pushq %r14 2641; CHECK-NEXT: .cfi_def_cfa_offset 32 2642; CHECK-NEXT: pushq %r13 2643; CHECK-NEXT: .cfi_def_cfa_offset 40 2644; CHECK-NEXT: pushq %r12 2645; CHECK-NEXT: .cfi_def_cfa_offset 48 2646; CHECK-NEXT: pushq %rbx 2647; CHECK-NEXT: .cfi_def_cfa_offset 56 2648; CHECK-NEXT: .cfi_offset %rbx, -56 2649; CHECK-NEXT: .cfi_offset %r12, -48 2650; CHECK-NEXT: .cfi_offset %r13, -40 2651; CHECK-NEXT: .cfi_offset %r14, -32 2652; CHECK-NEXT: .cfi_offset %r15, -24 2653; CHECK-NEXT: .cfi_offset %rbp, -16 2654; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 2655; CHECK-NEXT: #APP 2656; CHECK-NEXT: nop 2657; CHECK-NEXT: #NO_APP 2658; CHECK-NEXT: vpinsrd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload 2659; CHECK-NEXT: popq %rbx 2660; CHECK-NEXT: .cfi_def_cfa_offset 48 2661; CHECK-NEXT: popq %r12 2662; CHECK-NEXT: .cfi_def_cfa_offset 40 2663; CHECK-NEXT: popq %r13 2664; CHECK-NEXT: .cfi_def_cfa_offset 32 2665; CHECK-NEXT: popq %r14 2666; CHECK-NEXT: .cfi_def_cfa_offset 24 2667; CHECK-NEXT: popq %r15 2668; CHECK-NEXT: .cfi_def_cfa_offset 16 2669; CHECK-NEXT: popq %rbp 2670; CHECK-NEXT: .cfi_def_cfa_offset 8 2671; CHECK-NEXT: retq 2672 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 2673 %2 = insertelement <4 x i32> %a0, i32 %a1, i32 1 2674 ret <4 x i32> %2 2675} 2676 2677define <2 x i64> @stack_fold_pinsrq(<2 x i64> %a0, i64 %a1) { 2678; CHECK-LABEL: stack_fold_pinsrq: 2679; CHECK: # %bb.0: 2680; CHECK-NEXT: pushq %rbp 2681; CHECK-NEXT: .cfi_def_cfa_offset 16 2682; CHECK-NEXT: pushq %r15 2683; CHECK-NEXT: .cfi_def_cfa_offset 24 2684; CHECK-NEXT: pushq %r14 2685; CHECK-NEXT: .cfi_def_cfa_offset 32 2686; CHECK-NEXT: pushq %r13 2687; CHECK-NEXT: .cfi_def_cfa_offset 40 2688; CHECK-NEXT: pushq %r12 2689; CHECK-NEXT: .cfi_def_cfa_offset 48 2690; CHECK-NEXT: pushq %rbx 2691; CHECK-NEXT: .cfi_def_cfa_offset 56 2692; CHECK-NEXT: .cfi_offset %rbx, -56 2693; CHECK-NEXT: .cfi_offset %r12, -48 2694; CHECK-NEXT: .cfi_offset %r13, -40 2695; CHECK-NEXT: .cfi_offset %r14, -32 2696; CHECK-NEXT: .cfi_offset %r15, -24 2697; CHECK-NEXT: .cfi_offset %rbp, -16 2698; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2699; CHECK-NEXT: #APP 2700; CHECK-NEXT: nop 2701; CHECK-NEXT: #NO_APP 2702; CHECK-NEXT: vpinsrq $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload 2703; CHECK-NEXT: popq %rbx 2704; CHECK-NEXT: .cfi_def_cfa_offset 48 2705; CHECK-NEXT: popq %r12 2706; CHECK-NEXT: .cfi_def_cfa_offset 40 2707; CHECK-NEXT: popq %r13 2708; CHECK-NEXT: .cfi_def_cfa_offset 32 2709; CHECK-NEXT: popq %r14 2710; CHECK-NEXT: .cfi_def_cfa_offset 24 2711; CHECK-NEXT: popq %r15 2712; CHECK-NEXT: .cfi_def_cfa_offset 16 2713; CHECK-NEXT: popq %rbp 2714; CHECK-NEXT: .cfi_def_cfa_offset 8 2715; CHECK-NEXT: retq 2716 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 2717 %2 = insertelement <2 x i64> %a0, i64 %a1, i32 1 2718 ret <2 x i64> %2 2719} 2720 2721define <8 x i16> @stack_fold_pinsrw(<8 x i16> %a0, i16 %a1) { 2722; CHECK-LABEL: stack_fold_pinsrw: 2723; CHECK: # %bb.0: 2724; CHECK-NEXT: pushq %rbp 2725; CHECK-NEXT: .cfi_def_cfa_offset 16 2726; CHECK-NEXT: pushq %r15 2727; CHECK-NEXT: .cfi_def_cfa_offset 24 2728; CHECK-NEXT: pushq %r14 2729; CHECK-NEXT: .cfi_def_cfa_offset 32 2730; CHECK-NEXT: pushq %r13 2731; CHECK-NEXT: .cfi_def_cfa_offset 40 2732; CHECK-NEXT: pushq %r12 2733; CHECK-NEXT: .cfi_def_cfa_offset 48 2734; CHECK-NEXT: pushq %rbx 2735; CHECK-NEXT: .cfi_def_cfa_offset 56 2736; CHECK-NEXT: .cfi_offset %rbx, -56 2737; CHECK-NEXT: .cfi_offset %r12, -48 2738; CHECK-NEXT: .cfi_offset %r13, -40 2739; CHECK-NEXT: .cfi_offset %r14, -32 2740; CHECK-NEXT: .cfi_offset %r15, -24 2741; CHECK-NEXT: .cfi_offset %rbp, -16 2742; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 2743; CHECK-NEXT: #APP 2744; CHECK-NEXT: nop 2745; CHECK-NEXT: #NO_APP 2746; CHECK-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload 2747; CHECK-NEXT: popq %rbx 2748; CHECK-NEXT: .cfi_def_cfa_offset 48 2749; CHECK-NEXT: popq %r12 2750; CHECK-NEXT: .cfi_def_cfa_offset 40 2751; CHECK-NEXT: popq %r13 2752; CHECK-NEXT: .cfi_def_cfa_offset 32 2753; CHECK-NEXT: popq %r14 2754; CHECK-NEXT: .cfi_def_cfa_offset 24 2755; CHECK-NEXT: popq %r15 2756; CHECK-NEXT: .cfi_def_cfa_offset 16 2757; CHECK-NEXT: popq %rbp 2758; CHECK-NEXT: .cfi_def_cfa_offset 8 2759; CHECK-NEXT: retq 2760 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 2761 %2 = insertelement <8 x i16> %a0, i16 %a1, i32 1 2762 ret <8 x i16> %2 2763} 2764 2765define <16 x i32> @stack_fold_vplzcntd(<16 x i32> %a0) { 2766; CHECK-LABEL: stack_fold_vplzcntd: 2767; CHECK: # %bb.0: 2768; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2769; CHECK-NEXT: #APP 2770; CHECK-NEXT: nop 2771; CHECK-NEXT: #NO_APP 2772; CHECK-NEXT: vplzcntd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 2773; CHECK-NEXT: retq 2774 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2775 %2 = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a0, i1 false) 2776 ret <16 x i32> %2 2777} 2778 2779define <8 x i64> @stack_fold_vplzcntq(<8 x i64> %a0) { 2780; CHECK-LABEL: stack_fold_vplzcntq: 2781; CHECK: # %bb.0: 2782; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2783; CHECK-NEXT: #APP 2784; CHECK-NEXT: nop 2785; CHECK-NEXT: #NO_APP 2786; CHECK-NEXT: vplzcntq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 2787; CHECK-NEXT: retq 2788 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2789 %2 = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a0, i1 false) 2790 ret <8 x i64> %2 2791} 2792 2793define <32 x i16> @stack_fold_pmaddubsw_zmm(<64 x i8> %a0, <64 x i8> %a1) { 2794; CHECK-LABEL: stack_fold_pmaddubsw_zmm: 2795; CHECK: # %bb.0: 2796; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2797; CHECK-NEXT: #APP 2798; CHECK-NEXT: nop 2799; CHECK-NEXT: #NO_APP 2800; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 2801; CHECK-NEXT: retq 2802 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2803 %2 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> %a1) 2804 ret <32 x i16> %2 2805} 2806declare <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8>, <64 x i8>) nounwind readnone 2807 2808define <32 x i16> @stack_fold_pmaddubsw_zmm_mask(ptr %passthru, <64 x i8> %a0, <64 x i8> %a1, i32 %mask) { 2809; CHECK-LABEL: stack_fold_pmaddubsw_zmm_mask: 2810; CHECK: # %bb.0: 2811; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2812; CHECK-NEXT: #APP 2813; CHECK-NEXT: nop 2814; CHECK-NEXT: #NO_APP 2815; CHECK-NEXT: kmovd %esi, %k1 2816; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 2817; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 2818; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 2819; CHECK-NEXT: retq 2820 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2821 %2 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> %a1) 2822 %3 = bitcast i32 %mask to <32 x i1> 2823 ; load needed to keep the operation from being scheduled about the asm block 2824 %4 = load <32 x i16>, ptr %passthru 2825 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4 2826 ret <32 x i16> %5 2827} 2828 2829define <32 x i16> @stack_fold_pmaddubsw_zmm_maskz(<64 x i8> %a0, <64 x i8> %a1, i32 %mask) { 2830; CHECK-LABEL: stack_fold_pmaddubsw_zmm_maskz: 2831; CHECK: # %bb.0: 2832; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2833; CHECK-NEXT: #APP 2834; CHECK-NEXT: nop 2835; CHECK-NEXT: #NO_APP 2836; CHECK-NEXT: kmovd %edi, %k1 2837; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 2838; CHECK-NEXT: retq 2839 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2840 %2 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> %a1) 2841 %3 = bitcast i32 %mask to <32 x i1> 2842 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 2843 ret <32 x i16> %4 2844} 2845 2846define <16 x i32> @stack_fold_pmaddwd_zmm(<32 x i16> %a0, <32 x i16> %a1) { 2847; CHECK-LABEL: stack_fold_pmaddwd_zmm: 2848; CHECK: # %bb.0: 2849; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2850; CHECK-NEXT: #APP 2851; CHECK-NEXT: nop 2852; CHECK-NEXT: #NO_APP 2853; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 2854; CHECK-NEXT: retq 2855 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2856 %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1) 2857 ret <16 x i32> %2 2858} 2859declare <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16>, <32 x i16>) nounwind readnone 2860 2861define <16 x i32> @stack_fold_pmaddwd_zmm_commuted(<32 x i16> %a0, <32 x i16> %a1) { 2862; CHECK-LABEL: stack_fold_pmaddwd_zmm_commuted: 2863; CHECK: # %bb.0: 2864; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2865; CHECK-NEXT: #APP 2866; CHECK-NEXT: nop 2867; CHECK-NEXT: #NO_APP 2868; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 2869; CHECK-NEXT: retq 2870 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2871 %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a0) 2872 ret <16 x i32> %2 2873} 2874 2875define <16 x i32> @stack_fold_pmaddwd_zmm_mask(ptr %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) { 2876; CHECK-LABEL: stack_fold_pmaddwd_zmm_mask: 2877; CHECK: # %bb.0: 2878; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2879; CHECK-NEXT: #APP 2880; CHECK-NEXT: nop 2881; CHECK-NEXT: #NO_APP 2882; CHECK-NEXT: kmovd %esi, %k1 2883; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 2884; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 2885; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 2886; CHECK-NEXT: retq 2887 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2888 %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1) 2889 %3 = bitcast i16 %mask to <16 x i1> 2890 ; load needed to keep the operation from being scheduled about the asm block 2891 %4 = load <16 x i32>, ptr %passthru 2892 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 2893 ret <16 x i32> %5 2894} 2895 2896define <16 x i32> @stack_fold_pmaddwd_zmm_mask_commuted(ptr %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) { 2897; CHECK-LABEL: stack_fold_pmaddwd_zmm_mask_commuted: 2898; CHECK: # %bb.0: 2899; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2900; CHECK-NEXT: #APP 2901; CHECK-NEXT: nop 2902; CHECK-NEXT: #NO_APP 2903; CHECK-NEXT: kmovd %esi, %k1 2904; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 2905; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 2906; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 2907; CHECK-NEXT: retq 2908 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2909 %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a0) 2910 %3 = bitcast i16 %mask to <16 x i1> 2911 ; load needed to keep the operation from being scheduled about the asm block 2912 %4 = load <16 x i32>, ptr %passthru 2913 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 2914 ret <16 x i32> %5 2915} 2916 2917define <16 x i32> @stack_fold_pmaddwd_zmm_maskz(ptr %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) { 2918; CHECK-LABEL: stack_fold_pmaddwd_zmm_maskz: 2919; CHECK: # %bb.0: 2920; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2921; CHECK-NEXT: #APP 2922; CHECK-NEXT: nop 2923; CHECK-NEXT: #NO_APP 2924; CHECK-NEXT: kmovd %esi, %k1 2925; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 2926; CHECK-NEXT: retq 2927 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2928 %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1) 2929 %3 = bitcast i16 %mask to <16 x i1> 2930 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 2931 ret <16 x i32> %4 2932} 2933 2934define <16 x i32> @stack_fold_pmaddwd_zmm_maskz_commuted(ptr %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) { 2935; CHECK-LABEL: stack_fold_pmaddwd_zmm_maskz_commuted: 2936; CHECK: # %bb.0: 2937; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2938; CHECK-NEXT: #APP 2939; CHECK-NEXT: nop 2940; CHECK-NEXT: #NO_APP 2941; CHECK-NEXT: kmovd %esi, %k1 2942; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 2943; CHECK-NEXT: retq 2944 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2945 %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a0) 2946 %3 = bitcast i16 %mask to <16 x i1> 2947 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 2948 ret <16 x i32> %4 2949} 2950 2951define <64 x i8> @stack_fold_pmaxsb(<64 x i8> %a0, <64 x i8> %a1) { 2952; CHECK-LABEL: stack_fold_pmaxsb: 2953; CHECK: # %bb.0: 2954; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2955; CHECK-NEXT: #APP 2956; CHECK-NEXT: nop 2957; CHECK-NEXT: #NO_APP 2958; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 2959; CHECK-NEXT: retq 2960 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2961 %2 = icmp sgt <64 x i8> %a0, %a1 2962 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 2963 ret <64 x i8> %3 2964} 2965 2966define <64 x i8> @stack_fold_pmaxsb_commuted(<64 x i8> %a0, <64 x i8> %a1) { 2967; CHECK-LABEL: stack_fold_pmaxsb_commuted: 2968; CHECK: # %bb.0: 2969; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2970; CHECK-NEXT: #APP 2971; CHECK-NEXT: nop 2972; CHECK-NEXT: #NO_APP 2973; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 2974; CHECK-NEXT: retq 2975 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2976 %2 = icmp sgt <64 x i8> %a1, %a0 2977 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 2978 ret <64 x i8> %3 2979} 2980 2981define <64 x i8> @stack_fold_pmaxsb_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, ptr %passthru) { 2982; CHECK-LABEL: stack_fold_pmaxsb_mask: 2983; CHECK: # %bb.0: 2984; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2985; CHECK-NEXT: #APP 2986; CHECK-NEXT: nop 2987; CHECK-NEXT: #NO_APP 2988; CHECK-NEXT: kmovq %rdi, %k1 2989; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 2990; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 2991; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 2992; CHECK-NEXT: retq 2993 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2994 %2 = icmp sgt <64 x i8> %a0, %a1 2995 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 2996 %4 = bitcast i64 %mask to <64 x i1> 2997 ; load needed to keep the operation from being scheduled about the asm block 2998 %5 = load <64 x i8>, ptr %passthru 2999 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5 3000 ret <64 x i8> %6 3001} 3002 3003define <64 x i8> @stack_fold_pmaxsb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, ptr %passthru) { 3004; CHECK-LABEL: stack_fold_pmaxsb_mask_commuted: 3005; CHECK: # %bb.0: 3006; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3007; CHECK-NEXT: #APP 3008; CHECK-NEXT: nop 3009; CHECK-NEXT: #NO_APP 3010; CHECK-NEXT: kmovq %rdi, %k1 3011; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3012; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3013; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3014; CHECK-NEXT: retq 3015 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3016 %2 = icmp sgt <64 x i8> %a1, %a0 3017 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 3018 %4 = bitcast i64 %mask to <64 x i1> 3019 ; load needed to keep the operation from being scheduled about the asm block 3020 %5 = load <64 x i8>, ptr %passthru 3021 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5 3022 ret <64 x i8> %6 3023} 3024 3025define <64 x i8> @stack_fold_pmaxsb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 3026; CHECK-LABEL: stack_fold_pmaxsb_maskz: 3027; CHECK: # %bb.0: 3028; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3029; CHECK-NEXT: #APP 3030; CHECK-NEXT: nop 3031; CHECK-NEXT: #NO_APP 3032; CHECK-NEXT: kmovq %rdi, %k1 3033; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3034; CHECK-NEXT: retq 3035 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3036 %2 = icmp sgt <64 x i8> %a0, %a1 3037 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 3038 %4 = bitcast i64 %mask to <64 x i1> 3039 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer 3040 ret <64 x i8> %5 3041} 3042 3043define <64 x i8> @stack_fold_pmaxsb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 3044; CHECK-LABEL: stack_fold_pmaxsb_maskz_commuted: 3045; CHECK: # %bb.0: 3046; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3047; CHECK-NEXT: #APP 3048; CHECK-NEXT: nop 3049; CHECK-NEXT: #NO_APP 3050; CHECK-NEXT: kmovq %rdi, %k1 3051; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3052; CHECK-NEXT: retq 3053 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3054 %2 = icmp sgt <64 x i8> %a1, %a0 3055 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 3056 %4 = bitcast i64 %mask to <64 x i1> 3057 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer 3058 ret <64 x i8> %5 3059} 3060 3061define <16 x i32> @stack_fold_pmaxsd(<16 x i32> %a0, <16 x i32> %a1) { 3062; CHECK-LABEL: stack_fold_pmaxsd: 3063; CHECK: # %bb.0: 3064; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3065; CHECK-NEXT: #APP 3066; CHECK-NEXT: nop 3067; CHECK-NEXT: #NO_APP 3068; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3069; CHECK-NEXT: retq 3070 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3071 %2 = icmp sgt <16 x i32> %a0, %a1 3072 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 3073 ret <16 x i32> %3 3074} 3075 3076define <16 x i32> @stack_fold_pmaxsd_commuted(<16 x i32> %a0, <16 x i32> %a1) { 3077; CHECK-LABEL: stack_fold_pmaxsd_commuted: 3078; CHECK: # %bb.0: 3079; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3080; CHECK-NEXT: #APP 3081; CHECK-NEXT: nop 3082; CHECK-NEXT: #NO_APP 3083; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3084; CHECK-NEXT: retq 3085 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3086 %2 = icmp sgt <16 x i32> %a1, %a0 3087 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 3088 ret <16 x i32> %3 3089} 3090 3091define <16 x i32> @stack_fold_pmaxsd_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, ptr %passthru) { 3092; CHECK-LABEL: stack_fold_pmaxsd_mask: 3093; CHECK: # %bb.0: 3094; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3095; CHECK-NEXT: #APP 3096; CHECK-NEXT: nop 3097; CHECK-NEXT: #NO_APP 3098; CHECK-NEXT: kmovd %edi, %k1 3099; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3100; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3101; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3102; CHECK-NEXT: retq 3103 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3104 %2 = icmp sgt <16 x i32> %a0, %a1 3105 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 3106 %4 = bitcast i16 %mask to <16 x i1> 3107 ; load needed to keep the operation from being scheduled about the asm block 3108 %5 = load <16 x i32>, ptr %passthru 3109 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5 3110 ret <16 x i32> %6 3111} 3112 3113define <16 x i32> @stack_fold_pmaxsd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, ptr %passthru) { 3114; CHECK-LABEL: stack_fold_pmaxsd_mask_commuted: 3115; CHECK: # %bb.0: 3116; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3117; CHECK-NEXT: #APP 3118; CHECK-NEXT: nop 3119; CHECK-NEXT: #NO_APP 3120; CHECK-NEXT: kmovd %edi, %k1 3121; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3122; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3123; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3124; CHECK-NEXT: retq 3125 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3126 %2 = icmp sgt <16 x i32> %a1, %a0 3127 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 3128 %4 = bitcast i16 %mask to <16 x i1> 3129 ; load needed to keep the operation from being scheduled about the asm block 3130 %5 = load <16 x i32>, ptr %passthru 3131 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5 3132 ret <16 x i32> %6 3133} 3134 3135define <16 x i32> @stack_fold_pmaxsd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 3136; CHECK-LABEL: stack_fold_pmaxsd_maskz: 3137; CHECK: # %bb.0: 3138; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3139; CHECK-NEXT: #APP 3140; CHECK-NEXT: nop 3141; CHECK-NEXT: #NO_APP 3142; CHECK-NEXT: kmovd %edi, %k1 3143; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3144; CHECK-NEXT: retq 3145 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3146 %2 = icmp sgt <16 x i32> %a0, %a1 3147 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 3148 %4 = bitcast i16 %mask to <16 x i1> 3149 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer 3150 ret <16 x i32> %5 3151} 3152 3153define <16 x i32> @stack_fold_pmaxsd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 3154; CHECK-LABEL: stack_fold_pmaxsd_maskz_commuted: 3155; CHECK: # %bb.0: 3156; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3157; CHECK-NEXT: #APP 3158; CHECK-NEXT: nop 3159; CHECK-NEXT: #NO_APP 3160; CHECK-NEXT: kmovd %edi, %k1 3161; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3162; CHECK-NEXT: retq 3163 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3164 %2 = icmp sgt <16 x i32> %a1, %a0 3165 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 3166 %4 = bitcast i16 %mask to <16 x i1> 3167 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer 3168 ret <16 x i32> %5 3169} 3170 3171define <8 x i64> @stack_fold_pmaxsq(<8 x i64> %a0, <8 x i64> %a1) { 3172; CHECK-LABEL: stack_fold_pmaxsq: 3173; CHECK: # %bb.0: 3174; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3175; CHECK-NEXT: #APP 3176; CHECK-NEXT: nop 3177; CHECK-NEXT: #NO_APP 3178; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3179; CHECK-NEXT: retq 3180 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3181 %2 = icmp sgt <8 x i64> %a0, %a1 3182 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 3183 ret <8 x i64> %3 3184} 3185 3186define <8 x i64> @stack_fold_pmaxsq_commuted(<8 x i64> %a0, <8 x i64> %a1) { 3187; CHECK-LABEL: stack_fold_pmaxsq_commuted: 3188; CHECK: # %bb.0: 3189; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3190; CHECK-NEXT: #APP 3191; CHECK-NEXT: nop 3192; CHECK-NEXT: #NO_APP 3193; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3194; CHECK-NEXT: retq 3195 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3196 %2 = icmp sgt <8 x i64> %a1, %a0 3197 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 3198 ret <8 x i64> %3 3199} 3200 3201define <8 x i64> @stack_fold_pmaxsq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, ptr %passthru) { 3202; CHECK-LABEL: stack_fold_pmaxsq_mask: 3203; CHECK: # %bb.0: 3204; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3205; CHECK-NEXT: #APP 3206; CHECK-NEXT: nop 3207; CHECK-NEXT: #NO_APP 3208; CHECK-NEXT: kmovd %edi, %k1 3209; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3210; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3211; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3212; CHECK-NEXT: retq 3213 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3214 %2 = icmp sgt <8 x i64> %a0, %a1 3215 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 3216 %4 = bitcast i8 %mask to <8 x i1> 3217 ; load needed to keep the operation from being scheduled about the asm block 3218 %5 = load <8 x i64>, ptr %passthru 3219 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5 3220 ret <8 x i64> %6 3221} 3222 3223define <8 x i64> @stack_fold_pmaxsq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, ptr %passthru) { 3224; CHECK-LABEL: stack_fold_pmaxsq_mask_commuted: 3225; CHECK: # %bb.0: 3226; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3227; CHECK-NEXT: #APP 3228; CHECK-NEXT: nop 3229; CHECK-NEXT: #NO_APP 3230; CHECK-NEXT: kmovd %edi, %k1 3231; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3232; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3233; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3234; CHECK-NEXT: retq 3235 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3236 %2 = icmp sgt <8 x i64> %a1, %a0 3237 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 3238 %4 = bitcast i8 %mask to <8 x i1> 3239 ; load needed to keep the operation from being scheduled about the asm block 3240 %5 = load <8 x i64>, ptr %passthru 3241 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5 3242 ret <8 x i64> %6 3243} 3244 3245define <8 x i64> @stack_fold_pmaxsq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 3246; CHECK-LABEL: stack_fold_pmaxsq_maskz: 3247; CHECK: # %bb.0: 3248; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3249; CHECK-NEXT: #APP 3250; CHECK-NEXT: nop 3251; CHECK-NEXT: #NO_APP 3252; CHECK-NEXT: kmovd %edi, %k1 3253; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3254; CHECK-NEXT: retq 3255 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3256 %2 = icmp sgt <8 x i64> %a0, %a1 3257 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 3258 %4 = bitcast i8 %mask to <8 x i1> 3259 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer 3260 ret <8 x i64> %5 3261} 3262 3263define <8 x i64> @stack_fold_pmaxsq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 3264; CHECK-LABEL: stack_fold_pmaxsq_maskz_commuted: 3265; CHECK: # %bb.0: 3266; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3267; CHECK-NEXT: #APP 3268; CHECK-NEXT: nop 3269; CHECK-NEXT: #NO_APP 3270; CHECK-NEXT: kmovd %edi, %k1 3271; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3272; CHECK-NEXT: retq 3273 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3274 %2 = icmp sgt <8 x i64> %a1, %a0 3275 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 3276 %4 = bitcast i8 %mask to <8 x i1> 3277 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer 3278 ret <8 x i64> %5 3279} 3280 3281define <32 x i16> @stack_fold_pmaxsw(<32 x i16> %a0, <32 x i16> %a1) { 3282; CHECK-LABEL: stack_fold_pmaxsw: 3283; CHECK: # %bb.0: 3284; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3285; CHECK-NEXT: #APP 3286; CHECK-NEXT: nop 3287; CHECK-NEXT: #NO_APP 3288; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3289; CHECK-NEXT: retq 3290 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3291 %2 = icmp sgt <32 x i16> %a0, %a1 3292 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 3293 ret <32 x i16> %3 3294} 3295 3296define <32 x i16> @stack_fold_pmaxsw_commuted(<32 x i16> %a0, <32 x i16> %a1) { 3297; CHECK-LABEL: stack_fold_pmaxsw_commuted: 3298; CHECK: # %bb.0: 3299; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3300; CHECK-NEXT: #APP 3301; CHECK-NEXT: nop 3302; CHECK-NEXT: #NO_APP 3303; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3304; CHECK-NEXT: retq 3305 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3306 %2 = icmp sgt <32 x i16> %a1, %a0 3307 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 3308 ret <32 x i16> %3 3309} 3310 3311define <32 x i16> @stack_fold_pmaxsw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, ptr %passthru) { 3312; CHECK-LABEL: stack_fold_pmaxsw_mask: 3313; CHECK: # %bb.0: 3314; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3315; CHECK-NEXT: #APP 3316; CHECK-NEXT: nop 3317; CHECK-NEXT: #NO_APP 3318; CHECK-NEXT: kmovd %edi, %k1 3319; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3320; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3321; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3322; CHECK-NEXT: retq 3323 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3324 %2 = icmp sgt <32 x i16> %a0, %a1 3325 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 3326 %4 = bitcast i32 %mask to <32 x i1> 3327 ; load needed to keep the operation from being scheduled about the asm block 3328 %5 = load <32 x i16>, ptr %passthru 3329 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5 3330 ret <32 x i16> %6 3331} 3332 3333define <32 x i16> @stack_fold_pmaxsw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, ptr %passthru) { 3334; CHECK-LABEL: stack_fold_pmaxsw_mask_commuted: 3335; CHECK: # %bb.0: 3336; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3337; CHECK-NEXT: #APP 3338; CHECK-NEXT: nop 3339; CHECK-NEXT: #NO_APP 3340; CHECK-NEXT: kmovd %edi, %k1 3341; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3342; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3343; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3344; CHECK-NEXT: retq 3345 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3346 %2 = icmp sgt <32 x i16> %a1, %a0 3347 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 3348 %4 = bitcast i32 %mask to <32 x i1> 3349 ; load needed to keep the operation from being scheduled about the asm block 3350 %5 = load <32 x i16>, ptr %passthru 3351 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5 3352 ret <32 x i16> %6 3353} 3354 3355define <32 x i16> @stack_fold_pmaxsw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 3356; CHECK-LABEL: stack_fold_pmaxsw_maskz: 3357; CHECK: # %bb.0: 3358; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3359; CHECK-NEXT: #APP 3360; CHECK-NEXT: nop 3361; CHECK-NEXT: #NO_APP 3362; CHECK-NEXT: kmovd %edi, %k1 3363; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3364; CHECK-NEXT: retq 3365 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3366 %2 = icmp sgt <32 x i16> %a0, %a1 3367 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 3368 %4 = bitcast i32 %mask to <32 x i1> 3369 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer 3370 ret <32 x i16> %5 3371} 3372 3373define <32 x i16> @stack_fold_pmaxsw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 3374; CHECK-LABEL: stack_fold_pmaxsw_maskz_commuted: 3375; CHECK: # %bb.0: 3376; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3377; CHECK-NEXT: #APP 3378; CHECK-NEXT: nop 3379; CHECK-NEXT: #NO_APP 3380; CHECK-NEXT: kmovd %edi, %k1 3381; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3382; CHECK-NEXT: retq 3383 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3384 %2 = icmp sgt <32 x i16> %a1, %a0 3385 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 3386 %4 = bitcast i32 %mask to <32 x i1> 3387 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer 3388 ret <32 x i16> %5 3389} 3390 3391define <64 x i8> @stack_fold_pmaxub(<64 x i8> %a0, <64 x i8> %a1) { 3392; CHECK-LABEL: stack_fold_pmaxub: 3393; CHECK: # %bb.0: 3394; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3395; CHECK-NEXT: #APP 3396; CHECK-NEXT: nop 3397; CHECK-NEXT: #NO_APP 3398; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3399; CHECK-NEXT: retq 3400 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3401 %2 = icmp ugt <64 x i8> %a0, %a1 3402 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 3403 ret <64 x i8> %3 3404} 3405 3406define <64 x i8> @stack_fold_pmaxub_commuted(<64 x i8> %a0, <64 x i8> %a1) { 3407; CHECK-LABEL: stack_fold_pmaxub_commuted: 3408; CHECK: # %bb.0: 3409; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3410; CHECK-NEXT: #APP 3411; CHECK-NEXT: nop 3412; CHECK-NEXT: #NO_APP 3413; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3414; CHECK-NEXT: retq 3415 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3416 %2 = icmp ugt <64 x i8> %a1, %a0 3417 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 3418 ret <64 x i8> %3 3419} 3420 3421define <64 x i8> @stack_fold_pmaxub_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, ptr %passthru) { 3422; CHECK-LABEL: stack_fold_pmaxub_mask: 3423; CHECK: # %bb.0: 3424; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3425; CHECK-NEXT: #APP 3426; CHECK-NEXT: nop 3427; CHECK-NEXT: #NO_APP 3428; CHECK-NEXT: kmovq %rdi, %k1 3429; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3430; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3431; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3432; CHECK-NEXT: retq 3433 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3434 %2 = icmp ugt <64 x i8> %a0, %a1 3435 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 3436 %4 = bitcast i64 %mask to <64 x i1> 3437 ; load needed to keep the operation from being scheduled about the asm block 3438 %5 = load <64 x i8>, ptr %passthru 3439 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5 3440 ret <64 x i8> %6 3441} 3442 3443define <64 x i8> @stack_fold_pmaxub_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, ptr %passthru) { 3444; CHECK-LABEL: stack_fold_pmaxub_mask_commuted: 3445; CHECK: # %bb.0: 3446; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3447; CHECK-NEXT: #APP 3448; CHECK-NEXT: nop 3449; CHECK-NEXT: #NO_APP 3450; CHECK-NEXT: kmovq %rdi, %k1 3451; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3452; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3453; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3454; CHECK-NEXT: retq 3455 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3456 %2 = icmp ugt <64 x i8> %a1, %a0 3457 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 3458 %4 = bitcast i64 %mask to <64 x i1> 3459 ; load needed to keep the operation from being scheduled about the asm block 3460 %5 = load <64 x i8>, ptr %passthru 3461 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5 3462 ret <64 x i8> %6 3463} 3464 3465define <64 x i8> @stack_fold_pmaxub_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 3466; CHECK-LABEL: stack_fold_pmaxub_maskz: 3467; CHECK: # %bb.0: 3468; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3469; CHECK-NEXT: #APP 3470; CHECK-NEXT: nop 3471; CHECK-NEXT: #NO_APP 3472; CHECK-NEXT: kmovq %rdi, %k1 3473; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3474; CHECK-NEXT: retq 3475 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3476 %2 = icmp ugt <64 x i8> %a0, %a1 3477 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 3478 %4 = bitcast i64 %mask to <64 x i1> 3479 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer 3480 ret <64 x i8> %5 3481} 3482 3483define <64 x i8> @stack_fold_pmaxub_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 3484; CHECK-LABEL: stack_fold_pmaxub_maskz_commuted: 3485; CHECK: # %bb.0: 3486; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3487; CHECK-NEXT: #APP 3488; CHECK-NEXT: nop 3489; CHECK-NEXT: #NO_APP 3490; CHECK-NEXT: kmovq %rdi, %k1 3491; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3492; CHECK-NEXT: retq 3493 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3494 %2 = icmp ugt <64 x i8> %a1, %a0 3495 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 3496 %4 = bitcast i64 %mask to <64 x i1> 3497 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer 3498 ret <64 x i8> %5 3499} 3500 3501define <16 x i32> @stack_fold_pmaxud(<16 x i32> %a0, <16 x i32> %a1) { 3502; CHECK-LABEL: stack_fold_pmaxud: 3503; CHECK: # %bb.0: 3504; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3505; CHECK-NEXT: #APP 3506; CHECK-NEXT: nop 3507; CHECK-NEXT: #NO_APP 3508; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3509; CHECK-NEXT: retq 3510 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3511 %2 = icmp ugt <16 x i32> %a0, %a1 3512 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 3513 ret <16 x i32> %3 3514} 3515 3516define <16 x i32> @stack_fold_pmaxud_commuted(<16 x i32> %a0, <16 x i32> %a1) { 3517; CHECK-LABEL: stack_fold_pmaxud_commuted: 3518; CHECK: # %bb.0: 3519; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3520; CHECK-NEXT: #APP 3521; CHECK-NEXT: nop 3522; CHECK-NEXT: #NO_APP 3523; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3524; CHECK-NEXT: retq 3525 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3526 %2 = icmp ugt <16 x i32> %a1, %a0 3527 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 3528 ret <16 x i32> %3 3529} 3530 3531define <16 x i32> @stack_fold_pmaxud_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, ptr %passthru) { 3532; CHECK-LABEL: stack_fold_pmaxud_mask: 3533; CHECK: # %bb.0: 3534; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3535; CHECK-NEXT: #APP 3536; CHECK-NEXT: nop 3537; CHECK-NEXT: #NO_APP 3538; CHECK-NEXT: kmovd %edi, %k1 3539; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3540; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3541; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3542; CHECK-NEXT: retq 3543 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3544 %2 = icmp ugt <16 x i32> %a0, %a1 3545 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 3546 %4 = bitcast i16 %mask to <16 x i1> 3547 ; load needed to keep the operation from being scheduled about the asm block 3548 %5 = load <16 x i32>, ptr %passthru 3549 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5 3550 ret <16 x i32> %6 3551} 3552 3553define <16 x i32> @stack_fold_pmaxud_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, ptr %passthru) { 3554; CHECK-LABEL: stack_fold_pmaxud_mask_commuted: 3555; CHECK: # %bb.0: 3556; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3557; CHECK-NEXT: #APP 3558; CHECK-NEXT: nop 3559; CHECK-NEXT: #NO_APP 3560; CHECK-NEXT: kmovd %edi, %k1 3561; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3562; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3563; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3564; CHECK-NEXT: retq 3565 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3566 %2 = icmp ugt <16 x i32> %a1, %a0 3567 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 3568 %4 = bitcast i16 %mask to <16 x i1> 3569 ; load needed to keep the operation from being scheduled about the asm block 3570 %5 = load <16 x i32>, ptr %passthru 3571 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5 3572 ret <16 x i32> %6 3573} 3574 3575define <16 x i32> @stack_fold_pmaxud_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 3576; CHECK-LABEL: stack_fold_pmaxud_maskz: 3577; CHECK: # %bb.0: 3578; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3579; CHECK-NEXT: #APP 3580; CHECK-NEXT: nop 3581; CHECK-NEXT: #NO_APP 3582; CHECK-NEXT: kmovd %edi, %k1 3583; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3584; CHECK-NEXT: retq 3585 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3586 %2 = icmp ugt <16 x i32> %a0, %a1 3587 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 3588 %4 = bitcast i16 %mask to <16 x i1> 3589 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer 3590 ret <16 x i32> %5 3591} 3592 3593define <16 x i32> @stack_fold_pmaxud_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 3594; CHECK-LABEL: stack_fold_pmaxud_maskz_commuted: 3595; CHECK: # %bb.0: 3596; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3597; CHECK-NEXT: #APP 3598; CHECK-NEXT: nop 3599; CHECK-NEXT: #NO_APP 3600; CHECK-NEXT: kmovd %edi, %k1 3601; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3602; CHECK-NEXT: retq 3603 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3604 %2 = icmp ugt <16 x i32> %a1, %a0 3605 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 3606 %4 = bitcast i16 %mask to <16 x i1> 3607 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer 3608 ret <16 x i32> %5 3609} 3610 3611define <8 x i64> @stack_fold_pmaxuq(<8 x i64> %a0, <8 x i64> %a1) { 3612; CHECK-LABEL: stack_fold_pmaxuq: 3613; CHECK: # %bb.0: 3614; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3615; CHECK-NEXT: #APP 3616; CHECK-NEXT: nop 3617; CHECK-NEXT: #NO_APP 3618; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3619; CHECK-NEXT: retq 3620 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3621 %2 = icmp ugt <8 x i64> %a0, %a1 3622 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 3623 ret <8 x i64> %3 3624} 3625 3626define <8 x i64> @stack_fold_pmaxuq_commuted(<8 x i64> %a0, <8 x i64> %a1) { 3627; CHECK-LABEL: stack_fold_pmaxuq_commuted: 3628; CHECK: # %bb.0: 3629; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3630; CHECK-NEXT: #APP 3631; CHECK-NEXT: nop 3632; CHECK-NEXT: #NO_APP 3633; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3634; CHECK-NEXT: retq 3635 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3636 %2 = icmp ugt <8 x i64> %a1, %a0 3637 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 3638 ret <8 x i64> %3 3639} 3640 3641define <8 x i64> @stack_fold_pmaxuq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, ptr %passthru) { 3642; CHECK-LABEL: stack_fold_pmaxuq_mask: 3643; CHECK: # %bb.0: 3644; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3645; CHECK-NEXT: #APP 3646; CHECK-NEXT: nop 3647; CHECK-NEXT: #NO_APP 3648; CHECK-NEXT: kmovd %edi, %k1 3649; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3650; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3651; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3652; CHECK-NEXT: retq 3653 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3654 %2 = icmp ugt <8 x i64> %a0, %a1 3655 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 3656 %4 = bitcast i8 %mask to <8 x i1> 3657 ; load needed to keep the operation from being scheduled about the asm block 3658 %5 = load <8 x i64>, ptr %passthru 3659 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5 3660 ret <8 x i64> %6 3661} 3662 3663define <8 x i64> @stack_fold_pmaxuq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, ptr %passthru) { 3664; CHECK-LABEL: stack_fold_pmaxuq_mask_commuted: 3665; CHECK: # %bb.0: 3666; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3667; CHECK-NEXT: #APP 3668; CHECK-NEXT: nop 3669; CHECK-NEXT: #NO_APP 3670; CHECK-NEXT: kmovd %edi, %k1 3671; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3672; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3673; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3674; CHECK-NEXT: retq 3675 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3676 %2 = icmp ugt <8 x i64> %a1, %a0 3677 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 3678 %4 = bitcast i8 %mask to <8 x i1> 3679 ; load needed to keep the operation from being scheduled about the asm block 3680 %5 = load <8 x i64>, ptr %passthru 3681 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5 3682 ret <8 x i64> %6 3683} 3684 3685define <8 x i64> @stack_fold_pmaxuq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 3686; CHECK-LABEL: stack_fold_pmaxuq_maskz: 3687; CHECK: # %bb.0: 3688; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3689; CHECK-NEXT: #APP 3690; CHECK-NEXT: nop 3691; CHECK-NEXT: #NO_APP 3692; CHECK-NEXT: kmovd %edi, %k1 3693; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3694; CHECK-NEXT: retq 3695 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3696 %2 = icmp ugt <8 x i64> %a0, %a1 3697 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 3698 %4 = bitcast i8 %mask to <8 x i1> 3699 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer 3700 ret <8 x i64> %5 3701} 3702 3703define <8 x i64> @stack_fold_pmaxuq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 3704; CHECK-LABEL: stack_fold_pmaxuq_maskz_commuted: 3705; CHECK: # %bb.0: 3706; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3707; CHECK-NEXT: #APP 3708; CHECK-NEXT: nop 3709; CHECK-NEXT: #NO_APP 3710; CHECK-NEXT: kmovd %edi, %k1 3711; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3712; CHECK-NEXT: retq 3713 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3714 %2 = icmp ugt <8 x i64> %a1, %a0 3715 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 3716 %4 = bitcast i8 %mask to <8 x i1> 3717 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer 3718 ret <8 x i64> %5 3719} 3720 3721define <32 x i16> @stack_fold_pmaxuw(<32 x i16> %a0, <32 x i16> %a1) { 3722; CHECK-LABEL: stack_fold_pmaxuw: 3723; CHECK: # %bb.0: 3724; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3725; CHECK-NEXT: #APP 3726; CHECK-NEXT: nop 3727; CHECK-NEXT: #NO_APP 3728; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3729; CHECK-NEXT: retq 3730 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3731 %2 = icmp ugt <32 x i16> %a0, %a1 3732 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 3733 ret <32 x i16> %3 3734} 3735 3736define <32 x i16> @stack_fold_pmaxuw_commuted(<32 x i16> %a0, <32 x i16> %a1) { 3737; CHECK-LABEL: stack_fold_pmaxuw_commuted: 3738; CHECK: # %bb.0: 3739; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3740; CHECK-NEXT: #APP 3741; CHECK-NEXT: nop 3742; CHECK-NEXT: #NO_APP 3743; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3744; CHECK-NEXT: retq 3745 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3746 %2 = icmp ugt <32 x i16> %a1, %a0 3747 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 3748 ret <32 x i16> %3 3749} 3750 3751define <32 x i16> @stack_fold_pmaxuw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, ptr %passthru) { 3752; CHECK-LABEL: stack_fold_pmaxuw_mask: 3753; CHECK: # %bb.0: 3754; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3755; CHECK-NEXT: #APP 3756; CHECK-NEXT: nop 3757; CHECK-NEXT: #NO_APP 3758; CHECK-NEXT: kmovd %edi, %k1 3759; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3760; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3761; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3762; CHECK-NEXT: retq 3763 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3764 %2 = icmp ugt <32 x i16> %a0, %a1 3765 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 3766 %4 = bitcast i32 %mask to <32 x i1> 3767 ; load needed to keep the operation from being scheduled about the asm block 3768 %5 = load <32 x i16>, ptr %passthru 3769 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5 3770 ret <32 x i16> %6 3771} 3772 3773define <32 x i16> @stack_fold_pmaxuw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, ptr %passthru) { 3774; CHECK-LABEL: stack_fold_pmaxuw_mask_commuted: 3775; CHECK: # %bb.0: 3776; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3777; CHECK-NEXT: #APP 3778; CHECK-NEXT: nop 3779; CHECK-NEXT: #NO_APP 3780; CHECK-NEXT: kmovd %edi, %k1 3781; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3782; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3783; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3784; CHECK-NEXT: retq 3785 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3786 %2 = icmp ugt <32 x i16> %a1, %a0 3787 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 3788 %4 = bitcast i32 %mask to <32 x i1> 3789 ; load needed to keep the operation from being scheduled about the asm block 3790 %5 = load <32 x i16>, ptr %passthru 3791 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5 3792 ret <32 x i16> %6 3793} 3794 3795define <32 x i16> @stack_fold_pmaxuw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 3796; CHECK-LABEL: stack_fold_pmaxuw_maskz: 3797; CHECK: # %bb.0: 3798; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3799; CHECK-NEXT: #APP 3800; CHECK-NEXT: nop 3801; CHECK-NEXT: #NO_APP 3802; CHECK-NEXT: kmovd %edi, %k1 3803; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3804; CHECK-NEXT: retq 3805 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3806 %2 = icmp ugt <32 x i16> %a0, %a1 3807 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 3808 %4 = bitcast i32 %mask to <32 x i1> 3809 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer 3810 ret <32 x i16> %5 3811} 3812 3813define <32 x i16> @stack_fold_pmaxuw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 3814; CHECK-LABEL: stack_fold_pmaxuw_maskz_commuted: 3815; CHECK: # %bb.0: 3816; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3817; CHECK-NEXT: #APP 3818; CHECK-NEXT: nop 3819; CHECK-NEXT: #NO_APP 3820; CHECK-NEXT: kmovd %edi, %k1 3821; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3822; CHECK-NEXT: retq 3823 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3824 %2 = icmp ugt <32 x i16> %a1, %a0 3825 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 3826 %4 = bitcast i32 %mask to <32 x i1> 3827 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer 3828 ret <32 x i16> %5 3829} 3830 3831define <64 x i8> @stack_fold_pminsb(<64 x i8> %a0, <64 x i8> %a1) { 3832; CHECK-LABEL: stack_fold_pminsb: 3833; CHECK: # %bb.0: 3834; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3835; CHECK-NEXT: #APP 3836; CHECK-NEXT: nop 3837; CHECK-NEXT: #NO_APP 3838; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3839; CHECK-NEXT: retq 3840 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3841 %2 = icmp slt <64 x i8> %a0, %a1 3842 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 3843 ret <64 x i8> %3 3844} 3845 3846define <64 x i8> @stack_fold_pminsb_commuted(<64 x i8> %a0, <64 x i8> %a1) { 3847; CHECK-LABEL: stack_fold_pminsb_commuted: 3848; CHECK: # %bb.0: 3849; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3850; CHECK-NEXT: #APP 3851; CHECK-NEXT: nop 3852; CHECK-NEXT: #NO_APP 3853; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3854; CHECK-NEXT: retq 3855 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3856 %2 = icmp slt <64 x i8> %a1, %a0 3857 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 3858 ret <64 x i8> %3 3859} 3860 3861define <64 x i8> @stack_fold_pminsb_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, ptr %passthru) { 3862; CHECK-LABEL: stack_fold_pminsb_mask: 3863; CHECK: # %bb.0: 3864; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3865; CHECK-NEXT: #APP 3866; CHECK-NEXT: nop 3867; CHECK-NEXT: #NO_APP 3868; CHECK-NEXT: kmovq %rdi, %k1 3869; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3870; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3871; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3872; CHECK-NEXT: retq 3873 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3874 %2 = icmp slt <64 x i8> %a0, %a1 3875 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 3876 %4 = bitcast i64 %mask to <64 x i1> 3877 ; load needed to keep the operation from being scheduled about the asm block 3878 %5 = load <64 x i8>, ptr %passthru 3879 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5 3880 ret <64 x i8> %6 3881} 3882 3883define <64 x i8> @stack_fold_pminsb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, ptr %passthru) { 3884; CHECK-LABEL: stack_fold_pminsb_mask_commuted: 3885; CHECK: # %bb.0: 3886; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3887; CHECK-NEXT: #APP 3888; CHECK-NEXT: nop 3889; CHECK-NEXT: #NO_APP 3890; CHECK-NEXT: kmovq %rdi, %k1 3891; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3892; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3893; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3894; CHECK-NEXT: retq 3895 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3896 %2 = icmp slt <64 x i8> %a1, %a0 3897 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 3898 %4 = bitcast i64 %mask to <64 x i1> 3899 ; load needed to keep the operation from being scheduled about the asm block 3900 %5 = load <64 x i8>, ptr %passthru 3901 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5 3902 ret <64 x i8> %6 3903} 3904 3905define <64 x i8> @stack_fold_pminsb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 3906; CHECK-LABEL: stack_fold_pminsb_maskz: 3907; CHECK: # %bb.0: 3908; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3909; CHECK-NEXT: #APP 3910; CHECK-NEXT: nop 3911; CHECK-NEXT: #NO_APP 3912; CHECK-NEXT: kmovq %rdi, %k1 3913; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3914; CHECK-NEXT: retq 3915 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3916 %2 = icmp slt <64 x i8> %a0, %a1 3917 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 3918 %4 = bitcast i64 %mask to <64 x i1> 3919 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer 3920 ret <64 x i8> %5 3921} 3922 3923define <64 x i8> @stack_fold_pminsb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 3924; CHECK-LABEL: stack_fold_pminsb_maskz_commuted: 3925; CHECK: # %bb.0: 3926; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3927; CHECK-NEXT: #APP 3928; CHECK-NEXT: nop 3929; CHECK-NEXT: #NO_APP 3930; CHECK-NEXT: kmovq %rdi, %k1 3931; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 3932; CHECK-NEXT: retq 3933 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3934 %2 = icmp slt <64 x i8> %a1, %a0 3935 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 3936 %4 = bitcast i64 %mask to <64 x i1> 3937 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer 3938 ret <64 x i8> %5 3939} 3940 3941define <16 x i32> @stack_fold_pminsd(<16 x i32> %a0, <16 x i32> %a1) { 3942; CHECK-LABEL: stack_fold_pminsd: 3943; CHECK: # %bb.0: 3944; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3945; CHECK-NEXT: #APP 3946; CHECK-NEXT: nop 3947; CHECK-NEXT: #NO_APP 3948; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3949; CHECK-NEXT: retq 3950 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3951 %2 = icmp slt <16 x i32> %a0, %a1 3952 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 3953 ret <16 x i32> %3 3954} 3955 3956define <16 x i32> @stack_fold_pminsd_commuted(<16 x i32> %a0, <16 x i32> %a1) { 3957; CHECK-LABEL: stack_fold_pminsd_commuted: 3958; CHECK: # %bb.0: 3959; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3960; CHECK-NEXT: #APP 3961; CHECK-NEXT: nop 3962; CHECK-NEXT: #NO_APP 3963; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 3964; CHECK-NEXT: retq 3965 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3966 %2 = icmp slt <16 x i32> %a1, %a0 3967 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 3968 ret <16 x i32> %3 3969} 3970 3971define <16 x i32> @stack_fold_pminsd_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, ptr %passthru) { 3972; CHECK-LABEL: stack_fold_pminsd_mask: 3973; CHECK: # %bb.0: 3974; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3975; CHECK-NEXT: #APP 3976; CHECK-NEXT: nop 3977; CHECK-NEXT: #NO_APP 3978; CHECK-NEXT: kmovd %edi, %k1 3979; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 3980; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 3981; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 3982; CHECK-NEXT: retq 3983 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3984 %2 = icmp slt <16 x i32> %a0, %a1 3985 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 3986 %4 = bitcast i16 %mask to <16 x i1> 3987 ; load needed to keep the operation from being scheduled about the asm block 3988 %5 = load <16 x i32>, ptr %passthru 3989 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5 3990 ret <16 x i32> %6 3991} 3992 3993define <16 x i32> @stack_fold_pminsd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, ptr %passthru) { 3994; CHECK-LABEL: stack_fold_pminsd_mask_commuted: 3995; CHECK: # %bb.0: 3996; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3997; CHECK-NEXT: #APP 3998; CHECK-NEXT: nop 3999; CHECK-NEXT: #NO_APP 4000; CHECK-NEXT: kmovd %edi, %k1 4001; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 4002; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 4003; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 4004; CHECK-NEXT: retq 4005 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4006 %2 = icmp slt <16 x i32> %a1, %a0 4007 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 4008 %4 = bitcast i16 %mask to <16 x i1> 4009 ; load needed to keep the operation from being scheduled about the asm block 4010 %5 = load <16 x i32>, ptr %passthru 4011 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5 4012 ret <16 x i32> %6 4013} 4014 4015define <16 x i32> @stack_fold_pminsd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 4016; CHECK-LABEL: stack_fold_pminsd_maskz: 4017; CHECK: # %bb.0: 4018; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4019; CHECK-NEXT: #APP 4020; CHECK-NEXT: nop 4021; CHECK-NEXT: #NO_APP 4022; CHECK-NEXT: kmovd %edi, %k1 4023; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 4024; CHECK-NEXT: retq 4025 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4026 %2 = icmp slt <16 x i32> %a0, %a1 4027 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 4028 %4 = bitcast i16 %mask to <16 x i1> 4029 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer 4030 ret <16 x i32> %5 4031} 4032 4033define <16 x i32> @stack_fold_pminsd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 4034; CHECK-LABEL: stack_fold_pminsd_maskz_commuted: 4035; CHECK: # %bb.0: 4036; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4037; CHECK-NEXT: #APP 4038; CHECK-NEXT: nop 4039; CHECK-NEXT: #NO_APP 4040; CHECK-NEXT: kmovd %edi, %k1 4041; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 4042; CHECK-NEXT: retq 4043 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4044 %2 = icmp slt <16 x i32> %a1, %a0 4045 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 4046 %4 = bitcast i16 %mask to <16 x i1> 4047 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer 4048 ret <16 x i32> %5 4049} 4050 4051define <8 x i64> @stack_fold_pminsq(<8 x i64> %a0, <8 x i64> %a1) { 4052; CHECK-LABEL: stack_fold_pminsq: 4053; CHECK: # %bb.0: 4054; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4055; CHECK-NEXT: #APP 4056; CHECK-NEXT: nop 4057; CHECK-NEXT: #NO_APP 4058; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 4059; CHECK-NEXT: retq 4060 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4061 %2 = icmp slt <8 x i64> %a0, %a1 4062 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 4063 ret <8 x i64> %3 4064} 4065 4066define <8 x i64> @stack_fold_pminsq_commuted(<8 x i64> %a0, <8 x i64> %a1) { 4067; CHECK-LABEL: stack_fold_pminsq_commuted: 4068; CHECK: # %bb.0: 4069; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4070; CHECK-NEXT: #APP 4071; CHECK-NEXT: nop 4072; CHECK-NEXT: #NO_APP 4073; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 4074; CHECK-NEXT: retq 4075 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4076 %2 = icmp slt <8 x i64> %a1, %a0 4077 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 4078 ret <8 x i64> %3 4079} 4080 4081define <8 x i64> @stack_fold_pminsq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, ptr %passthru) { 4082; CHECK-LABEL: stack_fold_pminsq_mask: 4083; CHECK: # %bb.0: 4084; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4085; CHECK-NEXT: #APP 4086; CHECK-NEXT: nop 4087; CHECK-NEXT: #NO_APP 4088; CHECK-NEXT: kmovd %edi, %k1 4089; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 4090; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 4091; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 4092; CHECK-NEXT: retq 4093 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4094 %2 = icmp slt <8 x i64> %a0, %a1 4095 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 4096 %4 = bitcast i8 %mask to <8 x i1> 4097 ; load needed to keep the operation from being scheduled about the asm block 4098 %5 = load <8 x i64>, ptr %passthru 4099 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5 4100 ret <8 x i64> %6 4101} 4102 4103define <8 x i64> @stack_fold_pminsq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, ptr %passthru) { 4104; CHECK-LABEL: stack_fold_pminsq_mask_commuted: 4105; CHECK: # %bb.0: 4106; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4107; CHECK-NEXT: #APP 4108; CHECK-NEXT: nop 4109; CHECK-NEXT: #NO_APP 4110; CHECK-NEXT: kmovd %edi, %k1 4111; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 4112; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 4113; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 4114; CHECK-NEXT: retq 4115 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4116 %2 = icmp slt <8 x i64> %a1, %a0 4117 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 4118 %4 = bitcast i8 %mask to <8 x i1> 4119 ; load needed to keep the operation from being scheduled about the asm block 4120 %5 = load <8 x i64>, ptr %passthru 4121 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5 4122 ret <8 x i64> %6 4123} 4124 4125define <8 x i64> @stack_fold_pminsq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 4126; CHECK-LABEL: stack_fold_pminsq_maskz: 4127; CHECK: # %bb.0: 4128; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4129; CHECK-NEXT: #APP 4130; CHECK-NEXT: nop 4131; CHECK-NEXT: #NO_APP 4132; CHECK-NEXT: kmovd %edi, %k1 4133; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 4134; CHECK-NEXT: retq 4135 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4136 %2 = icmp slt <8 x i64> %a0, %a1 4137 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 4138 %4 = bitcast i8 %mask to <8 x i1> 4139 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer 4140 ret <8 x i64> %5 4141} 4142 4143define <8 x i64> @stack_fold_pminsq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 4144; CHECK-LABEL: stack_fold_pminsq_maskz_commuted: 4145; CHECK: # %bb.0: 4146; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4147; CHECK-NEXT: #APP 4148; CHECK-NEXT: nop 4149; CHECK-NEXT: #NO_APP 4150; CHECK-NEXT: kmovd %edi, %k1 4151; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 4152; CHECK-NEXT: retq 4153 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4154 %2 = icmp slt <8 x i64> %a1, %a0 4155 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 4156 %4 = bitcast i8 %mask to <8 x i1> 4157 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer 4158 ret <8 x i64> %5 4159} 4160 4161define <32 x i16> @stack_fold_pminsw(<32 x i16> %a0, <32 x i16> %a1) { 4162; CHECK-LABEL: stack_fold_pminsw: 4163; CHECK: # %bb.0: 4164; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4165; CHECK-NEXT: #APP 4166; CHECK-NEXT: nop 4167; CHECK-NEXT: #NO_APP 4168; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 4169; CHECK-NEXT: retq 4170 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4171 %2 = icmp slt <32 x i16> %a0, %a1 4172 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 4173 ret <32 x i16> %3 4174} 4175 4176define <32 x i16> @stack_fold_pminsw_commuted(<32 x i16> %a0, <32 x i16> %a1) { 4177; CHECK-LABEL: stack_fold_pminsw_commuted: 4178; CHECK: # %bb.0: 4179; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4180; CHECK-NEXT: #APP 4181; CHECK-NEXT: nop 4182; CHECK-NEXT: #NO_APP 4183; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 4184; CHECK-NEXT: retq 4185 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4186 %2 = icmp slt <32 x i16> %a1, %a0 4187 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 4188 ret <32 x i16> %3 4189} 4190 4191define <32 x i16> @stack_fold_pminsw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, ptr %passthru) { 4192; CHECK-LABEL: stack_fold_pminsw_mask: 4193; CHECK: # %bb.0: 4194; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4195; CHECK-NEXT: #APP 4196; CHECK-NEXT: nop 4197; CHECK-NEXT: #NO_APP 4198; CHECK-NEXT: kmovd %edi, %k1 4199; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 4200; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 4201; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 4202; CHECK-NEXT: retq 4203 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4204 %2 = icmp slt <32 x i16> %a0, %a1 4205 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 4206 %4 = bitcast i32 %mask to <32 x i1> 4207 ; load needed to keep the operation from being scheduled about the asm block 4208 %5 = load <32 x i16>, ptr %passthru 4209 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5 4210 ret <32 x i16> %6 4211} 4212 4213define <32 x i16> @stack_fold_pminsw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, ptr %passthru) { 4214; CHECK-LABEL: stack_fold_pminsw_mask_commuted: 4215; CHECK: # %bb.0: 4216; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4217; CHECK-NEXT: #APP 4218; CHECK-NEXT: nop 4219; CHECK-NEXT: #NO_APP 4220; CHECK-NEXT: kmovd %edi, %k1 4221; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 4222; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 4223; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 4224; CHECK-NEXT: retq 4225 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4226 %2 = icmp slt <32 x i16> %a1, %a0 4227 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 4228 %4 = bitcast i32 %mask to <32 x i1> 4229 ; load needed to keep the operation from being scheduled about the asm block 4230 %5 = load <32 x i16>, ptr %passthru 4231 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5 4232 ret <32 x i16> %6 4233} 4234 4235define <32 x i16> @stack_fold_pminsw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 4236; CHECK-LABEL: stack_fold_pminsw_maskz: 4237; CHECK: # %bb.0: 4238; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4239; CHECK-NEXT: #APP 4240; CHECK-NEXT: nop 4241; CHECK-NEXT: #NO_APP 4242; CHECK-NEXT: kmovd %edi, %k1 4243; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 4244; CHECK-NEXT: retq 4245 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4246 %2 = icmp slt <32 x i16> %a0, %a1 4247 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 4248 %4 = bitcast i32 %mask to <32 x i1> 4249 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer 4250 ret <32 x i16> %5 4251} 4252 4253define <32 x i16> @stack_fold_pminsw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 4254; CHECK-LABEL: stack_fold_pminsw_maskz_commuted: 4255; CHECK: # %bb.0: 4256; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4257; CHECK-NEXT: #APP 4258; CHECK-NEXT: nop 4259; CHECK-NEXT: #NO_APP 4260; CHECK-NEXT: kmovd %edi, %k1 4261; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 4262; CHECK-NEXT: retq 4263 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4264 %2 = icmp slt <32 x i16> %a1, %a0 4265 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 4266 %4 = bitcast i32 %mask to <32 x i1> 4267 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer 4268 ret <32 x i16> %5 4269} 4270 4271define <64 x i8> @stack_fold_pminub(<64 x i8> %a0, <64 x i8> %a1) { 4272; CHECK-LABEL: stack_fold_pminub: 4273; CHECK: # %bb.0: 4274; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4275; CHECK-NEXT: #APP 4276; CHECK-NEXT: nop 4277; CHECK-NEXT: #NO_APP 4278; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 4279; CHECK-NEXT: retq 4280 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4281 %2 = icmp ult <64 x i8> %a0, %a1 4282 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 4283 ret <64 x i8> %3 4284} 4285 4286define <64 x i8> @stack_fold_pminub_commuted(<64 x i8> %a0, <64 x i8> %a1) { 4287; CHECK-LABEL: stack_fold_pminub_commuted: 4288; CHECK: # %bb.0: 4289; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4290; CHECK-NEXT: #APP 4291; CHECK-NEXT: nop 4292; CHECK-NEXT: #NO_APP 4293; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 4294; CHECK-NEXT: retq 4295 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4296 %2 = icmp ult <64 x i8> %a1, %a0 4297 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 4298 ret <64 x i8> %3 4299} 4300 4301define <64 x i8> @stack_fold_pminub_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, ptr %passthru) { 4302; CHECK-LABEL: stack_fold_pminub_mask: 4303; CHECK: # %bb.0: 4304; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4305; CHECK-NEXT: #APP 4306; CHECK-NEXT: nop 4307; CHECK-NEXT: #NO_APP 4308; CHECK-NEXT: kmovq %rdi, %k1 4309; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 4310; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 4311; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 4312; CHECK-NEXT: retq 4313 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4314 %2 = icmp ult <64 x i8> %a0, %a1 4315 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 4316 %4 = bitcast i64 %mask to <64 x i1> 4317 ; load needed to keep the operation from being scheduled about the asm block 4318 %5 = load <64 x i8>, ptr %passthru 4319 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5 4320 ret <64 x i8> %6 4321} 4322 4323define <64 x i8> @stack_fold_pminub_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, ptr %passthru) { 4324; CHECK-LABEL: stack_fold_pminub_mask_commuted: 4325; CHECK: # %bb.0: 4326; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4327; CHECK-NEXT: #APP 4328; CHECK-NEXT: nop 4329; CHECK-NEXT: #NO_APP 4330; CHECK-NEXT: kmovq %rdi, %k1 4331; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 4332; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 4333; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 4334; CHECK-NEXT: retq 4335 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4336 %2 = icmp ult <64 x i8> %a1, %a0 4337 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 4338 %4 = bitcast i64 %mask to <64 x i1> 4339 ; load needed to keep the operation from being scheduled about the asm block 4340 %5 = load <64 x i8>, ptr %passthru 4341 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5 4342 ret <64 x i8> %6 4343} 4344 4345define <64 x i8> @stack_fold_pminub_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 4346; CHECK-LABEL: stack_fold_pminub_maskz: 4347; CHECK: # %bb.0: 4348; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4349; CHECK-NEXT: #APP 4350; CHECK-NEXT: nop 4351; CHECK-NEXT: #NO_APP 4352; CHECK-NEXT: kmovq %rdi, %k1 4353; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 4354; CHECK-NEXT: retq 4355 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4356 %2 = icmp ult <64 x i8> %a0, %a1 4357 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1 4358 %4 = bitcast i64 %mask to <64 x i1> 4359 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer 4360 ret <64 x i8> %5 4361} 4362 4363define <64 x i8> @stack_fold_pminub_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 4364; CHECK-LABEL: stack_fold_pminub_maskz_commuted: 4365; CHECK: # %bb.0: 4366; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4367; CHECK-NEXT: #APP 4368; CHECK-NEXT: nop 4369; CHECK-NEXT: #NO_APP 4370; CHECK-NEXT: kmovq %rdi, %k1 4371; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 4372; CHECK-NEXT: retq 4373 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4374 %2 = icmp ult <64 x i8> %a1, %a0 4375 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0 4376 %4 = bitcast i64 %mask to <64 x i1> 4377 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer 4378 ret <64 x i8> %5 4379} 4380 4381define <16 x i32> @stack_fold_pminud(<16 x i32> %a0, <16 x i32> %a1) { 4382; CHECK-LABEL: stack_fold_pminud: 4383; CHECK: # %bb.0: 4384; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4385; CHECK-NEXT: #APP 4386; CHECK-NEXT: nop 4387; CHECK-NEXT: #NO_APP 4388; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 4389; CHECK-NEXT: retq 4390 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4391 %2 = icmp ult <16 x i32> %a0, %a1 4392 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 4393 ret <16 x i32> %3 4394} 4395 4396define <16 x i32> @stack_fold_pminud_commuted(<16 x i32> %a0, <16 x i32> %a1) { 4397; CHECK-LABEL: stack_fold_pminud_commuted: 4398; CHECK: # %bb.0: 4399; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4400; CHECK-NEXT: #APP 4401; CHECK-NEXT: nop 4402; CHECK-NEXT: #NO_APP 4403; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 4404; CHECK-NEXT: retq 4405 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4406 %2 = icmp ult <16 x i32> %a1, %a0 4407 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 4408 ret <16 x i32> %3 4409} 4410 4411define <16 x i32> @stack_fold_pminud_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, ptr %passthru) { 4412; CHECK-LABEL: stack_fold_pminud_mask: 4413; CHECK: # %bb.0: 4414; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4415; CHECK-NEXT: #APP 4416; CHECK-NEXT: nop 4417; CHECK-NEXT: #NO_APP 4418; CHECK-NEXT: kmovd %edi, %k1 4419; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 4420; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 4421; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 4422; CHECK-NEXT: retq 4423 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4424 %2 = icmp ult <16 x i32> %a0, %a1 4425 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 4426 %4 = bitcast i16 %mask to <16 x i1> 4427 ; load needed to keep the operation from being scheduled about the asm block 4428 %5 = load <16 x i32>, ptr %passthru 4429 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5 4430 ret <16 x i32> %6 4431} 4432 4433define <16 x i32> @stack_fold_pminud_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, ptr %passthru) { 4434; CHECK-LABEL: stack_fold_pminud_mask_commuted: 4435; CHECK: # %bb.0: 4436; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4437; CHECK-NEXT: #APP 4438; CHECK-NEXT: nop 4439; CHECK-NEXT: #NO_APP 4440; CHECK-NEXT: kmovd %edi, %k1 4441; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 4442; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 4443; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 4444; CHECK-NEXT: retq 4445 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4446 %2 = icmp ult <16 x i32> %a1, %a0 4447 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 4448 %4 = bitcast i16 %mask to <16 x i1> 4449 ; load needed to keep the operation from being scheduled about the asm block 4450 %5 = load <16 x i32>, ptr %passthru 4451 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5 4452 ret <16 x i32> %6 4453} 4454 4455define <16 x i32> @stack_fold_pminud_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 4456; CHECK-LABEL: stack_fold_pminud_maskz: 4457; CHECK: # %bb.0: 4458; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4459; CHECK-NEXT: #APP 4460; CHECK-NEXT: nop 4461; CHECK-NEXT: #NO_APP 4462; CHECK-NEXT: kmovd %edi, %k1 4463; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 4464; CHECK-NEXT: retq 4465 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4466 %2 = icmp ult <16 x i32> %a0, %a1 4467 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1 4468 %4 = bitcast i16 %mask to <16 x i1> 4469 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer 4470 ret <16 x i32> %5 4471} 4472 4473define <16 x i32> @stack_fold_pminud_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 4474; CHECK-LABEL: stack_fold_pminud_maskz_commuted: 4475; CHECK: # %bb.0: 4476; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4477; CHECK-NEXT: #APP 4478; CHECK-NEXT: nop 4479; CHECK-NEXT: #NO_APP 4480; CHECK-NEXT: kmovd %edi, %k1 4481; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 4482; CHECK-NEXT: retq 4483 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4484 %2 = icmp ult <16 x i32> %a1, %a0 4485 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0 4486 %4 = bitcast i16 %mask to <16 x i1> 4487 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer 4488 ret <16 x i32> %5 4489} 4490 4491define <8 x i64> @stack_fold_pminuq(<8 x i64> %a0, <8 x i64> %a1) { 4492; CHECK-LABEL: stack_fold_pminuq: 4493; CHECK: # %bb.0: 4494; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4495; CHECK-NEXT: #APP 4496; CHECK-NEXT: nop 4497; CHECK-NEXT: #NO_APP 4498; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 4499; CHECK-NEXT: retq 4500 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4501 %2 = icmp ult <8 x i64> %a0, %a1 4502 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 4503 ret <8 x i64> %3 4504} 4505 4506define <8 x i64> @stack_fold_pminuq_commuted(<8 x i64> %a0, <8 x i64> %a1) { 4507; CHECK-LABEL: stack_fold_pminuq_commuted: 4508; CHECK: # %bb.0: 4509; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4510; CHECK-NEXT: #APP 4511; CHECK-NEXT: nop 4512; CHECK-NEXT: #NO_APP 4513; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 4514; CHECK-NEXT: retq 4515 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4516 %2 = icmp ult <8 x i64> %a1, %a0 4517 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 4518 ret <8 x i64> %3 4519} 4520 4521define <8 x i64> @stack_fold_pminuq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, ptr %passthru) { 4522; CHECK-LABEL: stack_fold_pminuq_mask: 4523; CHECK: # %bb.0: 4524; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4525; CHECK-NEXT: #APP 4526; CHECK-NEXT: nop 4527; CHECK-NEXT: #NO_APP 4528; CHECK-NEXT: kmovd %edi, %k1 4529; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 4530; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 4531; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 4532; CHECK-NEXT: retq 4533 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4534 %2 = icmp ult <8 x i64> %a0, %a1 4535 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 4536 %4 = bitcast i8 %mask to <8 x i1> 4537 ; load needed to keep the operation from being scheduled about the asm block 4538 %5 = load <8 x i64>, ptr %passthru 4539 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5 4540 ret <8 x i64> %6 4541} 4542 4543define <8 x i64> @stack_fold_pminuq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, ptr %passthru) { 4544; CHECK-LABEL: stack_fold_pminuq_mask_commuted: 4545; CHECK: # %bb.0: 4546; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4547; CHECK-NEXT: #APP 4548; CHECK-NEXT: nop 4549; CHECK-NEXT: #NO_APP 4550; CHECK-NEXT: kmovd %edi, %k1 4551; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 4552; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 4553; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 4554; CHECK-NEXT: retq 4555 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4556 %2 = icmp ult <8 x i64> %a1, %a0 4557 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 4558 %4 = bitcast i8 %mask to <8 x i1> 4559 ; load needed to keep the operation from being scheduled about the asm block 4560 %5 = load <8 x i64>, ptr %passthru 4561 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5 4562 ret <8 x i64> %6 4563} 4564 4565define <8 x i64> @stack_fold_pminuq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 4566; CHECK-LABEL: stack_fold_pminuq_maskz: 4567; CHECK: # %bb.0: 4568; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4569; CHECK-NEXT: #APP 4570; CHECK-NEXT: nop 4571; CHECK-NEXT: #NO_APP 4572; CHECK-NEXT: kmovd %edi, %k1 4573; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 4574; CHECK-NEXT: retq 4575 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4576 %2 = icmp ult <8 x i64> %a0, %a1 4577 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1 4578 %4 = bitcast i8 %mask to <8 x i1> 4579 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer 4580 ret <8 x i64> %5 4581} 4582 4583define <8 x i64> @stack_fold_pminuq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 4584; CHECK-LABEL: stack_fold_pminuq_maskz_commuted: 4585; CHECK: # %bb.0: 4586; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4587; CHECK-NEXT: #APP 4588; CHECK-NEXT: nop 4589; CHECK-NEXT: #NO_APP 4590; CHECK-NEXT: kmovd %edi, %k1 4591; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 4592; CHECK-NEXT: retq 4593 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4594 %2 = icmp ult <8 x i64> %a1, %a0 4595 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0 4596 %4 = bitcast i8 %mask to <8 x i1> 4597 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer 4598 ret <8 x i64> %5 4599} 4600 4601define <32 x i16> @stack_fold_pminuw(<32 x i16> %a0, <32 x i16> %a1) { 4602; CHECK-LABEL: stack_fold_pminuw: 4603; CHECK: # %bb.0: 4604; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4605; CHECK-NEXT: #APP 4606; CHECK-NEXT: nop 4607; CHECK-NEXT: #NO_APP 4608; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 4609; CHECK-NEXT: retq 4610 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4611 %2 = icmp ult <32 x i16> %a0, %a1 4612 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 4613 ret <32 x i16> %3 4614} 4615 4616define <32 x i16> @stack_fold_pminuw_commuted(<32 x i16> %a0, <32 x i16> %a1) { 4617; CHECK-LABEL: stack_fold_pminuw_commuted: 4618; CHECK: # %bb.0: 4619; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4620; CHECK-NEXT: #APP 4621; CHECK-NEXT: nop 4622; CHECK-NEXT: #NO_APP 4623; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 4624; CHECK-NEXT: retq 4625 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4626 %2 = icmp ult <32 x i16> %a1, %a0 4627 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 4628 ret <32 x i16> %3 4629} 4630 4631define <32 x i16> @stack_fold_pminuw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, ptr %passthru) { 4632; CHECK-LABEL: stack_fold_pminuw_mask: 4633; CHECK: # %bb.0: 4634; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4635; CHECK-NEXT: #APP 4636; CHECK-NEXT: nop 4637; CHECK-NEXT: #NO_APP 4638; CHECK-NEXT: kmovd %edi, %k1 4639; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 4640; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 4641; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 4642; CHECK-NEXT: retq 4643 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4644 %2 = icmp ult <32 x i16> %a0, %a1 4645 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 4646 %4 = bitcast i32 %mask to <32 x i1> 4647 ; load needed to keep the operation from being scheduled about the asm block 4648 %5 = load <32 x i16>, ptr %passthru 4649 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5 4650 ret <32 x i16> %6 4651} 4652 4653define <32 x i16> @stack_fold_pminuw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, ptr %passthru) { 4654; CHECK-LABEL: stack_fold_pminuw_mask_commuted: 4655; CHECK: # %bb.0: 4656; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4657; CHECK-NEXT: #APP 4658; CHECK-NEXT: nop 4659; CHECK-NEXT: #NO_APP 4660; CHECK-NEXT: kmovd %edi, %k1 4661; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2 4662; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 4663; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 4664; CHECK-NEXT: retq 4665 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4666 %2 = icmp ult <32 x i16> %a1, %a0 4667 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 4668 %4 = bitcast i32 %mask to <32 x i1> 4669 ; load needed to keep the operation from being scheduled about the asm block 4670 %5 = load <32 x i16>, ptr %passthru 4671 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5 4672 ret <32 x i16> %6 4673} 4674 4675define <32 x i16> @stack_fold_pminuw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 4676; CHECK-LABEL: stack_fold_pminuw_maskz: 4677; CHECK: # %bb.0: 4678; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4679; CHECK-NEXT: #APP 4680; CHECK-NEXT: nop 4681; CHECK-NEXT: #NO_APP 4682; CHECK-NEXT: kmovd %edi, %k1 4683; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 4684; CHECK-NEXT: retq 4685 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4686 %2 = icmp ult <32 x i16> %a0, %a1 4687 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1 4688 %4 = bitcast i32 %mask to <32 x i1> 4689 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer 4690 ret <32 x i16> %5 4691} 4692 4693define <32 x i16> @stack_fold_pminuw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 4694; CHECK-LABEL: stack_fold_pminuw_maskz_commuted: 4695; CHECK: # %bb.0: 4696; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4697; CHECK-NEXT: #APP 4698; CHECK-NEXT: nop 4699; CHECK-NEXT: #NO_APP 4700; CHECK-NEXT: kmovd %edi, %k1 4701; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 4702; CHECK-NEXT: retq 4703 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4704 %2 = icmp ult <32 x i16> %a1, %a0 4705 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0 4706 %4 = bitcast i32 %mask to <32 x i1> 4707 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer 4708 ret <32 x i16> %5 4709} 4710 4711define <16 x i8> @stack_fold_vpmovdb(<16 x i32> %a0) { 4712; CHECK-LABEL: stack_fold_vpmovdb: 4713; CHECK: # %bb.0: 4714; CHECK-NEXT: vpmovdb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 4715; CHECK-NEXT: #APP 4716; CHECK-NEXT: nop 4717; CHECK-NEXT: #NO_APP 4718; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4719; CHECK-NEXT: vzeroupper 4720; CHECK-NEXT: retq 4721 %1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1) 4722 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4723 ret <16 x i8> %1 4724} 4725declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16) 4726 4727define <16 x i16> @stack_fold_vpmovdw(<16 x i32> %a0) { 4728; CHECK-LABEL: stack_fold_vpmovdw: 4729; CHECK: # %bb.0: 4730; CHECK-NEXT: vpmovdw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill 4731; CHECK-NEXT: #APP 4732; CHECK-NEXT: nop 4733; CHECK-NEXT: #NO_APP 4734; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4735; CHECK-NEXT: retq 4736 %1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1) 4737 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4738 ret <16 x i16> %1 4739} 4740declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16) 4741 4742define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) { 4743; CHECK-LABEL: stack_fold_movq_load: 4744; CHECK: # %bb.0: 4745; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4746; CHECK-NEXT: #APP 4747; CHECK-NEXT: nop 4748; CHECK-NEXT: #NO_APP 4749; CHECK-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4750; CHECK-NEXT: # xmm0 = mem[0],zero 4751; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 4752; CHECK-NEXT: vpsubq %xmm1, %xmm0, %xmm0 4753; CHECK-NEXT: retq 4754 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4755 %2 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2> 4756 ; add forces execution domain 4757 %3 = add <2 x i64> %2, <i64 1, i64 1> 4758 ret <2 x i64> %3 4759} 4760 4761define <8 x i32> @stack_fold_vpmovqd(<8 x i64> %a0) { 4762; CHECK-LABEL: stack_fold_vpmovqd: 4763; CHECK: # %bb.0: 4764; CHECK-NEXT: vpmovqd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill 4765; CHECK-NEXT: #APP 4766; CHECK-NEXT: nop 4767; CHECK-NEXT: #NO_APP 4768; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4769; CHECK-NEXT: retq 4770 %1 = trunc <8 x i64> %a0 to <8 x i32> 4771 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4772 ret <8 x i32> %1 4773} 4774declare <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64>, <8 x i32>, i8) 4775 4776define <8 x i16> @stack_fold_vpmovqw(<8 x i64> %a0) { 4777; CHECK-LABEL: stack_fold_vpmovqw: 4778; CHECK: # %bb.0: 4779; CHECK-NEXT: vpmovqw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 4780; CHECK-NEXT: #APP 4781; CHECK-NEXT: nop 4782; CHECK-NEXT: #NO_APP 4783; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4784; CHECK-NEXT: vzeroupper 4785; CHECK-NEXT: retq 4786 %1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1) 4787 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4788 ret <8 x i16> %1 4789} 4790declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8) 4791 4792define <32 x i8> @stack_fold_vpmovwb(<32 x i16> %a0) { 4793; CHECK-LABEL: stack_fold_vpmovwb: 4794; CHECK: # %bb.0: 4795; CHECK-NEXT: vpmovwb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill 4796; CHECK-NEXT: #APP 4797; CHECK-NEXT: nop 4798; CHECK-NEXT: #NO_APP 4799; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4800; CHECK-NEXT: retq 4801 %1 = trunc <32 x i16> %a0 to <32 x i8> 4802 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4803 ret <32 x i8> %1 4804} 4805declare <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16>, <32 x i8>, i32) 4806 4807define <16 x i8> @stack_fold_vpmovsdb(<16 x i32> %a0) { 4808; CHECK-LABEL: stack_fold_vpmovsdb: 4809; CHECK: # %bb.0: 4810; CHECK-NEXT: vpmovsdb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 4811; CHECK-NEXT: #APP 4812; CHECK-NEXT: nop 4813; CHECK-NEXT: #NO_APP 4814; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4815; CHECK-NEXT: vzeroupper 4816; CHECK-NEXT: retq 4817 %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1) 4818 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4819 ret <16 x i8> %1 4820} 4821declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16) 4822 4823define <16 x i16> @stack_fold_vpmovsdw(<16 x i32> %a0) { 4824; CHECK-LABEL: stack_fold_vpmovsdw: 4825; CHECK: # %bb.0: 4826; CHECK-NEXT: vpmovsdw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill 4827; CHECK-NEXT: #APP 4828; CHECK-NEXT: nop 4829; CHECK-NEXT: #NO_APP 4830; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4831; CHECK-NEXT: retq 4832 %1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1) 4833 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4834 ret <16 x i16> %1 4835} 4836declare <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32>, <16 x i16>, i16) 4837 4838define <8 x i32> @stack_fold_vpmovsqd(<8 x i64> %a0) { 4839; CHECK-LABEL: stack_fold_vpmovsqd: 4840; CHECK: # %bb.0: 4841; CHECK-NEXT: vpmovsqd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill 4842; CHECK-NEXT: #APP 4843; CHECK-NEXT: nop 4844; CHECK-NEXT: #NO_APP 4845; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4846; CHECK-NEXT: retq 4847 %1 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %a0, <8 x i32> undef, i8 -1) 4848 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4849 ret <8 x i32> %1 4850} 4851declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8) 4852 4853define <8 x i16> @stack_fold_vpmovsqw(<8 x i64> %a0) { 4854; CHECK-LABEL: stack_fold_vpmovsqw: 4855; CHECK: # %bb.0: 4856; CHECK-NEXT: vpmovsqw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 4857; CHECK-NEXT: #APP 4858; CHECK-NEXT: nop 4859; CHECK-NEXT: #NO_APP 4860; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4861; CHECK-NEXT: vzeroupper 4862; CHECK-NEXT: retq 4863 %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1) 4864 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4865 ret <8 x i16> %1 4866} 4867declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8) 4868 4869define <32 x i8> @stack_fold_vpmovswb(<32 x i16> %a0) { 4870; CHECK-LABEL: stack_fold_vpmovswb: 4871; CHECK: # %bb.0: 4872; CHECK-NEXT: vpmovswb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill 4873; CHECK-NEXT: #APP 4874; CHECK-NEXT: nop 4875; CHECK-NEXT: #NO_APP 4876; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4877; CHECK-NEXT: retq 4878 %1 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %a0, <32 x i8> undef, i32 -1) 4879 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4880 ret <32 x i8> %1 4881} 4882declare <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16>, <32 x i8>, i32) 4883 4884define <16 x i32> @stack_fold_pmovsxbd_zmm(<16 x i8> %a0) { 4885; CHECK-LABEL: stack_fold_pmovsxbd_zmm: 4886; CHECK: # %bb.0: 4887; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4888; CHECK-NEXT: #APP 4889; CHECK-NEXT: nop 4890; CHECK-NEXT: #NO_APP 4891; CHECK-NEXT: vpmovsxbd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload 4892; CHECK-NEXT: retq 4893 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4894 %2 = sext <16 x i8> %a0 to <16 x i32> 4895 ret <16 x i32> %2 4896} 4897 4898define <8 x i64> @stack_fold_pmovsxbq_zmm(<16 x i8> %a0) { 4899; CHECK-LABEL: stack_fold_pmovsxbq_zmm: 4900; CHECK: # %bb.0: 4901; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4902; CHECK-NEXT: #APP 4903; CHECK-NEXT: nop 4904; CHECK-NEXT: #NO_APP 4905; CHECK-NEXT: vpmovsxbq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload 4906; CHECK-NEXT: retq 4907 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4908 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 4909 %3 = sext <8 x i8> %2 to <8 x i64> 4910 ret <8 x i64> %3 4911} 4912 4913define <32 x i16> @stack_fold_pmovsxbw_zmm(<32 x i8> %a0) { 4914; CHECK-LABEL: stack_fold_pmovsxbw_zmm: 4915; CHECK: # %bb.0: 4916; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4917; CHECK-NEXT: #APP 4918; CHECK-NEXT: nop 4919; CHECK-NEXT: #NO_APP 4920; CHECK-NEXT: vpmovsxbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload 4921; CHECK-NEXT: retq 4922 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4923 %2 = sext <32 x i8> %a0 to <32 x i16> 4924 ret <32 x i16> %2 4925} 4926 4927define <8 x i64> @stack_fold_pmovsxdq_zmm(<8 x i32> %a0) { 4928; CHECK-LABEL: stack_fold_pmovsxdq_zmm: 4929; CHECK: # %bb.0: 4930; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4931; CHECK-NEXT: #APP 4932; CHECK-NEXT: nop 4933; CHECK-NEXT: #NO_APP 4934; CHECK-NEXT: vpmovsxdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload 4935; CHECK-NEXT: retq 4936 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4937 %2 = sext <8 x i32> %a0 to <8 x i64> 4938 ret <8 x i64> %2 4939} 4940 4941define <16 x i32> @stack_fold_pmovsxwd_zmm(<16 x i16> %a0) { 4942; CHECK-LABEL: stack_fold_pmovsxwd_zmm: 4943; CHECK: # %bb.0: 4944; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4945; CHECK-NEXT: #APP 4946; CHECK-NEXT: nop 4947; CHECK-NEXT: #NO_APP 4948; CHECK-NEXT: vpmovsxwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload 4949; CHECK-NEXT: retq 4950 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4951 %2 = sext <16 x i16> %a0 to <16 x i32> 4952 ret <16 x i32> %2 4953} 4954 4955define <8 x i64> @stack_fold_pmovsxwq_zmm(<8 x i16> %a0) { 4956; CHECK-LABEL: stack_fold_pmovsxwq_zmm: 4957; CHECK: # %bb.0: 4958; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4959; CHECK-NEXT: #APP 4960; CHECK-NEXT: nop 4961; CHECK-NEXT: #NO_APP 4962; CHECK-NEXT: vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload 4963; CHECK-NEXT: retq 4964 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4965 %2 = sext <8 x i16> %a0 to <8 x i64> 4966 ret <8 x i64> %2 4967} 4968 4969define <8 x i64> @stack_fold_pmovsxwq_mask_zmm(<8 x i64> %passthru, <8 x i16> %a0, i8 %mask) { 4970; CHECK-LABEL: stack_fold_pmovsxwq_mask_zmm: 4971; CHECK: # %bb.0: 4972; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4973; CHECK-NEXT: #APP 4974; CHECK-NEXT: nop 4975; CHECK-NEXT: #NO_APP 4976; CHECK-NEXT: kmovd %edi, %k1 4977; CHECK-NEXT: vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 16-byte Folded Reload 4978; CHECK-NEXT: retq 4979 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4980 %2 = sext <8 x i16> %a0 to <8 x i64> 4981 %3 = bitcast i8 %mask to <8 x i1> 4982 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %passthru 4983 ret <8 x i64> %4 4984} 4985 4986define <8 x i64> @stack_fold_pmovsxwq_maskz_zmm(<8 x i16> %a0, i8 %mask) { 4987; CHECK-LABEL: stack_fold_pmovsxwq_maskz_zmm: 4988; CHECK: # %bb.0: 4989; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4990; CHECK-NEXT: #APP 4991; CHECK-NEXT: nop 4992; CHECK-NEXT: #NO_APP 4993; CHECK-NEXT: kmovd %edi, %k1 4994; CHECK-NEXT: vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 16-byte Folded Reload 4995; CHECK-NEXT: retq 4996 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4997 %2 = sext <8 x i16> %a0 to <8 x i64> 4998 %3 = bitcast i8 %mask to <8 x i1> 4999 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 5000 ret <8 x i64> %4 5001} 5002 5003define <16 x i8> @stack_fold_vpmovusdb(<16 x i32> %a0) { 5004; CHECK-LABEL: stack_fold_vpmovusdb: 5005; CHECK: # %bb.0: 5006; CHECK-NEXT: vpmovusdb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 5007; CHECK-NEXT: #APP 5008; CHECK-NEXT: nop 5009; CHECK-NEXT: #NO_APP 5010; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5011; CHECK-NEXT: vzeroupper 5012; CHECK-NEXT: retq 5013 %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1) 5014 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5015 ret <16 x i8> %1 5016} 5017declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32>, <16 x i8>, i16) 5018 5019define <16 x i16> @stack_fold_vpmovusdw(<16 x i32> %a0) { 5020; CHECK-LABEL: stack_fold_vpmovusdw: 5021; CHECK: # %bb.0: 5022; CHECK-NEXT: vpmovusdw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill 5023; CHECK-NEXT: #APP 5024; CHECK-NEXT: nop 5025; CHECK-NEXT: #NO_APP 5026; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5027; CHECK-NEXT: retq 5028 %1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1) 5029 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5030 ret <16 x i16> %1 5031} 5032declare <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32>, <16 x i16>, i16) 5033 5034define <8 x i32> @stack_fold_vpmovusqd(<8 x i64> %a0) { 5035; CHECK-LABEL: stack_fold_vpmovusqd: 5036; CHECK: # %bb.0: 5037; CHECK-NEXT: vpmovusqd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill 5038; CHECK-NEXT: #APP 5039; CHECK-NEXT: nop 5040; CHECK-NEXT: #NO_APP 5041; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5042; CHECK-NEXT: retq 5043 %1 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %a0, <8 x i32> undef, i8 -1) 5044 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5045 ret <8 x i32> %1 5046} 5047declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8) 5048 5049define <8 x i16> @stack_fold_vpmovusqw(<8 x i64> %a0) { 5050; CHECK-LABEL: stack_fold_vpmovusqw: 5051; CHECK: # %bb.0: 5052; CHECK-NEXT: vpmovusqw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 5053; CHECK-NEXT: #APP 5054; CHECK-NEXT: nop 5055; CHECK-NEXT: #NO_APP 5056; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5057; CHECK-NEXT: vzeroupper 5058; CHECK-NEXT: retq 5059 %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1) 5060 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5061 ret <8 x i16> %1 5062} 5063declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8) 5064 5065define <32 x i8> @stack_fold_vpmovuswb(<32 x i16> %a0) { 5066; CHECK-LABEL: stack_fold_vpmovuswb: 5067; CHECK: # %bb.0: 5068; CHECK-NEXT: vpmovuswb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill 5069; CHECK-NEXT: #APP 5070; CHECK-NEXT: nop 5071; CHECK-NEXT: #NO_APP 5072; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5073; CHECK-NEXT: retq 5074 %1 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %a0, <32 x i8> undef, i32 -1) 5075 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5076 ret <32 x i8> %1 5077} 5078declare <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16>, <32 x i8>, i32) 5079 5080define <16 x i32> @stack_fold_pmovzxbd_zmm(<16 x i8> %a0) { 5081; CHECK-LABEL: stack_fold_pmovzxbd_zmm: 5082; CHECK: # %bb.0: 5083; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5084; CHECK-NEXT: #APP 5085; CHECK-NEXT: nop 5086; CHECK-NEXT: #NO_APP 5087; CHECK-NEXT: vpmovzxbd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload 5088; CHECK-NEXT: # zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero 5089; CHECK-NEXT: retq 5090 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5091 %2 = zext <16 x i8> %a0 to <16 x i32> 5092 ret <16 x i32> %2 5093} 5094 5095define <8 x i64> @stack_fold_pmovzxbq_zmm(<16 x i8> %a0) { 5096; CHECK-LABEL: stack_fold_pmovzxbq_zmm: 5097; CHECK: # %bb.0: 5098; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5099; CHECK-NEXT: #APP 5100; CHECK-NEXT: nop 5101; CHECK-NEXT: #NO_APP 5102; CHECK-NEXT: vpmovzxbq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload 5103; CHECK-NEXT: # zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero 5104; CHECK-NEXT: retq 5105 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5106 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 5107 %3 = zext <8 x i8> %2 to <8 x i64> 5108 ret <8 x i64> %3 5109} 5110 5111define <32 x i16> @stack_fold_pmovzxbw_zmm(<32 x i8> %a0) { 5112; CHECK-LABEL: stack_fold_pmovzxbw_zmm: 5113; CHECK: # %bb.0: 5114; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5115; CHECK-NEXT: #APP 5116; CHECK-NEXT: nop 5117; CHECK-NEXT: #NO_APP 5118; CHECK-NEXT: vpmovzxbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload 5119; CHECK-NEXT: # zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero 5120; CHECK-NEXT: retq 5121 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5122 %2 = zext <32 x i8> %a0 to <32 x i16> 5123 ret <32 x i16> %2 5124} 5125 5126define <8 x i64> @stack_fold_pmovzxdq_zmm(<8 x i32> %a0) { 5127; CHECK-LABEL: stack_fold_pmovzxdq_zmm: 5128; CHECK: # %bb.0: 5129; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5130; CHECK-NEXT: #APP 5131; CHECK-NEXT: nop 5132; CHECK-NEXT: #NO_APP 5133; CHECK-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload 5134; CHECK-NEXT: # zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 5135; CHECK-NEXT: retq 5136 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5137 %2 = zext <8 x i32> %a0 to <8 x i64> 5138 ret <8 x i64> %2 5139} 5140 5141define <16 x i32> @stack_fold_pmovzxwd_zmm(<16 x i16> %a0) { 5142; CHECK-LABEL: stack_fold_pmovzxwd_zmm: 5143; CHECK: # %bb.0: 5144; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5145; CHECK-NEXT: #APP 5146; CHECK-NEXT: nop 5147; CHECK-NEXT: #NO_APP 5148; CHECK-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload 5149; CHECK-NEXT: # zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 5150; CHECK-NEXT: retq 5151 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5152 %2 = zext <16 x i16> %a0 to <16 x i32> 5153 ret <16 x i32> %2 5154} 5155 5156define <8 x i64> @stack_fold_pmovzxwq_zmm(<8 x i16> %a0) { 5157; CHECK-LABEL: stack_fold_pmovzxwq_zmm: 5158; CHECK: # %bb.0: 5159; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5160; CHECK-NEXT: #APP 5161; CHECK-NEXT: nop 5162; CHECK-NEXT: #NO_APP 5163; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload 5164; CHECK-NEXT: # zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 5165; CHECK-NEXT: retq 5166 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5167 %2 = zext <8 x i16> %a0 to <8 x i64> 5168 ret <8 x i64> %2 5169} 5170 5171define <8 x i64> @stack_fold_pmovzxwq_mask_zmm(<8 x i64> %passthru, <8 x i16> %a0, i8 %mask) { 5172; CHECK-LABEL: stack_fold_pmovzxwq_mask_zmm: 5173; CHECK: # %bb.0: 5174; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5175; CHECK-NEXT: #APP 5176; CHECK-NEXT: nop 5177; CHECK-NEXT: #NO_APP 5178; CHECK-NEXT: kmovd %edi, %k1 5179; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 16-byte Folded Reload 5180; CHECK-NEXT: # zmm0 {%k1} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 5181; CHECK-NEXT: retq 5182 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5183 %2 = zext <8 x i16> %a0 to <8 x i64> 5184 %3 = bitcast i8 %mask to <8 x i1> 5185 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %passthru 5186 ret <8 x i64> %4 5187} 5188 5189define <8 x i64> @stack_fold_pmovzxwq_maskz_zmm(<8 x i16> %a0, i8 %mask) { 5190; CHECK-LABEL: stack_fold_pmovzxwq_maskz_zmm: 5191; CHECK: # %bb.0: 5192; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5193; CHECK-NEXT: #APP 5194; CHECK-NEXT: nop 5195; CHECK-NEXT: #NO_APP 5196; CHECK-NEXT: kmovd %edi, %k1 5197; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 16-byte Folded Reload 5198; CHECK-NEXT: # zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 5199; CHECK-NEXT: retq 5200 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5201 %2 = zext <8 x i16> %a0 to <8 x i64> 5202 %3 = bitcast i8 %mask to <8 x i1> 5203 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 5204 ret <8 x i64> %4 5205} 5206 5207define <16 x i32> @stack_fold_pmulld(<16 x i32> %a0, <16 x i32> %a1) { 5208; CHECK-LABEL: stack_fold_pmulld: 5209; CHECK: # %bb.0: 5210; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5211; CHECK-NEXT: #APP 5212; CHECK-NEXT: nop 5213; CHECK-NEXT: #NO_APP 5214; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 5215; CHECK-NEXT: retq 5216 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5217 %2 = mul <16 x i32> %a0, %a1 5218 ret <16 x i32> %2 5219} 5220 5221define <16 x i32> @stack_fold_pmulld_commuted(<16 x i32> %a0, <16 x i32> %a1) { 5222; CHECK-LABEL: stack_fold_pmulld_commuted: 5223; CHECK: # %bb.0: 5224; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5225; CHECK-NEXT: #APP 5226; CHECK-NEXT: nop 5227; CHECK-NEXT: #NO_APP 5228; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 5229; CHECK-NEXT: retq 5230 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5231 %2 = mul <16 x i32> %a1, %a0 5232 ret <16 x i32> %2 5233} 5234 5235define <16 x i32> @stack_fold_pmulld_mask(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) { 5236; CHECK-LABEL: stack_fold_pmulld_mask: 5237; CHECK: # %bb.0: 5238; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5239; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 5240; CHECK-NEXT: #APP 5241; CHECK-NEXT: nop 5242; CHECK-NEXT: #NO_APP 5243; CHECK-NEXT: kmovd %esi, %k1 5244; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 5245; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 5246; CHECK-NEXT: retq 5247 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5248 %2 = mul <16 x i32> %a0, %a1 5249 %3 = bitcast i16 %mask to <16 x i1> 5250 ; load needed to keep the operation from being scheduled about the asm block 5251 %4 = load <16 x i32>, ptr %a2 5252 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 5253 ret <16 x i32> %5 5254} 5255 5256define <16 x i32> @stack_fold_pmulld_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) { 5257; CHECK-LABEL: stack_fold_pmulld_mask_commuted: 5258; CHECK: # %bb.0: 5259; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5260; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 5261; CHECK-NEXT: #APP 5262; CHECK-NEXT: nop 5263; CHECK-NEXT: #NO_APP 5264; CHECK-NEXT: kmovd %esi, %k1 5265; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 5266; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 5267; CHECK-NEXT: retq 5268 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5269 %2 = mul <16 x i32> %a1, %a0 5270 %3 = bitcast i16 %mask to <16 x i1> 5271 ; load needed to keep the operation from being scheduled about the asm block 5272 %4 = load <16 x i32>, ptr %a2 5273 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 5274 ret <16 x i32> %5 5275} 5276 5277define <16 x i32> @stack_fold_pmulld_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 5278; CHECK-LABEL: stack_fold_pmulld_maskz: 5279; CHECK: # %bb.0: 5280; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5281; CHECK-NEXT: #APP 5282; CHECK-NEXT: nop 5283; CHECK-NEXT: #NO_APP 5284; CHECK-NEXT: kmovd %edi, %k1 5285; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 5286; CHECK-NEXT: retq 5287 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5288 %2 = mul <16 x i32> %a0, %a1 5289 %3 = bitcast i16 %mask to <16 x i1> 5290 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 5291 ret <16 x i32> %4 5292} 5293 5294define <16 x i32> @stack_fold_pmulld_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 5295; CHECK-LABEL: stack_fold_pmulld_maskz_commuted: 5296; CHECK: # %bb.0: 5297; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5298; CHECK-NEXT: #APP 5299; CHECK-NEXT: nop 5300; CHECK-NEXT: #NO_APP 5301; CHECK-NEXT: kmovd %edi, %k1 5302; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 5303; CHECK-NEXT: retq 5304 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5305 %2 = mul <16 x i32> %a1, %a0 5306 %3 = bitcast i16 %mask to <16 x i1> 5307 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 5308 ret <16 x i32> %4 5309} 5310 5311define <8 x i64> @stack_fold_pmullq(<8 x i64> %a0, <8 x i64> %a1) { 5312; CHECK-LABEL: stack_fold_pmullq: 5313; CHECK: # %bb.0: 5314; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5315; CHECK-NEXT: #APP 5316; CHECK-NEXT: nop 5317; CHECK-NEXT: #NO_APP 5318; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 5319; CHECK-NEXT: retq 5320 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5321 %2 = mul <8 x i64> %a0, %a1 5322 ret <8 x i64> %2 5323} 5324 5325define <8 x i64> @stack_fold_pmullq_commuted(<8 x i64> %a0, <8 x i64> %a1) { 5326; CHECK-LABEL: stack_fold_pmullq_commuted: 5327; CHECK: # %bb.0: 5328; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5329; CHECK-NEXT: #APP 5330; CHECK-NEXT: nop 5331; CHECK-NEXT: #NO_APP 5332; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 5333; CHECK-NEXT: retq 5334 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5335 %2 = mul <8 x i64> %a1, %a0 5336 ret <8 x i64> %2 5337} 5338 5339define <8 x i64> @stack_fold_pmullq_mask(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) { 5340; CHECK-LABEL: stack_fold_pmullq_mask: 5341; CHECK: # %bb.0: 5342; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5343; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 5344; CHECK-NEXT: #APP 5345; CHECK-NEXT: nop 5346; CHECK-NEXT: #NO_APP 5347; CHECK-NEXT: kmovd %esi, %k1 5348; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 5349; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 5350; CHECK-NEXT: retq 5351 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5352 %2 = mul <8 x i64> %a0, %a1 5353 %3 = bitcast i8 %mask to <8 x i1> 5354 ; load needed to keep the operation from being scheduled about the asm block 5355 %4 = load <8 x i64>, ptr %a2 5356 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4 5357 ret <8 x i64> %5 5358} 5359 5360define <8 x i64> @stack_fold_pmullq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) { 5361; CHECK-LABEL: stack_fold_pmullq_mask_commuted: 5362; CHECK: # %bb.0: 5363; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5364; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 5365; CHECK-NEXT: #APP 5366; CHECK-NEXT: nop 5367; CHECK-NEXT: #NO_APP 5368; CHECK-NEXT: kmovd %esi, %k1 5369; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 5370; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 5371; CHECK-NEXT: retq 5372 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5373 %2 = mul <8 x i64> %a1, %a0 5374 %3 = bitcast i8 %mask to <8 x i1> 5375 ; load needed to keep the operation from being scheduled about the asm block 5376 %4 = load <8 x i64>, ptr %a2 5377 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4 5378 ret <8 x i64> %5 5379} 5380 5381define <8 x i64> @stack_fold_pmullq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 5382; CHECK-LABEL: stack_fold_pmullq_maskz: 5383; CHECK: # %bb.0: 5384; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5385; CHECK-NEXT: #APP 5386; CHECK-NEXT: nop 5387; CHECK-NEXT: #NO_APP 5388; CHECK-NEXT: kmovd %edi, %k1 5389; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 5390; CHECK-NEXT: retq 5391 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5392 %2 = mul <8 x i64> %a0, %a1 5393 %3 = bitcast i8 %mask to <8 x i1> 5394 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 5395 ret <8 x i64> %4 5396} 5397 5398define <8 x i64> @stack_fold_pmullq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 5399; CHECK-LABEL: stack_fold_pmullq_maskz_commuted: 5400; CHECK: # %bb.0: 5401; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5402; CHECK-NEXT: #APP 5403; CHECK-NEXT: nop 5404; CHECK-NEXT: #NO_APP 5405; CHECK-NEXT: kmovd %edi, %k1 5406; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 5407; CHECK-NEXT: retq 5408 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5409 %2 = mul <8 x i64> %a1, %a0 5410 %3 = bitcast i8 %mask to <8 x i1> 5411 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 5412 ret <8 x i64> %4 5413} 5414 5415define <32 x i16> @stack_fold_pmullw(<32 x i16> %a0, <32 x i16> %a1) { 5416; CHECK-LABEL: stack_fold_pmullw: 5417; CHECK: # %bb.0: 5418; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5419; CHECK-NEXT: #APP 5420; CHECK-NEXT: nop 5421; CHECK-NEXT: #NO_APP 5422; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 5423; CHECK-NEXT: retq 5424 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5425 %2 = mul <32 x i16> %a0, %a1 5426 ret <32 x i16> %2 5427} 5428 5429define <32 x i16> @stack_fold_pmullw_commuted(<32 x i16> %a0, <32 x i16> %a1) { 5430; CHECK-LABEL: stack_fold_pmullw_commuted: 5431; CHECK: # %bb.0: 5432; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5433; CHECK-NEXT: #APP 5434; CHECK-NEXT: nop 5435; CHECK-NEXT: #NO_APP 5436; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 5437; CHECK-NEXT: retq 5438 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5439 %2 = mul <32 x i16> %a1, %a0 5440 ret <32 x i16> %2 5441} 5442 5443define <32 x i16> @stack_fold_pmullw_mask(<32 x i16> %a0, <32 x i16> %a1, ptr %a2, i32 %mask) { 5444; CHECK-LABEL: stack_fold_pmullw_mask: 5445; CHECK: # %bb.0: 5446; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5447; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 5448; CHECK-NEXT: #APP 5449; CHECK-NEXT: nop 5450; CHECK-NEXT: #NO_APP 5451; CHECK-NEXT: kmovd %esi, %k1 5452; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 5453; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 5454; CHECK-NEXT: retq 5455 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5456 %2 = mul <32 x i16> %a0, %a1 5457 %3 = bitcast i32 %mask to <32 x i1> 5458 ; load needed to keep the operation from being scheduled about the asm block 5459 %4 = load <32 x i16>, ptr %a2 5460 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4 5461 ret <32 x i16> %5 5462} 5463 5464define <32 x i16> @stack_fold_pmullw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, ptr %a2, i32 %mask) { 5465; CHECK-LABEL: stack_fold_pmullw_mask_commuted: 5466; CHECK: # %bb.0: 5467; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5468; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 5469; CHECK-NEXT: #APP 5470; CHECK-NEXT: nop 5471; CHECK-NEXT: #NO_APP 5472; CHECK-NEXT: kmovd %esi, %k1 5473; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 5474; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 5475; CHECK-NEXT: retq 5476 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5477 %2 = mul <32 x i16> %a1, %a0 5478 %3 = bitcast i32 %mask to <32 x i1> 5479 ; load needed to keep the operation from being scheduled about the asm block 5480 %4 = load <32 x i16>, ptr %a2 5481 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4 5482 ret <32 x i16> %5 5483} 5484 5485define <32 x i16> @stack_fold_pmullw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 5486; CHECK-LABEL: stack_fold_pmullw_maskz: 5487; CHECK: # %bb.0: 5488; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5489; CHECK-NEXT: #APP 5490; CHECK-NEXT: nop 5491; CHECK-NEXT: #NO_APP 5492; CHECK-NEXT: kmovd %edi, %k1 5493; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 5494; CHECK-NEXT: retq 5495 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5496 %2 = mul <32 x i16> %a0, %a1 5497 %3 = bitcast i32 %mask to <32 x i1> 5498 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 5499 ret <32 x i16> %4 5500} 5501 5502define <32 x i16> @stack_fold_pmullw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { 5503; CHECK-LABEL: stack_fold_pmullw_maskz_commuted: 5504; CHECK: # %bb.0: 5505; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5506; CHECK-NEXT: #APP 5507; CHECK-NEXT: nop 5508; CHECK-NEXT: #NO_APP 5509; CHECK-NEXT: kmovd %edi, %k1 5510; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 5511; CHECK-NEXT: retq 5512 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5513 %2 = mul <32 x i16> %a1, %a0 5514 %3 = bitcast i32 %mask to <32 x i1> 5515 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 5516 ret <32 x i16> %4 5517} 5518 5519define <8 x i64> @stack_fold_pmuldq(<8 x i64> %a0, <8 x i64> %a1) { 5520; CHECK-LABEL: stack_fold_pmuldq: 5521; CHECK: # %bb.0: 5522; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5523; CHECK-NEXT: #APP 5524; CHECK-NEXT: nop 5525; CHECK-NEXT: #NO_APP 5526; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 5527; CHECK-NEXT: retq 5528 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5529 %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5530 %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5531 %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5532 %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5533 %6 = mul <8 x i64> %3, %5 5534 ret <8 x i64> %6 5535} 5536 5537define <8 x i64> @stack_fold_pmuldq_commuted(<8 x i64> %a0, <8 x i64> %a1) { 5538; CHECK-LABEL: stack_fold_pmuldq_commuted: 5539; CHECK: # %bb.0: 5540; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5541; CHECK-NEXT: #APP 5542; CHECK-NEXT: nop 5543; CHECK-NEXT: #NO_APP 5544; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 5545; CHECK-NEXT: retq 5546 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5547 %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5548 %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5549 %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5550 %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5551 %6 = mul <8 x i64> %5, %3 5552 ret <8 x i64> %6 5553} 5554 5555define <8 x i64> @stack_fold_pmuldq_mask(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) { 5556; CHECK-LABEL: stack_fold_pmuldq_mask: 5557; CHECK: # %bb.0: 5558; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5559; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 5560; CHECK-NEXT: #APP 5561; CHECK-NEXT: nop 5562; CHECK-NEXT: #NO_APP 5563; CHECK-NEXT: kmovd %esi, %k1 5564; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 5565; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 5566; CHECK-NEXT: retq 5567 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5568 %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5569 %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5570 %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5571 %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5572 %6 = mul <8 x i64> %3, %5 5573 %7 = bitcast i8 %mask to <8 x i1> 5574 ; load needed to keep the operation from being scheduled about the asm block 5575 %8 = load <8 x i64>, ptr %a2 5576 %9 = select <8 x i1> %7, <8 x i64> %6, <8 x i64> %8 5577 ret <8 x i64> %9 5578} 5579 5580define <8 x i64> @stack_fold_pmuldq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) { 5581; CHECK-LABEL: stack_fold_pmuldq_mask_commuted: 5582; CHECK: # %bb.0: 5583; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5584; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 5585; CHECK-NEXT: #APP 5586; CHECK-NEXT: nop 5587; CHECK-NEXT: #NO_APP 5588; CHECK-NEXT: kmovd %esi, %k1 5589; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 5590; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 5591; CHECK-NEXT: retq 5592 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5593 %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5594 %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5595 %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5596 %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5597 %6 = mul <8 x i64> %5, %3 5598 %7 = bitcast i8 %mask to <8 x i1> 5599 ; load needed to keep the operation from being scheduled about the asm block 5600 %8 = load <8 x i64>, ptr %a2 5601 %9 = select <8 x i1> %7, <8 x i64> %6, <8 x i64> %8 5602 ret <8 x i64> %9 5603} 5604 5605define <8 x i64> @stack_fold_pmuldq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 5606; CHECK-LABEL: stack_fold_pmuldq_maskz: 5607; CHECK: # %bb.0: 5608; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5609; CHECK-NEXT: #APP 5610; CHECK-NEXT: nop 5611; CHECK-NEXT: #NO_APP 5612; CHECK-NEXT: kmovd %edi, %k1 5613; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 5614; CHECK-NEXT: retq 5615 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5616 %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5617 %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5618 %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5619 %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5620 %6 = mul <8 x i64> %3, %5 5621 %7 = bitcast i8 %mask to <8 x i1> 5622 %8 = select <8 x i1> %7, <8 x i64> %6, <8 x i64> zeroinitializer 5623 ret <8 x i64> %8 5624} 5625 5626define <8 x i64> @stack_fold_pmuldq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 5627; CHECK-LABEL: stack_fold_pmuldq_maskz_commuted: 5628; CHECK: # %bb.0: 5629; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5630; CHECK-NEXT: #APP 5631; CHECK-NEXT: nop 5632; CHECK-NEXT: #NO_APP 5633; CHECK-NEXT: kmovd %edi, %k1 5634; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 5635; CHECK-NEXT: retq 5636 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5637 %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5638 %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5639 %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5640 %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 5641 %6 = mul <8 x i64> %5, %3 5642 %7 = bitcast i8 %mask to <8 x i1> 5643 %8 = select <8 x i1> %7, <8 x i64> %6, <8 x i64> zeroinitializer 5644 ret <8 x i64> %8 5645} 5646 5647 5648 5649 5650define <8 x i64> @stack_fold_pmuludq(<8 x i64> %a0, <8 x i64> %a1) { 5651; CHECK-LABEL: stack_fold_pmuludq: 5652; CHECK: # %bb.0: 5653; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5654; CHECK-NEXT: #APP 5655; CHECK-NEXT: nop 5656; CHECK-NEXT: #NO_APP 5657; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 5658; CHECK-NEXT: retq 5659 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5660 %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 5661 %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 5662 %4 = mul <8 x i64> %2, %3 5663 ret <8 x i64> %4 5664} 5665 5666define <8 x i64> @stack_fold_pmuludq_commuted(<8 x i64> %a0, <8 x i64> %a1) { 5667; CHECK-LABEL: stack_fold_pmuludq_commuted: 5668; CHECK: # %bb.0: 5669; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5670; CHECK-NEXT: #APP 5671; CHECK-NEXT: nop 5672; CHECK-NEXT: #NO_APP 5673; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 5674; CHECK-NEXT: retq 5675 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5676 %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 5677 %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 5678 %4 = mul <8 x i64> %3, %2 5679 ret <8 x i64> %4 5680} 5681 5682define <8 x i64> @stack_fold_pmuludq_mask(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) { 5683; CHECK-LABEL: stack_fold_pmuludq_mask: 5684; CHECK: # %bb.0: 5685; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5686; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 5687; CHECK-NEXT: #APP 5688; CHECK-NEXT: nop 5689; CHECK-NEXT: #NO_APP 5690; CHECK-NEXT: kmovd %esi, %k1 5691; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 5692; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 5693; CHECK-NEXT: retq 5694 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5695 %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 5696 %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 5697 %4 = mul <8 x i64> %2, %3 5698 %5 = bitcast i8 %mask to <8 x i1> 5699 ; load needed to keep the operation from being scheduled about the asm block 5700 %6 = load <8 x i64>, ptr %a2 5701 %7 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %6 5702 ret <8 x i64> %7 5703} 5704 5705define <8 x i64> @stack_fold_pmuludq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) { 5706; CHECK-LABEL: stack_fold_pmuludq_mask_commuted: 5707; CHECK: # %bb.0: 5708; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5709; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 5710; CHECK-NEXT: #APP 5711; CHECK-NEXT: nop 5712; CHECK-NEXT: #NO_APP 5713; CHECK-NEXT: kmovd %esi, %k1 5714; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 5715; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 5716; CHECK-NEXT: retq 5717 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5718 %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 5719 %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 5720 %4 = mul <8 x i64> %3, %2 5721 %5 = bitcast i8 %mask to <8 x i1> 5722 ; load needed to keep the operation from being scheduled about the asm block 5723 %6 = load <8 x i64>, ptr %a2 5724 %7 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %6 5725 ret <8 x i64> %7 5726} 5727 5728define <8 x i64> @stack_fold_pmuludq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 5729; CHECK-LABEL: stack_fold_pmuludq_maskz: 5730; CHECK: # %bb.0: 5731; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5732; CHECK-NEXT: #APP 5733; CHECK-NEXT: nop 5734; CHECK-NEXT: #NO_APP 5735; CHECK-NEXT: kmovd %edi, %k1 5736; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 5737; CHECK-NEXT: retq 5738 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5739 %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 5740 %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 5741 %4 = mul <8 x i64> %2, %3 5742 %5 = bitcast i8 %mask to <8 x i1> 5743 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer 5744 ret <8 x i64> %6 5745} 5746 5747define <8 x i64> @stack_fold_pmuludq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 5748; CHECK-LABEL: stack_fold_pmuludq_maskz_commuted: 5749; CHECK: # %bb.0: 5750; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5751; CHECK-NEXT: #APP 5752; CHECK-NEXT: nop 5753; CHECK-NEXT: #NO_APP 5754; CHECK-NEXT: kmovd %edi, %k1 5755; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 5756; CHECK-NEXT: retq 5757 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5758 %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 5759 %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 5760 %4 = mul <8 x i64> %3, %2 5761 %5 = bitcast i8 %mask to <8 x i1> 5762 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer 5763 ret <8 x i64> %6 5764} 5765 5766define <16 x i32> @stack_fold_vpopcntd(<16 x i32> %a0) { 5767; CHECK-LABEL: stack_fold_vpopcntd: 5768; CHECK: # %bb.0: 5769; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5770; CHECK-NEXT: #APP 5771; CHECK-NEXT: nop 5772; CHECK-NEXT: #NO_APP 5773; CHECK-NEXT: vpopcntd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 5774; CHECK-NEXT: retq 5775 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5776 %2 = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %a0) 5777 ret <16 x i32> %2 5778} 5779declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readonly 5780 5781define <8 x i64> @stack_fold_vpopcntq(<8 x i64> %a0) { 5782; CHECK-LABEL: stack_fold_vpopcntq: 5783; CHECK: # %bb.0: 5784; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5785; CHECK-NEXT: #APP 5786; CHECK-NEXT: nop 5787; CHECK-NEXT: #NO_APP 5788; CHECK-NEXT: vpopcntq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 5789; CHECK-NEXT: retq 5790 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5791 %2 = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %a0) 5792 ret <8 x i64> %2 5793} 5794declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone 5795 5796define <16 x i32> @stack_fold_pord(<16 x i32> %a0, <16 x i32> %a1) { 5797; CHECK-LABEL: stack_fold_pord: 5798; CHECK: # %bb.0: 5799; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5800; CHECK-NEXT: #APP 5801; CHECK-NEXT: nop 5802; CHECK-NEXT: #NO_APP 5803; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 5804; CHECK-NEXT: retq 5805 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5806 %2 = or <16 x i32> %a0, %a1 5807 ret <16 x i32> %2 5808} 5809 5810define <16 x i32> @stack_fold_pord_commuted(<16 x i32> %a0, <16 x i32> %a1) { 5811; CHECK-LABEL: stack_fold_pord_commuted: 5812; CHECK: # %bb.0: 5813; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5814; CHECK-NEXT: #APP 5815; CHECK-NEXT: nop 5816; CHECK-NEXT: #NO_APP 5817; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 5818; CHECK-NEXT: retq 5819 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5820 %2 = or <16 x i32> %a1, %a0 5821 ret <16 x i32> %2 5822} 5823 5824define <16 x i32> @stack_fold_pord_mask(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) { 5825; CHECK-LABEL: stack_fold_pord_mask: 5826; CHECK: # %bb.0: 5827; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5828; CHECK-NEXT: vmovaps %zmm0, %zmm1 5829; CHECK-NEXT: #APP 5830; CHECK-NEXT: nop 5831; CHECK-NEXT: #NO_APP 5832; CHECK-NEXT: kmovd %esi, %k1 5833; CHECK-NEXT: vmovaps (%rdi), %zmm0 5834; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 5835; CHECK-NEXT: retq 5836 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5837 %2 = or <16 x i32> %a0, %a1 5838 %3 = bitcast i16 %mask to <16 x i1> 5839 ; load needed to keep the operation from being scheduled about the asm block 5840 %4 = load <16 x i32>, ptr %a2 5841 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 5842 ret <16 x i32> %5 5843} 5844 5845define <16 x i32> @stack_fold_pord_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) { 5846; CHECK-LABEL: stack_fold_pord_mask_commuted: 5847; CHECK: # %bb.0: 5848; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5849; CHECK-NEXT: vmovaps %zmm0, %zmm1 5850; CHECK-NEXT: #APP 5851; CHECK-NEXT: nop 5852; CHECK-NEXT: #NO_APP 5853; CHECK-NEXT: kmovd %esi, %k1 5854; CHECK-NEXT: vmovaps (%rdi), %zmm0 5855; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 5856; CHECK-NEXT: retq 5857 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5858 %2 = or <16 x i32> %a1, %a0 5859 %3 = bitcast i16 %mask to <16 x i1> 5860 ; load needed to keep the operation from being scheduled about the asm block 5861 %4 = load <16 x i32>, ptr %a2 5862 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 5863 ret <16 x i32> %5 5864} 5865 5866define <16 x i32> @stack_fold_pord_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 5867; CHECK-LABEL: stack_fold_pord_maskz: 5868; CHECK: # %bb.0: 5869; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5870; CHECK-NEXT: #APP 5871; CHECK-NEXT: nop 5872; CHECK-NEXT: #NO_APP 5873; CHECK-NEXT: kmovd %edi, %k1 5874; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 5875; CHECK-NEXT: retq 5876 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5877 %2 = or <16 x i32> %a0, %a1 5878 %3 = bitcast i16 %mask to <16 x i1> 5879 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 5880 ret <16 x i32> %4 5881} 5882 5883define <16 x i32> @stack_fold_pord_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 5884; CHECK-LABEL: stack_fold_pord_maskz_commuted: 5885; CHECK: # %bb.0: 5886; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5887; CHECK-NEXT: #APP 5888; CHECK-NEXT: nop 5889; CHECK-NEXT: #NO_APP 5890; CHECK-NEXT: kmovd %edi, %k1 5891; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 5892; CHECK-NEXT: retq 5893 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5894 %2 = or <16 x i32> %a1, %a0 5895 %3 = bitcast i16 %mask to <16 x i1> 5896 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 5897 ret <16 x i32> %4 5898} 5899 5900define <8 x i64> @stack_fold_porq(<8 x i64> %a0, <8 x i64> %a1) { 5901; CHECK-LABEL: stack_fold_porq: 5902; CHECK: # %bb.0: 5903; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5904; CHECK-NEXT: #APP 5905; CHECK-NEXT: nop 5906; CHECK-NEXT: #NO_APP 5907; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 5908; CHECK-NEXT: retq 5909 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5910 %2 = or <8 x i64> %a0, %a1 5911 ret <8 x i64> %2 5912} 5913 5914define <8 x i64> @stack_fold_porq_commuted(<8 x i64> %a0, <8 x i64> %a1) { 5915; CHECK-LABEL: stack_fold_porq_commuted: 5916; CHECK: # %bb.0: 5917; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5918; CHECK-NEXT: #APP 5919; CHECK-NEXT: nop 5920; CHECK-NEXT: #NO_APP 5921; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 5922; CHECK-NEXT: retq 5923 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5924 %2 = or <8 x i64> %a1, %a0 5925 ret <8 x i64> %2 5926} 5927 5928define <8 x i64> @stack_fold_porq_mask(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) { 5929; CHECK-LABEL: stack_fold_porq_mask: 5930; CHECK: # %bb.0: 5931; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5932; CHECK-NEXT: vmovapd %zmm0, %zmm1 5933; CHECK-NEXT: #APP 5934; CHECK-NEXT: nop 5935; CHECK-NEXT: #NO_APP 5936; CHECK-NEXT: kmovd %esi, %k1 5937; CHECK-NEXT: vmovapd (%rdi), %zmm0 5938; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 5939; CHECK-NEXT: retq 5940 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5941 %2 = or <8 x i64> %a0, %a1 5942 %3 = bitcast i8 %mask to <8 x i1> 5943 ; load needed to keep the operation from being scheduled about the asm block 5944 %4 = load <8 x i64>, ptr %a2 5945 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4 5946 ret <8 x i64> %5 5947} 5948 5949define <8 x i64> @stack_fold_porq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) { 5950; CHECK-LABEL: stack_fold_porq_mask_commuted: 5951; CHECK: # %bb.0: 5952; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5953; CHECK-NEXT: vmovapd %zmm0, %zmm1 5954; CHECK-NEXT: #APP 5955; CHECK-NEXT: nop 5956; CHECK-NEXT: #NO_APP 5957; CHECK-NEXT: kmovd %esi, %k1 5958; CHECK-NEXT: vmovapd (%rdi), %zmm0 5959; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 5960; CHECK-NEXT: retq 5961 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5962 %2 = or <8 x i64> %a1, %a0 5963 %3 = bitcast i8 %mask to <8 x i1> 5964 ; load needed to keep the operation from being scheduled about the asm block 5965 %4 = load <8 x i64>, ptr %a2 5966 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4 5967 ret <8 x i64> %5 5968} 5969 5970define <8 x i64> @stack_fold_porq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 5971; CHECK-LABEL: stack_fold_porq_maskz: 5972; CHECK: # %bb.0: 5973; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5974; CHECK-NEXT: #APP 5975; CHECK-NEXT: nop 5976; CHECK-NEXT: #NO_APP 5977; CHECK-NEXT: kmovd %edi, %k1 5978; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 5979; CHECK-NEXT: retq 5980 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5981 %2 = or <8 x i64> %a0, %a1 5982 %3 = bitcast i8 %mask to <8 x i1> 5983 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 5984 ret <8 x i64> %4 5985} 5986 5987define <8 x i64> @stack_fold_porq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 5988; CHECK-LABEL: stack_fold_porq_maskz_commuted: 5989; CHECK: # %bb.0: 5990; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 5991; CHECK-NEXT: #APP 5992; CHECK-NEXT: nop 5993; CHECK-NEXT: #NO_APP 5994; CHECK-NEXT: kmovd %edi, %k1 5995; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 5996; CHECK-NEXT: retq 5997 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 5998 %2 = or <8 x i64> %a1, %a0 5999 %3 = bitcast i8 %mask to <8 x i1> 6000 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 6001 ret <8 x i64> %4 6002} 6003 6004define <8 x i64> @stack_fold_psadbw(<64 x i8> %a0, <64 x i8> %a1) { 6005; CHECK-LABEL: stack_fold_psadbw: 6006; CHECK: # %bb.0: 6007; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6008; CHECK-NEXT: #APP 6009; CHECK-NEXT: nop 6010; CHECK-NEXT: #NO_APP 6011; CHECK-NEXT: vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6012; CHECK-NEXT: retq 6013 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6014 %2 = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %a0, <64 x i8> %a1) 6015 ret <8 x i64> %2 6016} 6017declare <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>) nounwind readnone 6018 6019define <8 x i64> @stack_fold_psadbw_commute(<64 x i8> %a0, <64 x i8> %a1) { 6020; CHECK-LABEL: stack_fold_psadbw_commute: 6021; CHECK: # %bb.0: 6022; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6023; CHECK-NEXT: #APP 6024; CHECK-NEXT: nop 6025; CHECK-NEXT: #NO_APP 6026; CHECK-NEXT: vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6027; CHECK-NEXT: retq 6028 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6029 %2 = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %a1, <64 x i8> %a0) 6030 ret <8 x i64> %2 6031} 6032 6033define <64 x i8> @stack_fold_pshufb_zmm(<64 x i8> %a0, <64 x i8> %a1) { 6034; CHECK-LABEL: stack_fold_pshufb_zmm: 6035; CHECK: # %bb.0: 6036; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6037; CHECK-NEXT: #APP 6038; CHECK-NEXT: nop 6039; CHECK-NEXT: #NO_APP 6040; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6041; CHECK-NEXT: retq 6042 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6043 %2 = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %a0, <64 x i8> %a1) 6044 ret <64 x i8> %2 6045} 6046declare <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8>, <64 x i8>) 6047 6048define <64 x i8> @stack_fold_pshufb_zmm_mask(ptr %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 6049; CHECK-LABEL: stack_fold_pshufb_zmm_mask: 6050; CHECK: # %bb.0: 6051; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6052; CHECK-NEXT: #APP 6053; CHECK-NEXT: nop 6054; CHECK-NEXT: #NO_APP 6055; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 6056; CHECK-NEXT: kmovq %rsi, %k1 6057; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 6058; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 6059; CHECK-NEXT: retq 6060 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6061 %2 = load <64 x i8>, ptr %passthru 6062 %3 = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %a0, <64 x i8> %a1) 6063 %4 = bitcast i64 %mask to <64 x i1> 6064 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %2 6065 ret <64 x i8> %5 6066} 6067 6068define <64 x i8> @stack_fold_pshufb_zmm_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 6069; CHECK-LABEL: stack_fold_pshufb_zmm_maskz: 6070; CHECK: # %bb.0: 6071; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6072; CHECK-NEXT: #APP 6073; CHECK-NEXT: nop 6074; CHECK-NEXT: #NO_APP 6075; CHECK-NEXT: kmovq %rdi, %k1 6076; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 6077; CHECK-NEXT: retq 6078 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6079 %2 = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %a0, <64 x i8> %a1) 6080 %3 = bitcast i64 %mask to <64 x i1> 6081 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer 6082 ret <64 x i8> %4 6083} 6084 6085define <16 x i32> @stack_fold_pshufd_zmm(<16 x i32> %a0) { 6086; CHECK-LABEL: stack_fold_pshufd_zmm: 6087; CHECK: # %bb.0: 6088; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6089; CHECK-NEXT: #APP 6090; CHECK-NEXT: nop 6091; CHECK-NEXT: #NO_APP 6092; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 6093; CHECK-NEXT: # zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 6094; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1 6095; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 6096; CHECK-NEXT: retq 6097 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6098 %2 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12> 6099 %3 = add <16 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 6100 ret <16 x i32> %3 6101} 6102 6103define <16 x i32> @stack_fold_pshufd_zmm_mask(<16 x i32> %passthru, <16 x i32> %a0, i16 %mask) { 6104; CHECK-LABEL: stack_fold_pshufd_zmm_mask: 6105; CHECK: # %bb.0: 6106; CHECK-NEXT: pushq %rax 6107; CHECK-NEXT: .cfi_def_cfa_offset 16 6108; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6109; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6110; CHECK-NEXT: #APP 6111; CHECK-NEXT: nop 6112; CHECK-NEXT: #NO_APP 6113; CHECK-NEXT: kmovd %edi, %k1 6114; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 6115; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload 6116; CHECK-NEXT: # zmm0 {%k1} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 6117; CHECK-NEXT: popq %rax 6118; CHECK-NEXT: .cfi_def_cfa_offset 8 6119; CHECK-NEXT: retq 6120 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6121 %2 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12> 6122 %3 = bitcast i16 %mask to <16 x i1> 6123 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %passthru 6124 ret <16 x i32> %4 6125} 6126 6127define <16 x i32> @stack_fold_pshufd_zmm_maskz(<16 x i32> %a0, i16 %mask) { 6128; CHECK-LABEL: stack_fold_pshufd_zmm_maskz: 6129; CHECK: # %bb.0: 6130; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6131; CHECK-NEXT: #APP 6132; CHECK-NEXT: nop 6133; CHECK-NEXT: #NO_APP 6134; CHECK-NEXT: kmovd %edi, %k1 6135; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload 6136; CHECK-NEXT: # zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 6137; CHECK-NEXT: retq 6138 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6139 %2 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12> 6140 %3 = bitcast i16 %mask to <16 x i1> 6141 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 6142 ret <16 x i32> %4 6143} 6144 6145define <32 x i16> @stack_fold_pshufhw_zmm(<32 x i16> %a0) { 6146; CHECK-LABEL: stack_fold_pshufhw_zmm: 6147; CHECK: # %bb.0: 6148; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6149; CHECK-NEXT: #APP 6150; CHECK-NEXT: nop 6151; CHECK-NEXT: #NO_APP 6152; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 6153; CHECK-NEXT: # zmm0 = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12,16,17,18,19,23,22,21,20,24,25,26,27,31,30,29,28] 6154; CHECK-NEXT: retq 6155 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6156 %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 23, i32 22, i32 21, i32 20, i32 24, i32 25, i32 26, i32 27, i32 31, i32 30, i32 29, i32 28> 6157 ret <32 x i16> %2 6158} 6159 6160define <32 x i16> @stack_fold_pshufhw_zmm_mask(<32 x i16> %passthru, <32 x i16> %a0, i32 %mask) { 6161; CHECK-LABEL: stack_fold_pshufhw_zmm_mask: 6162; CHECK: # %bb.0: 6163; CHECK-NEXT: pushq %rax 6164; CHECK-NEXT: .cfi_def_cfa_offset 16 6165; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6166; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6167; CHECK-NEXT: #APP 6168; CHECK-NEXT: nop 6169; CHECK-NEXT: #NO_APP 6170; CHECK-NEXT: kmovd %edi, %k1 6171; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 6172; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload 6173; CHECK-NEXT: # zmm0 {%k1} = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12,16,17,18,19,23,22,21,20,24,25,26,27,31,30,29,28] 6174; CHECK-NEXT: popq %rax 6175; CHECK-NEXT: .cfi_def_cfa_offset 8 6176; CHECK-NEXT: retq 6177 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6178 %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 23, i32 22, i32 21, i32 20, i32 24, i32 25, i32 26, i32 27, i32 31, i32 30, i32 29, i32 28> 6179 %3 = bitcast i32 %mask to <32 x i1> 6180 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %passthru 6181 ret <32 x i16> %4 6182} 6183 6184define <32 x i16> @stack_fold_pshufhw_zmm_maskz(<32 x i16> %a0, i32 %mask) { 6185; CHECK-LABEL: stack_fold_pshufhw_zmm_maskz: 6186; CHECK: # %bb.0: 6187; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6188; CHECK-NEXT: #APP 6189; CHECK-NEXT: nop 6190; CHECK-NEXT: #NO_APP 6191; CHECK-NEXT: kmovd %edi, %k1 6192; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload 6193; CHECK-NEXT: # zmm0 {%k1} {z} = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12,16,17,18,19,23,22,21,20,24,25,26,27,31,30,29,28] 6194; CHECK-NEXT: retq 6195 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6196 %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 23, i32 22, i32 21, i32 20, i32 24, i32 25, i32 26, i32 27, i32 31, i32 30, i32 29, i32 28> 6197 %3 = bitcast i32 %mask to <32 x i1> 6198 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 6199 ret <32 x i16> %4 6200} 6201 6202define <32 x i16> @stack_fold_pshuflw_zmm(<32 x i16> %a0) { 6203; CHECK-LABEL: stack_fold_pshuflw_zmm: 6204; CHECK: # %bb.0: 6205; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6206; CHECK-NEXT: #APP 6207; CHECK-NEXT: nop 6208; CHECK-NEXT: #NO_APP 6209; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 6210; CHECK-NEXT: # zmm0 = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15,19,18,17,16,20,21,22,23,27,26,25,24,28,29,30,31] 6211; CHECK-NEXT: retq 6212 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6213 %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 18, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 26, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31> 6214 ret <32 x i16> %2 6215} 6216 6217define <32 x i16> @stack_fold_pshuflw_zmm_mask(<32 x i16> %passthru, <32 x i16> %a0, i32 %mask) { 6218; CHECK-LABEL: stack_fold_pshuflw_zmm_mask: 6219; CHECK: # %bb.0: 6220; CHECK-NEXT: pushq %rax 6221; CHECK-NEXT: .cfi_def_cfa_offset 16 6222; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6223; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6224; CHECK-NEXT: #APP 6225; CHECK-NEXT: nop 6226; CHECK-NEXT: #NO_APP 6227; CHECK-NEXT: kmovd %edi, %k1 6228; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 6229; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload 6230; CHECK-NEXT: # zmm0 {%k1} = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15,19,18,17,16,20,21,22,23,27,26,25,24,28,29,30,31] 6231; CHECK-NEXT: popq %rax 6232; CHECK-NEXT: .cfi_def_cfa_offset 8 6233; CHECK-NEXT: retq 6234 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6235 %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 18, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 26, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31> 6236 %3 = bitcast i32 %mask to <32 x i1> 6237 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %passthru 6238 ret <32 x i16> %4 6239} 6240 6241define <32 x i16> @stack_fold_pshuflw_zmm_maskz(<32 x i16> %a0, i32 %mask) { 6242; CHECK-LABEL: stack_fold_pshuflw_zmm_maskz: 6243; CHECK: # %bb.0: 6244; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6245; CHECK-NEXT: #APP 6246; CHECK-NEXT: nop 6247; CHECK-NEXT: #NO_APP 6248; CHECK-NEXT: kmovd %edi, %k1 6249; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload 6250; CHECK-NEXT: # zmm0 {%k1} {z} = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15,19,18,17,16,20,21,22,23,27,26,25,24,28,29,30,31] 6251; CHECK-NEXT: retq 6252 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6253 %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 18, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 26, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31> 6254 %3 = bitcast i32 %mask to <32 x i1> 6255 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 6256 ret <32 x i16> %4 6257} 6258 6259define <16 x i32> @stack_fold_pslld(<16 x i32> %a0, <4 x i32> %a1) { 6260; CHECK-LABEL: stack_fold_pslld: 6261; CHECK: # %bb.0: 6262; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6263; CHECK-NEXT: #APP 6264; CHECK-NEXT: nop 6265; CHECK-NEXT: #NO_APP 6266; CHECK-NEXT: vpslld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload 6267; CHECK-NEXT: retq 6268 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6269 %2 = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) 6270 ret <16 x i32> %2 6271} 6272declare <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32>, <4 x i32>) nounwind readnone 6273 6274define <16 x i32> @stack_fold_pslld_mask(ptr %passthru, <16 x i32> %a0, <4 x i32> %a1, i16 %mask) { 6275; CHECK-LABEL: stack_fold_pslld_mask: 6276; CHECK: # %bb.0: 6277; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6278; CHECK-NEXT: #APP 6279; CHECK-NEXT: nop 6280; CHECK-NEXT: #NO_APP 6281; CHECK-NEXT: kmovd %esi, %k1 6282; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 6283; CHECK-NEXT: vpslld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload 6284; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 6285; CHECK-NEXT: retq 6286 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6287 %2 = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) 6288 %3 = bitcast i16 %mask to <16 x i1> 6289 %4 = load <16 x i32>, ptr %passthru 6290 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 6291 ret <16 x i32> %5 6292} 6293 6294define <16 x i32> @stack_fold_pslld_maskz(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) { 6295; CHECK-LABEL: stack_fold_pslld_maskz: 6296; CHECK: # %bb.0: 6297; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6298; CHECK-NEXT: #APP 6299; CHECK-NEXT: nop 6300; CHECK-NEXT: #NO_APP 6301; CHECK-NEXT: kmovd %edi, %k1 6302; CHECK-NEXT: vpslld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 16-byte Folded Reload 6303; CHECK-NEXT: retq 6304 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6305 %2 = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) 6306 %3 = bitcast i16 %mask to <16 x i1> 6307 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 6308 ret <16 x i32> %4 6309} 6310 6311define <16 x i32> @stack_fold_pslldi(<16 x i32> %a0) { 6312; CHECK-LABEL: stack_fold_pslldi: 6313; CHECK: # %bb.0: 6314; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6315; CHECK-NEXT: #APP 6316; CHECK-NEXT: nop 6317; CHECK-NEXT: #NO_APP 6318; CHECK-NEXT: vpslld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 6319; CHECK-NEXT: retq 6320 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6321 %2 = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 1) 6322 ret <16 x i32> %2 6323} 6324declare <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32>, i32) nounwind readnone 6325 6326define <16 x i32> @stack_fold_pslldi_mask(ptr %passthru, <16 x i32> %a0, i16 %mask) { 6327; CHECK-LABEL: stack_fold_pslldi_mask: 6328; CHECK: # %bb.0: 6329; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6330; CHECK-NEXT: #APP 6331; CHECK-NEXT: nop 6332; CHECK-NEXT: #NO_APP 6333; CHECK-NEXT: kmovd %esi, %k1 6334; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 6335; CHECK-NEXT: vpslld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload 6336; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 6337; CHECK-NEXT: retq 6338 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6339 %2 = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 1) 6340 %3 = bitcast i16 %mask to <16 x i1> 6341 %4 = load <16 x i32>, ptr %passthru 6342 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 6343 ret <16 x i32> %5 6344} 6345 6346define <16 x i32> @stack_fold_pslldi_maskz(<16 x i32> %a0, i16 %mask) { 6347; CHECK-LABEL: stack_fold_pslldi_maskz: 6348; CHECK: # %bb.0: 6349; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6350; CHECK-NEXT: #APP 6351; CHECK-NEXT: nop 6352; CHECK-NEXT: #NO_APP 6353; CHECK-NEXT: kmovd %edi, %k1 6354; CHECK-NEXT: vpslld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload 6355; CHECK-NEXT: retq 6356 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6357 %2 = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 1) 6358 %3 = bitcast i16 %mask to <16 x i1> 6359 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 6360 ret <16 x i32> %4 6361} 6362 6363define <64 x i8> @stack_fold_pslldq(<64 x i8> %a, <64 x i8> %b) { 6364; CHECK-LABEL: stack_fold_pslldq: 6365; CHECK: # %bb.0: 6366; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6367; CHECK-NEXT: #APP 6368; CHECK-NEXT: nop 6369; CHECK-NEXT: #NO_APP 6370; CHECK-NEXT: vpslldq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 6371; CHECK-NEXT: # zmm0 = zero,mem[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,mem[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zero,mem[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zero,mem[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62] 6372; CHECK-NEXT: retq 6373 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6374 %2 = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 79, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 95, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 111, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 127, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62> 6375 ret <64 x i8> %2 6376} 6377 6378define <8 x i64> @stack_fold_psllq(<8 x i64> %a0, <2 x i64> %a1) { 6379; CHECK-LABEL: stack_fold_psllq: 6380; CHECK: # %bb.0: 6381; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6382; CHECK-NEXT: #APP 6383; CHECK-NEXT: nop 6384; CHECK-NEXT: #NO_APP 6385; CHECK-NEXT: vpsllq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload 6386; CHECK-NEXT: retq 6387 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6388 %2 = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) 6389 ret <8 x i64> %2 6390} 6391declare <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64>, <2 x i64>) nounwind readnone 6392 6393define <8 x i64> @stack_fold_psllqi(<8 x i64> %a0) { 6394; CHECK-LABEL: stack_fold_psllqi: 6395; CHECK: # %bb.0: 6396; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6397; CHECK-NEXT: #APP 6398; CHECK-NEXT: nop 6399; CHECK-NEXT: #NO_APP 6400; CHECK-NEXT: vpsllq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 6401; CHECK-NEXT: retq 6402 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6403 %2 = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 1) 6404 ret <8 x i64> %2 6405} 6406declare <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64>, i32) nounwind readnone 6407 6408define <16 x i32> @stack_fold_psllvd(<16 x i32> %a0, <16 x i32> %a1) { 6409; CHECK-LABEL: stack_fold_psllvd: 6410; CHECK: # %bb.0: 6411; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6412; CHECK-NEXT: #APP 6413; CHECK-NEXT: nop 6414; CHECK-NEXT: #NO_APP 6415; CHECK-NEXT: vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6416; CHECK-NEXT: retq 6417 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6418 %2 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1) 6419 ret <16 x i32> %2 6420} 6421declare <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32>, <16 x i32>) nounwind readnone 6422 6423define <16 x i32> @stack_fold_psllvd_mask(ptr %passthru, <16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 6424; CHECK-LABEL: stack_fold_psllvd_mask: 6425; CHECK: # %bb.0: 6426; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6427; CHECK-NEXT: #APP 6428; CHECK-NEXT: nop 6429; CHECK-NEXT: #NO_APP 6430; CHECK-NEXT: kmovd %esi, %k1 6431; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 6432; CHECK-NEXT: vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 6433; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 6434; CHECK-NEXT: retq 6435 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6436 %2 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1) 6437 %3 = bitcast i16 %mask to <16 x i1> 6438 %4 = load <16 x i32>, ptr %passthru 6439 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 6440 ret <16 x i32> %5 6441} 6442 6443define <16 x i32> @stack_fold_psllvd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 6444; CHECK-LABEL: stack_fold_psllvd_maskz: 6445; CHECK: # %bb.0: 6446; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6447; CHECK-NEXT: #APP 6448; CHECK-NEXT: nop 6449; CHECK-NEXT: #NO_APP 6450; CHECK-NEXT: kmovd %edi, %k1 6451; CHECK-NEXT: vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 6452; CHECK-NEXT: retq 6453 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6454 %2 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1) 6455 %3 = bitcast i16 %mask to <16 x i1> 6456 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 6457 ret <16 x i32> %4 6458} 6459 6460define <8 x i64> @stack_fold_psllvq(<8 x i64> %a0, <8 x i64> %a1) { 6461; CHECK-LABEL: stack_fold_psllvq: 6462; CHECK: # %bb.0: 6463; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6464; CHECK-NEXT: #APP 6465; CHECK-NEXT: nop 6466; CHECK-NEXT: #NO_APP 6467; CHECK-NEXT: vpsllvq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6468; CHECK-NEXT: retq 6469 %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6470 %2 = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1) 6471 ret <8 x i64> %2 6472} 6473declare <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64>, <8 x i64>) nounwind readnone 6474 6475define <32 x i16> @stack_fold_psllvw(<32 x i16> %a0, <32 x i16> %a1) { 6476; CHECK-LABEL: stack_fold_psllvw: 6477; CHECK: # %bb.0: 6478; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6479; CHECK-NEXT: #APP 6480; CHECK-NEXT: nop 6481; CHECK-NEXT: #NO_APP 6482; CHECK-NEXT: vpsllvw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6483; CHECK-NEXT: retq 6484 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6485 %2 = call <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16> %a0, <32 x i16> %a1) 6486 ret <32 x i16> %2 6487} 6488declare <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16>, <32 x i16>) nounwind readnone 6489 6490define <32 x i16> @stack_fold_psllw(<32 x i16> %a0, <8 x i16> %a1) { 6491; CHECK-LABEL: stack_fold_psllw: 6492; CHECK: # %bb.0: 6493; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6494; CHECK-NEXT: #APP 6495; CHECK-NEXT: nop 6496; CHECK-NEXT: #NO_APP 6497; CHECK-NEXT: vpsllw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload 6498; CHECK-NEXT: retq 6499 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6500 %2 = call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %a0, <8 x i16> %a1) 6501 ret <32 x i16> %2 6502} 6503declare <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16>, <8 x i16>) nounwind readnone 6504 6505define <32 x i16> @stack_fold_psllwi(<32 x i16> %a0) { 6506; CHECK-LABEL: stack_fold_psllwi: 6507; CHECK: # %bb.0: 6508; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6509; CHECK-NEXT: #APP 6510; CHECK-NEXT: nop 6511; CHECK-NEXT: #NO_APP 6512; CHECK-NEXT: vpsllw $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 6513; CHECK-NEXT: retq 6514 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6515 %2 = call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> %a0, i32 1) 6516 ret <32 x i16> %2 6517} 6518declare <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16>, i32) nounwind readnone 6519 6520define <16 x i32> @stack_fold_psrad(<16 x i32> %a0, <4 x i32> %a1) { 6521; CHECK-LABEL: stack_fold_psrad: 6522; CHECK: # %bb.0: 6523; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6524; CHECK-NEXT: #APP 6525; CHECK-NEXT: nop 6526; CHECK-NEXT: #NO_APP 6527; CHECK-NEXT: vpsrad {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload 6528; CHECK-NEXT: retq 6529 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6530 %2 = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) 6531 ret <16 x i32> %2 6532} 6533declare <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32>, <4 x i32>) nounwind readnone 6534 6535define <16 x i32> @stack_fold_psradi(<16 x i32> %a0) { 6536; CHECK-LABEL: stack_fold_psradi: 6537; CHECK: # %bb.0: 6538; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6539; CHECK-NEXT: #APP 6540; CHECK-NEXT: nop 6541; CHECK-NEXT: #NO_APP 6542; CHECK-NEXT: vpsrad $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 6543; CHECK-NEXT: retq 6544 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6545 %2 = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 1) 6546 ret <16 x i32> %2 6547} 6548declare <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32>, i32) nounwind readnone 6549 6550define <8 x i64> @stack_fold_psraq(<8 x i64> %a0, <2 x i64> %a1) { 6551; CHECK-LABEL: stack_fold_psraq: 6552; CHECK: # %bb.0: 6553; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6554; CHECK-NEXT: #APP 6555; CHECK-NEXT: nop 6556; CHECK-NEXT: #NO_APP 6557; CHECK-NEXT: vpsraq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload 6558; CHECK-NEXT: retq 6559 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6560 %2 = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) 6561 ret <8 x i64> %2 6562} 6563declare <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64>, <2 x i64>) nounwind readnone 6564 6565define <8 x i64> @stack_fold_psraqi(<8 x i64> %a0) { 6566; CHECK-LABEL: stack_fold_psraqi: 6567; CHECK: # %bb.0: 6568; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6569; CHECK-NEXT: #APP 6570; CHECK-NEXT: nop 6571; CHECK-NEXT: #NO_APP 6572; CHECK-NEXT: vpsraq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 6573; CHECK-NEXT: retq 6574 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6575 %2 = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 1) 6576 ret <8 x i64> %2 6577} 6578declare <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64>, i32) nounwind readnone 6579 6580define <16 x i32> @stack_fold_psravd(<16 x i32> %a0, <16 x i32> %a1) { 6581; CHECK-LABEL: stack_fold_psravd: 6582; CHECK: # %bb.0: 6583; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6584; CHECK-NEXT: #APP 6585; CHECK-NEXT: nop 6586; CHECK-NEXT: #NO_APP 6587; CHECK-NEXT: vpsravd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6588; CHECK-NEXT: retq 6589 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6590 %2 = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1) 6591 ret <16 x i32> %2 6592} 6593declare <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32>, <16 x i32>) nounwind readnone 6594 6595define <8 x i64> @stack_fold_psravq(<8 x i64> %a0, <8 x i64> %a1) { 6596; CHECK-LABEL: stack_fold_psravq: 6597; CHECK: # %bb.0: 6598; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6599; CHECK-NEXT: #APP 6600; CHECK-NEXT: nop 6601; CHECK-NEXT: #NO_APP 6602; CHECK-NEXT: vpsravq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6603; CHECK-NEXT: retq 6604 %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6605 %2 = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1) 6606 ret <8 x i64> %2 6607} 6608declare <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64>, <8 x i64>) nounwind readnone 6609 6610define <32 x i16> @stack_fold_psravw(<32 x i16> %a0, <32 x i16> %a1) { 6611; CHECK-LABEL: stack_fold_psravw: 6612; CHECK: # %bb.0: 6613; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6614; CHECK-NEXT: #APP 6615; CHECK-NEXT: nop 6616; CHECK-NEXT: #NO_APP 6617; CHECK-NEXT: vpsravw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6618; CHECK-NEXT: retq 6619 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6620 %2 = call <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16> %a0, <32 x i16> %a1) 6621 ret <32 x i16> %2 6622} 6623declare <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16>, <32 x i16>) nounwind readnone 6624 6625define <32 x i16> @stack_fold_psraw(<32 x i16> %a0, <8 x i16> %a1) { 6626; CHECK-LABEL: stack_fold_psraw: 6627; CHECK: # %bb.0: 6628; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6629; CHECK-NEXT: #APP 6630; CHECK-NEXT: nop 6631; CHECK-NEXT: #NO_APP 6632; CHECK-NEXT: vpsraw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload 6633; CHECK-NEXT: retq 6634 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6635 %2 = call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %a0, <8 x i16> %a1) 6636 ret <32 x i16> %2 6637} 6638declare <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16>, <8 x i16>) nounwind readnone 6639 6640define <32 x i16> @stack_fold_psrawi(<32 x i16> %a0) { 6641; CHECK-LABEL: stack_fold_psrawi: 6642; CHECK: # %bb.0: 6643; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6644; CHECK-NEXT: #APP 6645; CHECK-NEXT: nop 6646; CHECK-NEXT: #NO_APP 6647; CHECK-NEXT: vpsraw $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 6648; CHECK-NEXT: retq 6649 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6650 %2 = call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %a0, i32 1) 6651 ret <32 x i16> %2 6652} 6653declare <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16>, i32) nounwind readnone 6654 6655define <16 x i32> @stack_fold_psrld(<16 x i32> %a0, <4 x i32> %a1) { 6656; CHECK-LABEL: stack_fold_psrld: 6657; CHECK: # %bb.0: 6658; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6659; CHECK-NEXT: #APP 6660; CHECK-NEXT: nop 6661; CHECK-NEXT: #NO_APP 6662; CHECK-NEXT: vpsrld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload 6663; CHECK-NEXT: retq 6664 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6665 %2 = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) 6666 ret <16 x i32> %2 6667} 6668declare <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32>, <4 x i32>) nounwind readnone 6669 6670define <16 x i32> @stack_fold_psrldi(<16 x i32> %a0) { 6671; CHECK-LABEL: stack_fold_psrldi: 6672; CHECK: # %bb.0: 6673; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6674; CHECK-NEXT: #APP 6675; CHECK-NEXT: nop 6676; CHECK-NEXT: #NO_APP 6677; CHECK-NEXT: vpsrld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 6678; CHECK-NEXT: retq 6679 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6680 %2 = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 1) 6681 ret <16 x i32> %2 6682} 6683declare <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32>, i32) nounwind readnone 6684 6685define <64 x i8> @stack_fold_psrldq(<64 x i8> %a, <64 x i8> %b) { 6686; CHECK-LABEL: stack_fold_psrldq: 6687; CHECK: # %bb.0: 6688; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6689; CHECK-NEXT: #APP 6690; CHECK-NEXT: nop 6691; CHECK-NEXT: #NO_APP 6692; CHECK-NEXT: vpsrldq $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 6693; CHECK-NEXT: # zmm0 = mem[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,mem[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,mem[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,mem[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zero,zero 6694; CHECK-NEXT: retq 6695 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6696 %2 = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 64, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 64, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 64, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 64> 6697 ret <64 x i8> %2 6698} 6699 6700define <8 x i64> @stack_fold_psrlq(<8 x i64> %a0, <2 x i64> %a1) { 6701; CHECK-LABEL: stack_fold_psrlq: 6702; CHECK: # %bb.0: 6703; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6704; CHECK-NEXT: #APP 6705; CHECK-NEXT: nop 6706; CHECK-NEXT: #NO_APP 6707; CHECK-NEXT: vpsrlq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload 6708; CHECK-NEXT: retq 6709 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6710 %2 = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) 6711 ret <8 x i64> %2 6712} 6713declare <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64>, <2 x i64>) nounwind readnone 6714 6715define <8 x i64> @stack_fold_psrlqi(<8 x i64> %a0) { 6716; CHECK-LABEL: stack_fold_psrlqi: 6717; CHECK: # %bb.0: 6718; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6719; CHECK-NEXT: #APP 6720; CHECK-NEXT: nop 6721; CHECK-NEXT: #NO_APP 6722; CHECK-NEXT: vpsrlq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 6723; CHECK-NEXT: retq 6724 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6725 %2 = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 1) 6726 ret <8 x i64> %2 6727} 6728declare <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64>, i32) nounwind readnone 6729 6730define <16 x i32> @stack_fold_psrlvd(<16 x i32> %a0, <16 x i32> %a1) { 6731; CHECK-LABEL: stack_fold_psrlvd: 6732; CHECK: # %bb.0: 6733; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6734; CHECK-NEXT: #APP 6735; CHECK-NEXT: nop 6736; CHECK-NEXT: #NO_APP 6737; CHECK-NEXT: vpsrlvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6738; CHECK-NEXT: retq 6739 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6740 %2 = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1) 6741 ret <16 x i32> %2 6742} 6743declare <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32>, <16 x i32>) nounwind readnone 6744 6745define <8 x i64> @stack_fold_psrlvq(<8 x i64> %a0, <8 x i64> %a1) { 6746; CHECK-LABEL: stack_fold_psrlvq: 6747; CHECK: # %bb.0: 6748; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6749; CHECK-NEXT: #APP 6750; CHECK-NEXT: nop 6751; CHECK-NEXT: #NO_APP 6752; CHECK-NEXT: vpsrlvq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6753; CHECK-NEXT: retq 6754 %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6755 %2 = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1) 6756 ret <8 x i64> %2 6757} 6758declare <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64>, <8 x i64>) nounwind readnone 6759 6760define <32 x i16> @stack_fold_psrlvw(<32 x i16> %a0, <32 x i16> %a1) { 6761; CHECK-LABEL: stack_fold_psrlvw: 6762; CHECK: # %bb.0: 6763; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6764; CHECK-NEXT: #APP 6765; CHECK-NEXT: nop 6766; CHECK-NEXT: #NO_APP 6767; CHECK-NEXT: vpsrlvw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6768; CHECK-NEXT: retq 6769 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6770 %2 = call <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16> %a0, <32 x i16> %a1) 6771 ret <32 x i16> %2 6772} 6773declare <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16>, <32 x i16>) nounwind readnone 6774 6775define <32 x i16> @stack_fold_psrlw(<32 x i16> %a0, <8 x i16> %a1) { 6776; CHECK-LABEL: stack_fold_psrlw: 6777; CHECK: # %bb.0: 6778; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6779; CHECK-NEXT: #APP 6780; CHECK-NEXT: nop 6781; CHECK-NEXT: #NO_APP 6782; CHECK-NEXT: vpsrlw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload 6783; CHECK-NEXT: retq 6784 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6785 %2 = call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %a0, <8 x i16> %a1) 6786 ret <32 x i16> %2 6787} 6788declare <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16>, <8 x i16>) nounwind readnone 6789 6790define <32 x i16> @stack_fold_psrlwi(<32 x i16> %a0) { 6791; CHECK-LABEL: stack_fold_psrlwi: 6792; CHECK: # %bb.0: 6793; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6794; CHECK-NEXT: #APP 6795; CHECK-NEXT: nop 6796; CHECK-NEXT: #NO_APP 6797; CHECK-NEXT: vpsrlw $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload 6798; CHECK-NEXT: retq 6799 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6800 %2 = call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %a0, i32 1) 6801 ret <32 x i16> %2 6802} 6803declare <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16>, i32) nounwind readnone 6804 6805define <64 x i8> @stack_fold_psubb(<64 x i8> %a0, <64 x i8> %a1) { 6806; CHECK-LABEL: stack_fold_psubb: 6807; CHECK: # %bb.0: 6808; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6809; CHECK-NEXT: #APP 6810; CHECK-NEXT: nop 6811; CHECK-NEXT: #NO_APP 6812; CHECK-NEXT: vpsubb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6813; CHECK-NEXT: retq 6814 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6815 %2 = sub <64 x i8> %a0, %a1 6816 ret <64 x i8> %2 6817} 6818 6819define <16 x i32> @stack_fold_psubd(<16 x i32> %a0, <16 x i32> %a1) { 6820; CHECK-LABEL: stack_fold_psubd: 6821; CHECK: # %bb.0: 6822; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6823; CHECK-NEXT: #APP 6824; CHECK-NEXT: nop 6825; CHECK-NEXT: #NO_APP 6826; CHECK-NEXT: vpsubd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6827; CHECK-NEXT: retq 6828 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6829 %2 = sub <16 x i32> %a0, %a1 6830 ret <16 x i32> %2 6831} 6832 6833define <8 x i64> @stack_fold_psubq(<8 x i64> %a0, <8 x i64> %a1) { 6834; CHECK-LABEL: stack_fold_psubq: 6835; CHECK: # %bb.0: 6836; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6837; CHECK-NEXT: #APP 6838; CHECK-NEXT: nop 6839; CHECK-NEXT: #NO_APP 6840; CHECK-NEXT: vpsubq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6841; CHECK-NEXT: retq 6842 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6843 %2 = sub <8 x i64> %a0, %a1 6844 ret <8 x i64> %2 6845} 6846 6847define <64 x i8> @stack_fold_psubsb(<64 x i8> %a0, <64 x i8> %a1) { 6848; CHECK-LABEL: stack_fold_psubsb: 6849; CHECK: # %bb.0: 6850; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6851; CHECK-NEXT: #APP 6852; CHECK-NEXT: nop 6853; CHECK-NEXT: #NO_APP 6854; CHECK-NEXT: vpsubsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6855; CHECK-NEXT: retq 6856 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6857 %2 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1) 6858 ret <64 x i8> %2 6859} 6860 6861define <32 x i16> @stack_fold_psubsw(<32 x i16> %a0, <32 x i16> %a1) { 6862; CHECK-LABEL: stack_fold_psubsw: 6863; CHECK: # %bb.0: 6864; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6865; CHECK-NEXT: #APP 6866; CHECK-NEXT: nop 6867; CHECK-NEXT: #NO_APP 6868; CHECK-NEXT: vpsubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6869; CHECK-NEXT: retq 6870 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6871 %2 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1) 6872 ret <32 x i16> %2 6873} 6874 6875define <64 x i8> @stack_fold_psubusb(<64 x i8> %a0, <64 x i8> %a1) { 6876; CHECK-LABEL: stack_fold_psubusb: 6877; CHECK: # %bb.0: 6878; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6879; CHECK-NEXT: #APP 6880; CHECK-NEXT: nop 6881; CHECK-NEXT: #NO_APP 6882; CHECK-NEXT: vpsubusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6883; CHECK-NEXT: retq 6884 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6885 %2 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1) 6886 ret <64 x i8> %2 6887} 6888 6889define <32 x i16> @stack_fold_psubusw(<32 x i16> %a0, <32 x i16> %a1) { 6890; CHECK-LABEL: stack_fold_psubusw: 6891; CHECK: # %bb.0: 6892; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6893; CHECK-NEXT: #APP 6894; CHECK-NEXT: nop 6895; CHECK-NEXT: #NO_APP 6896; CHECK-NEXT: vpsubusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6897; CHECK-NEXT: retq 6898 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6899 %2 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1) 6900 ret <32 x i16> %2 6901} 6902 6903define <32 x i16> @stack_fold_psubw(<32 x i16> %a0, <32 x i16> %a1) { 6904; CHECK-LABEL: stack_fold_psubw: 6905; CHECK: # %bb.0: 6906; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6907; CHECK-NEXT: #APP 6908; CHECK-NEXT: nop 6909; CHECK-NEXT: #NO_APP 6910; CHECK-NEXT: vpsubw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6911; CHECK-NEXT: retq 6912 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6913 %2 = sub <32 x i16> %a0, %a1 6914 ret <32 x i16> %2 6915} 6916 6917define <8 x i64> @stack_fold_shufi64x2(<8 x i64> %a, <8 x i64> %b) { 6918; CHECK-LABEL: stack_fold_shufi64x2: 6919; CHECK: # %bb.0: 6920; CHECK-NEXT: pushq %rax 6921; CHECK-NEXT: .cfi_def_cfa_offset 16 6922; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6923; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6924; CHECK-NEXT: #APP 6925; CHECK-NEXT: nop 6926; CHECK-NEXT: #NO_APP 6927; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 6928; CHECK-NEXT: vshufi64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 6929; CHECK-NEXT: # zmm0 = zmm0[0,1,4,5],mem[2,3,0,1] 6930; CHECK-NEXT: popq %rax 6931; CHECK-NEXT: .cfi_def_cfa_offset 8 6932; CHECK-NEXT: retq 6933 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6934 %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9> 6935 ret <8 x i64> %2 6936} 6937 6938define <8 x i64> @stack_fold_shufi64x2_mask(<8 x i64> %a, <8 x i64> %b, i8 %mask, ptr %passthru) { 6939; CHECK-LABEL: stack_fold_shufi64x2_mask: 6940; CHECK: # %bb.0: 6941; CHECK-NEXT: pushq %rax 6942; CHECK-NEXT: .cfi_def_cfa_offset 16 6943; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6944; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6945; CHECK-NEXT: #APP 6946; CHECK-NEXT: nop 6947; CHECK-NEXT: #NO_APP 6948; CHECK-NEXT: kmovd %edi, %k1 6949; CHECK-NEXT: vmovdqa64 (%rsi), %zmm1 6950; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 6951; CHECK-NEXT: vshufi64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload 6952; CHECK-NEXT: # zmm1 {%k1} = zmm0[0,1,4,5],mem[2,3,0,1] 6953; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 6954; CHECK-NEXT: popq %rax 6955; CHECK-NEXT: .cfi_def_cfa_offset 8 6956; CHECK-NEXT: retq 6957 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6958 %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9> 6959 %3 = bitcast i8 %mask to <8 x i1> 6960 ; load needed to keep the operation from being scheduled above the asm block 6961 %4 = load <8 x i64>, ptr %passthru 6962 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4 6963 ret <8 x i64> %5 6964} 6965 6966define <8 x i64> @stack_fold_shufi64x2_maskz(<8 x i64> %a, <8 x i64> %b, i8 %mask, ptr %passthru) { 6967; CHECK-LABEL: stack_fold_shufi64x2_maskz: 6968; CHECK: # %bb.0: 6969; CHECK-NEXT: pushq %rax 6970; CHECK-NEXT: .cfi_def_cfa_offset 16 6971; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6972; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6973; CHECK-NEXT: #APP 6974; CHECK-NEXT: nop 6975; CHECK-NEXT: #NO_APP 6976; CHECK-NEXT: kmovd %edi, %k1 6977; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 6978; CHECK-NEXT: vshufi64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 6979; CHECK-NEXT: # zmm0 {%k1} {z} = zmm0[0,1,4,5],mem[2,3,0,1] 6980; CHECK-NEXT: popq %rax 6981; CHECK-NEXT: .cfi_def_cfa_offset 8 6982; CHECK-NEXT: retq 6983 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 6984 %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9> 6985 %3 = bitcast i8 %mask to <8 x i1> 6986 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 6987 ret <8 x i64> %4 6988} 6989 6990define <16 x i32> @stack_fold_shufi32x4_mask(<16 x i32> %a, <16 x i32> %b, i16 %mask, ptr %passthru) { 6991; CHECK-LABEL: stack_fold_shufi32x4_mask: 6992; CHECK: # %bb.0: 6993; CHECK-NEXT: pushq %rax 6994; CHECK-NEXT: .cfi_def_cfa_offset 16 6995; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6996; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 6997; CHECK-NEXT: #APP 6998; CHECK-NEXT: nop 6999; CHECK-NEXT: #NO_APP 7000; CHECK-NEXT: kmovd %edi, %k1 7001; CHECK-NEXT: vmovdqa64 (%rsi), %zmm1 7002; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 7003; CHECK-NEXT: vshufi32x4 $20, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload 7004; CHECK-NEXT: # zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,7],mem[4,5,6,7,0,1,2,3] 7005; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 7006; CHECK-NEXT: popq %rax 7007; CHECK-NEXT: .cfi_def_cfa_offset 8 7008; CHECK-NEXT: retq 7009 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7010 %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19> 7011 %3 = bitcast i16 %mask to <16 x i1> 7012 ; load needed to keep the operation from being scheduled above the asm block 7013 %4 = load <16 x i32>, ptr %passthru 7014 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 7015 ret <16 x i32> %5 7016} 7017 7018define <16 x i32> @stack_fold_shufi32x4_maskz(<16 x i32> %a, <16 x i32> %b, i16 %mask) { 7019; CHECK-LABEL: stack_fold_shufi32x4_maskz: 7020; CHECK: # %bb.0: 7021; CHECK-NEXT: pushq %rax 7022; CHECK-NEXT: .cfi_def_cfa_offset 16 7023; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7024; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7025; CHECK-NEXT: #APP 7026; CHECK-NEXT: nop 7027; CHECK-NEXT: #NO_APP 7028; CHECK-NEXT: kmovd %edi, %k1 7029; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 7030; CHECK-NEXT: vshufi32x4 $20, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 7031; CHECK-NEXT: # zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],mem[4,5,6,7,0,1,2,3] 7032; CHECK-NEXT: popq %rax 7033; CHECK-NEXT: .cfi_def_cfa_offset 8 7034; CHECK-NEXT: retq 7035 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7036 %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19> 7037 %3 = bitcast i16 %mask to <16 x i1> 7038 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 7039 ret <16 x i32> %4 7040} 7041 7042define <16 x i32> @stack_fold_ternlogd(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { 7043; CHECK-LABEL: stack_fold_ternlogd: 7044; CHECK: # %bb.0: 7045; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7046; CHECK-NEXT: #APP 7047; CHECK-NEXT: nop 7048; CHECK-NEXT: #NO_APP 7049; CHECK-NEXT: vpternlogd $33, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 7050; CHECK-NEXT: # zmm0 = ~(zmm1 | (zmm0 ^ mem)) 7051; CHECK-NEXT: retq 7052 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7053 %2 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33) 7054 ret <16 x i32> %2 7055} 7056declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32) 7057 7058define <8 x i64> @stack_fold_ternlogq(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { 7059; CHECK-LABEL: stack_fold_ternlogq: 7060; CHECK: # %bb.0: 7061; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7062; CHECK-NEXT: #APP 7063; CHECK-NEXT: nop 7064; CHECK-NEXT: #NO_APP 7065; CHECK-NEXT: vpternlogq $33, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 7066; CHECK-NEXT: # zmm0 = ~(zmm1 | (zmm0 ^ mem)) 7067; CHECK-NEXT: retq 7068 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7069 %2 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33) 7070 ret <8 x i64> %2 7071} 7072 7073declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32) 7074 7075define <64 x i8> @stack_fold_punpckhbw_zmm(<64 x i8> %a0, <64 x i8> %a1) { 7076; CHECK-LABEL: stack_fold_punpckhbw_zmm: 7077; CHECK: # %bb.0: 7078; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7079; CHECK-NEXT: #APP 7080; CHECK-NEXT: nop 7081; CHECK-NEXT: #NO_APP 7082; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 7083; CHECK-NEXT: # zmm0 = zmm0[8],mem[8],zmm0[9],mem[9],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[12],mem[12],zmm0[13],mem[13],zmm0[14],mem[14],zmm0[15],mem[15],zmm0[24],mem[24],zmm0[25],mem[25],zmm0[26],mem[26],zmm0[27],mem[27],zmm0[28],mem[28],zmm0[29],mem[29],zmm0[30],mem[30],zmm0[31],mem[31],zmm0[40],mem[40],zmm0[41],mem[41],zmm0[42],mem[42],zmm0[43],mem[43],zmm0[44],mem[44],zmm0[45],mem[45],zmm0[46],mem[46],zmm0[47],mem[47],zmm0[56],mem[56],zmm0[57],mem[57],zmm0[58],mem[58],zmm0[59],mem[59],zmm0[60],mem[60],zmm0[61],mem[61],zmm0[62],mem[62],zmm0[63],mem[63] 7084; CHECK-NEXT: retq 7085 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7086 %2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127> 7087 ret <64 x i8> %2 7088} 7089 7090define <64 x i8> @stack_fold_punpckhbw_mask_zmm(ptr %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 7091; CHECK-LABEL: stack_fold_punpckhbw_mask_zmm: 7092; CHECK: # %bb.0: 7093; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7094; CHECK-NEXT: #APP 7095; CHECK-NEXT: nop 7096; CHECK-NEXT: #NO_APP 7097; CHECK-NEXT: kmovq %rsi, %k1 7098; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 7099; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 7100; CHECK-NEXT: # zmm2 {%k1} = zmm0[8],mem[8],zmm0[9],mem[9],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[12],mem[12],zmm0[13],mem[13],zmm0[14],mem[14],zmm0[15],mem[15],zmm0[24],mem[24],zmm0[25],mem[25],zmm0[26],mem[26],zmm0[27],mem[27],zmm0[28],mem[28],zmm0[29],mem[29],zmm0[30],mem[30],zmm0[31],mem[31],zmm0[40],mem[40],zmm0[41],mem[41],zmm0[42],mem[42],zmm0[43],mem[43],zmm0[44],mem[44],zmm0[45],mem[45],zmm0[46],mem[46],zmm0[47],mem[47],zmm0[56],mem[56],zmm0[57],mem[57],zmm0[58],mem[58],zmm0[59],mem[59],zmm0[60],mem[60],zmm0[61],mem[61],zmm0[62],mem[62],zmm0[63],mem[63] 7101; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 7102; CHECK-NEXT: retq 7103 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7104 %2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127> 7105 %3 = bitcast i64 %mask to <64 x i1> 7106 ; load needed to keep the operation from being scheduled about the asm block 7107 %4 = load <64 x i8>, ptr %passthru 7108 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4 7109 ret <64 x i8> %5 7110} 7111 7112define <64 x i8> @stack_fold_punpckhbw_maskz_zmm(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { 7113; CHECK-LABEL: stack_fold_punpckhbw_maskz_zmm: 7114; CHECK: # %bb.0: 7115; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7116; CHECK-NEXT: #APP 7117; CHECK-NEXT: nop 7118; CHECK-NEXT: #NO_APP 7119; CHECK-NEXT: kmovq %rdi, %k1 7120; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 7121; CHECK-NEXT: # zmm0 {%k1} {z} = zmm0[8],mem[8],zmm0[9],mem[9],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[12],mem[12],zmm0[13],mem[13],zmm0[14],mem[14],zmm0[15],mem[15],zmm0[24],mem[24],zmm0[25],mem[25],zmm0[26],mem[26],zmm0[27],mem[27],zmm0[28],mem[28],zmm0[29],mem[29],zmm0[30],mem[30],zmm0[31],mem[31],zmm0[40],mem[40],zmm0[41],mem[41],zmm0[42],mem[42],zmm0[43],mem[43],zmm0[44],mem[44],zmm0[45],mem[45],zmm0[46],mem[46],zmm0[47],mem[47],zmm0[56],mem[56],zmm0[57],mem[57],zmm0[58],mem[58],zmm0[59],mem[59],zmm0[60],mem[60],zmm0[61],mem[61],zmm0[62],mem[62],zmm0[63],mem[63] 7122; CHECK-NEXT: retq 7123 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7124 %2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127> 7125 %3 = bitcast i64 %mask to <64 x i1> 7126 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer 7127 ret <64 x i8> %4 7128} 7129 7130define <16 x i32> @stack_fold_pxord(<16 x i32> %a0, <16 x i32> %a1) { 7131; CHECK-LABEL: stack_fold_pxord: 7132; CHECK: # %bb.0: 7133; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7134; CHECK-NEXT: #APP 7135; CHECK-NEXT: nop 7136; CHECK-NEXT: #NO_APP 7137; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 7138; CHECK-NEXT: retq 7139 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7140 %2 = xor <16 x i32> %a0, %a1 7141 ret <16 x i32> %2 7142} 7143 7144define <16 x i32> @stack_fold_pxord_commuted(<16 x i32> %a0, <16 x i32> %a1) { 7145; CHECK-LABEL: stack_fold_pxord_commuted: 7146; CHECK: # %bb.0: 7147; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7148; CHECK-NEXT: #APP 7149; CHECK-NEXT: nop 7150; CHECK-NEXT: #NO_APP 7151; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 7152; CHECK-NEXT: retq 7153 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7154 %2 = xor <16 x i32> %a1, %a0 7155 ret <16 x i32> %2 7156} 7157 7158define <16 x i32> @stack_fold_pxord_mask(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) { 7159; CHECK-LABEL: stack_fold_pxord_mask: 7160; CHECK: # %bb.0: 7161; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7162; CHECK-NEXT: vmovaps %zmm0, %zmm1 7163; CHECK-NEXT: #APP 7164; CHECK-NEXT: nop 7165; CHECK-NEXT: #NO_APP 7166; CHECK-NEXT: kmovd %esi, %k1 7167; CHECK-NEXT: vmovaps (%rdi), %zmm0 7168; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 7169; CHECK-NEXT: retq 7170 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7171 %2 = xor <16 x i32> %a0, %a1 7172 %3 = bitcast i16 %mask to <16 x i1> 7173 ; load needed to keep the operation from being scheduled about the asm block 7174 %4 = load <16 x i32>, ptr %a2 7175 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 7176 ret <16 x i32> %5 7177} 7178 7179define <16 x i32> @stack_fold_pxord_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) { 7180; CHECK-LABEL: stack_fold_pxord_mask_commuted: 7181; CHECK: # %bb.0: 7182; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7183; CHECK-NEXT: vmovaps %zmm0, %zmm1 7184; CHECK-NEXT: #APP 7185; CHECK-NEXT: nop 7186; CHECK-NEXT: #NO_APP 7187; CHECK-NEXT: kmovd %esi, %k1 7188; CHECK-NEXT: vmovaps (%rdi), %zmm0 7189; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 7190; CHECK-NEXT: retq 7191 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7192 %2 = xor <16 x i32> %a1, %a0 7193 %3 = bitcast i16 %mask to <16 x i1> 7194 ; load needed to keep the operation from being scheduled about the asm block 7195 %4 = load <16 x i32>, ptr %a2 7196 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 7197 ret <16 x i32> %5 7198} 7199 7200define <16 x i32> @stack_fold_pxord_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 7201; CHECK-LABEL: stack_fold_pxord_maskz: 7202; CHECK: # %bb.0: 7203; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7204; CHECK-NEXT: #APP 7205; CHECK-NEXT: nop 7206; CHECK-NEXT: #NO_APP 7207; CHECK-NEXT: kmovd %edi, %k1 7208; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 7209; CHECK-NEXT: retq 7210 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7211 %2 = xor <16 x i32> %a0, %a1 7212 %3 = bitcast i16 %mask to <16 x i1> 7213 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 7214 ret <16 x i32> %4 7215} 7216 7217define <16 x i32> @stack_fold_pxord_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { 7218; CHECK-LABEL: stack_fold_pxord_maskz_commuted: 7219; CHECK: # %bb.0: 7220; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7221; CHECK-NEXT: #APP 7222; CHECK-NEXT: nop 7223; CHECK-NEXT: #NO_APP 7224; CHECK-NEXT: kmovd %edi, %k1 7225; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 7226; CHECK-NEXT: retq 7227 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7228 %2 = xor <16 x i32> %a1, %a0 7229 %3 = bitcast i16 %mask to <16 x i1> 7230 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 7231 ret <16 x i32> %4 7232} 7233 7234define <8 x i64> @stack_fold_pxorq(<8 x i64> %a0, <8 x i64> %a1) { 7235; CHECK-LABEL: stack_fold_pxorq: 7236; CHECK: # %bb.0: 7237; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7238; CHECK-NEXT: #APP 7239; CHECK-NEXT: nop 7240; CHECK-NEXT: #NO_APP 7241; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 7242; CHECK-NEXT: retq 7243 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7244 %2 = xor <8 x i64> %a0, %a1 7245 ret <8 x i64> %2 7246} 7247 7248define <8 x i64> @stack_fold_pxorq_commuted(<8 x i64> %a0, <8 x i64> %a1) { 7249; CHECK-LABEL: stack_fold_pxorq_commuted: 7250; CHECK: # %bb.0: 7251; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7252; CHECK-NEXT: #APP 7253; CHECK-NEXT: nop 7254; CHECK-NEXT: #NO_APP 7255; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 7256; CHECK-NEXT: retq 7257 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7258 %2 = xor <8 x i64> %a1, %a0 7259 ret <8 x i64> %2 7260} 7261 7262define <8 x i64> @stack_fold_pxorq_mask(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) { 7263; CHECK-LABEL: stack_fold_pxorq_mask: 7264; CHECK: # %bb.0: 7265; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7266; CHECK-NEXT: vmovapd %zmm0, %zmm1 7267; CHECK-NEXT: #APP 7268; CHECK-NEXT: nop 7269; CHECK-NEXT: #NO_APP 7270; CHECK-NEXT: kmovd %esi, %k1 7271; CHECK-NEXT: vmovapd (%rdi), %zmm0 7272; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 7273; CHECK-NEXT: retq 7274 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7275 %2 = xor <8 x i64> %a0, %a1 7276 %3 = bitcast i8 %mask to <8 x i1> 7277 ; load needed to keep the operation from being scheduled about the asm block 7278 %4 = load <8 x i64>, ptr %a2 7279 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4 7280 ret <8 x i64> %5 7281} 7282 7283define <8 x i64> @stack_fold_pxorq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) { 7284; CHECK-LABEL: stack_fold_pxorq_mask_commuted: 7285; CHECK: # %bb.0: 7286; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7287; CHECK-NEXT: vmovapd %zmm0, %zmm1 7288; CHECK-NEXT: #APP 7289; CHECK-NEXT: nop 7290; CHECK-NEXT: #NO_APP 7291; CHECK-NEXT: kmovd %esi, %k1 7292; CHECK-NEXT: vmovapd (%rdi), %zmm0 7293; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload 7294; CHECK-NEXT: retq 7295 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7296 %2 = xor <8 x i64> %a1, %a0 7297 %3 = bitcast i8 %mask to <8 x i1> 7298 ; load needed to keep the operation from being scheduled about the asm block 7299 %4 = load <8 x i64>, ptr %a2 7300 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4 7301 ret <8 x i64> %5 7302} 7303 7304define <8 x i64> @stack_fold_pxorq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 7305; CHECK-LABEL: stack_fold_pxorq_maskz: 7306; CHECK: # %bb.0: 7307; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7308; CHECK-NEXT: #APP 7309; CHECK-NEXT: nop 7310; CHECK-NEXT: #NO_APP 7311; CHECK-NEXT: kmovd %edi, %k1 7312; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 7313; CHECK-NEXT: retq 7314 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7315 %2 = xor <8 x i64> %a0, %a1 7316 %3 = bitcast i8 %mask to <8 x i1> 7317 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 7318 ret <8 x i64> %4 7319} 7320 7321define <8 x i64> @stack_fold_pxorq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { 7322; CHECK-LABEL: stack_fold_pxorq_maskz_commuted: 7323; CHECK: # %bb.0: 7324; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7325; CHECK-NEXT: #APP 7326; CHECK-NEXT: nop 7327; CHECK-NEXT: #NO_APP 7328; CHECK-NEXT: kmovd %edi, %k1 7329; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 7330; CHECK-NEXT: retq 7331 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 7332 %2 = xor <8 x i64> %a1, %a0 7333 %3 = bitcast i8 %mask to <8 x i1> 7334 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 7335 ret <8 x i64> %4 7336} 7337 7338declare <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8>, <64 x i8>) 7339declare <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16>, <32 x i16>) 7340declare <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8>, <64 x i8>) 7341declare <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16>, <32 x i16>) 7342declare <16 x i32> @llvm.x86.avx512.conflict.d.512(<16 x i32>) 7343declare <8 x i64> @llvm.x86.avx512.conflict.q.512(<8 x i64>) 7344declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>, i1) 7345declare <8 x i64> @llvm.ctlz.v8i64(<8 x i64>, i1) 7346declare <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8>, <64 x i8>) 7347declare <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16>, <32 x i16>) 7348declare <64 x i8> @llvm.usub.sat.v64i8(<64 x i8>, <64 x i8>) 7349declare <32 x i16> @llvm.usub.sat.v32i16(<32 x i16>, <32 x i16>) 7350