1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=+false-deps-mullq -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=ENABLE 3; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=-false-deps-mullq -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=DISABLE 4 5define <2 x i64> @pmullq_128(<2 x i64> %a0, <2 x i64> %a1) { 6; ENABLE-LABEL: pmullq_128: 7; ENABLE: # %bb.0: 8; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9; ENABLE-NEXT: #APP 10; ENABLE-NEXT: nop 11; ENABLE-NEXT: #NO_APP 12; ENABLE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 13; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 14; ENABLE-NEXT: vpmullq %xmm2, %xmm0, %xmm1 15; ENABLE-NEXT: vpaddq %xmm2, %xmm0, %xmm0 16; ENABLE-NEXT: vpaddq %xmm0, %xmm1, %xmm0 17; ENABLE-NEXT: retq 18; 19; DISABLE-LABEL: pmullq_128: 20; DISABLE: # %bb.0: 21; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 22; DISABLE-NEXT: #APP 23; DISABLE-NEXT: nop 24; DISABLE-NEXT: #NO_APP 25; DISABLE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 26; DISABLE-NEXT: vpmullq %xmm2, %xmm0, %xmm1 27; DISABLE-NEXT: vpaddq %xmm2, %xmm0, %xmm0 28; DISABLE-NEXT: vpaddq %xmm0, %xmm1, %xmm0 29; DISABLE-NEXT: retq 30 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 31 %2 = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> undef, i8 -1) 32 %3 = add <2 x i64> %a0, %a1 33 %res = add <2 x i64> %2, %3 34 ret <2 x i64> %res 35} 36 37define <2 x i64> @pmullq_mem_128(<2 x i64> %a0, ptr %p1) { 38; ENABLE-LABEL: pmullq_mem_128: 39; ENABLE: # %bb.0: 40; ENABLE-NEXT: #APP 41; ENABLE-NEXT: nop 42; ENABLE-NEXT: #NO_APP 43; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 44; ENABLE-NEXT: vpmullq (%rdi), %xmm0, %xmm1 45; ENABLE-NEXT: vpaddq %xmm0, %xmm1, %xmm0 46; ENABLE-NEXT: retq 47; 48; DISABLE-LABEL: pmullq_mem_128: 49; DISABLE: # %bb.0: 50; DISABLE-NEXT: #APP 51; DISABLE-NEXT: nop 52; DISABLE-NEXT: #NO_APP 53; DISABLE-NEXT: vpmullq (%rdi), %xmm0, %xmm1 54; DISABLE-NEXT: vpaddq %xmm0, %xmm1, %xmm0 55; DISABLE-NEXT: retq 56 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 57 %a1 = load <2 x i64>, ptr %p1, align 64 58 %2 = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> undef, i8 -1) 59 %res = add <2 x i64> %2, %a0 60 ret <2 x i64> %res 61} 62 63define <2 x i64> @pmullq_broadcast_128(<2 x i64> %a0, ptr %p1) { 64; ENABLE-LABEL: pmullq_broadcast_128: 65; ENABLE: # %bb.0: 66; ENABLE-NEXT: #APP 67; ENABLE-NEXT: nop 68; ENABLE-NEXT: #NO_APP 69; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 70; ENABLE-NEXT: vpmullq (%rdi){1to2}, %xmm0, %xmm1 71; ENABLE-NEXT: vpaddq %xmm0, %xmm1, %xmm0 72; ENABLE-NEXT: retq 73; 74; DISABLE-LABEL: pmullq_broadcast_128: 75; DISABLE: # %bb.0: 76; DISABLE-NEXT: #APP 77; DISABLE-NEXT: nop 78; DISABLE-NEXT: #NO_APP 79; DISABLE-NEXT: vpmullq (%rdi){1to2}, %xmm0, %xmm1 80; DISABLE-NEXT: vpaddq %xmm0, %xmm1, %xmm0 81; DISABLE-NEXT: retq 82 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 83 %v1 = load i64, ptr %p1, align 4 84 %t0 = insertelement <2 x i64> undef, i64 %v1, i64 0 85 %a1 = shufflevector <2 x i64> %t0, <2 x i64> undef, <2 x i32> zeroinitializer 86 %2 = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> undef, i8 -1) 87 %res = add <2 x i64> %2, %a0 88 ret <2 x i64> %res 89} 90 91define <2 x i64> @pmullq_maskz_128(<2 x i64> %a0, <2 x i64> %a1, ptr %pmask) { 92; ENABLE-LABEL: pmullq_maskz_128: 93; ENABLE: # %bb.0: 94; ENABLE-NEXT: vpmullq %xmm1, %xmm0, %xmm2 95; ENABLE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 96; ENABLE-NEXT: #APP 97; ENABLE-NEXT: nop 98; ENABLE-NEXT: #NO_APP 99; ENABLE-NEXT: kmovb (%rdi), %k1 100; ENABLE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 101; ENABLE-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} # 16-byte Folded Reload 102; ENABLE-NEXT: retq 103; 104; DISABLE-LABEL: pmullq_maskz_128: 105; DISABLE: # %bb.0: 106; DISABLE-NEXT: vpmullq %xmm1, %xmm0, %xmm2 107; DISABLE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 108; DISABLE-NEXT: #APP 109; DISABLE-NEXT: nop 110; DISABLE-NEXT: #NO_APP 111; DISABLE-NEXT: kmovb (%rdi), %k1 112; DISABLE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 113; DISABLE-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} # 16-byte Folded Reload 114; DISABLE-NEXT: retq 115 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 116 %mask = load i8, ptr %pmask 117 %2 = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> zeroinitializer, i8 %mask) 118 %3 = add <2 x i64> %a0, %a1 119 %res = add <2 x i64> %2, %3 120 ret <2 x i64> %res 121} 122 123declare <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask) 124 125define <4 x i64> @pmullq_256(<4 x i64> %a0, <4 x i64> %a1) { 126; ENABLE-LABEL: pmullq_256: 127; ENABLE: # %bb.0: 128; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 129; ENABLE-NEXT: #APP 130; ENABLE-NEXT: nop 131; ENABLE-NEXT: #NO_APP 132; ENABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 133; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 134; ENABLE-NEXT: vpmullq %ymm2, %ymm0, %ymm1 135; ENABLE-NEXT: vpaddq %ymm2, %ymm0, %ymm0 136; ENABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0 137; ENABLE-NEXT: retq 138; 139; DISABLE-LABEL: pmullq_256: 140; DISABLE: # %bb.0: 141; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 142; DISABLE-NEXT: #APP 143; DISABLE-NEXT: nop 144; DISABLE-NEXT: #NO_APP 145; DISABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 146; DISABLE-NEXT: vpmullq %ymm2, %ymm0, %ymm1 147; DISABLE-NEXT: vpaddq %ymm2, %ymm0, %ymm0 148; DISABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0 149; DISABLE-NEXT: retq 150 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 151 %2 = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> undef, i8 -1) 152 %3 = add <4 x i64> %a0, %a1 153 %res = add <4 x i64> %2, %3 154 ret <4 x i64> %res 155} 156 157define <4 x i64> @pmullq_mem_256(<4 x i64> %a0, ptr %p1) { 158; ENABLE-LABEL: pmullq_mem_256: 159; ENABLE: # %bb.0: 160; ENABLE-NEXT: #APP 161; ENABLE-NEXT: nop 162; ENABLE-NEXT: #NO_APP 163; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 164; ENABLE-NEXT: vpmullq (%rdi), %ymm0, %ymm1 165; ENABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0 166; ENABLE-NEXT: retq 167; 168; DISABLE-LABEL: pmullq_mem_256: 169; DISABLE: # %bb.0: 170; DISABLE-NEXT: #APP 171; DISABLE-NEXT: nop 172; DISABLE-NEXT: #NO_APP 173; DISABLE-NEXT: vpmullq (%rdi), %ymm0, %ymm1 174; DISABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0 175; DISABLE-NEXT: retq 176 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 177 %a1 = load <4 x i64>, ptr %p1, align 64 178 %2 = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> undef, i8 -1) 179 %res = add <4 x i64> %2, %a0 180 ret <4 x i64> %res 181} 182 183define <4 x i64> @pmullq_broadcast_256(<4 x i64> %a0, ptr %p1) { 184; ENABLE-LABEL: pmullq_broadcast_256: 185; ENABLE: # %bb.0: 186; ENABLE-NEXT: #APP 187; ENABLE-NEXT: nop 188; ENABLE-NEXT: #NO_APP 189; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 190; ENABLE-NEXT: vpmullq (%rdi){1to4}, %ymm0, %ymm1 191; ENABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0 192; ENABLE-NEXT: retq 193; 194; DISABLE-LABEL: pmullq_broadcast_256: 195; DISABLE: # %bb.0: 196; DISABLE-NEXT: #APP 197; DISABLE-NEXT: nop 198; DISABLE-NEXT: #NO_APP 199; DISABLE-NEXT: vpmullq (%rdi){1to4}, %ymm0, %ymm1 200; DISABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0 201; DISABLE-NEXT: retq 202 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 203 %v1 = load i64, ptr %p1, align 4 204 %t0 = insertelement <4 x i64> undef, i64 %v1, i64 0 205 %a1 = shufflevector <4 x i64> %t0, <4 x i64> undef, <4 x i32> zeroinitializer 206 %2 = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> undef, i8 -1) 207 %res = add <4 x i64> %2, %a0 208 ret <4 x i64> %res 209} 210 211define <4 x i64> @pmullq_maskz_256(<4 x i64> %a0, <4 x i64> %a1, ptr %pmask) { 212; ENABLE-LABEL: pmullq_maskz_256: 213; ENABLE: # %bb.0: 214; ENABLE-NEXT: vpmullq %ymm1, %ymm0, %ymm2 215; ENABLE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 216; ENABLE-NEXT: #APP 217; ENABLE-NEXT: nop 218; ENABLE-NEXT: #NO_APP 219; ENABLE-NEXT: kmovb (%rdi), %k1 220; ENABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 221; ENABLE-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} # 32-byte Folded Reload 222; ENABLE-NEXT: retq 223; 224; DISABLE-LABEL: pmullq_maskz_256: 225; DISABLE: # %bb.0: 226; DISABLE-NEXT: vpmullq %ymm1, %ymm0, %ymm2 227; DISABLE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 228; DISABLE-NEXT: #APP 229; DISABLE-NEXT: nop 230; DISABLE-NEXT: #NO_APP 231; DISABLE-NEXT: kmovb (%rdi), %k1 232; DISABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 233; DISABLE-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} # 32-byte Folded Reload 234; DISABLE-NEXT: retq 235 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 236 %mask = load i8, ptr %pmask 237 %2 = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> zeroinitializer, i8 %mask) 238 %3 = add <4 x i64> %a0, %a1 239 %res = add <4 x i64> %2, %3 240 ret <4 x i64> %res 241} 242 243declare <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask) 244 245define <8 x i64> @pmullq_512(<8 x i64> %a0, <8 x i64> %a1) { 246; ENABLE-LABEL: pmullq_512: 247; ENABLE: # %bb.0: 248; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 249; ENABLE-NEXT: #APP 250; ENABLE-NEXT: nop 251; ENABLE-NEXT: #NO_APP 252; ENABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 253; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1 254; ENABLE-NEXT: vpmullq %zmm2, %zmm0, %zmm1 255; ENABLE-NEXT: vpaddq %zmm2, %zmm0, %zmm0 256; ENABLE-NEXT: vpaddq %zmm0, %zmm1, %zmm0 257; ENABLE-NEXT: retq 258; 259; DISABLE-LABEL: pmullq_512: 260; DISABLE: # %bb.0: 261; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 262; DISABLE-NEXT: #APP 263; DISABLE-NEXT: nop 264; DISABLE-NEXT: #NO_APP 265; DISABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 266; DISABLE-NEXT: vpmullq %zmm2, %zmm0, %zmm1 267; DISABLE-NEXT: vpaddq %zmm2, %zmm0, %zmm0 268; DISABLE-NEXT: vpaddq %zmm0, %zmm1, %zmm0 269; DISABLE-NEXT: retq 270 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 271 %2 = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> undef, i8 -1) 272 %3 = add <8 x i64> %a0, %a1 273 %res = add <8 x i64> %2, %3 274 ret <8 x i64> %res 275} 276 277define <8 x i64> @pmullq_mem_512(<8 x i64> %a0, ptr %p1) { 278; ENABLE-LABEL: pmullq_mem_512: 279; ENABLE: # %bb.0: 280; ENABLE-NEXT: #APP 281; ENABLE-NEXT: nop 282; ENABLE-NEXT: #NO_APP 283; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1 284; ENABLE-NEXT: vpmullq (%rdi), %zmm0, %zmm1 285; ENABLE-NEXT: vpaddq %zmm0, %zmm1, %zmm0 286; ENABLE-NEXT: retq 287; 288; DISABLE-LABEL: pmullq_mem_512: 289; DISABLE: # %bb.0: 290; DISABLE-NEXT: #APP 291; DISABLE-NEXT: nop 292; DISABLE-NEXT: #NO_APP 293; DISABLE-NEXT: vpmullq (%rdi), %zmm0, %zmm1 294; DISABLE-NEXT: vpaddq %zmm0, %zmm1, %zmm0 295; DISABLE-NEXT: retq 296 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 297 %a1 = load <8 x i64>, ptr %p1, align 64 298 %2 = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> undef, i8 -1) 299 %res = add <8 x i64> %2, %a0 300 ret <8 x i64> %res 301} 302 303define <8 x i64> @pmullq_broadcast_512(<8 x i64> %a0, ptr %p1) { 304; ENABLE-LABEL: pmullq_broadcast_512: 305; ENABLE: # %bb.0: 306; ENABLE-NEXT: #APP 307; ENABLE-NEXT: nop 308; ENABLE-NEXT: #NO_APP 309; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1 310; ENABLE-NEXT: vpmullq (%rdi){1to8}, %zmm0, %zmm1 311; ENABLE-NEXT: vpaddq %zmm0, %zmm1, %zmm0 312; ENABLE-NEXT: retq 313; 314; DISABLE-LABEL: pmullq_broadcast_512: 315; DISABLE: # %bb.0: 316; DISABLE-NEXT: #APP 317; DISABLE-NEXT: nop 318; DISABLE-NEXT: #NO_APP 319; DISABLE-NEXT: vpmullq (%rdi){1to8}, %zmm0, %zmm1 320; DISABLE-NEXT: vpaddq %zmm0, %zmm1, %zmm0 321; DISABLE-NEXT: retq 322 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 323 %v1 = load i64, ptr %p1, align 4 324 %t0 = insertelement <8 x i64> undef, i64 %v1, i64 0 325 %a1 = shufflevector <8 x i64> %t0, <8 x i64> undef, <8 x i32> zeroinitializer 326 %2 = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> undef, i8 -1) 327 %res = add <8 x i64> %2, %a0 328 ret <8 x i64> %res 329} 330 331define <8 x i64> @pmullq_maskz_512(<8 x i64> %a0, <8 x i64> %a1, ptr %pmask) { 332; ENABLE-LABEL: pmullq_maskz_512: 333; ENABLE: # %bb.0: 334; ENABLE-NEXT: vpmullq %zmm1, %zmm0, %zmm2 335; ENABLE-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 336; ENABLE-NEXT: #APP 337; ENABLE-NEXT: nop 338; ENABLE-NEXT: #NO_APP 339; ENABLE-NEXT: kmovb (%rdi), %k1 340; ENABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 341; ENABLE-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload 342; ENABLE-NEXT: retq 343; 344; DISABLE-LABEL: pmullq_maskz_512: 345; DISABLE: # %bb.0: 346; DISABLE-NEXT: vpmullq %zmm1, %zmm0, %zmm2 347; DISABLE-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 348; DISABLE-NEXT: #APP 349; DISABLE-NEXT: nop 350; DISABLE-NEXT: #NO_APP 351; DISABLE-NEXT: kmovb (%rdi), %k1 352; DISABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 353; DISABLE-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload 354; DISABLE-NEXT: retq 355 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 356 %mask = load i8, ptr %pmask 357 %2 = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask) 358 %3 = add <8 x i64> %a0, %a1 359 %res = add <8 x i64> %2, %3 360 ret <8 x i64> %res 361} 362 363declare <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) 364