1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,EVEX512 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,-evex512 | FileCheck %s --check-prefixes=CHECK,EVEX256 4 5; 256-bit 6 7define <32 x i8> @vpaddb256_test(<32 x i8> %i, <32 x i8> %j) nounwind readnone { 8; CHECK-LABEL: vpaddb256_test: 9; CHECK: # %bb.0: 10; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 11; CHECK-NEXT: retq 12 %x = add <32 x i8> %i, %j 13 ret <32 x i8> %x 14} 15 16define <32 x i8> @vpaddb256_fold_test(<32 x i8> %i, ptr %j) nounwind { 17; CHECK-LABEL: vpaddb256_fold_test: 18; CHECK: # %bb.0: 19; CHECK-NEXT: vpaddb (%rdi), %ymm0, %ymm0 20; CHECK-NEXT: retq 21 %tmp = load <32 x i8>, ptr %j, align 4 22 %x = add <32 x i8> %i, %tmp 23 ret <32 x i8> %x 24} 25 26define <16 x i16> @vpaddw256_test(<16 x i16> %i, <16 x i16> %j) nounwind readnone { 27; CHECK-LABEL: vpaddw256_test: 28; CHECK: # %bb.0: 29; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 30; CHECK-NEXT: retq 31 %x = add <16 x i16> %i, %j 32 ret <16 x i16> %x 33} 34 35define <16 x i16> @vpaddw256_fold_test(<16 x i16> %i, ptr %j) nounwind { 36; CHECK-LABEL: vpaddw256_fold_test: 37; CHECK: # %bb.0: 38; CHECK-NEXT: vpaddw (%rdi), %ymm0, %ymm0 39; CHECK-NEXT: retq 40 %tmp = load <16 x i16>, ptr %j, align 4 41 %x = add <16 x i16> %i, %tmp 42 ret <16 x i16> %x 43} 44 45define <16 x i16> @vpaddw256_mask_test(<16 x i16> %i, <16 x i16> %j, <16 x i16> %mask1) nounwind readnone { 46; CHECK-LABEL: vpaddw256_mask_test: 47; CHECK: # %bb.0: 48; CHECK-NEXT: vptestmw %ymm2, %ymm2, %k1 49; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 {%k1} 50; CHECK-NEXT: retq 51 %mask = icmp ne <16 x i16> %mask1, zeroinitializer 52 %x = add <16 x i16> %i, %j 53 %r = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %i 54 ret <16 x i16> %r 55} 56 57define <16 x i16> @vpaddw256_maskz_test(<16 x i16> %i, <16 x i16> %j, <16 x i16> %mask1) nounwind readnone { 58; CHECK-LABEL: vpaddw256_maskz_test: 59; CHECK: # %bb.0: 60; CHECK-NEXT: vptestmw %ymm2, %ymm2, %k1 61; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 {%k1} {z} 62; CHECK-NEXT: retq 63 %mask = icmp ne <16 x i16> %mask1, zeroinitializer 64 %x = add <16 x i16> %i, %j 65 %r = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer 66 ret <16 x i16> %r 67} 68 69define <16 x i16> @vpaddw256_mask_fold_test(<16 x i16> %i, ptr %j.ptr, <16 x i16> %mask1) nounwind readnone { 70; CHECK-LABEL: vpaddw256_mask_fold_test: 71; CHECK: # %bb.0: 72; CHECK-NEXT: vptestmw %ymm1, %ymm1, %k1 73; CHECK-NEXT: vpaddw (%rdi), %ymm0, %ymm0 {%k1} 74; CHECK-NEXT: retq 75 %mask = icmp ne <16 x i16> %mask1, zeroinitializer 76 %j = load <16 x i16>, ptr %j.ptr 77 %x = add <16 x i16> %i, %j 78 %r = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %i 79 ret <16 x i16> %r 80} 81 82define <16 x i16> @vpaddw256_maskz_fold_test(<16 x i16> %i, ptr %j.ptr, <16 x i16> %mask1) nounwind readnone { 83; CHECK-LABEL: vpaddw256_maskz_fold_test: 84; CHECK: # %bb.0: 85; CHECK-NEXT: vptestmw %ymm1, %ymm1, %k1 86; CHECK-NEXT: vpaddw (%rdi), %ymm0, %ymm0 {%k1} {z} 87; CHECK-NEXT: retq 88 %mask = icmp ne <16 x i16> %mask1, zeroinitializer 89 %j = load <16 x i16>, ptr %j.ptr 90 %x = add <16 x i16> %i, %j 91 %r = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer 92 ret <16 x i16> %r 93} 94 95define <32 x i8> @vpsubb256_test(<32 x i8> %i, <32 x i8> %j) nounwind readnone { 96; CHECK-LABEL: vpsubb256_test: 97; CHECK: # %bb.0: 98; CHECK-NEXT: vpsubb %ymm1, %ymm0, %ymm0 99; CHECK-NEXT: retq 100 %x = sub <32 x i8> %i, %j 101 ret <32 x i8> %x 102} 103 104define <16 x i16> @vpsubw256_test(<16 x i16> %i, <16 x i16> %j) nounwind readnone { 105; CHECK-LABEL: vpsubw256_test: 106; CHECK: # %bb.0: 107; CHECK-NEXT: vpsubw %ymm1, %ymm0, %ymm0 108; CHECK-NEXT: retq 109 %x = sub <16 x i16> %i, %j 110 ret <16 x i16> %x 111} 112 113define <16 x i16> @vpmullw256_test(<16 x i16> %i, <16 x i16> %j) { 114; CHECK-LABEL: vpmullw256_test: 115; CHECK: # %bb.0: 116; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0 117; CHECK-NEXT: retq 118 %x = mul <16 x i16> %i, %j 119 ret <16 x i16> %x 120} 121 122; 128-bit 123 124define <16 x i8> @vpaddb128_test(<16 x i8> %i, <16 x i8> %j) nounwind readnone { 125; CHECK-LABEL: vpaddb128_test: 126; CHECK: # %bb.0: 127; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 128; CHECK-NEXT: retq 129 %x = add <16 x i8> %i, %j 130 ret <16 x i8> %x 131} 132 133define <16 x i8> @vpaddb128_fold_test(<16 x i8> %i, ptr %j) nounwind { 134; CHECK-LABEL: vpaddb128_fold_test: 135; CHECK: # %bb.0: 136; CHECK-NEXT: vpaddb (%rdi), %xmm0, %xmm0 137; CHECK-NEXT: retq 138 %tmp = load <16 x i8>, ptr %j, align 4 139 %x = add <16 x i8> %i, %tmp 140 ret <16 x i8> %x 141} 142 143define <8 x i16> @vpaddw128_test(<8 x i16> %i, <8 x i16> %j) nounwind readnone { 144; CHECK-LABEL: vpaddw128_test: 145; CHECK: # %bb.0: 146; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 147; CHECK-NEXT: retq 148 %x = add <8 x i16> %i, %j 149 ret <8 x i16> %x 150} 151 152define <8 x i16> @vpaddw128_fold_test(<8 x i16> %i, ptr %j) nounwind { 153; CHECK-LABEL: vpaddw128_fold_test: 154; CHECK: # %bb.0: 155; CHECK-NEXT: vpaddw (%rdi), %xmm0, %xmm0 156; CHECK-NEXT: retq 157 %tmp = load <8 x i16>, ptr %j, align 4 158 %x = add <8 x i16> %i, %tmp 159 ret <8 x i16> %x 160} 161 162define <8 x i16> @vpaddw128_mask_test(<8 x i16> %i, <8 x i16> %j, <8 x i16> %mask1) nounwind readnone { 163; CHECK-LABEL: vpaddw128_mask_test: 164; CHECK: # %bb.0: 165; CHECK-NEXT: vptestmw %xmm2, %xmm2, %k1 166; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 {%k1} 167; CHECK-NEXT: retq 168 %mask = icmp ne <8 x i16> %mask1, zeroinitializer 169 %x = add <8 x i16> %i, %j 170 %r = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %i 171 ret <8 x i16> %r 172} 173 174define <8 x i16> @vpaddw128_maskz_test(<8 x i16> %i, <8 x i16> %j, <8 x i16> %mask1) nounwind readnone { 175; CHECK-LABEL: vpaddw128_maskz_test: 176; CHECK: # %bb.0: 177; CHECK-NEXT: vptestmw %xmm2, %xmm2, %k1 178; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 {%k1} {z} 179; CHECK-NEXT: retq 180 %mask = icmp ne <8 x i16> %mask1, zeroinitializer 181 %x = add <8 x i16> %i, %j 182 %r = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> zeroinitializer 183 ret <8 x i16> %r 184} 185 186define <8 x i16> @vpaddw128_mask_fold_test(<8 x i16> %i, ptr %j.ptr, <8 x i16> %mask1) nounwind readnone { 187; CHECK-LABEL: vpaddw128_mask_fold_test: 188; CHECK: # %bb.0: 189; CHECK-NEXT: vptestmw %xmm1, %xmm1, %k1 190; CHECK-NEXT: vpaddw (%rdi), %xmm0, %xmm0 {%k1} 191; CHECK-NEXT: retq 192 %mask = icmp ne <8 x i16> %mask1, zeroinitializer 193 %j = load <8 x i16>, ptr %j.ptr 194 %x = add <8 x i16> %i, %j 195 %r = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %i 196 ret <8 x i16> %r 197} 198 199define <8 x i16> @vpaddw128_maskz_fold_test(<8 x i16> %i, ptr %j.ptr, <8 x i16> %mask1) nounwind readnone { 200; CHECK-LABEL: vpaddw128_maskz_fold_test: 201; CHECK: # %bb.0: 202; CHECK-NEXT: vptestmw %xmm1, %xmm1, %k1 203; CHECK-NEXT: vpaddw (%rdi), %xmm0, %xmm0 {%k1} {z} 204; CHECK-NEXT: retq 205 %mask = icmp ne <8 x i16> %mask1, zeroinitializer 206 %j = load <8 x i16>, ptr %j.ptr 207 %x = add <8 x i16> %i, %j 208 %r = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> zeroinitializer 209 ret <8 x i16> %r 210} 211 212define <16 x i8> @vpsubb128_test(<16 x i8> %i, <16 x i8> %j) nounwind readnone { 213; CHECK-LABEL: vpsubb128_test: 214; CHECK: # %bb.0: 215; CHECK-NEXT: vpsubb %xmm1, %xmm0, %xmm0 216; CHECK-NEXT: retq 217 %x = sub <16 x i8> %i, %j 218 ret <16 x i8> %x 219} 220 221define <8 x i16> @vpsubw128_test(<8 x i16> %i, <8 x i16> %j) nounwind readnone { 222; CHECK-LABEL: vpsubw128_test: 223; CHECK: # %bb.0: 224; CHECK-NEXT: vpsubw %xmm1, %xmm0, %xmm0 225; CHECK-NEXT: retq 226 %x = sub <8 x i16> %i, %j 227 ret <8 x i16> %x 228} 229 230define <8 x i16> @vpmullw128_test(<8 x i16> %i, <8 x i16> %j) { 231; CHECK-LABEL: vpmullw128_test: 232; CHECK: # %bb.0: 233; CHECK-NEXT: vpmullw %xmm1, %xmm0, %xmm0 234; CHECK-NEXT: retq 235 %x = mul <8 x i16> %i, %j 236 ret <8 x i16> %x 237} 238 239define i16 @PR90356(<16 x i1> %a) { 240; EVEX512-LABEL: PR90356: 241; EVEX512: # %bb.0: 242; EVEX512-NEXT: vpsllw $7, %xmm0, %xmm0 243; EVEX512-NEXT: vpmovb2m %xmm0, %k1 244; EVEX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 245; EVEX512-NEXT: movb $63, %al 246; EVEX512-NEXT: kmovd %eax, %k1 247; EVEX512-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} 248; EVEX512-NEXT: vptestmd %zmm0, %zmm0, %k0 249; EVEX512-NEXT: kmovd %k0, %eax 250; EVEX512-NEXT: # kill: def $ax killed $ax killed $eax 251; EVEX512-NEXT: vzeroupper 252; EVEX512-NEXT: retq 253; 254; EVEX256-LABEL: PR90356: 255; EVEX256: # %bb.0: 256; EVEX256-NEXT: vpsllw $7, %xmm0, %xmm0 257; EVEX256-NEXT: vpmovb2m %xmm0, %k0 258; EVEX256-NEXT: vpmovm2w %k0, %ymm0 259; EVEX256-NEXT: vpxor %xmm1, %xmm1, %xmm1 260; EVEX256-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 261; EVEX256-NEXT: vpmovw2m %ymm0, %k0 262; EVEX256-NEXT: kmovd %k0, %eax 263; EVEX256-NEXT: # kill: def $ax killed $ax killed $eax 264; EVEX256-NEXT: vzeroupper 265; EVEX256-NEXT: retq 266 %1 = shufflevector <16 x i1> %a, <16 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31> 267 %2 = bitcast <16 x i1> %1 to i16 268 ret i16 %2 269} 270