1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=sse4.2 | FileCheck %s --check-prefixes=SSE,SSE4 4; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1 5; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2 6; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F 7; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512dq,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLDQ 8; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLBW 9; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx512f,avx512bw,avx512dq,avx512vl | FileCheck %s --check-prefixes=X86-AVX512 10 11; 12; vXf64 13; 14 15define void @store_v1f64_v1i64(<1 x i64> %trigger, ptr %addr, <1 x double> %val) nounwind { 16; SSE-LABEL: store_v1f64_v1i64: 17; SSE: ## %bb.0: 18; SSE-NEXT: testq %rdi, %rdi 19; SSE-NEXT: jns LBB0_2 20; SSE-NEXT: ## %bb.1: ## %cond.store 21; SSE-NEXT: movsd %xmm0, (%rsi) 22; SSE-NEXT: LBB0_2: ## %else 23; SSE-NEXT: retq 24; 25; AVX-LABEL: store_v1f64_v1i64: 26; AVX: ## %bb.0: 27; AVX-NEXT: testq %rdi, %rdi 28; AVX-NEXT: jns LBB0_2 29; AVX-NEXT: ## %bb.1: ## %cond.store 30; AVX-NEXT: vmovsd %xmm0, (%rsi) 31; AVX-NEXT: LBB0_2: ## %else 32; AVX-NEXT: retq 33; 34; X86-AVX512-LABEL: store_v1f64_v1i64: 35; X86-AVX512: ## %bb.0: 36; X86-AVX512-NEXT: cmpl $0, {{[0-9]+}}(%esp) 37; X86-AVX512-NEXT: jns LBB0_2 38; X86-AVX512-NEXT: ## %bb.1: ## %cond.store 39; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 40; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 41; X86-AVX512-NEXT: vmovsd %xmm0, (%eax) 42; X86-AVX512-NEXT: LBB0_2: ## %else 43; X86-AVX512-NEXT: retl 44 %mask = icmp slt <1 x i64> %trigger, zeroinitializer 45 call void @llvm.masked.store.v1f64.p0(<1 x double> %val, ptr %addr, i32 4, <1 x i1> %mask) 46 ret void 47} 48 49define void @store_v2f64_v2i64(<2 x i64> %trigger, ptr %addr, <2 x double> %val) nounwind { 50; SSE-LABEL: store_v2f64_v2i64: 51; SSE: ## %bb.0: 52; SSE-NEXT: movmskpd %xmm0, %eax 53; SSE-NEXT: testb $1, %al 54; SSE-NEXT: jne LBB1_1 55; SSE-NEXT: ## %bb.2: ## %else 56; SSE-NEXT: testb $2, %al 57; SSE-NEXT: jne LBB1_3 58; SSE-NEXT: LBB1_4: ## %else2 59; SSE-NEXT: retq 60; SSE-NEXT: LBB1_1: ## %cond.store 61; SSE-NEXT: movlps %xmm1, (%rdi) 62; SSE-NEXT: testb $2, %al 63; SSE-NEXT: je LBB1_4 64; SSE-NEXT: LBB1_3: ## %cond.store1 65; SSE-NEXT: movhps %xmm1, 8(%rdi) 66; SSE-NEXT: retq 67; 68; AVX1OR2-LABEL: store_v2f64_v2i64: 69; AVX1OR2: ## %bb.0: 70; AVX1OR2-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) 71; AVX1OR2-NEXT: retq 72; 73; AVX512F-LABEL: store_v2f64_v2i64: 74; AVX512F: ## %bb.0: 75; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 76; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 77; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 78; AVX512F-NEXT: vpcmpgtq %zmm0, %zmm2, %k0 79; AVX512F-NEXT: kshiftlw $14, %k0, %k0 80; AVX512F-NEXT: kshiftrw $14, %k0, %k1 81; AVX512F-NEXT: vmovupd %zmm1, (%rdi) {%k1} 82; AVX512F-NEXT: vzeroupper 83; AVX512F-NEXT: retq 84; 85; AVX512VLDQ-LABEL: store_v2f64_v2i64: 86; AVX512VLDQ: ## %bb.0: 87; AVX512VLDQ-NEXT: vpmovq2m %xmm0, %k1 88; AVX512VLDQ-NEXT: vmovupd %xmm1, (%rdi) {%k1} 89; AVX512VLDQ-NEXT: retq 90; 91; AVX512VLBW-LABEL: store_v2f64_v2i64: 92; AVX512VLBW: ## %bb.0: 93; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 94; AVX512VLBW-NEXT: vpcmpgtq %xmm0, %xmm2, %k1 95; AVX512VLBW-NEXT: vmovupd %xmm1, (%rdi) {%k1} 96; AVX512VLBW-NEXT: retq 97; 98; X86-AVX512-LABEL: store_v2f64_v2i64: 99; X86-AVX512: ## %bb.0: 100; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 101; X86-AVX512-NEXT: vpmovq2m %xmm0, %k1 102; X86-AVX512-NEXT: vmovupd %xmm1, (%eax) {%k1} 103; X86-AVX512-NEXT: retl 104 %mask = icmp slt <2 x i64> %trigger, zeroinitializer 105 call void @llvm.masked.store.v2f64.p0(<2 x double> %val, ptr %addr, i32 4, <2 x i1> %mask) 106 ret void 107} 108 109define void @store_v4f64_v4i64(<4 x i64> %trigger, ptr %addr, <4 x double> %val) nounwind { 110; SSE2-LABEL: store_v4f64_v4i64: 111; SSE2: ## %bb.0: 112; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 113; SSE2-NEXT: movmskps %xmm0, %eax 114; SSE2-NEXT: testb $1, %al 115; SSE2-NEXT: jne LBB2_1 116; SSE2-NEXT: ## %bb.2: ## %else 117; SSE2-NEXT: testb $2, %al 118; SSE2-NEXT: jne LBB2_3 119; SSE2-NEXT: LBB2_4: ## %else2 120; SSE2-NEXT: testb $4, %al 121; SSE2-NEXT: jne LBB2_5 122; SSE2-NEXT: LBB2_6: ## %else4 123; SSE2-NEXT: testb $8, %al 124; SSE2-NEXT: jne LBB2_7 125; SSE2-NEXT: LBB2_8: ## %else6 126; SSE2-NEXT: retq 127; SSE2-NEXT: LBB2_1: ## %cond.store 128; SSE2-NEXT: movlps %xmm2, (%rdi) 129; SSE2-NEXT: testb $2, %al 130; SSE2-NEXT: je LBB2_4 131; SSE2-NEXT: LBB2_3: ## %cond.store1 132; SSE2-NEXT: movhps %xmm2, 8(%rdi) 133; SSE2-NEXT: testb $4, %al 134; SSE2-NEXT: je LBB2_6 135; SSE2-NEXT: LBB2_5: ## %cond.store3 136; SSE2-NEXT: movlps %xmm3, 16(%rdi) 137; SSE2-NEXT: testb $8, %al 138; SSE2-NEXT: je LBB2_8 139; SSE2-NEXT: LBB2_7: ## %cond.store5 140; SSE2-NEXT: movhps %xmm3, 24(%rdi) 141; SSE2-NEXT: retq 142; 143; SSE4-LABEL: store_v4f64_v4i64: 144; SSE4: ## %bb.0: 145; SSE4-NEXT: packssdw %xmm1, %xmm0 146; SSE4-NEXT: movmskps %xmm0, %eax 147; SSE4-NEXT: testb $1, %al 148; SSE4-NEXT: jne LBB2_1 149; SSE4-NEXT: ## %bb.2: ## %else 150; SSE4-NEXT: testb $2, %al 151; SSE4-NEXT: jne LBB2_3 152; SSE4-NEXT: LBB2_4: ## %else2 153; SSE4-NEXT: testb $4, %al 154; SSE4-NEXT: jne LBB2_5 155; SSE4-NEXT: LBB2_6: ## %else4 156; SSE4-NEXT: testb $8, %al 157; SSE4-NEXT: jne LBB2_7 158; SSE4-NEXT: LBB2_8: ## %else6 159; SSE4-NEXT: retq 160; SSE4-NEXT: LBB2_1: ## %cond.store 161; SSE4-NEXT: movlps %xmm2, (%rdi) 162; SSE4-NEXT: testb $2, %al 163; SSE4-NEXT: je LBB2_4 164; SSE4-NEXT: LBB2_3: ## %cond.store1 165; SSE4-NEXT: movhps %xmm2, 8(%rdi) 166; SSE4-NEXT: testb $4, %al 167; SSE4-NEXT: je LBB2_6 168; SSE4-NEXT: LBB2_5: ## %cond.store3 169; SSE4-NEXT: movlps %xmm3, 16(%rdi) 170; SSE4-NEXT: testb $8, %al 171; SSE4-NEXT: je LBB2_8 172; SSE4-NEXT: LBB2_7: ## %cond.store5 173; SSE4-NEXT: movhps %xmm3, 24(%rdi) 174; SSE4-NEXT: retq 175; 176; AVX1OR2-LABEL: store_v4f64_v4i64: 177; AVX1OR2: ## %bb.0: 178; AVX1OR2-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) 179; AVX1OR2-NEXT: vzeroupper 180; AVX1OR2-NEXT: retq 181; 182; AVX512F-LABEL: store_v4f64_v4i64: 183; AVX512F: ## %bb.0: 184; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 185; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 186; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 187; AVX512F-NEXT: vpcmpgtq %zmm0, %zmm2, %k0 188; AVX512F-NEXT: kshiftlw $12, %k0, %k0 189; AVX512F-NEXT: kshiftrw $12, %k0, %k1 190; AVX512F-NEXT: vmovupd %zmm1, (%rdi) {%k1} 191; AVX512F-NEXT: vzeroupper 192; AVX512F-NEXT: retq 193; 194; AVX512VLDQ-LABEL: store_v4f64_v4i64: 195; AVX512VLDQ: ## %bb.0: 196; AVX512VLDQ-NEXT: vpmovq2m %ymm0, %k1 197; AVX512VLDQ-NEXT: vmovupd %ymm1, (%rdi) {%k1} 198; AVX512VLDQ-NEXT: vzeroupper 199; AVX512VLDQ-NEXT: retq 200; 201; AVX512VLBW-LABEL: store_v4f64_v4i64: 202; AVX512VLBW: ## %bb.0: 203; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 204; AVX512VLBW-NEXT: vpcmpgtq %ymm0, %ymm2, %k1 205; AVX512VLBW-NEXT: vmovupd %ymm1, (%rdi) {%k1} 206; AVX512VLBW-NEXT: vzeroupper 207; AVX512VLBW-NEXT: retq 208; 209; X86-AVX512-LABEL: store_v4f64_v4i64: 210; X86-AVX512: ## %bb.0: 211; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 212; X86-AVX512-NEXT: vpmovq2m %ymm0, %k1 213; X86-AVX512-NEXT: vmovupd %ymm1, (%eax) {%k1} 214; X86-AVX512-NEXT: vzeroupper 215; X86-AVX512-NEXT: retl 216 %mask = icmp slt <4 x i64> %trigger, zeroinitializer 217 call void @llvm.masked.store.v4f64.p0(<4 x double> %val, ptr %addr, i32 4, <4 x i1> %mask) 218 ret void 219} 220 221; 222; vXf32 223; 224 225define void @store_v2f32_v2i32(<2 x i32> %trigger, ptr %addr, <2 x float> %val) nounwind { 226; SSE2-LABEL: store_v2f32_v2i32: 227; SSE2: ## %bb.0: 228; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 229; SSE2-NEXT: pxor %xmm2, %xmm2 230; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 231; SSE2-NEXT: movmskpd %xmm2, %eax 232; SSE2-NEXT: testb $1, %al 233; SSE2-NEXT: jne LBB3_1 234; SSE2-NEXT: ## %bb.2: ## %else 235; SSE2-NEXT: testb $2, %al 236; SSE2-NEXT: jne LBB3_3 237; SSE2-NEXT: LBB3_4: ## %else2 238; SSE2-NEXT: retq 239; SSE2-NEXT: LBB3_1: ## %cond.store 240; SSE2-NEXT: movss %xmm1, (%rdi) 241; SSE2-NEXT: testb $2, %al 242; SSE2-NEXT: je LBB3_4 243; SSE2-NEXT: LBB3_3: ## %cond.store1 244; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] 245; SSE2-NEXT: movss %xmm1, 4(%rdi) 246; SSE2-NEXT: retq 247; 248; SSE4-LABEL: store_v2f32_v2i32: 249; SSE4: ## %bb.0: 250; SSE4-NEXT: pxor %xmm2, %xmm2 251; SSE4-NEXT: pcmpeqd %xmm0, %xmm2 252; SSE4-NEXT: pmovsxdq %xmm2, %xmm0 253; SSE4-NEXT: movmskpd %xmm0, %eax 254; SSE4-NEXT: testb $1, %al 255; SSE4-NEXT: jne LBB3_1 256; SSE4-NEXT: ## %bb.2: ## %else 257; SSE4-NEXT: testb $2, %al 258; SSE4-NEXT: jne LBB3_3 259; SSE4-NEXT: LBB3_4: ## %else2 260; SSE4-NEXT: retq 261; SSE4-NEXT: LBB3_1: ## %cond.store 262; SSE4-NEXT: movss %xmm1, (%rdi) 263; SSE4-NEXT: testb $2, %al 264; SSE4-NEXT: je LBB3_4 265; SSE4-NEXT: LBB3_3: ## %cond.store1 266; SSE4-NEXT: extractps $1, %xmm1, 4(%rdi) 267; SSE4-NEXT: retq 268; 269; AVX1OR2-LABEL: store_v2f32_v2i32: 270; AVX1OR2: ## %bb.0: 271; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 272; AVX1OR2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 273; AVX1OR2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 274; AVX1OR2-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) 275; AVX1OR2-NEXT: retq 276; 277; AVX512F-LABEL: store_v2f32_v2i32: 278; AVX512F: ## %bb.0: 279; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 280; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 281; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 282; AVX512F-NEXT: kshiftlw $14, %k0, %k0 283; AVX512F-NEXT: kshiftrw $14, %k0, %k1 284; AVX512F-NEXT: vmovups %zmm1, (%rdi) {%k1} 285; AVX512F-NEXT: vzeroupper 286; AVX512F-NEXT: retq 287; 288; AVX512VLDQ-LABEL: store_v2f32_v2i32: 289; AVX512VLDQ: ## %bb.0: 290; AVX512VLDQ-NEXT: vptestnmd %xmm0, %xmm0, %k0 291; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0 292; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1 293; AVX512VLDQ-NEXT: vmovups %xmm1, (%rdi) {%k1} 294; AVX512VLDQ-NEXT: retq 295; 296; AVX512VLBW-LABEL: store_v2f32_v2i32: 297; AVX512VLBW: ## %bb.0: 298; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k0 299; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0 300; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1 301; AVX512VLBW-NEXT: vmovups %xmm1, (%rdi) {%k1} 302; AVX512VLBW-NEXT: retq 303; 304; X86-AVX512-LABEL: store_v2f32_v2i32: 305; X86-AVX512: ## %bb.0: 306; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 307; X86-AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k0 308; X86-AVX512-NEXT: kshiftlb $6, %k0, %k0 309; X86-AVX512-NEXT: kshiftrb $6, %k0, %k1 310; X86-AVX512-NEXT: vmovups %xmm1, (%eax) {%k1} 311; X86-AVX512-NEXT: retl 312 %mask = icmp eq <2 x i32> %trigger, zeroinitializer 313 call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask) 314 ret void 315} 316 317define void @store_v4f32_v4i32(<4 x float> %x, ptr %ptr, <4 x float> %y, <4 x i32> %mask) nounwind { 318; SSE2-LABEL: store_v4f32_v4i32: 319; SSE2: ## %bb.0: 320; SSE2-NEXT: movmskps %xmm2, %eax 321; SSE2-NEXT: testb $1, %al 322; SSE2-NEXT: jne LBB4_1 323; SSE2-NEXT: ## %bb.2: ## %else 324; SSE2-NEXT: testb $2, %al 325; SSE2-NEXT: jne LBB4_3 326; SSE2-NEXT: LBB4_4: ## %else2 327; SSE2-NEXT: testb $4, %al 328; SSE2-NEXT: jne LBB4_5 329; SSE2-NEXT: LBB4_6: ## %else4 330; SSE2-NEXT: testb $8, %al 331; SSE2-NEXT: jne LBB4_7 332; SSE2-NEXT: LBB4_8: ## %else6 333; SSE2-NEXT: retq 334; SSE2-NEXT: LBB4_1: ## %cond.store 335; SSE2-NEXT: movss %xmm0, (%rdi) 336; SSE2-NEXT: testb $2, %al 337; SSE2-NEXT: je LBB4_4 338; SSE2-NEXT: LBB4_3: ## %cond.store1 339; SSE2-NEXT: movaps %xmm0, %xmm1 340; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 341; SSE2-NEXT: movss %xmm1, 4(%rdi) 342; SSE2-NEXT: testb $4, %al 343; SSE2-NEXT: je LBB4_6 344; SSE2-NEXT: LBB4_5: ## %cond.store3 345; SSE2-NEXT: movaps %xmm0, %xmm1 346; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 347; SSE2-NEXT: movss %xmm1, 8(%rdi) 348; SSE2-NEXT: testb $8, %al 349; SSE2-NEXT: je LBB4_8 350; SSE2-NEXT: LBB4_7: ## %cond.store5 351; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 352; SSE2-NEXT: movss %xmm0, 12(%rdi) 353; SSE2-NEXT: retq 354; 355; SSE4-LABEL: store_v4f32_v4i32: 356; SSE4: ## %bb.0: 357; SSE4-NEXT: movmskps %xmm2, %eax 358; SSE4-NEXT: testb $1, %al 359; SSE4-NEXT: jne LBB4_1 360; SSE4-NEXT: ## %bb.2: ## %else 361; SSE4-NEXT: testb $2, %al 362; SSE4-NEXT: jne LBB4_3 363; SSE4-NEXT: LBB4_4: ## %else2 364; SSE4-NEXT: testb $4, %al 365; SSE4-NEXT: jne LBB4_5 366; SSE4-NEXT: LBB4_6: ## %else4 367; SSE4-NEXT: testb $8, %al 368; SSE4-NEXT: jne LBB4_7 369; SSE4-NEXT: LBB4_8: ## %else6 370; SSE4-NEXT: retq 371; SSE4-NEXT: LBB4_1: ## %cond.store 372; SSE4-NEXT: movss %xmm0, (%rdi) 373; SSE4-NEXT: testb $2, %al 374; SSE4-NEXT: je LBB4_4 375; SSE4-NEXT: LBB4_3: ## %cond.store1 376; SSE4-NEXT: extractps $1, %xmm0, 4(%rdi) 377; SSE4-NEXT: testb $4, %al 378; SSE4-NEXT: je LBB4_6 379; SSE4-NEXT: LBB4_5: ## %cond.store3 380; SSE4-NEXT: extractps $2, %xmm0, 8(%rdi) 381; SSE4-NEXT: testb $8, %al 382; SSE4-NEXT: je LBB4_8 383; SSE4-NEXT: LBB4_7: ## %cond.store5 384; SSE4-NEXT: extractps $3, %xmm0, 12(%rdi) 385; SSE4-NEXT: retq 386; 387; AVX1OR2-LABEL: store_v4f32_v4i32: 388; AVX1OR2: ## %bb.0: 389; AVX1OR2-NEXT: vmaskmovps %xmm0, %xmm2, (%rdi) 390; AVX1OR2-NEXT: retq 391; 392; AVX512F-LABEL: store_v4f32_v4i32: 393; AVX512F: ## %bb.0: 394; AVX512F-NEXT: ## kill: def $xmm2 killed $xmm2 def $zmm2 395; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 396; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 397; AVX512F-NEXT: vpcmpgtd %zmm2, %zmm1, %k0 398; AVX512F-NEXT: kshiftlw $12, %k0, %k0 399; AVX512F-NEXT: kshiftrw $12, %k0, %k1 400; AVX512F-NEXT: vmovups %zmm0, (%rdi) {%k1} 401; AVX512F-NEXT: vzeroupper 402; AVX512F-NEXT: retq 403; 404; AVX512VLDQ-LABEL: store_v4f32_v4i32: 405; AVX512VLDQ: ## %bb.0: 406; AVX512VLDQ-NEXT: vpmovd2m %xmm2, %k1 407; AVX512VLDQ-NEXT: vmovups %xmm0, (%rdi) {%k1} 408; AVX512VLDQ-NEXT: retq 409; 410; AVX512VLBW-LABEL: store_v4f32_v4i32: 411; AVX512VLBW: ## %bb.0: 412; AVX512VLBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 413; AVX512VLBW-NEXT: vpcmpgtd %xmm2, %xmm1, %k1 414; AVX512VLBW-NEXT: vmovups %xmm0, (%rdi) {%k1} 415; AVX512VLBW-NEXT: retq 416; 417; X86-AVX512-LABEL: store_v4f32_v4i32: 418; X86-AVX512: ## %bb.0: 419; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 420; X86-AVX512-NEXT: vpmovd2m %xmm2, %k1 421; X86-AVX512-NEXT: vmovups %xmm0, (%eax) {%k1} 422; X86-AVX512-NEXT: retl 423 %bool_mask = icmp slt <4 x i32> %mask, zeroinitializer 424 call void @llvm.masked.store.v4f32.p0(<4 x float> %x, ptr %ptr, i32 1, <4 x i1> %bool_mask) 425 ret void 426} 427 428define void @store_v8f32_v8i32(<8 x float> %x, ptr %ptr, <8 x float> %y, <8 x i32> %mask) nounwind { 429; SSE2-LABEL: store_v8f32_v8i32: 430; SSE2: ## %bb.0: 431; SSE2-NEXT: packssdw %xmm5, %xmm4 432; SSE2-NEXT: packsswb %xmm4, %xmm4 433; SSE2-NEXT: pmovmskb %xmm4, %eax 434; SSE2-NEXT: testb $1, %al 435; SSE2-NEXT: jne LBB5_1 436; SSE2-NEXT: ## %bb.2: ## %else 437; SSE2-NEXT: testb $2, %al 438; SSE2-NEXT: jne LBB5_3 439; SSE2-NEXT: LBB5_4: ## %else2 440; SSE2-NEXT: testb $4, %al 441; SSE2-NEXT: jne LBB5_5 442; SSE2-NEXT: LBB5_6: ## %else4 443; SSE2-NEXT: testb $8, %al 444; SSE2-NEXT: jne LBB5_7 445; SSE2-NEXT: LBB5_8: ## %else6 446; SSE2-NEXT: testb $16, %al 447; SSE2-NEXT: jne LBB5_9 448; SSE2-NEXT: LBB5_10: ## %else8 449; SSE2-NEXT: testb $32, %al 450; SSE2-NEXT: jne LBB5_11 451; SSE2-NEXT: LBB5_12: ## %else10 452; SSE2-NEXT: testb $64, %al 453; SSE2-NEXT: jne LBB5_13 454; SSE2-NEXT: LBB5_14: ## %else12 455; SSE2-NEXT: testb $-128, %al 456; SSE2-NEXT: jne LBB5_15 457; SSE2-NEXT: LBB5_16: ## %else14 458; SSE2-NEXT: retq 459; SSE2-NEXT: LBB5_1: ## %cond.store 460; SSE2-NEXT: movd %xmm0, (%rdi) 461; SSE2-NEXT: testb $2, %al 462; SSE2-NEXT: je LBB5_4 463; SSE2-NEXT: LBB5_3: ## %cond.store1 464; SSE2-NEXT: movdqa %xmm0, %xmm2 465; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] 466; SSE2-NEXT: movss %xmm2, 4(%rdi) 467; SSE2-NEXT: testb $4, %al 468; SSE2-NEXT: je LBB5_6 469; SSE2-NEXT: LBB5_5: ## %cond.store3 470; SSE2-NEXT: movaps %xmm0, %xmm2 471; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 472; SSE2-NEXT: movss %xmm2, 8(%rdi) 473; SSE2-NEXT: testb $8, %al 474; SSE2-NEXT: je LBB5_8 475; SSE2-NEXT: LBB5_7: ## %cond.store5 476; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 477; SSE2-NEXT: movss %xmm0, 12(%rdi) 478; SSE2-NEXT: testb $16, %al 479; SSE2-NEXT: je LBB5_10 480; SSE2-NEXT: LBB5_9: ## %cond.store7 481; SSE2-NEXT: movss %xmm1, 16(%rdi) 482; SSE2-NEXT: testb $32, %al 483; SSE2-NEXT: je LBB5_12 484; SSE2-NEXT: LBB5_11: ## %cond.store9 485; SSE2-NEXT: movaps %xmm1, %xmm0 486; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] 487; SSE2-NEXT: movss %xmm0, 20(%rdi) 488; SSE2-NEXT: testb $64, %al 489; SSE2-NEXT: je LBB5_14 490; SSE2-NEXT: LBB5_13: ## %cond.store11 491; SSE2-NEXT: movaps %xmm1, %xmm0 492; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 493; SSE2-NEXT: movss %xmm0, 24(%rdi) 494; SSE2-NEXT: testb $-128, %al 495; SSE2-NEXT: je LBB5_16 496; SSE2-NEXT: LBB5_15: ## %cond.store13 497; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 498; SSE2-NEXT: movss %xmm1, 28(%rdi) 499; SSE2-NEXT: retq 500; 501; SSE4-LABEL: store_v8f32_v8i32: 502; SSE4: ## %bb.0: 503; SSE4-NEXT: packssdw %xmm5, %xmm4 504; SSE4-NEXT: packsswb %xmm4, %xmm4 505; SSE4-NEXT: pmovmskb %xmm4, %eax 506; SSE4-NEXT: testb $1, %al 507; SSE4-NEXT: jne LBB5_1 508; SSE4-NEXT: ## %bb.2: ## %else 509; SSE4-NEXT: testb $2, %al 510; SSE4-NEXT: jne LBB5_3 511; SSE4-NEXT: LBB5_4: ## %else2 512; SSE4-NEXT: testb $4, %al 513; SSE4-NEXT: jne LBB5_5 514; SSE4-NEXT: LBB5_6: ## %else4 515; SSE4-NEXT: testb $8, %al 516; SSE4-NEXT: jne LBB5_7 517; SSE4-NEXT: LBB5_8: ## %else6 518; SSE4-NEXT: testb $16, %al 519; SSE4-NEXT: jne LBB5_9 520; SSE4-NEXT: LBB5_10: ## %else8 521; SSE4-NEXT: testb $32, %al 522; SSE4-NEXT: jne LBB5_11 523; SSE4-NEXT: LBB5_12: ## %else10 524; SSE4-NEXT: testb $64, %al 525; SSE4-NEXT: jne LBB5_13 526; SSE4-NEXT: LBB5_14: ## %else12 527; SSE4-NEXT: testb $-128, %al 528; SSE4-NEXT: jne LBB5_15 529; SSE4-NEXT: LBB5_16: ## %else14 530; SSE4-NEXT: retq 531; SSE4-NEXT: LBB5_1: ## %cond.store 532; SSE4-NEXT: movd %xmm0, (%rdi) 533; SSE4-NEXT: testb $2, %al 534; SSE4-NEXT: je LBB5_4 535; SSE4-NEXT: LBB5_3: ## %cond.store1 536; SSE4-NEXT: pextrd $1, %xmm0, 4(%rdi) 537; SSE4-NEXT: testb $4, %al 538; SSE4-NEXT: je LBB5_6 539; SSE4-NEXT: LBB5_5: ## %cond.store3 540; SSE4-NEXT: pextrd $2, %xmm0, 8(%rdi) 541; SSE4-NEXT: testb $8, %al 542; SSE4-NEXT: je LBB5_8 543; SSE4-NEXT: LBB5_7: ## %cond.store5 544; SSE4-NEXT: pextrd $3, %xmm0, 12(%rdi) 545; SSE4-NEXT: testb $16, %al 546; SSE4-NEXT: je LBB5_10 547; SSE4-NEXT: LBB5_9: ## %cond.store7 548; SSE4-NEXT: movss %xmm1, 16(%rdi) 549; SSE4-NEXT: testb $32, %al 550; SSE4-NEXT: je LBB5_12 551; SSE4-NEXT: LBB5_11: ## %cond.store9 552; SSE4-NEXT: extractps $1, %xmm1, 20(%rdi) 553; SSE4-NEXT: testb $64, %al 554; SSE4-NEXT: je LBB5_14 555; SSE4-NEXT: LBB5_13: ## %cond.store11 556; SSE4-NEXT: extractps $2, %xmm1, 24(%rdi) 557; SSE4-NEXT: testb $-128, %al 558; SSE4-NEXT: je LBB5_16 559; SSE4-NEXT: LBB5_15: ## %cond.store13 560; SSE4-NEXT: extractps $3, %xmm1, 28(%rdi) 561; SSE4-NEXT: retq 562; 563; AVX1OR2-LABEL: store_v8f32_v8i32: 564; AVX1OR2: ## %bb.0: 565; AVX1OR2-NEXT: vmaskmovps %ymm0, %ymm2, (%rdi) 566; AVX1OR2-NEXT: vzeroupper 567; AVX1OR2-NEXT: retq 568; 569; AVX512F-LABEL: store_v8f32_v8i32: 570; AVX512F: ## %bb.0: 571; AVX512F-NEXT: ## kill: def $ymm2 killed $ymm2 def $zmm2 572; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 573; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 574; AVX512F-NEXT: vpcmpgtd %zmm2, %zmm1, %k0 575; AVX512F-NEXT: kshiftlw $8, %k0, %k0 576; AVX512F-NEXT: kshiftrw $8, %k0, %k1 577; AVX512F-NEXT: vmovups %zmm0, (%rdi) {%k1} 578; AVX512F-NEXT: vzeroupper 579; AVX512F-NEXT: retq 580; 581; AVX512VLDQ-LABEL: store_v8f32_v8i32: 582; AVX512VLDQ: ## %bb.0: 583; AVX512VLDQ-NEXT: vpmovd2m %ymm2, %k1 584; AVX512VLDQ-NEXT: vmovups %ymm0, (%rdi) {%k1} 585; AVX512VLDQ-NEXT: vzeroupper 586; AVX512VLDQ-NEXT: retq 587; 588; AVX512VLBW-LABEL: store_v8f32_v8i32: 589; AVX512VLBW: ## %bb.0: 590; AVX512VLBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 591; AVX512VLBW-NEXT: vpcmpgtd %ymm2, %ymm1, %k1 592; AVX512VLBW-NEXT: vmovups %ymm0, (%rdi) {%k1} 593; AVX512VLBW-NEXT: vzeroupper 594; AVX512VLBW-NEXT: retq 595; 596; X86-AVX512-LABEL: store_v8f32_v8i32: 597; X86-AVX512: ## %bb.0: 598; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 599; X86-AVX512-NEXT: vpmovd2m %ymm2, %k1 600; X86-AVX512-NEXT: vmovups %ymm0, (%eax) {%k1} 601; X86-AVX512-NEXT: vzeroupper 602; X86-AVX512-NEXT: retl 603 %bool_mask = icmp slt <8 x i32> %mask, zeroinitializer 604 call void @llvm.masked.store.v8f32.p0(<8 x float> %x, ptr %ptr, i32 1, <8 x i1> %bool_mask) 605 ret void 606} 607 608define void @store_v16f32_v16i32(<16 x float> %x, ptr %ptr, <16 x float> %y, <16 x i32> %mask) nounwind { 609; SSE2-LABEL: store_v16f32_v16i32: 610; SSE2: ## %bb.0: 611; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 612; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 613; SSE2-NEXT: packssdw {{[0-9]+}}(%rsp), %xmm5 614; SSE2-NEXT: packssdw {{[0-9]+}}(%rsp), %xmm4 615; SSE2-NEXT: packsswb %xmm5, %xmm4 616; SSE2-NEXT: pmovmskb %xmm4, %eax 617; SSE2-NEXT: testb $1, %al 618; SSE2-NEXT: jne LBB6_1 619; SSE2-NEXT: ## %bb.2: ## %else 620; SSE2-NEXT: testb $2, %al 621; SSE2-NEXT: jne LBB6_3 622; SSE2-NEXT: LBB6_4: ## %else2 623; SSE2-NEXT: testb $4, %al 624; SSE2-NEXT: jne LBB6_5 625; SSE2-NEXT: LBB6_6: ## %else4 626; SSE2-NEXT: testb $8, %al 627; SSE2-NEXT: jne LBB6_7 628; SSE2-NEXT: LBB6_8: ## %else6 629; SSE2-NEXT: testb $16, %al 630; SSE2-NEXT: jne LBB6_9 631; SSE2-NEXT: LBB6_10: ## %else8 632; SSE2-NEXT: testb $32, %al 633; SSE2-NEXT: jne LBB6_11 634; SSE2-NEXT: LBB6_12: ## %else10 635; SSE2-NEXT: testb $64, %al 636; SSE2-NEXT: jne LBB6_13 637; SSE2-NEXT: LBB6_14: ## %else12 638; SSE2-NEXT: testb %al, %al 639; SSE2-NEXT: js LBB6_15 640; SSE2-NEXT: LBB6_16: ## %else14 641; SSE2-NEXT: testl $256, %eax ## imm = 0x100 642; SSE2-NEXT: jne LBB6_17 643; SSE2-NEXT: LBB6_18: ## %else16 644; SSE2-NEXT: testl $512, %eax ## imm = 0x200 645; SSE2-NEXT: jne LBB6_19 646; SSE2-NEXT: LBB6_20: ## %else18 647; SSE2-NEXT: testl $1024, %eax ## imm = 0x400 648; SSE2-NEXT: jne LBB6_21 649; SSE2-NEXT: LBB6_22: ## %else20 650; SSE2-NEXT: testl $2048, %eax ## imm = 0x800 651; SSE2-NEXT: jne LBB6_23 652; SSE2-NEXT: LBB6_24: ## %else22 653; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000 654; SSE2-NEXT: jne LBB6_25 655; SSE2-NEXT: LBB6_26: ## %else24 656; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000 657; SSE2-NEXT: jne LBB6_27 658; SSE2-NEXT: LBB6_28: ## %else26 659; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000 660; SSE2-NEXT: jne LBB6_29 661; SSE2-NEXT: LBB6_30: ## %else28 662; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000 663; SSE2-NEXT: jne LBB6_31 664; SSE2-NEXT: LBB6_32: ## %else30 665; SSE2-NEXT: retq 666; SSE2-NEXT: LBB6_1: ## %cond.store 667; SSE2-NEXT: movss %xmm0, (%rdi) 668; SSE2-NEXT: testb $2, %al 669; SSE2-NEXT: je LBB6_4 670; SSE2-NEXT: LBB6_3: ## %cond.store1 671; SSE2-NEXT: movaps %xmm0, %xmm4 672; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1] 673; SSE2-NEXT: movss %xmm4, 4(%rdi) 674; SSE2-NEXT: testb $4, %al 675; SSE2-NEXT: je LBB6_6 676; SSE2-NEXT: LBB6_5: ## %cond.store3 677; SSE2-NEXT: movaps %xmm0, %xmm4 678; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] 679; SSE2-NEXT: movss %xmm4, 8(%rdi) 680; SSE2-NEXT: testb $8, %al 681; SSE2-NEXT: je LBB6_8 682; SSE2-NEXT: LBB6_7: ## %cond.store5 683; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 684; SSE2-NEXT: movss %xmm0, 12(%rdi) 685; SSE2-NEXT: testb $16, %al 686; SSE2-NEXT: je LBB6_10 687; SSE2-NEXT: LBB6_9: ## %cond.store7 688; SSE2-NEXT: movss %xmm1, 16(%rdi) 689; SSE2-NEXT: testb $32, %al 690; SSE2-NEXT: je LBB6_12 691; SSE2-NEXT: LBB6_11: ## %cond.store9 692; SSE2-NEXT: movaps %xmm1, %xmm0 693; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] 694; SSE2-NEXT: movss %xmm0, 20(%rdi) 695; SSE2-NEXT: testb $64, %al 696; SSE2-NEXT: je LBB6_14 697; SSE2-NEXT: LBB6_13: ## %cond.store11 698; SSE2-NEXT: movaps %xmm1, %xmm0 699; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 700; SSE2-NEXT: movss %xmm0, 24(%rdi) 701; SSE2-NEXT: testb %al, %al 702; SSE2-NEXT: jns LBB6_16 703; SSE2-NEXT: LBB6_15: ## %cond.store13 704; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 705; SSE2-NEXT: movss %xmm1, 28(%rdi) 706; SSE2-NEXT: testl $256, %eax ## imm = 0x100 707; SSE2-NEXT: je LBB6_18 708; SSE2-NEXT: LBB6_17: ## %cond.store15 709; SSE2-NEXT: movss %xmm2, 32(%rdi) 710; SSE2-NEXT: testl $512, %eax ## imm = 0x200 711; SSE2-NEXT: je LBB6_20 712; SSE2-NEXT: LBB6_19: ## %cond.store17 713; SSE2-NEXT: movaps %xmm2, %xmm0 714; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] 715; SSE2-NEXT: movss %xmm0, 36(%rdi) 716; SSE2-NEXT: testl $1024, %eax ## imm = 0x400 717; SSE2-NEXT: je LBB6_22 718; SSE2-NEXT: LBB6_21: ## %cond.store19 719; SSE2-NEXT: movaps %xmm2, %xmm0 720; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] 721; SSE2-NEXT: movss %xmm0, 40(%rdi) 722; SSE2-NEXT: testl $2048, %eax ## imm = 0x800 723; SSE2-NEXT: je LBB6_24 724; SSE2-NEXT: LBB6_23: ## %cond.store21 725; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 726; SSE2-NEXT: movss %xmm2, 44(%rdi) 727; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000 728; SSE2-NEXT: je LBB6_26 729; SSE2-NEXT: LBB6_25: ## %cond.store23 730; SSE2-NEXT: movss %xmm3, 48(%rdi) 731; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000 732; SSE2-NEXT: je LBB6_28 733; SSE2-NEXT: LBB6_27: ## %cond.store25 734; SSE2-NEXT: movaps %xmm3, %xmm0 735; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[1,1] 736; SSE2-NEXT: movss %xmm0, 52(%rdi) 737; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000 738; SSE2-NEXT: je LBB6_30 739; SSE2-NEXT: LBB6_29: ## %cond.store27 740; SSE2-NEXT: movaps %xmm3, %xmm0 741; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] 742; SSE2-NEXT: movss %xmm0, 56(%rdi) 743; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000 744; SSE2-NEXT: je LBB6_32 745; SSE2-NEXT: LBB6_31: ## %cond.store29 746; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 747; SSE2-NEXT: movss %xmm3, 60(%rdi) 748; SSE2-NEXT: retq 749; 750; SSE4-LABEL: store_v16f32_v16i32: 751; SSE4: ## %bb.0: 752; SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 753; SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 754; SSE4-NEXT: packssdw {{[0-9]+}}(%rsp), %xmm5 755; SSE4-NEXT: packssdw {{[0-9]+}}(%rsp), %xmm4 756; SSE4-NEXT: packsswb %xmm5, %xmm4 757; SSE4-NEXT: pmovmskb %xmm4, %eax 758; SSE4-NEXT: testb $1, %al 759; SSE4-NEXT: jne LBB6_1 760; SSE4-NEXT: ## %bb.2: ## %else 761; SSE4-NEXT: testb $2, %al 762; SSE4-NEXT: jne LBB6_3 763; SSE4-NEXT: LBB6_4: ## %else2 764; SSE4-NEXT: testb $4, %al 765; SSE4-NEXT: jne LBB6_5 766; SSE4-NEXT: LBB6_6: ## %else4 767; SSE4-NEXT: testb $8, %al 768; SSE4-NEXT: jne LBB6_7 769; SSE4-NEXT: LBB6_8: ## %else6 770; SSE4-NEXT: testb $16, %al 771; SSE4-NEXT: jne LBB6_9 772; SSE4-NEXT: LBB6_10: ## %else8 773; SSE4-NEXT: testb $32, %al 774; SSE4-NEXT: jne LBB6_11 775; SSE4-NEXT: LBB6_12: ## %else10 776; SSE4-NEXT: testb $64, %al 777; SSE4-NEXT: jne LBB6_13 778; SSE4-NEXT: LBB6_14: ## %else12 779; SSE4-NEXT: testb %al, %al 780; SSE4-NEXT: js LBB6_15 781; SSE4-NEXT: LBB6_16: ## %else14 782; SSE4-NEXT: testl $256, %eax ## imm = 0x100 783; SSE4-NEXT: jne LBB6_17 784; SSE4-NEXT: LBB6_18: ## %else16 785; SSE4-NEXT: testl $512, %eax ## imm = 0x200 786; SSE4-NEXT: jne LBB6_19 787; SSE4-NEXT: LBB6_20: ## %else18 788; SSE4-NEXT: testl $1024, %eax ## imm = 0x400 789; SSE4-NEXT: jne LBB6_21 790; SSE4-NEXT: LBB6_22: ## %else20 791; SSE4-NEXT: testl $2048, %eax ## imm = 0x800 792; SSE4-NEXT: jne LBB6_23 793; SSE4-NEXT: LBB6_24: ## %else22 794; SSE4-NEXT: testl $4096, %eax ## imm = 0x1000 795; SSE4-NEXT: jne LBB6_25 796; SSE4-NEXT: LBB6_26: ## %else24 797; SSE4-NEXT: testl $8192, %eax ## imm = 0x2000 798; SSE4-NEXT: jne LBB6_27 799; SSE4-NEXT: LBB6_28: ## %else26 800; SSE4-NEXT: testl $16384, %eax ## imm = 0x4000 801; SSE4-NEXT: jne LBB6_29 802; SSE4-NEXT: LBB6_30: ## %else28 803; SSE4-NEXT: testl $32768, %eax ## imm = 0x8000 804; SSE4-NEXT: jne LBB6_31 805; SSE4-NEXT: LBB6_32: ## %else30 806; SSE4-NEXT: retq 807; SSE4-NEXT: LBB6_1: ## %cond.store 808; SSE4-NEXT: movss %xmm0, (%rdi) 809; SSE4-NEXT: testb $2, %al 810; SSE4-NEXT: je LBB6_4 811; SSE4-NEXT: LBB6_3: ## %cond.store1 812; SSE4-NEXT: extractps $1, %xmm0, 4(%rdi) 813; SSE4-NEXT: testb $4, %al 814; SSE4-NEXT: je LBB6_6 815; SSE4-NEXT: LBB6_5: ## %cond.store3 816; SSE4-NEXT: extractps $2, %xmm0, 8(%rdi) 817; SSE4-NEXT: testb $8, %al 818; SSE4-NEXT: je LBB6_8 819; SSE4-NEXT: LBB6_7: ## %cond.store5 820; SSE4-NEXT: extractps $3, %xmm0, 12(%rdi) 821; SSE4-NEXT: testb $16, %al 822; SSE4-NEXT: je LBB6_10 823; SSE4-NEXT: LBB6_9: ## %cond.store7 824; SSE4-NEXT: movss %xmm1, 16(%rdi) 825; SSE4-NEXT: testb $32, %al 826; SSE4-NEXT: je LBB6_12 827; SSE4-NEXT: LBB6_11: ## %cond.store9 828; SSE4-NEXT: extractps $1, %xmm1, 20(%rdi) 829; SSE4-NEXT: testb $64, %al 830; SSE4-NEXT: je LBB6_14 831; SSE4-NEXT: LBB6_13: ## %cond.store11 832; SSE4-NEXT: extractps $2, %xmm1, 24(%rdi) 833; SSE4-NEXT: testb %al, %al 834; SSE4-NEXT: jns LBB6_16 835; SSE4-NEXT: LBB6_15: ## %cond.store13 836; SSE4-NEXT: extractps $3, %xmm1, 28(%rdi) 837; SSE4-NEXT: testl $256, %eax ## imm = 0x100 838; SSE4-NEXT: je LBB6_18 839; SSE4-NEXT: LBB6_17: ## %cond.store15 840; SSE4-NEXT: movss %xmm2, 32(%rdi) 841; SSE4-NEXT: testl $512, %eax ## imm = 0x200 842; SSE4-NEXT: je LBB6_20 843; SSE4-NEXT: LBB6_19: ## %cond.store17 844; SSE4-NEXT: extractps $1, %xmm2, 36(%rdi) 845; SSE4-NEXT: testl $1024, %eax ## imm = 0x400 846; SSE4-NEXT: je LBB6_22 847; SSE4-NEXT: LBB6_21: ## %cond.store19 848; SSE4-NEXT: extractps $2, %xmm2, 40(%rdi) 849; SSE4-NEXT: testl $2048, %eax ## imm = 0x800 850; SSE4-NEXT: je LBB6_24 851; SSE4-NEXT: LBB6_23: ## %cond.store21 852; SSE4-NEXT: extractps $3, %xmm2, 44(%rdi) 853; SSE4-NEXT: testl $4096, %eax ## imm = 0x1000 854; SSE4-NEXT: je LBB6_26 855; SSE4-NEXT: LBB6_25: ## %cond.store23 856; SSE4-NEXT: movss %xmm3, 48(%rdi) 857; SSE4-NEXT: testl $8192, %eax ## imm = 0x2000 858; SSE4-NEXT: je LBB6_28 859; SSE4-NEXT: LBB6_27: ## %cond.store25 860; SSE4-NEXT: extractps $1, %xmm3, 52(%rdi) 861; SSE4-NEXT: testl $16384, %eax ## imm = 0x4000 862; SSE4-NEXT: je LBB6_30 863; SSE4-NEXT: LBB6_29: ## %cond.store27 864; SSE4-NEXT: extractps $2, %xmm3, 56(%rdi) 865; SSE4-NEXT: testl $32768, %eax ## imm = 0x8000 866; SSE4-NEXT: je LBB6_32 867; SSE4-NEXT: LBB6_31: ## %cond.store29 868; SSE4-NEXT: extractps $3, %xmm3, 60(%rdi) 869; SSE4-NEXT: retq 870; 871; AVX1OR2-LABEL: store_v16f32_v16i32: 872; AVX1OR2: ## %bb.0: 873; AVX1OR2-NEXT: vmaskmovps %ymm1, %ymm5, 32(%rdi) 874; AVX1OR2-NEXT: vmaskmovps %ymm0, %ymm4, (%rdi) 875; AVX1OR2-NEXT: vzeroupper 876; AVX1OR2-NEXT: retq 877; 878; AVX512F-LABEL: store_v16f32_v16i32: 879; AVX512F: ## %bb.0: 880; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 881; AVX512F-NEXT: vpcmpgtd %zmm2, %zmm1, %k1 882; AVX512F-NEXT: vmovups %zmm0, (%rdi) {%k1} 883; AVX512F-NEXT: vzeroupper 884; AVX512F-NEXT: retq 885; 886; AVX512VLDQ-LABEL: store_v16f32_v16i32: 887; AVX512VLDQ: ## %bb.0: 888; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k1 889; AVX512VLDQ-NEXT: vmovups %zmm0, (%rdi) {%k1} 890; AVX512VLDQ-NEXT: vzeroupper 891; AVX512VLDQ-NEXT: retq 892; 893; AVX512VLBW-LABEL: store_v16f32_v16i32: 894; AVX512VLBW: ## %bb.0: 895; AVX512VLBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 896; AVX512VLBW-NEXT: vpcmpgtd %zmm2, %zmm1, %k1 897; AVX512VLBW-NEXT: vmovups %zmm0, (%rdi) {%k1} 898; AVX512VLBW-NEXT: vzeroupper 899; AVX512VLBW-NEXT: retq 900; 901; X86-AVX512-LABEL: store_v16f32_v16i32: 902; X86-AVX512: ## %bb.0: 903; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 904; X86-AVX512-NEXT: vpmovd2m %zmm2, %k1 905; X86-AVX512-NEXT: vmovups %zmm0, (%eax) {%k1} 906; X86-AVX512-NEXT: vzeroupper 907; X86-AVX512-NEXT: retl 908 %bool_mask = icmp slt <16 x i32> %mask, zeroinitializer 909 call void @llvm.masked.store.v16f32.p0(<16 x float> %x, ptr %ptr, i32 1, <16 x i1> %bool_mask) 910 ret void 911} 912 913; 914; vXi64 915; 916 917define void @store_v2i64_v2i64(<2 x i64> %trigger, ptr %addr, <2 x i64> %val) nounwind { 918; SSE2-LABEL: store_v2i64_v2i64: 919; SSE2: ## %bb.0: 920; SSE2-NEXT: movmskpd %xmm0, %eax 921; SSE2-NEXT: testb $1, %al 922; SSE2-NEXT: jne LBB7_1 923; SSE2-NEXT: ## %bb.2: ## %else 924; SSE2-NEXT: testb $2, %al 925; SSE2-NEXT: jne LBB7_3 926; SSE2-NEXT: LBB7_4: ## %else2 927; SSE2-NEXT: retq 928; SSE2-NEXT: LBB7_1: ## %cond.store 929; SSE2-NEXT: movq %xmm1, (%rdi) 930; SSE2-NEXT: testb $2, %al 931; SSE2-NEXT: je LBB7_4 932; SSE2-NEXT: LBB7_3: ## %cond.store1 933; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 934; SSE2-NEXT: movq %xmm0, 8(%rdi) 935; SSE2-NEXT: retq 936; 937; SSE4-LABEL: store_v2i64_v2i64: 938; SSE4: ## %bb.0: 939; SSE4-NEXT: movmskpd %xmm0, %eax 940; SSE4-NEXT: testb $1, %al 941; SSE4-NEXT: jne LBB7_1 942; SSE4-NEXT: ## %bb.2: ## %else 943; SSE4-NEXT: testb $2, %al 944; SSE4-NEXT: jne LBB7_3 945; SSE4-NEXT: LBB7_4: ## %else2 946; SSE4-NEXT: retq 947; SSE4-NEXT: LBB7_1: ## %cond.store 948; SSE4-NEXT: movq %xmm1, (%rdi) 949; SSE4-NEXT: testb $2, %al 950; SSE4-NEXT: je LBB7_4 951; SSE4-NEXT: LBB7_3: ## %cond.store1 952; SSE4-NEXT: pextrq $1, %xmm1, 8(%rdi) 953; SSE4-NEXT: retq 954; 955; AVX1-LABEL: store_v2i64_v2i64: 956; AVX1: ## %bb.0: 957; AVX1-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) 958; AVX1-NEXT: retq 959; 960; AVX2-LABEL: store_v2i64_v2i64: 961; AVX2: ## %bb.0: 962; AVX2-NEXT: vpmaskmovq %xmm1, %xmm0, (%rdi) 963; AVX2-NEXT: retq 964; 965; AVX512F-LABEL: store_v2i64_v2i64: 966; AVX512F: ## %bb.0: 967; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 968; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 969; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 970; AVX512F-NEXT: vpcmpgtq %zmm0, %zmm2, %k0 971; AVX512F-NEXT: kshiftlw $14, %k0, %k0 972; AVX512F-NEXT: kshiftrw $14, %k0, %k1 973; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1} 974; AVX512F-NEXT: vzeroupper 975; AVX512F-NEXT: retq 976; 977; AVX512VLDQ-LABEL: store_v2i64_v2i64: 978; AVX512VLDQ: ## %bb.0: 979; AVX512VLDQ-NEXT: vpmovq2m %xmm0, %k1 980; AVX512VLDQ-NEXT: vmovdqu64 %xmm1, (%rdi) {%k1} 981; AVX512VLDQ-NEXT: retq 982; 983; AVX512VLBW-LABEL: store_v2i64_v2i64: 984; AVX512VLBW: ## %bb.0: 985; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 986; AVX512VLBW-NEXT: vpcmpgtq %xmm0, %xmm2, %k1 987; AVX512VLBW-NEXT: vmovdqu64 %xmm1, (%rdi) {%k1} 988; AVX512VLBW-NEXT: retq 989; 990; X86-AVX512-LABEL: store_v2i64_v2i64: 991; X86-AVX512: ## %bb.0: 992; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 993; X86-AVX512-NEXT: vpmovq2m %xmm0, %k1 994; X86-AVX512-NEXT: vmovdqu64 %xmm1, (%eax) {%k1} 995; X86-AVX512-NEXT: retl 996 %mask = icmp slt <2 x i64> %trigger, zeroinitializer 997 call void @llvm.masked.store.v2i64.p0(<2 x i64> %val, ptr %addr, i32 4, <2 x i1> %mask) 998 ret void 999} 1000 1001define void @store_v4i64_v4i64(<4 x i64> %trigger, ptr %addr, <4 x i64> %val) nounwind { 1002; SSE2-LABEL: store_v4i64_v4i64: 1003; SSE2: ## %bb.0: 1004; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 1005; SSE2-NEXT: movmskps %xmm0, %eax 1006; SSE2-NEXT: testb $1, %al 1007; SSE2-NEXT: jne LBB8_1 1008; SSE2-NEXT: ## %bb.2: ## %else 1009; SSE2-NEXT: testb $2, %al 1010; SSE2-NEXT: jne LBB8_3 1011; SSE2-NEXT: LBB8_4: ## %else2 1012; SSE2-NEXT: testb $4, %al 1013; SSE2-NEXT: jne LBB8_5 1014; SSE2-NEXT: LBB8_6: ## %else4 1015; SSE2-NEXT: testb $8, %al 1016; SSE2-NEXT: jne LBB8_7 1017; SSE2-NEXT: LBB8_8: ## %else6 1018; SSE2-NEXT: retq 1019; SSE2-NEXT: LBB8_1: ## %cond.store 1020; SSE2-NEXT: movq %xmm2, (%rdi) 1021; SSE2-NEXT: testb $2, %al 1022; SSE2-NEXT: je LBB8_4 1023; SSE2-NEXT: LBB8_3: ## %cond.store1 1024; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 1025; SSE2-NEXT: movq %xmm0, 8(%rdi) 1026; SSE2-NEXT: testb $4, %al 1027; SSE2-NEXT: je LBB8_6 1028; SSE2-NEXT: LBB8_5: ## %cond.store3 1029; SSE2-NEXT: movq %xmm3, 16(%rdi) 1030; SSE2-NEXT: testb $8, %al 1031; SSE2-NEXT: je LBB8_8 1032; SSE2-NEXT: LBB8_7: ## %cond.store5 1033; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] 1034; SSE2-NEXT: movq %xmm0, 24(%rdi) 1035; SSE2-NEXT: retq 1036; 1037; SSE4-LABEL: store_v4i64_v4i64: 1038; SSE4: ## %bb.0: 1039; SSE4-NEXT: packssdw %xmm1, %xmm0 1040; SSE4-NEXT: movmskps %xmm0, %eax 1041; SSE4-NEXT: testb $1, %al 1042; SSE4-NEXT: jne LBB8_1 1043; SSE4-NEXT: ## %bb.2: ## %else 1044; SSE4-NEXT: testb $2, %al 1045; SSE4-NEXT: jne LBB8_3 1046; SSE4-NEXT: LBB8_4: ## %else2 1047; SSE4-NEXT: testb $4, %al 1048; SSE4-NEXT: jne LBB8_5 1049; SSE4-NEXT: LBB8_6: ## %else4 1050; SSE4-NEXT: testb $8, %al 1051; SSE4-NEXT: jne LBB8_7 1052; SSE4-NEXT: LBB8_8: ## %else6 1053; SSE4-NEXT: retq 1054; SSE4-NEXT: LBB8_1: ## %cond.store 1055; SSE4-NEXT: movq %xmm2, (%rdi) 1056; SSE4-NEXT: testb $2, %al 1057; SSE4-NEXT: je LBB8_4 1058; SSE4-NEXT: LBB8_3: ## %cond.store1 1059; SSE4-NEXT: pextrq $1, %xmm2, 8(%rdi) 1060; SSE4-NEXT: testb $4, %al 1061; SSE4-NEXT: je LBB8_6 1062; SSE4-NEXT: LBB8_5: ## %cond.store3 1063; SSE4-NEXT: movq %xmm3, 16(%rdi) 1064; SSE4-NEXT: testb $8, %al 1065; SSE4-NEXT: je LBB8_8 1066; SSE4-NEXT: LBB8_7: ## %cond.store5 1067; SSE4-NEXT: pextrq $1, %xmm3, 24(%rdi) 1068; SSE4-NEXT: retq 1069; 1070; AVX1-LABEL: store_v4i64_v4i64: 1071; AVX1: ## %bb.0: 1072; AVX1-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) 1073; AVX1-NEXT: vzeroupper 1074; AVX1-NEXT: retq 1075; 1076; AVX2-LABEL: store_v4i64_v4i64: 1077; AVX2: ## %bb.0: 1078; AVX2-NEXT: vpmaskmovq %ymm1, %ymm0, (%rdi) 1079; AVX2-NEXT: vzeroupper 1080; AVX2-NEXT: retq 1081; 1082; AVX512F-LABEL: store_v4i64_v4i64: 1083; AVX512F: ## %bb.0: 1084; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 1085; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 1086; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 1087; AVX512F-NEXT: vpcmpgtq %zmm0, %zmm2, %k0 1088; AVX512F-NEXT: kshiftlw $12, %k0, %k0 1089; AVX512F-NEXT: kshiftrw $12, %k0, %k1 1090; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1} 1091; AVX512F-NEXT: vzeroupper 1092; AVX512F-NEXT: retq 1093; 1094; AVX512VLDQ-LABEL: store_v4i64_v4i64: 1095; AVX512VLDQ: ## %bb.0: 1096; AVX512VLDQ-NEXT: vpmovq2m %ymm0, %k1 1097; AVX512VLDQ-NEXT: vmovdqu64 %ymm1, (%rdi) {%k1} 1098; AVX512VLDQ-NEXT: vzeroupper 1099; AVX512VLDQ-NEXT: retq 1100; 1101; AVX512VLBW-LABEL: store_v4i64_v4i64: 1102; AVX512VLBW: ## %bb.0: 1103; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 1104; AVX512VLBW-NEXT: vpcmpgtq %ymm0, %ymm2, %k1 1105; AVX512VLBW-NEXT: vmovdqu64 %ymm1, (%rdi) {%k1} 1106; AVX512VLBW-NEXT: vzeroupper 1107; AVX512VLBW-NEXT: retq 1108; 1109; X86-AVX512-LABEL: store_v4i64_v4i64: 1110; X86-AVX512: ## %bb.0: 1111; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 1112; X86-AVX512-NEXT: vpmovq2m %ymm0, %k1 1113; X86-AVX512-NEXT: vmovdqu64 %ymm1, (%eax) {%k1} 1114; X86-AVX512-NEXT: vzeroupper 1115; X86-AVX512-NEXT: retl 1116 %mask = icmp slt <4 x i64> %trigger, zeroinitializer 1117 call void @llvm.masked.store.v4i64.p0(<4 x i64> %val, ptr %addr, i32 4, <4 x i1> %mask) 1118 ret void 1119} 1120 1121; 1122; vXi32 1123; 1124 1125define void @store_v1i32_v1i32(<1 x i32> %trigger, ptr %addr, <1 x i32> %val) nounwind { 1126; SSE-LABEL: store_v1i32_v1i32: 1127; SSE: ## %bb.0: 1128; SSE-NEXT: testl %edi, %edi 1129; SSE-NEXT: jne LBB9_2 1130; SSE-NEXT: ## %bb.1: ## %cond.store 1131; SSE-NEXT: movl %edx, (%rsi) 1132; SSE-NEXT: LBB9_2: ## %else 1133; SSE-NEXT: retq 1134; 1135; AVX-LABEL: store_v1i32_v1i32: 1136; AVX: ## %bb.0: 1137; AVX-NEXT: testl %edi, %edi 1138; AVX-NEXT: jne LBB9_2 1139; AVX-NEXT: ## %bb.1: ## %cond.store 1140; AVX-NEXT: movl %edx, (%rsi) 1141; AVX-NEXT: LBB9_2: ## %else 1142; AVX-NEXT: retq 1143; 1144; X86-AVX512-LABEL: store_v1i32_v1i32: 1145; X86-AVX512: ## %bb.0: 1146; X86-AVX512-NEXT: cmpl $0, {{[0-9]+}}(%esp) 1147; X86-AVX512-NEXT: jne LBB9_2 1148; X86-AVX512-NEXT: ## %bb.1: ## %cond.store 1149; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 1150; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx 1151; X86-AVX512-NEXT: movl %eax, (%ecx) 1152; X86-AVX512-NEXT: LBB9_2: ## %else 1153; X86-AVX512-NEXT: retl 1154 %mask = icmp eq <1 x i32> %trigger, zeroinitializer 1155 call void @llvm.masked.store.v1i32.p0(<1 x i32> %val, ptr %addr, i32 4, <1 x i1> %mask) 1156 ret void 1157} 1158 1159define void @store_v2i32_v2i32(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) nounwind { 1160; SSE2-LABEL: store_v2i32_v2i32: 1161; SSE2: ## %bb.0: 1162; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 1163; SSE2-NEXT: pxor %xmm2, %xmm2 1164; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 1165; SSE2-NEXT: movmskpd %xmm2, %eax 1166; SSE2-NEXT: testb $1, %al 1167; SSE2-NEXT: jne LBB10_1 1168; SSE2-NEXT: ## %bb.2: ## %else 1169; SSE2-NEXT: testb $2, %al 1170; SSE2-NEXT: jne LBB10_3 1171; SSE2-NEXT: LBB10_4: ## %else2 1172; SSE2-NEXT: retq 1173; SSE2-NEXT: LBB10_1: ## %cond.store 1174; SSE2-NEXT: movd %xmm1, (%rdi) 1175; SSE2-NEXT: testb $2, %al 1176; SSE2-NEXT: je LBB10_4 1177; SSE2-NEXT: LBB10_3: ## %cond.store1 1178; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 1179; SSE2-NEXT: movd %xmm0, 4(%rdi) 1180; SSE2-NEXT: retq 1181; 1182; SSE4-LABEL: store_v2i32_v2i32: 1183; SSE4: ## %bb.0: 1184; SSE4-NEXT: pxor %xmm2, %xmm2 1185; SSE4-NEXT: pcmpeqd %xmm0, %xmm2 1186; SSE4-NEXT: pmovsxdq %xmm2, %xmm0 1187; SSE4-NEXT: movmskpd %xmm0, %eax 1188; SSE4-NEXT: testb $1, %al 1189; SSE4-NEXT: jne LBB10_1 1190; SSE4-NEXT: ## %bb.2: ## %else 1191; SSE4-NEXT: testb $2, %al 1192; SSE4-NEXT: jne LBB10_3 1193; SSE4-NEXT: LBB10_4: ## %else2 1194; SSE4-NEXT: retq 1195; SSE4-NEXT: LBB10_1: ## %cond.store 1196; SSE4-NEXT: movss %xmm1, (%rdi) 1197; SSE4-NEXT: testb $2, %al 1198; SSE4-NEXT: je LBB10_4 1199; SSE4-NEXT: LBB10_3: ## %cond.store1 1200; SSE4-NEXT: extractps $1, %xmm1, 4(%rdi) 1201; SSE4-NEXT: retq 1202; 1203; AVX1-LABEL: store_v2i32_v2i32: 1204; AVX1: ## %bb.0: 1205; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1206; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 1207; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 1208; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) 1209; AVX1-NEXT: retq 1210; 1211; AVX2-LABEL: store_v2i32_v2i32: 1212; AVX2: ## %bb.0: 1213; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1214; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 1215; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 1216; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) 1217; AVX2-NEXT: retq 1218; 1219; AVX512F-LABEL: store_v2i32_v2i32: 1220; AVX512F: ## %bb.0: 1221; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 1222; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 1223; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 1224; AVX512F-NEXT: kshiftlw $14, %k0, %k0 1225; AVX512F-NEXT: kshiftrw $14, %k0, %k1 1226; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} 1227; AVX512F-NEXT: vzeroupper 1228; AVX512F-NEXT: retq 1229; 1230; AVX512VLDQ-LABEL: store_v2i32_v2i32: 1231; AVX512VLDQ: ## %bb.0: 1232; AVX512VLDQ-NEXT: vptestnmd %xmm0, %xmm0, %k0 1233; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0 1234; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1 1235; AVX512VLDQ-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1} 1236; AVX512VLDQ-NEXT: retq 1237; 1238; AVX512VLBW-LABEL: store_v2i32_v2i32: 1239; AVX512VLBW: ## %bb.0: 1240; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k0 1241; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0 1242; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1 1243; AVX512VLBW-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1} 1244; AVX512VLBW-NEXT: retq 1245; 1246; X86-AVX512-LABEL: store_v2i32_v2i32: 1247; X86-AVX512: ## %bb.0: 1248; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 1249; X86-AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k0 1250; X86-AVX512-NEXT: kshiftlb $6, %k0, %k0 1251; X86-AVX512-NEXT: kshiftrb $6, %k0, %k1 1252; X86-AVX512-NEXT: vmovdqu32 %xmm1, (%eax) {%k1} 1253; X86-AVX512-NEXT: retl 1254 %mask = icmp eq <2 x i32> %trigger, zeroinitializer 1255 call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask) 1256 ret void 1257} 1258 1259define void @store_v4i32_v4i32(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) nounwind { 1260; SSE2-LABEL: store_v4i32_v4i32: 1261; SSE2: ## %bb.0: 1262; SSE2-NEXT: pxor %xmm2, %xmm2 1263; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 1264; SSE2-NEXT: movmskps %xmm2, %eax 1265; SSE2-NEXT: testb $1, %al 1266; SSE2-NEXT: jne LBB11_1 1267; SSE2-NEXT: ## %bb.2: ## %else 1268; SSE2-NEXT: testb $2, %al 1269; SSE2-NEXT: jne LBB11_3 1270; SSE2-NEXT: LBB11_4: ## %else2 1271; SSE2-NEXT: testb $4, %al 1272; SSE2-NEXT: jne LBB11_5 1273; SSE2-NEXT: LBB11_6: ## %else4 1274; SSE2-NEXT: testb $8, %al 1275; SSE2-NEXT: jne LBB11_7 1276; SSE2-NEXT: LBB11_8: ## %else6 1277; SSE2-NEXT: retq 1278; SSE2-NEXT: LBB11_1: ## %cond.store 1279; SSE2-NEXT: movd %xmm1, (%rdi) 1280; SSE2-NEXT: testb $2, %al 1281; SSE2-NEXT: je LBB11_4 1282; SSE2-NEXT: LBB11_3: ## %cond.store1 1283; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 1284; SSE2-NEXT: movd %xmm0, 4(%rdi) 1285; SSE2-NEXT: testb $4, %al 1286; SSE2-NEXT: je LBB11_6 1287; SSE2-NEXT: LBB11_5: ## %cond.store3 1288; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 1289; SSE2-NEXT: movd %xmm0, 8(%rdi) 1290; SSE2-NEXT: testb $8, %al 1291; SSE2-NEXT: je LBB11_8 1292; SSE2-NEXT: LBB11_7: ## %cond.store5 1293; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] 1294; SSE2-NEXT: movd %xmm0, 12(%rdi) 1295; SSE2-NEXT: retq 1296; 1297; SSE4-LABEL: store_v4i32_v4i32: 1298; SSE4: ## %bb.0: 1299; SSE4-NEXT: pxor %xmm2, %xmm2 1300; SSE4-NEXT: pcmpeqd %xmm0, %xmm2 1301; SSE4-NEXT: movmskps %xmm2, %eax 1302; SSE4-NEXT: testb $1, %al 1303; SSE4-NEXT: jne LBB11_1 1304; SSE4-NEXT: ## %bb.2: ## %else 1305; SSE4-NEXT: testb $2, %al 1306; SSE4-NEXT: jne LBB11_3 1307; SSE4-NEXT: LBB11_4: ## %else2 1308; SSE4-NEXT: testb $4, %al 1309; SSE4-NEXT: jne LBB11_5 1310; SSE4-NEXT: LBB11_6: ## %else4 1311; SSE4-NEXT: testb $8, %al 1312; SSE4-NEXT: jne LBB11_7 1313; SSE4-NEXT: LBB11_8: ## %else6 1314; SSE4-NEXT: retq 1315; SSE4-NEXT: LBB11_1: ## %cond.store 1316; SSE4-NEXT: movss %xmm1, (%rdi) 1317; SSE4-NEXT: testb $2, %al 1318; SSE4-NEXT: je LBB11_4 1319; SSE4-NEXT: LBB11_3: ## %cond.store1 1320; SSE4-NEXT: extractps $1, %xmm1, 4(%rdi) 1321; SSE4-NEXT: testb $4, %al 1322; SSE4-NEXT: je LBB11_6 1323; SSE4-NEXT: LBB11_5: ## %cond.store3 1324; SSE4-NEXT: extractps $2, %xmm1, 8(%rdi) 1325; SSE4-NEXT: testb $8, %al 1326; SSE4-NEXT: je LBB11_8 1327; SSE4-NEXT: LBB11_7: ## %cond.store5 1328; SSE4-NEXT: extractps $3, %xmm1, 12(%rdi) 1329; SSE4-NEXT: retq 1330; 1331; AVX1-LABEL: store_v4i32_v4i32: 1332; AVX1: ## %bb.0: 1333; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1334; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 1335; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) 1336; AVX1-NEXT: retq 1337; 1338; AVX2-LABEL: store_v4i32_v4i32: 1339; AVX2: ## %bb.0: 1340; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1341; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 1342; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) 1343; AVX2-NEXT: retq 1344; 1345; AVX512F-LABEL: store_v4i32_v4i32: 1346; AVX512F: ## %bb.0: 1347; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 1348; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 1349; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 1350; AVX512F-NEXT: kshiftlw $12, %k0, %k0 1351; AVX512F-NEXT: kshiftrw $12, %k0, %k1 1352; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} 1353; AVX512F-NEXT: vzeroupper 1354; AVX512F-NEXT: retq 1355; 1356; AVX512VL-LABEL: store_v4i32_v4i32: 1357; AVX512VL: ## %bb.0: 1358; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k1 1359; AVX512VL-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1} 1360; AVX512VL-NEXT: retq 1361; 1362; X86-AVX512-LABEL: store_v4i32_v4i32: 1363; X86-AVX512: ## %bb.0: 1364; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 1365; X86-AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1 1366; X86-AVX512-NEXT: vmovdqu32 %xmm1, (%eax) {%k1} 1367; X86-AVX512-NEXT: retl 1368 %mask = icmp eq <4 x i32> %trigger, zeroinitializer 1369 call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask) 1370 ret void 1371} 1372 1373define void @store_v8i32_v8i32(<8 x i32> %trigger, ptr %addr, <8 x i32> %val) nounwind { 1374; SSE2-LABEL: store_v8i32_v8i32: 1375; SSE2: ## %bb.0: 1376; SSE2-NEXT: pxor %xmm4, %xmm4 1377; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 1378; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 1379; SSE2-NEXT: packssdw %xmm1, %xmm0 1380; SSE2-NEXT: packsswb %xmm0, %xmm0 1381; SSE2-NEXT: pmovmskb %xmm0, %eax 1382; SSE2-NEXT: testb $1, %al 1383; SSE2-NEXT: jne LBB12_1 1384; SSE2-NEXT: ## %bb.2: ## %else 1385; SSE2-NEXT: testb $2, %al 1386; SSE2-NEXT: jne LBB12_3 1387; SSE2-NEXT: LBB12_4: ## %else2 1388; SSE2-NEXT: testb $4, %al 1389; SSE2-NEXT: jne LBB12_5 1390; SSE2-NEXT: LBB12_6: ## %else4 1391; SSE2-NEXT: testb $8, %al 1392; SSE2-NEXT: jne LBB12_7 1393; SSE2-NEXT: LBB12_8: ## %else6 1394; SSE2-NEXT: testb $16, %al 1395; SSE2-NEXT: jne LBB12_9 1396; SSE2-NEXT: LBB12_10: ## %else8 1397; SSE2-NEXT: testb $32, %al 1398; SSE2-NEXT: jne LBB12_11 1399; SSE2-NEXT: LBB12_12: ## %else10 1400; SSE2-NEXT: testb $64, %al 1401; SSE2-NEXT: jne LBB12_13 1402; SSE2-NEXT: LBB12_14: ## %else12 1403; SSE2-NEXT: testb $-128, %al 1404; SSE2-NEXT: jne LBB12_15 1405; SSE2-NEXT: LBB12_16: ## %else14 1406; SSE2-NEXT: retq 1407; SSE2-NEXT: LBB12_1: ## %cond.store 1408; SSE2-NEXT: movd %xmm2, (%rdi) 1409; SSE2-NEXT: testb $2, %al 1410; SSE2-NEXT: je LBB12_4 1411; SSE2-NEXT: LBB12_3: ## %cond.store1 1412; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] 1413; SSE2-NEXT: movd %xmm0, 4(%rdi) 1414; SSE2-NEXT: testb $4, %al 1415; SSE2-NEXT: je LBB12_6 1416; SSE2-NEXT: LBB12_5: ## %cond.store3 1417; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 1418; SSE2-NEXT: movd %xmm0, 8(%rdi) 1419; SSE2-NEXT: testb $8, %al 1420; SSE2-NEXT: je LBB12_8 1421; SSE2-NEXT: LBB12_7: ## %cond.store5 1422; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] 1423; SSE2-NEXT: movd %xmm0, 12(%rdi) 1424; SSE2-NEXT: testb $16, %al 1425; SSE2-NEXT: je LBB12_10 1426; SSE2-NEXT: LBB12_9: ## %cond.store7 1427; SSE2-NEXT: movd %xmm3, 16(%rdi) 1428; SSE2-NEXT: testb $32, %al 1429; SSE2-NEXT: je LBB12_12 1430; SSE2-NEXT: LBB12_11: ## %cond.store9 1431; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] 1432; SSE2-NEXT: movd %xmm0, 20(%rdi) 1433; SSE2-NEXT: testb $64, %al 1434; SSE2-NEXT: je LBB12_14 1435; SSE2-NEXT: LBB12_13: ## %cond.store11 1436; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] 1437; SSE2-NEXT: movd %xmm0, 24(%rdi) 1438; SSE2-NEXT: testb $-128, %al 1439; SSE2-NEXT: je LBB12_16 1440; SSE2-NEXT: LBB12_15: ## %cond.store13 1441; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] 1442; SSE2-NEXT: movd %xmm0, 28(%rdi) 1443; SSE2-NEXT: retq 1444; 1445; SSE4-LABEL: store_v8i32_v8i32: 1446; SSE4: ## %bb.0: 1447; SSE4-NEXT: pxor %xmm4, %xmm4 1448; SSE4-NEXT: pcmpeqd %xmm4, %xmm1 1449; SSE4-NEXT: pcmpeqd %xmm4, %xmm0 1450; SSE4-NEXT: packssdw %xmm1, %xmm0 1451; SSE4-NEXT: packsswb %xmm0, %xmm0 1452; SSE4-NEXT: pmovmskb %xmm0, %eax 1453; SSE4-NEXT: testb $1, %al 1454; SSE4-NEXT: jne LBB12_1 1455; SSE4-NEXT: ## %bb.2: ## %else 1456; SSE4-NEXT: testb $2, %al 1457; SSE4-NEXT: jne LBB12_3 1458; SSE4-NEXT: LBB12_4: ## %else2 1459; SSE4-NEXT: testb $4, %al 1460; SSE4-NEXT: jne LBB12_5 1461; SSE4-NEXT: LBB12_6: ## %else4 1462; SSE4-NEXT: testb $8, %al 1463; SSE4-NEXT: jne LBB12_7 1464; SSE4-NEXT: LBB12_8: ## %else6 1465; SSE4-NEXT: testb $16, %al 1466; SSE4-NEXT: jne LBB12_9 1467; SSE4-NEXT: LBB12_10: ## %else8 1468; SSE4-NEXT: testb $32, %al 1469; SSE4-NEXT: jne LBB12_11 1470; SSE4-NEXT: LBB12_12: ## %else10 1471; SSE4-NEXT: testb $64, %al 1472; SSE4-NEXT: jne LBB12_13 1473; SSE4-NEXT: LBB12_14: ## %else12 1474; SSE4-NEXT: testb $-128, %al 1475; SSE4-NEXT: jne LBB12_15 1476; SSE4-NEXT: LBB12_16: ## %else14 1477; SSE4-NEXT: retq 1478; SSE4-NEXT: LBB12_1: ## %cond.store 1479; SSE4-NEXT: movss %xmm2, (%rdi) 1480; SSE4-NEXT: testb $2, %al 1481; SSE4-NEXT: je LBB12_4 1482; SSE4-NEXT: LBB12_3: ## %cond.store1 1483; SSE4-NEXT: extractps $1, %xmm2, 4(%rdi) 1484; SSE4-NEXT: testb $4, %al 1485; SSE4-NEXT: je LBB12_6 1486; SSE4-NEXT: LBB12_5: ## %cond.store3 1487; SSE4-NEXT: extractps $2, %xmm2, 8(%rdi) 1488; SSE4-NEXT: testb $8, %al 1489; SSE4-NEXT: je LBB12_8 1490; SSE4-NEXT: LBB12_7: ## %cond.store5 1491; SSE4-NEXT: extractps $3, %xmm2, 12(%rdi) 1492; SSE4-NEXT: testb $16, %al 1493; SSE4-NEXT: je LBB12_10 1494; SSE4-NEXT: LBB12_9: ## %cond.store7 1495; SSE4-NEXT: movss %xmm3, 16(%rdi) 1496; SSE4-NEXT: testb $32, %al 1497; SSE4-NEXT: je LBB12_12 1498; SSE4-NEXT: LBB12_11: ## %cond.store9 1499; SSE4-NEXT: extractps $1, %xmm3, 20(%rdi) 1500; SSE4-NEXT: testb $64, %al 1501; SSE4-NEXT: je LBB12_14 1502; SSE4-NEXT: LBB12_13: ## %cond.store11 1503; SSE4-NEXT: extractps $2, %xmm3, 24(%rdi) 1504; SSE4-NEXT: testb $-128, %al 1505; SSE4-NEXT: je LBB12_16 1506; SSE4-NEXT: LBB12_15: ## %cond.store13 1507; SSE4-NEXT: extractps $3, %xmm3, 28(%rdi) 1508; SSE4-NEXT: retq 1509; 1510; AVX1-LABEL: store_v8i32_v8i32: 1511; AVX1: ## %bb.0: 1512; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1513; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 1514; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 1515; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 1516; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1517; AVX1-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) 1518; AVX1-NEXT: vzeroupper 1519; AVX1-NEXT: retq 1520; 1521; AVX2-LABEL: store_v8i32_v8i32: 1522; AVX2: ## %bb.0: 1523; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1524; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0 1525; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi) 1526; AVX2-NEXT: vzeroupper 1527; AVX2-NEXT: retq 1528; 1529; AVX512F-LABEL: store_v8i32_v8i32: 1530; AVX512F: ## %bb.0: 1531; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 1532; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 1533; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 1534; AVX512F-NEXT: kshiftlw $8, %k0, %k0 1535; AVX512F-NEXT: kshiftrw $8, %k0, %k1 1536; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} 1537; AVX512F-NEXT: vzeroupper 1538; AVX512F-NEXT: retq 1539; 1540; AVX512VL-LABEL: store_v8i32_v8i32: 1541; AVX512VL: ## %bb.0: 1542; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 1543; AVX512VL-NEXT: vmovdqu32 %ymm1, (%rdi) {%k1} 1544; AVX512VL-NEXT: vzeroupper 1545; AVX512VL-NEXT: retq 1546; 1547; X86-AVX512-LABEL: store_v8i32_v8i32: 1548; X86-AVX512: ## %bb.0: 1549; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 1550; X86-AVX512-NEXT: vptestnmd %ymm0, %ymm0, %k1 1551; X86-AVX512-NEXT: vmovdqu32 %ymm1, (%eax) {%k1} 1552; X86-AVX512-NEXT: vzeroupper 1553; X86-AVX512-NEXT: retl 1554 %mask = icmp eq <8 x i32> %trigger, zeroinitializer 1555 call void @llvm.masked.store.v8i32.p0(<8 x i32> %val, ptr %addr, i32 4, <8 x i1> %mask) 1556 ret void 1557} 1558 1559; 1560; vXi16 1561; 1562 1563define void @store_v8i16_v8i16(<8 x i16> %trigger, ptr %addr, <8 x i16> %val) nounwind { 1564; SSE2-LABEL: store_v8i16_v8i16: 1565; SSE2: ## %bb.0: 1566; SSE2-NEXT: pxor %xmm2, %xmm2 1567; SSE2-NEXT: pcmpeqw %xmm0, %xmm2 1568; SSE2-NEXT: packsswb %xmm2, %xmm2 1569; SSE2-NEXT: pmovmskb %xmm2, %eax 1570; SSE2-NEXT: testb $1, %al 1571; SSE2-NEXT: jne LBB13_1 1572; SSE2-NEXT: ## %bb.2: ## %else 1573; SSE2-NEXT: testb $2, %al 1574; SSE2-NEXT: jne LBB13_3 1575; SSE2-NEXT: LBB13_4: ## %else2 1576; SSE2-NEXT: testb $4, %al 1577; SSE2-NEXT: jne LBB13_5 1578; SSE2-NEXT: LBB13_6: ## %else4 1579; SSE2-NEXT: testb $8, %al 1580; SSE2-NEXT: jne LBB13_7 1581; SSE2-NEXT: LBB13_8: ## %else6 1582; SSE2-NEXT: testb $16, %al 1583; SSE2-NEXT: jne LBB13_9 1584; SSE2-NEXT: LBB13_10: ## %else8 1585; SSE2-NEXT: testb $32, %al 1586; SSE2-NEXT: jne LBB13_11 1587; SSE2-NEXT: LBB13_12: ## %else10 1588; SSE2-NEXT: testb $64, %al 1589; SSE2-NEXT: jne LBB13_13 1590; SSE2-NEXT: LBB13_14: ## %else12 1591; SSE2-NEXT: testb $-128, %al 1592; SSE2-NEXT: jne LBB13_15 1593; SSE2-NEXT: LBB13_16: ## %else14 1594; SSE2-NEXT: retq 1595; SSE2-NEXT: LBB13_1: ## %cond.store 1596; SSE2-NEXT: movd %xmm1, %ecx 1597; SSE2-NEXT: movw %cx, (%rdi) 1598; SSE2-NEXT: testb $2, %al 1599; SSE2-NEXT: je LBB13_4 1600; SSE2-NEXT: LBB13_3: ## %cond.store1 1601; SSE2-NEXT: pextrw $1, %xmm1, %ecx 1602; SSE2-NEXT: movw %cx, 2(%rdi) 1603; SSE2-NEXT: testb $4, %al 1604; SSE2-NEXT: je LBB13_6 1605; SSE2-NEXT: LBB13_5: ## %cond.store3 1606; SSE2-NEXT: pextrw $2, %xmm1, %ecx 1607; SSE2-NEXT: movw %cx, 4(%rdi) 1608; SSE2-NEXT: testb $8, %al 1609; SSE2-NEXT: je LBB13_8 1610; SSE2-NEXT: LBB13_7: ## %cond.store5 1611; SSE2-NEXT: pextrw $3, %xmm1, %ecx 1612; SSE2-NEXT: movw %cx, 6(%rdi) 1613; SSE2-NEXT: testb $16, %al 1614; SSE2-NEXT: je LBB13_10 1615; SSE2-NEXT: LBB13_9: ## %cond.store7 1616; SSE2-NEXT: pextrw $4, %xmm1, %ecx 1617; SSE2-NEXT: movw %cx, 8(%rdi) 1618; SSE2-NEXT: testb $32, %al 1619; SSE2-NEXT: je LBB13_12 1620; SSE2-NEXT: LBB13_11: ## %cond.store9 1621; SSE2-NEXT: pextrw $5, %xmm1, %ecx 1622; SSE2-NEXT: movw %cx, 10(%rdi) 1623; SSE2-NEXT: testb $64, %al 1624; SSE2-NEXT: je LBB13_14 1625; SSE2-NEXT: LBB13_13: ## %cond.store11 1626; SSE2-NEXT: pextrw $6, %xmm1, %ecx 1627; SSE2-NEXT: movw %cx, 12(%rdi) 1628; SSE2-NEXT: testb $-128, %al 1629; SSE2-NEXT: je LBB13_16 1630; SSE2-NEXT: LBB13_15: ## %cond.store13 1631; SSE2-NEXT: pextrw $7, %xmm1, %eax 1632; SSE2-NEXT: movw %ax, 14(%rdi) 1633; SSE2-NEXT: retq 1634; 1635; SSE4-LABEL: store_v8i16_v8i16: 1636; SSE4: ## %bb.0: 1637; SSE4-NEXT: pxor %xmm2, %xmm2 1638; SSE4-NEXT: pcmpeqw %xmm0, %xmm2 1639; SSE4-NEXT: packsswb %xmm2, %xmm2 1640; SSE4-NEXT: pmovmskb %xmm2, %eax 1641; SSE4-NEXT: testb $1, %al 1642; SSE4-NEXT: jne LBB13_1 1643; SSE4-NEXT: ## %bb.2: ## %else 1644; SSE4-NEXT: testb $2, %al 1645; SSE4-NEXT: jne LBB13_3 1646; SSE4-NEXT: LBB13_4: ## %else2 1647; SSE4-NEXT: testb $4, %al 1648; SSE4-NEXT: jne LBB13_5 1649; SSE4-NEXT: LBB13_6: ## %else4 1650; SSE4-NEXT: testb $8, %al 1651; SSE4-NEXT: jne LBB13_7 1652; SSE4-NEXT: LBB13_8: ## %else6 1653; SSE4-NEXT: testb $16, %al 1654; SSE4-NEXT: jne LBB13_9 1655; SSE4-NEXT: LBB13_10: ## %else8 1656; SSE4-NEXT: testb $32, %al 1657; SSE4-NEXT: jne LBB13_11 1658; SSE4-NEXT: LBB13_12: ## %else10 1659; SSE4-NEXT: testb $64, %al 1660; SSE4-NEXT: jne LBB13_13 1661; SSE4-NEXT: LBB13_14: ## %else12 1662; SSE4-NEXT: testb $-128, %al 1663; SSE4-NEXT: jne LBB13_15 1664; SSE4-NEXT: LBB13_16: ## %else14 1665; SSE4-NEXT: retq 1666; SSE4-NEXT: LBB13_1: ## %cond.store 1667; SSE4-NEXT: pextrw $0, %xmm1, (%rdi) 1668; SSE4-NEXT: testb $2, %al 1669; SSE4-NEXT: je LBB13_4 1670; SSE4-NEXT: LBB13_3: ## %cond.store1 1671; SSE4-NEXT: pextrw $1, %xmm1, 2(%rdi) 1672; SSE4-NEXT: testb $4, %al 1673; SSE4-NEXT: je LBB13_6 1674; SSE4-NEXT: LBB13_5: ## %cond.store3 1675; SSE4-NEXT: pextrw $2, %xmm1, 4(%rdi) 1676; SSE4-NEXT: testb $8, %al 1677; SSE4-NEXT: je LBB13_8 1678; SSE4-NEXT: LBB13_7: ## %cond.store5 1679; SSE4-NEXT: pextrw $3, %xmm1, 6(%rdi) 1680; SSE4-NEXT: testb $16, %al 1681; SSE4-NEXT: je LBB13_10 1682; SSE4-NEXT: LBB13_9: ## %cond.store7 1683; SSE4-NEXT: pextrw $4, %xmm1, 8(%rdi) 1684; SSE4-NEXT: testb $32, %al 1685; SSE4-NEXT: je LBB13_12 1686; SSE4-NEXT: LBB13_11: ## %cond.store9 1687; SSE4-NEXT: pextrw $5, %xmm1, 10(%rdi) 1688; SSE4-NEXT: testb $64, %al 1689; SSE4-NEXT: je LBB13_14 1690; SSE4-NEXT: LBB13_13: ## %cond.store11 1691; SSE4-NEXT: pextrw $6, %xmm1, 12(%rdi) 1692; SSE4-NEXT: testb $-128, %al 1693; SSE4-NEXT: je LBB13_16 1694; SSE4-NEXT: LBB13_15: ## %cond.store13 1695; SSE4-NEXT: pextrw $7, %xmm1, 14(%rdi) 1696; SSE4-NEXT: retq 1697; 1698; AVX1OR2-LABEL: store_v8i16_v8i16: 1699; AVX1OR2: ## %bb.0: 1700; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1701; AVX1OR2-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 1702; AVX1OR2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 1703; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax 1704; AVX1OR2-NEXT: testb $1, %al 1705; AVX1OR2-NEXT: jne LBB13_1 1706; AVX1OR2-NEXT: ## %bb.2: ## %else 1707; AVX1OR2-NEXT: testb $2, %al 1708; AVX1OR2-NEXT: jne LBB13_3 1709; AVX1OR2-NEXT: LBB13_4: ## %else2 1710; AVX1OR2-NEXT: testb $4, %al 1711; AVX1OR2-NEXT: jne LBB13_5 1712; AVX1OR2-NEXT: LBB13_6: ## %else4 1713; AVX1OR2-NEXT: testb $8, %al 1714; AVX1OR2-NEXT: jne LBB13_7 1715; AVX1OR2-NEXT: LBB13_8: ## %else6 1716; AVX1OR2-NEXT: testb $16, %al 1717; AVX1OR2-NEXT: jne LBB13_9 1718; AVX1OR2-NEXT: LBB13_10: ## %else8 1719; AVX1OR2-NEXT: testb $32, %al 1720; AVX1OR2-NEXT: jne LBB13_11 1721; AVX1OR2-NEXT: LBB13_12: ## %else10 1722; AVX1OR2-NEXT: testb $64, %al 1723; AVX1OR2-NEXT: jne LBB13_13 1724; AVX1OR2-NEXT: LBB13_14: ## %else12 1725; AVX1OR2-NEXT: testb $-128, %al 1726; AVX1OR2-NEXT: jne LBB13_15 1727; AVX1OR2-NEXT: LBB13_16: ## %else14 1728; AVX1OR2-NEXT: retq 1729; AVX1OR2-NEXT: LBB13_1: ## %cond.store 1730; AVX1OR2-NEXT: vpextrw $0, %xmm1, (%rdi) 1731; AVX1OR2-NEXT: testb $2, %al 1732; AVX1OR2-NEXT: je LBB13_4 1733; AVX1OR2-NEXT: LBB13_3: ## %cond.store1 1734; AVX1OR2-NEXT: vpextrw $1, %xmm1, 2(%rdi) 1735; AVX1OR2-NEXT: testb $4, %al 1736; AVX1OR2-NEXT: je LBB13_6 1737; AVX1OR2-NEXT: LBB13_5: ## %cond.store3 1738; AVX1OR2-NEXT: vpextrw $2, %xmm1, 4(%rdi) 1739; AVX1OR2-NEXT: testb $8, %al 1740; AVX1OR2-NEXT: je LBB13_8 1741; AVX1OR2-NEXT: LBB13_7: ## %cond.store5 1742; AVX1OR2-NEXT: vpextrw $3, %xmm1, 6(%rdi) 1743; AVX1OR2-NEXT: testb $16, %al 1744; AVX1OR2-NEXT: je LBB13_10 1745; AVX1OR2-NEXT: LBB13_9: ## %cond.store7 1746; AVX1OR2-NEXT: vpextrw $4, %xmm1, 8(%rdi) 1747; AVX1OR2-NEXT: testb $32, %al 1748; AVX1OR2-NEXT: je LBB13_12 1749; AVX1OR2-NEXT: LBB13_11: ## %cond.store9 1750; AVX1OR2-NEXT: vpextrw $5, %xmm1, 10(%rdi) 1751; AVX1OR2-NEXT: testb $64, %al 1752; AVX1OR2-NEXT: je LBB13_14 1753; AVX1OR2-NEXT: LBB13_13: ## %cond.store11 1754; AVX1OR2-NEXT: vpextrw $6, %xmm1, 12(%rdi) 1755; AVX1OR2-NEXT: testb $-128, %al 1756; AVX1OR2-NEXT: je LBB13_16 1757; AVX1OR2-NEXT: LBB13_15: ## %cond.store13 1758; AVX1OR2-NEXT: vpextrw $7, %xmm1, 14(%rdi) 1759; AVX1OR2-NEXT: retq 1760; 1761; AVX512F-LABEL: store_v8i16_v8i16: 1762; AVX512F: ## %bb.0: 1763; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 1764; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 1765; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 1766; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 1767; AVX512F-NEXT: kmovw %k0, %eax 1768; AVX512F-NEXT: testb $1, %al 1769; AVX512F-NEXT: jne LBB13_1 1770; AVX512F-NEXT: ## %bb.2: ## %else 1771; AVX512F-NEXT: testb $2, %al 1772; AVX512F-NEXT: jne LBB13_3 1773; AVX512F-NEXT: LBB13_4: ## %else2 1774; AVX512F-NEXT: testb $4, %al 1775; AVX512F-NEXT: jne LBB13_5 1776; AVX512F-NEXT: LBB13_6: ## %else4 1777; AVX512F-NEXT: testb $8, %al 1778; AVX512F-NEXT: jne LBB13_7 1779; AVX512F-NEXT: LBB13_8: ## %else6 1780; AVX512F-NEXT: testb $16, %al 1781; AVX512F-NEXT: jne LBB13_9 1782; AVX512F-NEXT: LBB13_10: ## %else8 1783; AVX512F-NEXT: testb $32, %al 1784; AVX512F-NEXT: jne LBB13_11 1785; AVX512F-NEXT: LBB13_12: ## %else10 1786; AVX512F-NEXT: testb $64, %al 1787; AVX512F-NEXT: jne LBB13_13 1788; AVX512F-NEXT: LBB13_14: ## %else12 1789; AVX512F-NEXT: testb $-128, %al 1790; AVX512F-NEXT: jne LBB13_15 1791; AVX512F-NEXT: LBB13_16: ## %else14 1792; AVX512F-NEXT: vzeroupper 1793; AVX512F-NEXT: retq 1794; AVX512F-NEXT: LBB13_1: ## %cond.store 1795; AVX512F-NEXT: vpextrw $0, %xmm1, (%rdi) 1796; AVX512F-NEXT: testb $2, %al 1797; AVX512F-NEXT: je LBB13_4 1798; AVX512F-NEXT: LBB13_3: ## %cond.store1 1799; AVX512F-NEXT: vpextrw $1, %xmm1, 2(%rdi) 1800; AVX512F-NEXT: testb $4, %al 1801; AVX512F-NEXT: je LBB13_6 1802; AVX512F-NEXT: LBB13_5: ## %cond.store3 1803; AVX512F-NEXT: vpextrw $2, %xmm1, 4(%rdi) 1804; AVX512F-NEXT: testb $8, %al 1805; AVX512F-NEXT: je LBB13_8 1806; AVX512F-NEXT: LBB13_7: ## %cond.store5 1807; AVX512F-NEXT: vpextrw $3, %xmm1, 6(%rdi) 1808; AVX512F-NEXT: testb $16, %al 1809; AVX512F-NEXT: je LBB13_10 1810; AVX512F-NEXT: LBB13_9: ## %cond.store7 1811; AVX512F-NEXT: vpextrw $4, %xmm1, 8(%rdi) 1812; AVX512F-NEXT: testb $32, %al 1813; AVX512F-NEXT: je LBB13_12 1814; AVX512F-NEXT: LBB13_11: ## %cond.store9 1815; AVX512F-NEXT: vpextrw $5, %xmm1, 10(%rdi) 1816; AVX512F-NEXT: testb $64, %al 1817; AVX512F-NEXT: je LBB13_14 1818; AVX512F-NEXT: LBB13_13: ## %cond.store11 1819; AVX512F-NEXT: vpextrw $6, %xmm1, 12(%rdi) 1820; AVX512F-NEXT: testb $-128, %al 1821; AVX512F-NEXT: je LBB13_16 1822; AVX512F-NEXT: LBB13_15: ## %cond.store13 1823; AVX512F-NEXT: vpextrw $7, %xmm1, 14(%rdi) 1824; AVX512F-NEXT: vzeroupper 1825; AVX512F-NEXT: retq 1826; 1827; AVX512VLDQ-LABEL: store_v8i16_v8i16: 1828; AVX512VLDQ: ## %bb.0: 1829; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 1830; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 1831; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0 1832; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k0 1833; AVX512VLDQ-NEXT: kmovw %k0, %eax 1834; AVX512VLDQ-NEXT: testb $1, %al 1835; AVX512VLDQ-NEXT: jne LBB13_1 1836; AVX512VLDQ-NEXT: ## %bb.2: ## %else 1837; AVX512VLDQ-NEXT: testb $2, %al 1838; AVX512VLDQ-NEXT: jne LBB13_3 1839; AVX512VLDQ-NEXT: LBB13_4: ## %else2 1840; AVX512VLDQ-NEXT: testb $4, %al 1841; AVX512VLDQ-NEXT: jne LBB13_5 1842; AVX512VLDQ-NEXT: LBB13_6: ## %else4 1843; AVX512VLDQ-NEXT: testb $8, %al 1844; AVX512VLDQ-NEXT: jne LBB13_7 1845; AVX512VLDQ-NEXT: LBB13_8: ## %else6 1846; AVX512VLDQ-NEXT: testb $16, %al 1847; AVX512VLDQ-NEXT: jne LBB13_9 1848; AVX512VLDQ-NEXT: LBB13_10: ## %else8 1849; AVX512VLDQ-NEXT: testb $32, %al 1850; AVX512VLDQ-NEXT: jne LBB13_11 1851; AVX512VLDQ-NEXT: LBB13_12: ## %else10 1852; AVX512VLDQ-NEXT: testb $64, %al 1853; AVX512VLDQ-NEXT: jne LBB13_13 1854; AVX512VLDQ-NEXT: LBB13_14: ## %else12 1855; AVX512VLDQ-NEXT: testb $-128, %al 1856; AVX512VLDQ-NEXT: jne LBB13_15 1857; AVX512VLDQ-NEXT: LBB13_16: ## %else14 1858; AVX512VLDQ-NEXT: vzeroupper 1859; AVX512VLDQ-NEXT: retq 1860; AVX512VLDQ-NEXT: LBB13_1: ## %cond.store 1861; AVX512VLDQ-NEXT: vpextrw $0, %xmm1, (%rdi) 1862; AVX512VLDQ-NEXT: testb $2, %al 1863; AVX512VLDQ-NEXT: je LBB13_4 1864; AVX512VLDQ-NEXT: LBB13_3: ## %cond.store1 1865; AVX512VLDQ-NEXT: vpextrw $1, %xmm1, 2(%rdi) 1866; AVX512VLDQ-NEXT: testb $4, %al 1867; AVX512VLDQ-NEXT: je LBB13_6 1868; AVX512VLDQ-NEXT: LBB13_5: ## %cond.store3 1869; AVX512VLDQ-NEXT: vpextrw $2, %xmm1, 4(%rdi) 1870; AVX512VLDQ-NEXT: testb $8, %al 1871; AVX512VLDQ-NEXT: je LBB13_8 1872; AVX512VLDQ-NEXT: LBB13_7: ## %cond.store5 1873; AVX512VLDQ-NEXT: vpextrw $3, %xmm1, 6(%rdi) 1874; AVX512VLDQ-NEXT: testb $16, %al 1875; AVX512VLDQ-NEXT: je LBB13_10 1876; AVX512VLDQ-NEXT: LBB13_9: ## %cond.store7 1877; AVX512VLDQ-NEXT: vpextrw $4, %xmm1, 8(%rdi) 1878; AVX512VLDQ-NEXT: testb $32, %al 1879; AVX512VLDQ-NEXT: je LBB13_12 1880; AVX512VLDQ-NEXT: LBB13_11: ## %cond.store9 1881; AVX512VLDQ-NEXT: vpextrw $5, %xmm1, 10(%rdi) 1882; AVX512VLDQ-NEXT: testb $64, %al 1883; AVX512VLDQ-NEXT: je LBB13_14 1884; AVX512VLDQ-NEXT: LBB13_13: ## %cond.store11 1885; AVX512VLDQ-NEXT: vpextrw $6, %xmm1, 12(%rdi) 1886; AVX512VLDQ-NEXT: testb $-128, %al 1887; AVX512VLDQ-NEXT: je LBB13_16 1888; AVX512VLDQ-NEXT: LBB13_15: ## %cond.store13 1889; AVX512VLDQ-NEXT: vpextrw $7, %xmm1, 14(%rdi) 1890; AVX512VLDQ-NEXT: vzeroupper 1891; AVX512VLDQ-NEXT: retq 1892; 1893; AVX512VLBW-LABEL: store_v8i16_v8i16: 1894; AVX512VLBW: ## %bb.0: 1895; AVX512VLBW-NEXT: vptestnmw %xmm0, %xmm0, %k1 1896; AVX512VLBW-NEXT: vmovdqu16 %xmm1, (%rdi) {%k1} 1897; AVX512VLBW-NEXT: retq 1898; 1899; X86-AVX512-LABEL: store_v8i16_v8i16: 1900; X86-AVX512: ## %bb.0: 1901; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 1902; X86-AVX512-NEXT: vptestnmw %xmm0, %xmm0, %k1 1903; X86-AVX512-NEXT: vmovdqu16 %xmm1, (%eax) {%k1} 1904; X86-AVX512-NEXT: retl 1905 %mask = icmp eq <8 x i16> %trigger, zeroinitializer 1906 call void @llvm.masked.store.v8i16.p0(<8 x i16> %val, ptr %addr, i32 4, <8 x i1> %mask) 1907 ret void 1908} 1909 1910define void @store_v16i16_v16i16(<16 x i16> %trigger, ptr %addr, <16 x i16> %val) nounwind { 1911; SSE2-LABEL: store_v16i16_v16i16: 1912; SSE2: ## %bb.0: 1913; SSE2-NEXT: pxor %xmm4, %xmm4 1914; SSE2-NEXT: pcmpeqw %xmm4, %xmm1 1915; SSE2-NEXT: pcmpeqw %xmm4, %xmm0 1916; SSE2-NEXT: packsswb %xmm1, %xmm0 1917; SSE2-NEXT: pmovmskb %xmm0, %eax 1918; SSE2-NEXT: testb $1, %al 1919; SSE2-NEXT: jne LBB14_1 1920; SSE2-NEXT: ## %bb.2: ## %else 1921; SSE2-NEXT: testb $2, %al 1922; SSE2-NEXT: jne LBB14_3 1923; SSE2-NEXT: LBB14_4: ## %else2 1924; SSE2-NEXT: testb $4, %al 1925; SSE2-NEXT: jne LBB14_5 1926; SSE2-NEXT: LBB14_6: ## %else4 1927; SSE2-NEXT: testb $8, %al 1928; SSE2-NEXT: jne LBB14_7 1929; SSE2-NEXT: LBB14_8: ## %else6 1930; SSE2-NEXT: testb $16, %al 1931; SSE2-NEXT: jne LBB14_9 1932; SSE2-NEXT: LBB14_10: ## %else8 1933; SSE2-NEXT: testb $32, %al 1934; SSE2-NEXT: jne LBB14_11 1935; SSE2-NEXT: LBB14_12: ## %else10 1936; SSE2-NEXT: testb $64, %al 1937; SSE2-NEXT: jne LBB14_13 1938; SSE2-NEXT: LBB14_14: ## %else12 1939; SSE2-NEXT: testb %al, %al 1940; SSE2-NEXT: js LBB14_15 1941; SSE2-NEXT: LBB14_16: ## %else14 1942; SSE2-NEXT: testl $256, %eax ## imm = 0x100 1943; SSE2-NEXT: jne LBB14_17 1944; SSE2-NEXT: LBB14_18: ## %else16 1945; SSE2-NEXT: testl $512, %eax ## imm = 0x200 1946; SSE2-NEXT: jne LBB14_19 1947; SSE2-NEXT: LBB14_20: ## %else18 1948; SSE2-NEXT: testl $1024, %eax ## imm = 0x400 1949; SSE2-NEXT: jne LBB14_21 1950; SSE2-NEXT: LBB14_22: ## %else20 1951; SSE2-NEXT: testl $2048, %eax ## imm = 0x800 1952; SSE2-NEXT: jne LBB14_23 1953; SSE2-NEXT: LBB14_24: ## %else22 1954; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000 1955; SSE2-NEXT: jne LBB14_25 1956; SSE2-NEXT: LBB14_26: ## %else24 1957; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000 1958; SSE2-NEXT: jne LBB14_27 1959; SSE2-NEXT: LBB14_28: ## %else26 1960; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000 1961; SSE2-NEXT: jne LBB14_29 1962; SSE2-NEXT: LBB14_30: ## %else28 1963; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000 1964; SSE2-NEXT: jne LBB14_31 1965; SSE2-NEXT: LBB14_32: ## %else30 1966; SSE2-NEXT: retq 1967; SSE2-NEXT: LBB14_1: ## %cond.store 1968; SSE2-NEXT: movd %xmm2, %ecx 1969; SSE2-NEXT: movw %cx, (%rdi) 1970; SSE2-NEXT: testb $2, %al 1971; SSE2-NEXT: je LBB14_4 1972; SSE2-NEXT: LBB14_3: ## %cond.store1 1973; SSE2-NEXT: pextrw $1, %xmm2, %ecx 1974; SSE2-NEXT: movw %cx, 2(%rdi) 1975; SSE2-NEXT: testb $4, %al 1976; SSE2-NEXT: je LBB14_6 1977; SSE2-NEXT: LBB14_5: ## %cond.store3 1978; SSE2-NEXT: pextrw $2, %xmm2, %ecx 1979; SSE2-NEXT: movw %cx, 4(%rdi) 1980; SSE2-NEXT: testb $8, %al 1981; SSE2-NEXT: je LBB14_8 1982; SSE2-NEXT: LBB14_7: ## %cond.store5 1983; SSE2-NEXT: pextrw $3, %xmm2, %ecx 1984; SSE2-NEXT: movw %cx, 6(%rdi) 1985; SSE2-NEXT: testb $16, %al 1986; SSE2-NEXT: je LBB14_10 1987; SSE2-NEXT: LBB14_9: ## %cond.store7 1988; SSE2-NEXT: pextrw $4, %xmm2, %ecx 1989; SSE2-NEXT: movw %cx, 8(%rdi) 1990; SSE2-NEXT: testb $32, %al 1991; SSE2-NEXT: je LBB14_12 1992; SSE2-NEXT: LBB14_11: ## %cond.store9 1993; SSE2-NEXT: pextrw $5, %xmm2, %ecx 1994; SSE2-NEXT: movw %cx, 10(%rdi) 1995; SSE2-NEXT: testb $64, %al 1996; SSE2-NEXT: je LBB14_14 1997; SSE2-NEXT: LBB14_13: ## %cond.store11 1998; SSE2-NEXT: pextrw $6, %xmm2, %ecx 1999; SSE2-NEXT: movw %cx, 12(%rdi) 2000; SSE2-NEXT: testb %al, %al 2001; SSE2-NEXT: jns LBB14_16 2002; SSE2-NEXT: LBB14_15: ## %cond.store13 2003; SSE2-NEXT: pextrw $7, %xmm2, %ecx 2004; SSE2-NEXT: movw %cx, 14(%rdi) 2005; SSE2-NEXT: testl $256, %eax ## imm = 0x100 2006; SSE2-NEXT: je LBB14_18 2007; SSE2-NEXT: LBB14_17: ## %cond.store15 2008; SSE2-NEXT: movd %xmm3, %ecx 2009; SSE2-NEXT: movw %cx, 16(%rdi) 2010; SSE2-NEXT: testl $512, %eax ## imm = 0x200 2011; SSE2-NEXT: je LBB14_20 2012; SSE2-NEXT: LBB14_19: ## %cond.store17 2013; SSE2-NEXT: pextrw $1, %xmm3, %ecx 2014; SSE2-NEXT: movw %cx, 18(%rdi) 2015; SSE2-NEXT: testl $1024, %eax ## imm = 0x400 2016; SSE2-NEXT: je LBB14_22 2017; SSE2-NEXT: LBB14_21: ## %cond.store19 2018; SSE2-NEXT: pextrw $2, %xmm3, %ecx 2019; SSE2-NEXT: movw %cx, 20(%rdi) 2020; SSE2-NEXT: testl $2048, %eax ## imm = 0x800 2021; SSE2-NEXT: je LBB14_24 2022; SSE2-NEXT: LBB14_23: ## %cond.store21 2023; SSE2-NEXT: pextrw $3, %xmm3, %ecx 2024; SSE2-NEXT: movw %cx, 22(%rdi) 2025; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000 2026; SSE2-NEXT: je LBB14_26 2027; SSE2-NEXT: LBB14_25: ## %cond.store23 2028; SSE2-NEXT: pextrw $4, %xmm3, %ecx 2029; SSE2-NEXT: movw %cx, 24(%rdi) 2030; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000 2031; SSE2-NEXT: je LBB14_28 2032; SSE2-NEXT: LBB14_27: ## %cond.store25 2033; SSE2-NEXT: pextrw $5, %xmm3, %ecx 2034; SSE2-NEXT: movw %cx, 26(%rdi) 2035; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000 2036; SSE2-NEXT: je LBB14_30 2037; SSE2-NEXT: LBB14_29: ## %cond.store27 2038; SSE2-NEXT: pextrw $6, %xmm3, %ecx 2039; SSE2-NEXT: movw %cx, 28(%rdi) 2040; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000 2041; SSE2-NEXT: je LBB14_32 2042; SSE2-NEXT: LBB14_31: ## %cond.store29 2043; SSE2-NEXT: pextrw $7, %xmm3, %eax 2044; SSE2-NEXT: movw %ax, 30(%rdi) 2045; SSE2-NEXT: retq 2046; 2047; SSE4-LABEL: store_v16i16_v16i16: 2048; SSE4: ## %bb.0: 2049; SSE4-NEXT: pxor %xmm4, %xmm4 2050; SSE4-NEXT: pcmpeqw %xmm4, %xmm1 2051; SSE4-NEXT: pcmpeqw %xmm4, %xmm0 2052; SSE4-NEXT: packsswb %xmm1, %xmm0 2053; SSE4-NEXT: pmovmskb %xmm0, %eax 2054; SSE4-NEXT: testb $1, %al 2055; SSE4-NEXT: jne LBB14_1 2056; SSE4-NEXT: ## %bb.2: ## %else 2057; SSE4-NEXT: testb $2, %al 2058; SSE4-NEXT: jne LBB14_3 2059; SSE4-NEXT: LBB14_4: ## %else2 2060; SSE4-NEXT: testb $4, %al 2061; SSE4-NEXT: jne LBB14_5 2062; SSE4-NEXT: LBB14_6: ## %else4 2063; SSE4-NEXT: testb $8, %al 2064; SSE4-NEXT: jne LBB14_7 2065; SSE4-NEXT: LBB14_8: ## %else6 2066; SSE4-NEXT: testb $16, %al 2067; SSE4-NEXT: jne LBB14_9 2068; SSE4-NEXT: LBB14_10: ## %else8 2069; SSE4-NEXT: testb $32, %al 2070; SSE4-NEXT: jne LBB14_11 2071; SSE4-NEXT: LBB14_12: ## %else10 2072; SSE4-NEXT: testb $64, %al 2073; SSE4-NEXT: jne LBB14_13 2074; SSE4-NEXT: LBB14_14: ## %else12 2075; SSE4-NEXT: testb %al, %al 2076; SSE4-NEXT: js LBB14_15 2077; SSE4-NEXT: LBB14_16: ## %else14 2078; SSE4-NEXT: testl $256, %eax ## imm = 0x100 2079; SSE4-NEXT: jne LBB14_17 2080; SSE4-NEXT: LBB14_18: ## %else16 2081; SSE4-NEXT: testl $512, %eax ## imm = 0x200 2082; SSE4-NEXT: jne LBB14_19 2083; SSE4-NEXT: LBB14_20: ## %else18 2084; SSE4-NEXT: testl $1024, %eax ## imm = 0x400 2085; SSE4-NEXT: jne LBB14_21 2086; SSE4-NEXT: LBB14_22: ## %else20 2087; SSE4-NEXT: testl $2048, %eax ## imm = 0x800 2088; SSE4-NEXT: jne LBB14_23 2089; SSE4-NEXT: LBB14_24: ## %else22 2090; SSE4-NEXT: testl $4096, %eax ## imm = 0x1000 2091; SSE4-NEXT: jne LBB14_25 2092; SSE4-NEXT: LBB14_26: ## %else24 2093; SSE4-NEXT: testl $8192, %eax ## imm = 0x2000 2094; SSE4-NEXT: jne LBB14_27 2095; SSE4-NEXT: LBB14_28: ## %else26 2096; SSE4-NEXT: testl $16384, %eax ## imm = 0x4000 2097; SSE4-NEXT: jne LBB14_29 2098; SSE4-NEXT: LBB14_30: ## %else28 2099; SSE4-NEXT: testl $32768, %eax ## imm = 0x8000 2100; SSE4-NEXT: jne LBB14_31 2101; SSE4-NEXT: LBB14_32: ## %else30 2102; SSE4-NEXT: retq 2103; SSE4-NEXT: LBB14_1: ## %cond.store 2104; SSE4-NEXT: pextrw $0, %xmm2, (%rdi) 2105; SSE4-NEXT: testb $2, %al 2106; SSE4-NEXT: je LBB14_4 2107; SSE4-NEXT: LBB14_3: ## %cond.store1 2108; SSE4-NEXT: pextrw $1, %xmm2, 2(%rdi) 2109; SSE4-NEXT: testb $4, %al 2110; SSE4-NEXT: je LBB14_6 2111; SSE4-NEXT: LBB14_5: ## %cond.store3 2112; SSE4-NEXT: pextrw $2, %xmm2, 4(%rdi) 2113; SSE4-NEXT: testb $8, %al 2114; SSE4-NEXT: je LBB14_8 2115; SSE4-NEXT: LBB14_7: ## %cond.store5 2116; SSE4-NEXT: pextrw $3, %xmm2, 6(%rdi) 2117; SSE4-NEXT: testb $16, %al 2118; SSE4-NEXT: je LBB14_10 2119; SSE4-NEXT: LBB14_9: ## %cond.store7 2120; SSE4-NEXT: pextrw $4, %xmm2, 8(%rdi) 2121; SSE4-NEXT: testb $32, %al 2122; SSE4-NEXT: je LBB14_12 2123; SSE4-NEXT: LBB14_11: ## %cond.store9 2124; SSE4-NEXT: pextrw $5, %xmm2, 10(%rdi) 2125; SSE4-NEXT: testb $64, %al 2126; SSE4-NEXT: je LBB14_14 2127; SSE4-NEXT: LBB14_13: ## %cond.store11 2128; SSE4-NEXT: pextrw $6, %xmm2, 12(%rdi) 2129; SSE4-NEXT: testb %al, %al 2130; SSE4-NEXT: jns LBB14_16 2131; SSE4-NEXT: LBB14_15: ## %cond.store13 2132; SSE4-NEXT: pextrw $7, %xmm2, 14(%rdi) 2133; SSE4-NEXT: testl $256, %eax ## imm = 0x100 2134; SSE4-NEXT: je LBB14_18 2135; SSE4-NEXT: LBB14_17: ## %cond.store15 2136; SSE4-NEXT: pextrw $0, %xmm3, 16(%rdi) 2137; SSE4-NEXT: testl $512, %eax ## imm = 0x200 2138; SSE4-NEXT: je LBB14_20 2139; SSE4-NEXT: LBB14_19: ## %cond.store17 2140; SSE4-NEXT: pextrw $1, %xmm3, 18(%rdi) 2141; SSE4-NEXT: testl $1024, %eax ## imm = 0x400 2142; SSE4-NEXT: je LBB14_22 2143; SSE4-NEXT: LBB14_21: ## %cond.store19 2144; SSE4-NEXT: pextrw $2, %xmm3, 20(%rdi) 2145; SSE4-NEXT: testl $2048, %eax ## imm = 0x800 2146; SSE4-NEXT: je LBB14_24 2147; SSE4-NEXT: LBB14_23: ## %cond.store21 2148; SSE4-NEXT: pextrw $3, %xmm3, 22(%rdi) 2149; SSE4-NEXT: testl $4096, %eax ## imm = 0x1000 2150; SSE4-NEXT: je LBB14_26 2151; SSE4-NEXT: LBB14_25: ## %cond.store23 2152; SSE4-NEXT: pextrw $4, %xmm3, 24(%rdi) 2153; SSE4-NEXT: testl $8192, %eax ## imm = 0x2000 2154; SSE4-NEXT: je LBB14_28 2155; SSE4-NEXT: LBB14_27: ## %cond.store25 2156; SSE4-NEXT: pextrw $5, %xmm3, 26(%rdi) 2157; SSE4-NEXT: testl $16384, %eax ## imm = 0x4000 2158; SSE4-NEXT: je LBB14_30 2159; SSE4-NEXT: LBB14_29: ## %cond.store27 2160; SSE4-NEXT: pextrw $6, %xmm3, 28(%rdi) 2161; SSE4-NEXT: testl $32768, %eax ## imm = 0x8000 2162; SSE4-NEXT: je LBB14_32 2163; SSE4-NEXT: LBB14_31: ## %cond.store29 2164; SSE4-NEXT: pextrw $7, %xmm3, 30(%rdi) 2165; SSE4-NEXT: retq 2166; 2167; AVX1-LABEL: store_v16i16_v16i16: 2168; AVX1: ## %bb.0: 2169; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2170; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 2171; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2 2172; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0 2173; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 2174; AVX1-NEXT: vpmovmskb %xmm0, %eax 2175; AVX1-NEXT: testb $1, %al 2176; AVX1-NEXT: jne LBB14_1 2177; AVX1-NEXT: ## %bb.2: ## %else 2178; AVX1-NEXT: testb $2, %al 2179; AVX1-NEXT: jne LBB14_3 2180; AVX1-NEXT: LBB14_4: ## %else2 2181; AVX1-NEXT: testb $4, %al 2182; AVX1-NEXT: jne LBB14_5 2183; AVX1-NEXT: LBB14_6: ## %else4 2184; AVX1-NEXT: testb $8, %al 2185; AVX1-NEXT: jne LBB14_7 2186; AVX1-NEXT: LBB14_8: ## %else6 2187; AVX1-NEXT: testb $16, %al 2188; AVX1-NEXT: jne LBB14_9 2189; AVX1-NEXT: LBB14_10: ## %else8 2190; AVX1-NEXT: testb $32, %al 2191; AVX1-NEXT: jne LBB14_11 2192; AVX1-NEXT: LBB14_12: ## %else10 2193; AVX1-NEXT: testb $64, %al 2194; AVX1-NEXT: jne LBB14_13 2195; AVX1-NEXT: LBB14_14: ## %else12 2196; AVX1-NEXT: testb %al, %al 2197; AVX1-NEXT: jns LBB14_16 2198; AVX1-NEXT: LBB14_15: ## %cond.store13 2199; AVX1-NEXT: vpextrw $7, %xmm1, 14(%rdi) 2200; AVX1-NEXT: LBB14_16: ## %else14 2201; AVX1-NEXT: testl $256, %eax ## imm = 0x100 2202; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 2203; AVX1-NEXT: jne LBB14_17 2204; AVX1-NEXT: ## %bb.18: ## %else16 2205; AVX1-NEXT: testl $512, %eax ## imm = 0x200 2206; AVX1-NEXT: jne LBB14_19 2207; AVX1-NEXT: LBB14_20: ## %else18 2208; AVX1-NEXT: testl $1024, %eax ## imm = 0x400 2209; AVX1-NEXT: jne LBB14_21 2210; AVX1-NEXT: LBB14_22: ## %else20 2211; AVX1-NEXT: testl $2048, %eax ## imm = 0x800 2212; AVX1-NEXT: jne LBB14_23 2213; AVX1-NEXT: LBB14_24: ## %else22 2214; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000 2215; AVX1-NEXT: jne LBB14_25 2216; AVX1-NEXT: LBB14_26: ## %else24 2217; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000 2218; AVX1-NEXT: jne LBB14_27 2219; AVX1-NEXT: LBB14_28: ## %else26 2220; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000 2221; AVX1-NEXT: jne LBB14_29 2222; AVX1-NEXT: LBB14_30: ## %else28 2223; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000 2224; AVX1-NEXT: jne LBB14_31 2225; AVX1-NEXT: LBB14_32: ## %else30 2226; AVX1-NEXT: vzeroupper 2227; AVX1-NEXT: retq 2228; AVX1-NEXT: LBB14_1: ## %cond.store 2229; AVX1-NEXT: vpextrw $0, %xmm1, (%rdi) 2230; AVX1-NEXT: testb $2, %al 2231; AVX1-NEXT: je LBB14_4 2232; AVX1-NEXT: LBB14_3: ## %cond.store1 2233; AVX1-NEXT: vpextrw $1, %xmm1, 2(%rdi) 2234; AVX1-NEXT: testb $4, %al 2235; AVX1-NEXT: je LBB14_6 2236; AVX1-NEXT: LBB14_5: ## %cond.store3 2237; AVX1-NEXT: vpextrw $2, %xmm1, 4(%rdi) 2238; AVX1-NEXT: testb $8, %al 2239; AVX1-NEXT: je LBB14_8 2240; AVX1-NEXT: LBB14_7: ## %cond.store5 2241; AVX1-NEXT: vpextrw $3, %xmm1, 6(%rdi) 2242; AVX1-NEXT: testb $16, %al 2243; AVX1-NEXT: je LBB14_10 2244; AVX1-NEXT: LBB14_9: ## %cond.store7 2245; AVX1-NEXT: vpextrw $4, %xmm1, 8(%rdi) 2246; AVX1-NEXT: testb $32, %al 2247; AVX1-NEXT: je LBB14_12 2248; AVX1-NEXT: LBB14_11: ## %cond.store9 2249; AVX1-NEXT: vpextrw $5, %xmm1, 10(%rdi) 2250; AVX1-NEXT: testb $64, %al 2251; AVX1-NEXT: je LBB14_14 2252; AVX1-NEXT: LBB14_13: ## %cond.store11 2253; AVX1-NEXT: vpextrw $6, %xmm1, 12(%rdi) 2254; AVX1-NEXT: testb %al, %al 2255; AVX1-NEXT: js LBB14_15 2256; AVX1-NEXT: jmp LBB14_16 2257; AVX1-NEXT: LBB14_17: ## %cond.store15 2258; AVX1-NEXT: vpextrw $0, %xmm0, 16(%rdi) 2259; AVX1-NEXT: testl $512, %eax ## imm = 0x200 2260; AVX1-NEXT: je LBB14_20 2261; AVX1-NEXT: LBB14_19: ## %cond.store17 2262; AVX1-NEXT: vpextrw $1, %xmm0, 18(%rdi) 2263; AVX1-NEXT: testl $1024, %eax ## imm = 0x400 2264; AVX1-NEXT: je LBB14_22 2265; AVX1-NEXT: LBB14_21: ## %cond.store19 2266; AVX1-NEXT: vpextrw $2, %xmm0, 20(%rdi) 2267; AVX1-NEXT: testl $2048, %eax ## imm = 0x800 2268; AVX1-NEXT: je LBB14_24 2269; AVX1-NEXT: LBB14_23: ## %cond.store21 2270; AVX1-NEXT: vpextrw $3, %xmm0, 22(%rdi) 2271; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000 2272; AVX1-NEXT: je LBB14_26 2273; AVX1-NEXT: LBB14_25: ## %cond.store23 2274; AVX1-NEXT: vpextrw $4, %xmm0, 24(%rdi) 2275; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000 2276; AVX1-NEXT: je LBB14_28 2277; AVX1-NEXT: LBB14_27: ## %cond.store25 2278; AVX1-NEXT: vpextrw $5, %xmm0, 26(%rdi) 2279; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000 2280; AVX1-NEXT: je LBB14_30 2281; AVX1-NEXT: LBB14_29: ## %cond.store27 2282; AVX1-NEXT: vpextrw $6, %xmm0, 28(%rdi) 2283; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000 2284; AVX1-NEXT: je LBB14_32 2285; AVX1-NEXT: LBB14_31: ## %cond.store29 2286; AVX1-NEXT: vpextrw $7, %xmm0, 30(%rdi) 2287; AVX1-NEXT: vzeroupper 2288; AVX1-NEXT: retq 2289; 2290; AVX2-LABEL: store_v16i16_v16i16: 2291; AVX2: ## %bb.0: 2292; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 2293; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 2294; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2295; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 2296; AVX2-NEXT: vpmovmskb %xmm0, %eax 2297; AVX2-NEXT: testb $1, %al 2298; AVX2-NEXT: jne LBB14_1 2299; AVX2-NEXT: ## %bb.2: ## %else 2300; AVX2-NEXT: testb $2, %al 2301; AVX2-NEXT: jne LBB14_3 2302; AVX2-NEXT: LBB14_4: ## %else2 2303; AVX2-NEXT: testb $4, %al 2304; AVX2-NEXT: jne LBB14_5 2305; AVX2-NEXT: LBB14_6: ## %else4 2306; AVX2-NEXT: testb $8, %al 2307; AVX2-NEXT: jne LBB14_7 2308; AVX2-NEXT: LBB14_8: ## %else6 2309; AVX2-NEXT: testb $16, %al 2310; AVX2-NEXT: jne LBB14_9 2311; AVX2-NEXT: LBB14_10: ## %else8 2312; AVX2-NEXT: testb $32, %al 2313; AVX2-NEXT: jne LBB14_11 2314; AVX2-NEXT: LBB14_12: ## %else10 2315; AVX2-NEXT: testb $64, %al 2316; AVX2-NEXT: jne LBB14_13 2317; AVX2-NEXT: LBB14_14: ## %else12 2318; AVX2-NEXT: testb %al, %al 2319; AVX2-NEXT: jns LBB14_16 2320; AVX2-NEXT: LBB14_15: ## %cond.store13 2321; AVX2-NEXT: vpextrw $7, %xmm1, 14(%rdi) 2322; AVX2-NEXT: LBB14_16: ## %else14 2323; AVX2-NEXT: testl $256, %eax ## imm = 0x100 2324; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 2325; AVX2-NEXT: jne LBB14_17 2326; AVX2-NEXT: ## %bb.18: ## %else16 2327; AVX2-NEXT: testl $512, %eax ## imm = 0x200 2328; AVX2-NEXT: jne LBB14_19 2329; AVX2-NEXT: LBB14_20: ## %else18 2330; AVX2-NEXT: testl $1024, %eax ## imm = 0x400 2331; AVX2-NEXT: jne LBB14_21 2332; AVX2-NEXT: LBB14_22: ## %else20 2333; AVX2-NEXT: testl $2048, %eax ## imm = 0x800 2334; AVX2-NEXT: jne LBB14_23 2335; AVX2-NEXT: LBB14_24: ## %else22 2336; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000 2337; AVX2-NEXT: jne LBB14_25 2338; AVX2-NEXT: LBB14_26: ## %else24 2339; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000 2340; AVX2-NEXT: jne LBB14_27 2341; AVX2-NEXT: LBB14_28: ## %else26 2342; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000 2343; AVX2-NEXT: jne LBB14_29 2344; AVX2-NEXT: LBB14_30: ## %else28 2345; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000 2346; AVX2-NEXT: jne LBB14_31 2347; AVX2-NEXT: LBB14_32: ## %else30 2348; AVX2-NEXT: vzeroupper 2349; AVX2-NEXT: retq 2350; AVX2-NEXT: LBB14_1: ## %cond.store 2351; AVX2-NEXT: vpextrw $0, %xmm1, (%rdi) 2352; AVX2-NEXT: testb $2, %al 2353; AVX2-NEXT: je LBB14_4 2354; AVX2-NEXT: LBB14_3: ## %cond.store1 2355; AVX2-NEXT: vpextrw $1, %xmm1, 2(%rdi) 2356; AVX2-NEXT: testb $4, %al 2357; AVX2-NEXT: je LBB14_6 2358; AVX2-NEXT: LBB14_5: ## %cond.store3 2359; AVX2-NEXT: vpextrw $2, %xmm1, 4(%rdi) 2360; AVX2-NEXT: testb $8, %al 2361; AVX2-NEXT: je LBB14_8 2362; AVX2-NEXT: LBB14_7: ## %cond.store5 2363; AVX2-NEXT: vpextrw $3, %xmm1, 6(%rdi) 2364; AVX2-NEXT: testb $16, %al 2365; AVX2-NEXT: je LBB14_10 2366; AVX2-NEXT: LBB14_9: ## %cond.store7 2367; AVX2-NEXT: vpextrw $4, %xmm1, 8(%rdi) 2368; AVX2-NEXT: testb $32, %al 2369; AVX2-NEXT: je LBB14_12 2370; AVX2-NEXT: LBB14_11: ## %cond.store9 2371; AVX2-NEXT: vpextrw $5, %xmm1, 10(%rdi) 2372; AVX2-NEXT: testb $64, %al 2373; AVX2-NEXT: je LBB14_14 2374; AVX2-NEXT: LBB14_13: ## %cond.store11 2375; AVX2-NEXT: vpextrw $6, %xmm1, 12(%rdi) 2376; AVX2-NEXT: testb %al, %al 2377; AVX2-NEXT: js LBB14_15 2378; AVX2-NEXT: jmp LBB14_16 2379; AVX2-NEXT: LBB14_17: ## %cond.store15 2380; AVX2-NEXT: vpextrw $0, %xmm0, 16(%rdi) 2381; AVX2-NEXT: testl $512, %eax ## imm = 0x200 2382; AVX2-NEXT: je LBB14_20 2383; AVX2-NEXT: LBB14_19: ## %cond.store17 2384; AVX2-NEXT: vpextrw $1, %xmm0, 18(%rdi) 2385; AVX2-NEXT: testl $1024, %eax ## imm = 0x400 2386; AVX2-NEXT: je LBB14_22 2387; AVX2-NEXT: LBB14_21: ## %cond.store19 2388; AVX2-NEXT: vpextrw $2, %xmm0, 20(%rdi) 2389; AVX2-NEXT: testl $2048, %eax ## imm = 0x800 2390; AVX2-NEXT: je LBB14_24 2391; AVX2-NEXT: LBB14_23: ## %cond.store21 2392; AVX2-NEXT: vpextrw $3, %xmm0, 22(%rdi) 2393; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000 2394; AVX2-NEXT: je LBB14_26 2395; AVX2-NEXT: LBB14_25: ## %cond.store23 2396; AVX2-NEXT: vpextrw $4, %xmm0, 24(%rdi) 2397; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000 2398; AVX2-NEXT: je LBB14_28 2399; AVX2-NEXT: LBB14_27: ## %cond.store25 2400; AVX2-NEXT: vpextrw $5, %xmm0, 26(%rdi) 2401; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000 2402; AVX2-NEXT: je LBB14_30 2403; AVX2-NEXT: LBB14_29: ## %cond.store27 2404; AVX2-NEXT: vpextrw $6, %xmm0, 28(%rdi) 2405; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000 2406; AVX2-NEXT: je LBB14_32 2407; AVX2-NEXT: LBB14_31: ## %cond.store29 2408; AVX2-NEXT: vpextrw $7, %xmm0, 30(%rdi) 2409; AVX2-NEXT: vzeroupper 2410; AVX2-NEXT: retq 2411; 2412; AVX512F-LABEL: store_v16i16_v16i16: 2413; AVX512F: ## %bb.0: 2414; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 2415; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 2416; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 2417; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 2418; AVX512F-NEXT: kmovw %k0, %eax 2419; AVX512F-NEXT: testb $1, %al 2420; AVX512F-NEXT: jne LBB14_1 2421; AVX512F-NEXT: ## %bb.2: ## %else 2422; AVX512F-NEXT: testb $2, %al 2423; AVX512F-NEXT: jne LBB14_3 2424; AVX512F-NEXT: LBB14_4: ## %else2 2425; AVX512F-NEXT: testb $4, %al 2426; AVX512F-NEXT: jne LBB14_5 2427; AVX512F-NEXT: LBB14_6: ## %else4 2428; AVX512F-NEXT: testb $8, %al 2429; AVX512F-NEXT: jne LBB14_7 2430; AVX512F-NEXT: LBB14_8: ## %else6 2431; AVX512F-NEXT: testb $16, %al 2432; AVX512F-NEXT: jne LBB14_9 2433; AVX512F-NEXT: LBB14_10: ## %else8 2434; AVX512F-NEXT: testb $32, %al 2435; AVX512F-NEXT: jne LBB14_11 2436; AVX512F-NEXT: LBB14_12: ## %else10 2437; AVX512F-NEXT: testb $64, %al 2438; AVX512F-NEXT: jne LBB14_13 2439; AVX512F-NEXT: LBB14_14: ## %else12 2440; AVX512F-NEXT: testb %al, %al 2441; AVX512F-NEXT: jns LBB14_16 2442; AVX512F-NEXT: LBB14_15: ## %cond.store13 2443; AVX512F-NEXT: vpextrw $7, %xmm1, 14(%rdi) 2444; AVX512F-NEXT: LBB14_16: ## %else14 2445; AVX512F-NEXT: testl $256, %eax ## imm = 0x100 2446; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 2447; AVX512F-NEXT: jne LBB14_17 2448; AVX512F-NEXT: ## %bb.18: ## %else16 2449; AVX512F-NEXT: testl $512, %eax ## imm = 0x200 2450; AVX512F-NEXT: jne LBB14_19 2451; AVX512F-NEXT: LBB14_20: ## %else18 2452; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400 2453; AVX512F-NEXT: jne LBB14_21 2454; AVX512F-NEXT: LBB14_22: ## %else20 2455; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800 2456; AVX512F-NEXT: jne LBB14_23 2457; AVX512F-NEXT: LBB14_24: ## %else22 2458; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000 2459; AVX512F-NEXT: jne LBB14_25 2460; AVX512F-NEXT: LBB14_26: ## %else24 2461; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000 2462; AVX512F-NEXT: jne LBB14_27 2463; AVX512F-NEXT: LBB14_28: ## %else26 2464; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000 2465; AVX512F-NEXT: jne LBB14_29 2466; AVX512F-NEXT: LBB14_30: ## %else28 2467; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000 2468; AVX512F-NEXT: jne LBB14_31 2469; AVX512F-NEXT: LBB14_32: ## %else30 2470; AVX512F-NEXT: vzeroupper 2471; AVX512F-NEXT: retq 2472; AVX512F-NEXT: LBB14_1: ## %cond.store 2473; AVX512F-NEXT: vpextrw $0, %xmm1, (%rdi) 2474; AVX512F-NEXT: testb $2, %al 2475; AVX512F-NEXT: je LBB14_4 2476; AVX512F-NEXT: LBB14_3: ## %cond.store1 2477; AVX512F-NEXT: vpextrw $1, %xmm1, 2(%rdi) 2478; AVX512F-NEXT: testb $4, %al 2479; AVX512F-NEXT: je LBB14_6 2480; AVX512F-NEXT: LBB14_5: ## %cond.store3 2481; AVX512F-NEXT: vpextrw $2, %xmm1, 4(%rdi) 2482; AVX512F-NEXT: testb $8, %al 2483; AVX512F-NEXT: je LBB14_8 2484; AVX512F-NEXT: LBB14_7: ## %cond.store5 2485; AVX512F-NEXT: vpextrw $3, %xmm1, 6(%rdi) 2486; AVX512F-NEXT: testb $16, %al 2487; AVX512F-NEXT: je LBB14_10 2488; AVX512F-NEXT: LBB14_9: ## %cond.store7 2489; AVX512F-NEXT: vpextrw $4, %xmm1, 8(%rdi) 2490; AVX512F-NEXT: testb $32, %al 2491; AVX512F-NEXT: je LBB14_12 2492; AVX512F-NEXT: LBB14_11: ## %cond.store9 2493; AVX512F-NEXT: vpextrw $5, %xmm1, 10(%rdi) 2494; AVX512F-NEXT: testb $64, %al 2495; AVX512F-NEXT: je LBB14_14 2496; AVX512F-NEXT: LBB14_13: ## %cond.store11 2497; AVX512F-NEXT: vpextrw $6, %xmm1, 12(%rdi) 2498; AVX512F-NEXT: testb %al, %al 2499; AVX512F-NEXT: js LBB14_15 2500; AVX512F-NEXT: jmp LBB14_16 2501; AVX512F-NEXT: LBB14_17: ## %cond.store15 2502; AVX512F-NEXT: vpextrw $0, %xmm0, 16(%rdi) 2503; AVX512F-NEXT: testl $512, %eax ## imm = 0x200 2504; AVX512F-NEXT: je LBB14_20 2505; AVX512F-NEXT: LBB14_19: ## %cond.store17 2506; AVX512F-NEXT: vpextrw $1, %xmm0, 18(%rdi) 2507; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400 2508; AVX512F-NEXT: je LBB14_22 2509; AVX512F-NEXT: LBB14_21: ## %cond.store19 2510; AVX512F-NEXT: vpextrw $2, %xmm0, 20(%rdi) 2511; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800 2512; AVX512F-NEXT: je LBB14_24 2513; AVX512F-NEXT: LBB14_23: ## %cond.store21 2514; AVX512F-NEXT: vpextrw $3, %xmm0, 22(%rdi) 2515; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000 2516; AVX512F-NEXT: je LBB14_26 2517; AVX512F-NEXT: LBB14_25: ## %cond.store23 2518; AVX512F-NEXT: vpextrw $4, %xmm0, 24(%rdi) 2519; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000 2520; AVX512F-NEXT: je LBB14_28 2521; AVX512F-NEXT: LBB14_27: ## %cond.store25 2522; AVX512F-NEXT: vpextrw $5, %xmm0, 26(%rdi) 2523; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000 2524; AVX512F-NEXT: je LBB14_30 2525; AVX512F-NEXT: LBB14_29: ## %cond.store27 2526; AVX512F-NEXT: vpextrw $6, %xmm0, 28(%rdi) 2527; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000 2528; AVX512F-NEXT: je LBB14_32 2529; AVX512F-NEXT: LBB14_31: ## %cond.store29 2530; AVX512F-NEXT: vpextrw $7, %xmm0, 30(%rdi) 2531; AVX512F-NEXT: vzeroupper 2532; AVX512F-NEXT: retq 2533; 2534; AVX512VLDQ-LABEL: store_v16i16_v16i16: 2535; AVX512VLDQ: ## %bb.0: 2536; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 2537; AVX512VLDQ-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 2538; AVX512VLDQ-NEXT: vpmovsxwd %ymm0, %zmm0 2539; AVX512VLDQ-NEXT: vpmovd2m %zmm0, %k0 2540; AVX512VLDQ-NEXT: kmovw %k0, %eax 2541; AVX512VLDQ-NEXT: testb $1, %al 2542; AVX512VLDQ-NEXT: jne LBB14_1 2543; AVX512VLDQ-NEXT: ## %bb.2: ## %else 2544; AVX512VLDQ-NEXT: testb $2, %al 2545; AVX512VLDQ-NEXT: jne LBB14_3 2546; AVX512VLDQ-NEXT: LBB14_4: ## %else2 2547; AVX512VLDQ-NEXT: testb $4, %al 2548; AVX512VLDQ-NEXT: jne LBB14_5 2549; AVX512VLDQ-NEXT: LBB14_6: ## %else4 2550; AVX512VLDQ-NEXT: testb $8, %al 2551; AVX512VLDQ-NEXT: jne LBB14_7 2552; AVX512VLDQ-NEXT: LBB14_8: ## %else6 2553; AVX512VLDQ-NEXT: testb $16, %al 2554; AVX512VLDQ-NEXT: jne LBB14_9 2555; AVX512VLDQ-NEXT: LBB14_10: ## %else8 2556; AVX512VLDQ-NEXT: testb $32, %al 2557; AVX512VLDQ-NEXT: jne LBB14_11 2558; AVX512VLDQ-NEXT: LBB14_12: ## %else10 2559; AVX512VLDQ-NEXT: testb $64, %al 2560; AVX512VLDQ-NEXT: jne LBB14_13 2561; AVX512VLDQ-NEXT: LBB14_14: ## %else12 2562; AVX512VLDQ-NEXT: testb %al, %al 2563; AVX512VLDQ-NEXT: jns LBB14_16 2564; AVX512VLDQ-NEXT: LBB14_15: ## %cond.store13 2565; AVX512VLDQ-NEXT: vpextrw $7, %xmm1, 14(%rdi) 2566; AVX512VLDQ-NEXT: LBB14_16: ## %else14 2567; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100 2568; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 2569; AVX512VLDQ-NEXT: jne LBB14_17 2570; AVX512VLDQ-NEXT: ## %bb.18: ## %else16 2571; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200 2572; AVX512VLDQ-NEXT: jne LBB14_19 2573; AVX512VLDQ-NEXT: LBB14_20: ## %else18 2574; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400 2575; AVX512VLDQ-NEXT: jne LBB14_21 2576; AVX512VLDQ-NEXT: LBB14_22: ## %else20 2577; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800 2578; AVX512VLDQ-NEXT: jne LBB14_23 2579; AVX512VLDQ-NEXT: LBB14_24: ## %else22 2580; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000 2581; AVX512VLDQ-NEXT: jne LBB14_25 2582; AVX512VLDQ-NEXT: LBB14_26: ## %else24 2583; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000 2584; AVX512VLDQ-NEXT: jne LBB14_27 2585; AVX512VLDQ-NEXT: LBB14_28: ## %else26 2586; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000 2587; AVX512VLDQ-NEXT: jne LBB14_29 2588; AVX512VLDQ-NEXT: LBB14_30: ## %else28 2589; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000 2590; AVX512VLDQ-NEXT: jne LBB14_31 2591; AVX512VLDQ-NEXT: LBB14_32: ## %else30 2592; AVX512VLDQ-NEXT: vzeroupper 2593; AVX512VLDQ-NEXT: retq 2594; AVX512VLDQ-NEXT: LBB14_1: ## %cond.store 2595; AVX512VLDQ-NEXT: vpextrw $0, %xmm1, (%rdi) 2596; AVX512VLDQ-NEXT: testb $2, %al 2597; AVX512VLDQ-NEXT: je LBB14_4 2598; AVX512VLDQ-NEXT: LBB14_3: ## %cond.store1 2599; AVX512VLDQ-NEXT: vpextrw $1, %xmm1, 2(%rdi) 2600; AVX512VLDQ-NEXT: testb $4, %al 2601; AVX512VLDQ-NEXT: je LBB14_6 2602; AVX512VLDQ-NEXT: LBB14_5: ## %cond.store3 2603; AVX512VLDQ-NEXT: vpextrw $2, %xmm1, 4(%rdi) 2604; AVX512VLDQ-NEXT: testb $8, %al 2605; AVX512VLDQ-NEXT: je LBB14_8 2606; AVX512VLDQ-NEXT: LBB14_7: ## %cond.store5 2607; AVX512VLDQ-NEXT: vpextrw $3, %xmm1, 6(%rdi) 2608; AVX512VLDQ-NEXT: testb $16, %al 2609; AVX512VLDQ-NEXT: je LBB14_10 2610; AVX512VLDQ-NEXT: LBB14_9: ## %cond.store7 2611; AVX512VLDQ-NEXT: vpextrw $4, %xmm1, 8(%rdi) 2612; AVX512VLDQ-NEXT: testb $32, %al 2613; AVX512VLDQ-NEXT: je LBB14_12 2614; AVX512VLDQ-NEXT: LBB14_11: ## %cond.store9 2615; AVX512VLDQ-NEXT: vpextrw $5, %xmm1, 10(%rdi) 2616; AVX512VLDQ-NEXT: testb $64, %al 2617; AVX512VLDQ-NEXT: je LBB14_14 2618; AVX512VLDQ-NEXT: LBB14_13: ## %cond.store11 2619; AVX512VLDQ-NEXT: vpextrw $6, %xmm1, 12(%rdi) 2620; AVX512VLDQ-NEXT: testb %al, %al 2621; AVX512VLDQ-NEXT: js LBB14_15 2622; AVX512VLDQ-NEXT: jmp LBB14_16 2623; AVX512VLDQ-NEXT: LBB14_17: ## %cond.store15 2624; AVX512VLDQ-NEXT: vpextrw $0, %xmm0, 16(%rdi) 2625; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200 2626; AVX512VLDQ-NEXT: je LBB14_20 2627; AVX512VLDQ-NEXT: LBB14_19: ## %cond.store17 2628; AVX512VLDQ-NEXT: vpextrw $1, %xmm0, 18(%rdi) 2629; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400 2630; AVX512VLDQ-NEXT: je LBB14_22 2631; AVX512VLDQ-NEXT: LBB14_21: ## %cond.store19 2632; AVX512VLDQ-NEXT: vpextrw $2, %xmm0, 20(%rdi) 2633; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800 2634; AVX512VLDQ-NEXT: je LBB14_24 2635; AVX512VLDQ-NEXT: LBB14_23: ## %cond.store21 2636; AVX512VLDQ-NEXT: vpextrw $3, %xmm0, 22(%rdi) 2637; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000 2638; AVX512VLDQ-NEXT: je LBB14_26 2639; AVX512VLDQ-NEXT: LBB14_25: ## %cond.store23 2640; AVX512VLDQ-NEXT: vpextrw $4, %xmm0, 24(%rdi) 2641; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000 2642; AVX512VLDQ-NEXT: je LBB14_28 2643; AVX512VLDQ-NEXT: LBB14_27: ## %cond.store25 2644; AVX512VLDQ-NEXT: vpextrw $5, %xmm0, 26(%rdi) 2645; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000 2646; AVX512VLDQ-NEXT: je LBB14_30 2647; AVX512VLDQ-NEXT: LBB14_29: ## %cond.store27 2648; AVX512VLDQ-NEXT: vpextrw $6, %xmm0, 28(%rdi) 2649; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000 2650; AVX512VLDQ-NEXT: je LBB14_32 2651; AVX512VLDQ-NEXT: LBB14_31: ## %cond.store29 2652; AVX512VLDQ-NEXT: vpextrw $7, %xmm0, 30(%rdi) 2653; AVX512VLDQ-NEXT: vzeroupper 2654; AVX512VLDQ-NEXT: retq 2655; 2656; AVX512VLBW-LABEL: store_v16i16_v16i16: 2657; AVX512VLBW: ## %bb.0: 2658; AVX512VLBW-NEXT: vptestnmw %ymm0, %ymm0, %k1 2659; AVX512VLBW-NEXT: vmovdqu16 %ymm1, (%rdi) {%k1} 2660; AVX512VLBW-NEXT: vzeroupper 2661; AVX512VLBW-NEXT: retq 2662; 2663; X86-AVX512-LABEL: store_v16i16_v16i16: 2664; X86-AVX512: ## %bb.0: 2665; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 2666; X86-AVX512-NEXT: vptestnmw %ymm0, %ymm0, %k1 2667; X86-AVX512-NEXT: vmovdqu16 %ymm1, (%eax) {%k1} 2668; X86-AVX512-NEXT: vzeroupper 2669; X86-AVX512-NEXT: retl 2670 %mask = icmp eq <16 x i16> %trigger, zeroinitializer 2671 call void @llvm.masked.store.v16i16.p0(<16 x i16> %val, ptr %addr, i32 4, <16 x i1> %mask) 2672 ret void 2673} 2674 2675; 2676; vXi8 2677; 2678 2679define void @store_v16i8_v16i8(<16 x i8> %trigger, ptr %addr, <16 x i8> %val) nounwind { 2680; SSE2-LABEL: store_v16i8_v16i8: 2681; SSE2: ## %bb.0: 2682; SSE2-NEXT: pxor %xmm2, %xmm2 2683; SSE2-NEXT: pcmpeqb %xmm0, %xmm2 2684; SSE2-NEXT: pmovmskb %xmm2, %eax 2685; SSE2-NEXT: testb $1, %al 2686; SSE2-NEXT: movd %xmm1, %ecx 2687; SSE2-NEXT: jne LBB15_1 2688; SSE2-NEXT: ## %bb.2: ## %else 2689; SSE2-NEXT: testb $2, %al 2690; SSE2-NEXT: jne LBB15_3 2691; SSE2-NEXT: LBB15_4: ## %else2 2692; SSE2-NEXT: testb $4, %al 2693; SSE2-NEXT: jne LBB15_5 2694; SSE2-NEXT: LBB15_6: ## %else4 2695; SSE2-NEXT: testb $8, %al 2696; SSE2-NEXT: je LBB15_8 2697; SSE2-NEXT: LBB15_7: ## %cond.store5 2698; SSE2-NEXT: shrl $24, %ecx 2699; SSE2-NEXT: movb %cl, 3(%rdi) 2700; SSE2-NEXT: LBB15_8: ## %else6 2701; SSE2-NEXT: testb $16, %al 2702; SSE2-NEXT: pextrw $2, %xmm1, %ecx 2703; SSE2-NEXT: je LBB15_10 2704; SSE2-NEXT: ## %bb.9: ## %cond.store7 2705; SSE2-NEXT: movb %cl, 4(%rdi) 2706; SSE2-NEXT: LBB15_10: ## %else8 2707; SSE2-NEXT: testb $32, %al 2708; SSE2-NEXT: je LBB15_12 2709; SSE2-NEXT: ## %bb.11: ## %cond.store9 2710; SSE2-NEXT: movb %ch, 5(%rdi) 2711; SSE2-NEXT: LBB15_12: ## %else10 2712; SSE2-NEXT: testb $64, %al 2713; SSE2-NEXT: pextrw $3, %xmm1, %ecx 2714; SSE2-NEXT: je LBB15_14 2715; SSE2-NEXT: ## %bb.13: ## %cond.store11 2716; SSE2-NEXT: movb %cl, 6(%rdi) 2717; SSE2-NEXT: LBB15_14: ## %else12 2718; SSE2-NEXT: testb %al, %al 2719; SSE2-NEXT: jns LBB15_16 2720; SSE2-NEXT: ## %bb.15: ## %cond.store13 2721; SSE2-NEXT: movb %ch, 7(%rdi) 2722; SSE2-NEXT: LBB15_16: ## %else14 2723; SSE2-NEXT: testl $256, %eax ## imm = 0x100 2724; SSE2-NEXT: pextrw $4, %xmm1, %ecx 2725; SSE2-NEXT: je LBB15_18 2726; SSE2-NEXT: ## %bb.17: ## %cond.store15 2727; SSE2-NEXT: movb %cl, 8(%rdi) 2728; SSE2-NEXT: LBB15_18: ## %else16 2729; SSE2-NEXT: testl $512, %eax ## imm = 0x200 2730; SSE2-NEXT: je LBB15_20 2731; SSE2-NEXT: ## %bb.19: ## %cond.store17 2732; SSE2-NEXT: movb %ch, 9(%rdi) 2733; SSE2-NEXT: LBB15_20: ## %else18 2734; SSE2-NEXT: testl $1024, %eax ## imm = 0x400 2735; SSE2-NEXT: pextrw $5, %xmm1, %ecx 2736; SSE2-NEXT: je LBB15_22 2737; SSE2-NEXT: ## %bb.21: ## %cond.store19 2738; SSE2-NEXT: movb %cl, 10(%rdi) 2739; SSE2-NEXT: LBB15_22: ## %else20 2740; SSE2-NEXT: testl $2048, %eax ## imm = 0x800 2741; SSE2-NEXT: je LBB15_24 2742; SSE2-NEXT: ## %bb.23: ## %cond.store21 2743; SSE2-NEXT: movb %ch, 11(%rdi) 2744; SSE2-NEXT: LBB15_24: ## %else22 2745; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000 2746; SSE2-NEXT: pextrw $6, %xmm1, %ecx 2747; SSE2-NEXT: je LBB15_26 2748; SSE2-NEXT: ## %bb.25: ## %cond.store23 2749; SSE2-NEXT: movb %cl, 12(%rdi) 2750; SSE2-NEXT: LBB15_26: ## %else24 2751; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000 2752; SSE2-NEXT: je LBB15_28 2753; SSE2-NEXT: ## %bb.27: ## %cond.store25 2754; SSE2-NEXT: movb %ch, 13(%rdi) 2755; SSE2-NEXT: LBB15_28: ## %else26 2756; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000 2757; SSE2-NEXT: pextrw $7, %xmm1, %ecx 2758; SSE2-NEXT: jne LBB15_29 2759; SSE2-NEXT: ## %bb.30: ## %else28 2760; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000 2761; SSE2-NEXT: jne LBB15_31 2762; SSE2-NEXT: LBB15_32: ## %else30 2763; SSE2-NEXT: retq 2764; SSE2-NEXT: LBB15_1: ## %cond.store 2765; SSE2-NEXT: movb %cl, (%rdi) 2766; SSE2-NEXT: testb $2, %al 2767; SSE2-NEXT: je LBB15_4 2768; SSE2-NEXT: LBB15_3: ## %cond.store1 2769; SSE2-NEXT: movb %ch, 1(%rdi) 2770; SSE2-NEXT: testb $4, %al 2771; SSE2-NEXT: je LBB15_6 2772; SSE2-NEXT: LBB15_5: ## %cond.store3 2773; SSE2-NEXT: movl %ecx, %edx 2774; SSE2-NEXT: shrl $16, %edx 2775; SSE2-NEXT: movb %dl, 2(%rdi) 2776; SSE2-NEXT: testb $8, %al 2777; SSE2-NEXT: jne LBB15_7 2778; SSE2-NEXT: jmp LBB15_8 2779; SSE2-NEXT: LBB15_29: ## %cond.store27 2780; SSE2-NEXT: movb %cl, 14(%rdi) 2781; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000 2782; SSE2-NEXT: je LBB15_32 2783; SSE2-NEXT: LBB15_31: ## %cond.store29 2784; SSE2-NEXT: movb %ch, 15(%rdi) 2785; SSE2-NEXT: retq 2786; 2787; SSE4-LABEL: store_v16i8_v16i8: 2788; SSE4: ## %bb.0: 2789; SSE4-NEXT: pxor %xmm2, %xmm2 2790; SSE4-NEXT: pcmpeqb %xmm0, %xmm2 2791; SSE4-NEXT: pmovmskb %xmm2, %eax 2792; SSE4-NEXT: testb $1, %al 2793; SSE4-NEXT: jne LBB15_1 2794; SSE4-NEXT: ## %bb.2: ## %else 2795; SSE4-NEXT: testb $2, %al 2796; SSE4-NEXT: jne LBB15_3 2797; SSE4-NEXT: LBB15_4: ## %else2 2798; SSE4-NEXT: testb $4, %al 2799; SSE4-NEXT: jne LBB15_5 2800; SSE4-NEXT: LBB15_6: ## %else4 2801; SSE4-NEXT: testb $8, %al 2802; SSE4-NEXT: jne LBB15_7 2803; SSE4-NEXT: LBB15_8: ## %else6 2804; SSE4-NEXT: testb $16, %al 2805; SSE4-NEXT: jne LBB15_9 2806; SSE4-NEXT: LBB15_10: ## %else8 2807; SSE4-NEXT: testb $32, %al 2808; SSE4-NEXT: jne LBB15_11 2809; SSE4-NEXT: LBB15_12: ## %else10 2810; SSE4-NEXT: testb $64, %al 2811; SSE4-NEXT: jne LBB15_13 2812; SSE4-NEXT: LBB15_14: ## %else12 2813; SSE4-NEXT: testb %al, %al 2814; SSE4-NEXT: js LBB15_15 2815; SSE4-NEXT: LBB15_16: ## %else14 2816; SSE4-NEXT: testl $256, %eax ## imm = 0x100 2817; SSE4-NEXT: jne LBB15_17 2818; SSE4-NEXT: LBB15_18: ## %else16 2819; SSE4-NEXT: testl $512, %eax ## imm = 0x200 2820; SSE4-NEXT: jne LBB15_19 2821; SSE4-NEXT: LBB15_20: ## %else18 2822; SSE4-NEXT: testl $1024, %eax ## imm = 0x400 2823; SSE4-NEXT: jne LBB15_21 2824; SSE4-NEXT: LBB15_22: ## %else20 2825; SSE4-NEXT: testl $2048, %eax ## imm = 0x800 2826; SSE4-NEXT: jne LBB15_23 2827; SSE4-NEXT: LBB15_24: ## %else22 2828; SSE4-NEXT: testl $4096, %eax ## imm = 0x1000 2829; SSE4-NEXT: jne LBB15_25 2830; SSE4-NEXT: LBB15_26: ## %else24 2831; SSE4-NEXT: testl $8192, %eax ## imm = 0x2000 2832; SSE4-NEXT: jne LBB15_27 2833; SSE4-NEXT: LBB15_28: ## %else26 2834; SSE4-NEXT: testl $16384, %eax ## imm = 0x4000 2835; SSE4-NEXT: jne LBB15_29 2836; SSE4-NEXT: LBB15_30: ## %else28 2837; SSE4-NEXT: testl $32768, %eax ## imm = 0x8000 2838; SSE4-NEXT: jne LBB15_31 2839; SSE4-NEXT: LBB15_32: ## %else30 2840; SSE4-NEXT: retq 2841; SSE4-NEXT: LBB15_1: ## %cond.store 2842; SSE4-NEXT: pextrb $0, %xmm1, (%rdi) 2843; SSE4-NEXT: testb $2, %al 2844; SSE4-NEXT: je LBB15_4 2845; SSE4-NEXT: LBB15_3: ## %cond.store1 2846; SSE4-NEXT: pextrb $1, %xmm1, 1(%rdi) 2847; SSE4-NEXT: testb $4, %al 2848; SSE4-NEXT: je LBB15_6 2849; SSE4-NEXT: LBB15_5: ## %cond.store3 2850; SSE4-NEXT: pextrb $2, %xmm1, 2(%rdi) 2851; SSE4-NEXT: testb $8, %al 2852; SSE4-NEXT: je LBB15_8 2853; SSE4-NEXT: LBB15_7: ## %cond.store5 2854; SSE4-NEXT: pextrb $3, %xmm1, 3(%rdi) 2855; SSE4-NEXT: testb $16, %al 2856; SSE4-NEXT: je LBB15_10 2857; SSE4-NEXT: LBB15_9: ## %cond.store7 2858; SSE4-NEXT: pextrb $4, %xmm1, 4(%rdi) 2859; SSE4-NEXT: testb $32, %al 2860; SSE4-NEXT: je LBB15_12 2861; SSE4-NEXT: LBB15_11: ## %cond.store9 2862; SSE4-NEXT: pextrb $5, %xmm1, 5(%rdi) 2863; SSE4-NEXT: testb $64, %al 2864; SSE4-NEXT: je LBB15_14 2865; SSE4-NEXT: LBB15_13: ## %cond.store11 2866; SSE4-NEXT: pextrb $6, %xmm1, 6(%rdi) 2867; SSE4-NEXT: testb %al, %al 2868; SSE4-NEXT: jns LBB15_16 2869; SSE4-NEXT: LBB15_15: ## %cond.store13 2870; SSE4-NEXT: pextrb $7, %xmm1, 7(%rdi) 2871; SSE4-NEXT: testl $256, %eax ## imm = 0x100 2872; SSE4-NEXT: je LBB15_18 2873; SSE4-NEXT: LBB15_17: ## %cond.store15 2874; SSE4-NEXT: pextrb $8, %xmm1, 8(%rdi) 2875; SSE4-NEXT: testl $512, %eax ## imm = 0x200 2876; SSE4-NEXT: je LBB15_20 2877; SSE4-NEXT: LBB15_19: ## %cond.store17 2878; SSE4-NEXT: pextrb $9, %xmm1, 9(%rdi) 2879; SSE4-NEXT: testl $1024, %eax ## imm = 0x400 2880; SSE4-NEXT: je LBB15_22 2881; SSE4-NEXT: LBB15_21: ## %cond.store19 2882; SSE4-NEXT: pextrb $10, %xmm1, 10(%rdi) 2883; SSE4-NEXT: testl $2048, %eax ## imm = 0x800 2884; SSE4-NEXT: je LBB15_24 2885; SSE4-NEXT: LBB15_23: ## %cond.store21 2886; SSE4-NEXT: pextrb $11, %xmm1, 11(%rdi) 2887; SSE4-NEXT: testl $4096, %eax ## imm = 0x1000 2888; SSE4-NEXT: je LBB15_26 2889; SSE4-NEXT: LBB15_25: ## %cond.store23 2890; SSE4-NEXT: pextrb $12, %xmm1, 12(%rdi) 2891; SSE4-NEXT: testl $8192, %eax ## imm = 0x2000 2892; SSE4-NEXT: je LBB15_28 2893; SSE4-NEXT: LBB15_27: ## %cond.store25 2894; SSE4-NEXT: pextrb $13, %xmm1, 13(%rdi) 2895; SSE4-NEXT: testl $16384, %eax ## imm = 0x4000 2896; SSE4-NEXT: je LBB15_30 2897; SSE4-NEXT: LBB15_29: ## %cond.store27 2898; SSE4-NEXT: pextrb $14, %xmm1, 14(%rdi) 2899; SSE4-NEXT: testl $32768, %eax ## imm = 0x8000 2900; SSE4-NEXT: je LBB15_32 2901; SSE4-NEXT: LBB15_31: ## %cond.store29 2902; SSE4-NEXT: pextrb $15, %xmm1, 15(%rdi) 2903; SSE4-NEXT: retq 2904; 2905; AVX1OR2-LABEL: store_v16i8_v16i8: 2906; AVX1OR2: ## %bb.0: 2907; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 2908; AVX1OR2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 2909; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax 2910; AVX1OR2-NEXT: testb $1, %al 2911; AVX1OR2-NEXT: jne LBB15_1 2912; AVX1OR2-NEXT: ## %bb.2: ## %else 2913; AVX1OR2-NEXT: testb $2, %al 2914; AVX1OR2-NEXT: jne LBB15_3 2915; AVX1OR2-NEXT: LBB15_4: ## %else2 2916; AVX1OR2-NEXT: testb $4, %al 2917; AVX1OR2-NEXT: jne LBB15_5 2918; AVX1OR2-NEXT: LBB15_6: ## %else4 2919; AVX1OR2-NEXT: testb $8, %al 2920; AVX1OR2-NEXT: jne LBB15_7 2921; AVX1OR2-NEXT: LBB15_8: ## %else6 2922; AVX1OR2-NEXT: testb $16, %al 2923; AVX1OR2-NEXT: jne LBB15_9 2924; AVX1OR2-NEXT: LBB15_10: ## %else8 2925; AVX1OR2-NEXT: testb $32, %al 2926; AVX1OR2-NEXT: jne LBB15_11 2927; AVX1OR2-NEXT: LBB15_12: ## %else10 2928; AVX1OR2-NEXT: testb $64, %al 2929; AVX1OR2-NEXT: jne LBB15_13 2930; AVX1OR2-NEXT: LBB15_14: ## %else12 2931; AVX1OR2-NEXT: testb %al, %al 2932; AVX1OR2-NEXT: js LBB15_15 2933; AVX1OR2-NEXT: LBB15_16: ## %else14 2934; AVX1OR2-NEXT: testl $256, %eax ## imm = 0x100 2935; AVX1OR2-NEXT: jne LBB15_17 2936; AVX1OR2-NEXT: LBB15_18: ## %else16 2937; AVX1OR2-NEXT: testl $512, %eax ## imm = 0x200 2938; AVX1OR2-NEXT: jne LBB15_19 2939; AVX1OR2-NEXT: LBB15_20: ## %else18 2940; AVX1OR2-NEXT: testl $1024, %eax ## imm = 0x400 2941; AVX1OR2-NEXT: jne LBB15_21 2942; AVX1OR2-NEXT: LBB15_22: ## %else20 2943; AVX1OR2-NEXT: testl $2048, %eax ## imm = 0x800 2944; AVX1OR2-NEXT: jne LBB15_23 2945; AVX1OR2-NEXT: LBB15_24: ## %else22 2946; AVX1OR2-NEXT: testl $4096, %eax ## imm = 0x1000 2947; AVX1OR2-NEXT: jne LBB15_25 2948; AVX1OR2-NEXT: LBB15_26: ## %else24 2949; AVX1OR2-NEXT: testl $8192, %eax ## imm = 0x2000 2950; AVX1OR2-NEXT: jne LBB15_27 2951; AVX1OR2-NEXT: LBB15_28: ## %else26 2952; AVX1OR2-NEXT: testl $16384, %eax ## imm = 0x4000 2953; AVX1OR2-NEXT: jne LBB15_29 2954; AVX1OR2-NEXT: LBB15_30: ## %else28 2955; AVX1OR2-NEXT: testl $32768, %eax ## imm = 0x8000 2956; AVX1OR2-NEXT: jne LBB15_31 2957; AVX1OR2-NEXT: LBB15_32: ## %else30 2958; AVX1OR2-NEXT: retq 2959; AVX1OR2-NEXT: LBB15_1: ## %cond.store 2960; AVX1OR2-NEXT: vpextrb $0, %xmm1, (%rdi) 2961; AVX1OR2-NEXT: testb $2, %al 2962; AVX1OR2-NEXT: je LBB15_4 2963; AVX1OR2-NEXT: LBB15_3: ## %cond.store1 2964; AVX1OR2-NEXT: vpextrb $1, %xmm1, 1(%rdi) 2965; AVX1OR2-NEXT: testb $4, %al 2966; AVX1OR2-NEXT: je LBB15_6 2967; AVX1OR2-NEXT: LBB15_5: ## %cond.store3 2968; AVX1OR2-NEXT: vpextrb $2, %xmm1, 2(%rdi) 2969; AVX1OR2-NEXT: testb $8, %al 2970; AVX1OR2-NEXT: je LBB15_8 2971; AVX1OR2-NEXT: LBB15_7: ## %cond.store5 2972; AVX1OR2-NEXT: vpextrb $3, %xmm1, 3(%rdi) 2973; AVX1OR2-NEXT: testb $16, %al 2974; AVX1OR2-NEXT: je LBB15_10 2975; AVX1OR2-NEXT: LBB15_9: ## %cond.store7 2976; AVX1OR2-NEXT: vpextrb $4, %xmm1, 4(%rdi) 2977; AVX1OR2-NEXT: testb $32, %al 2978; AVX1OR2-NEXT: je LBB15_12 2979; AVX1OR2-NEXT: LBB15_11: ## %cond.store9 2980; AVX1OR2-NEXT: vpextrb $5, %xmm1, 5(%rdi) 2981; AVX1OR2-NEXT: testb $64, %al 2982; AVX1OR2-NEXT: je LBB15_14 2983; AVX1OR2-NEXT: LBB15_13: ## %cond.store11 2984; AVX1OR2-NEXT: vpextrb $6, %xmm1, 6(%rdi) 2985; AVX1OR2-NEXT: testb %al, %al 2986; AVX1OR2-NEXT: jns LBB15_16 2987; AVX1OR2-NEXT: LBB15_15: ## %cond.store13 2988; AVX1OR2-NEXT: vpextrb $7, %xmm1, 7(%rdi) 2989; AVX1OR2-NEXT: testl $256, %eax ## imm = 0x100 2990; AVX1OR2-NEXT: je LBB15_18 2991; AVX1OR2-NEXT: LBB15_17: ## %cond.store15 2992; AVX1OR2-NEXT: vpextrb $8, %xmm1, 8(%rdi) 2993; AVX1OR2-NEXT: testl $512, %eax ## imm = 0x200 2994; AVX1OR2-NEXT: je LBB15_20 2995; AVX1OR2-NEXT: LBB15_19: ## %cond.store17 2996; AVX1OR2-NEXT: vpextrb $9, %xmm1, 9(%rdi) 2997; AVX1OR2-NEXT: testl $1024, %eax ## imm = 0x400 2998; AVX1OR2-NEXT: je LBB15_22 2999; AVX1OR2-NEXT: LBB15_21: ## %cond.store19 3000; AVX1OR2-NEXT: vpextrb $10, %xmm1, 10(%rdi) 3001; AVX1OR2-NEXT: testl $2048, %eax ## imm = 0x800 3002; AVX1OR2-NEXT: je LBB15_24 3003; AVX1OR2-NEXT: LBB15_23: ## %cond.store21 3004; AVX1OR2-NEXT: vpextrb $11, %xmm1, 11(%rdi) 3005; AVX1OR2-NEXT: testl $4096, %eax ## imm = 0x1000 3006; AVX1OR2-NEXT: je LBB15_26 3007; AVX1OR2-NEXT: LBB15_25: ## %cond.store23 3008; AVX1OR2-NEXT: vpextrb $12, %xmm1, 12(%rdi) 3009; AVX1OR2-NEXT: testl $8192, %eax ## imm = 0x2000 3010; AVX1OR2-NEXT: je LBB15_28 3011; AVX1OR2-NEXT: LBB15_27: ## %cond.store25 3012; AVX1OR2-NEXT: vpextrb $13, %xmm1, 13(%rdi) 3013; AVX1OR2-NEXT: testl $16384, %eax ## imm = 0x4000 3014; AVX1OR2-NEXT: je LBB15_30 3015; AVX1OR2-NEXT: LBB15_29: ## %cond.store27 3016; AVX1OR2-NEXT: vpextrb $14, %xmm1, 14(%rdi) 3017; AVX1OR2-NEXT: testl $32768, %eax ## imm = 0x8000 3018; AVX1OR2-NEXT: je LBB15_32 3019; AVX1OR2-NEXT: LBB15_31: ## %cond.store29 3020; AVX1OR2-NEXT: vpextrb $15, %xmm1, 15(%rdi) 3021; AVX1OR2-NEXT: retq 3022; 3023; AVX512F-LABEL: store_v16i8_v16i8: 3024; AVX512F: ## %bb.0: 3025; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 3026; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 3027; AVX512F-NEXT: vpmovmskb %xmm0, %eax 3028; AVX512F-NEXT: testb $1, %al 3029; AVX512F-NEXT: jne LBB15_1 3030; AVX512F-NEXT: ## %bb.2: ## %else 3031; AVX512F-NEXT: testb $2, %al 3032; AVX512F-NEXT: jne LBB15_3 3033; AVX512F-NEXT: LBB15_4: ## %else2 3034; AVX512F-NEXT: testb $4, %al 3035; AVX512F-NEXT: jne LBB15_5 3036; AVX512F-NEXT: LBB15_6: ## %else4 3037; AVX512F-NEXT: testb $8, %al 3038; AVX512F-NEXT: jne LBB15_7 3039; AVX512F-NEXT: LBB15_8: ## %else6 3040; AVX512F-NEXT: testb $16, %al 3041; AVX512F-NEXT: jne LBB15_9 3042; AVX512F-NEXT: LBB15_10: ## %else8 3043; AVX512F-NEXT: testb $32, %al 3044; AVX512F-NEXT: jne LBB15_11 3045; AVX512F-NEXT: LBB15_12: ## %else10 3046; AVX512F-NEXT: testb $64, %al 3047; AVX512F-NEXT: jne LBB15_13 3048; AVX512F-NEXT: LBB15_14: ## %else12 3049; AVX512F-NEXT: testb %al, %al 3050; AVX512F-NEXT: js LBB15_15 3051; AVX512F-NEXT: LBB15_16: ## %else14 3052; AVX512F-NEXT: testl $256, %eax ## imm = 0x100 3053; AVX512F-NEXT: jne LBB15_17 3054; AVX512F-NEXT: LBB15_18: ## %else16 3055; AVX512F-NEXT: testl $512, %eax ## imm = 0x200 3056; AVX512F-NEXT: jne LBB15_19 3057; AVX512F-NEXT: LBB15_20: ## %else18 3058; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400 3059; AVX512F-NEXT: jne LBB15_21 3060; AVX512F-NEXT: LBB15_22: ## %else20 3061; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800 3062; AVX512F-NEXT: jne LBB15_23 3063; AVX512F-NEXT: LBB15_24: ## %else22 3064; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000 3065; AVX512F-NEXT: jne LBB15_25 3066; AVX512F-NEXT: LBB15_26: ## %else24 3067; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000 3068; AVX512F-NEXT: jne LBB15_27 3069; AVX512F-NEXT: LBB15_28: ## %else26 3070; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000 3071; AVX512F-NEXT: jne LBB15_29 3072; AVX512F-NEXT: LBB15_30: ## %else28 3073; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000 3074; AVX512F-NEXT: jne LBB15_31 3075; AVX512F-NEXT: LBB15_32: ## %else30 3076; AVX512F-NEXT: retq 3077; AVX512F-NEXT: LBB15_1: ## %cond.store 3078; AVX512F-NEXT: vpextrb $0, %xmm1, (%rdi) 3079; AVX512F-NEXT: testb $2, %al 3080; AVX512F-NEXT: je LBB15_4 3081; AVX512F-NEXT: LBB15_3: ## %cond.store1 3082; AVX512F-NEXT: vpextrb $1, %xmm1, 1(%rdi) 3083; AVX512F-NEXT: testb $4, %al 3084; AVX512F-NEXT: je LBB15_6 3085; AVX512F-NEXT: LBB15_5: ## %cond.store3 3086; AVX512F-NEXT: vpextrb $2, %xmm1, 2(%rdi) 3087; AVX512F-NEXT: testb $8, %al 3088; AVX512F-NEXT: je LBB15_8 3089; AVX512F-NEXT: LBB15_7: ## %cond.store5 3090; AVX512F-NEXT: vpextrb $3, %xmm1, 3(%rdi) 3091; AVX512F-NEXT: testb $16, %al 3092; AVX512F-NEXT: je LBB15_10 3093; AVX512F-NEXT: LBB15_9: ## %cond.store7 3094; AVX512F-NEXT: vpextrb $4, %xmm1, 4(%rdi) 3095; AVX512F-NEXT: testb $32, %al 3096; AVX512F-NEXT: je LBB15_12 3097; AVX512F-NEXT: LBB15_11: ## %cond.store9 3098; AVX512F-NEXT: vpextrb $5, %xmm1, 5(%rdi) 3099; AVX512F-NEXT: testb $64, %al 3100; AVX512F-NEXT: je LBB15_14 3101; AVX512F-NEXT: LBB15_13: ## %cond.store11 3102; AVX512F-NEXT: vpextrb $6, %xmm1, 6(%rdi) 3103; AVX512F-NEXT: testb %al, %al 3104; AVX512F-NEXT: jns LBB15_16 3105; AVX512F-NEXT: LBB15_15: ## %cond.store13 3106; AVX512F-NEXT: vpextrb $7, %xmm1, 7(%rdi) 3107; AVX512F-NEXT: testl $256, %eax ## imm = 0x100 3108; AVX512F-NEXT: je LBB15_18 3109; AVX512F-NEXT: LBB15_17: ## %cond.store15 3110; AVX512F-NEXT: vpextrb $8, %xmm1, 8(%rdi) 3111; AVX512F-NEXT: testl $512, %eax ## imm = 0x200 3112; AVX512F-NEXT: je LBB15_20 3113; AVX512F-NEXT: LBB15_19: ## %cond.store17 3114; AVX512F-NEXT: vpextrb $9, %xmm1, 9(%rdi) 3115; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400 3116; AVX512F-NEXT: je LBB15_22 3117; AVX512F-NEXT: LBB15_21: ## %cond.store19 3118; AVX512F-NEXT: vpextrb $10, %xmm1, 10(%rdi) 3119; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800 3120; AVX512F-NEXT: je LBB15_24 3121; AVX512F-NEXT: LBB15_23: ## %cond.store21 3122; AVX512F-NEXT: vpextrb $11, %xmm1, 11(%rdi) 3123; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000 3124; AVX512F-NEXT: je LBB15_26 3125; AVX512F-NEXT: LBB15_25: ## %cond.store23 3126; AVX512F-NEXT: vpextrb $12, %xmm1, 12(%rdi) 3127; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000 3128; AVX512F-NEXT: je LBB15_28 3129; AVX512F-NEXT: LBB15_27: ## %cond.store25 3130; AVX512F-NEXT: vpextrb $13, %xmm1, 13(%rdi) 3131; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000 3132; AVX512F-NEXT: je LBB15_30 3133; AVX512F-NEXT: LBB15_29: ## %cond.store27 3134; AVX512F-NEXT: vpextrb $14, %xmm1, 14(%rdi) 3135; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000 3136; AVX512F-NEXT: je LBB15_32 3137; AVX512F-NEXT: LBB15_31: ## %cond.store29 3138; AVX512F-NEXT: vpextrb $15, %xmm1, 15(%rdi) 3139; AVX512F-NEXT: retq 3140; 3141; AVX512VLDQ-LABEL: store_v16i8_v16i8: 3142; AVX512VLDQ: ## %bb.0: 3143; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 3144; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 3145; AVX512VLDQ-NEXT: vpmovmskb %xmm0, %eax 3146; AVX512VLDQ-NEXT: testb $1, %al 3147; AVX512VLDQ-NEXT: jne LBB15_1 3148; AVX512VLDQ-NEXT: ## %bb.2: ## %else 3149; AVX512VLDQ-NEXT: testb $2, %al 3150; AVX512VLDQ-NEXT: jne LBB15_3 3151; AVX512VLDQ-NEXT: LBB15_4: ## %else2 3152; AVX512VLDQ-NEXT: testb $4, %al 3153; AVX512VLDQ-NEXT: jne LBB15_5 3154; AVX512VLDQ-NEXT: LBB15_6: ## %else4 3155; AVX512VLDQ-NEXT: testb $8, %al 3156; AVX512VLDQ-NEXT: jne LBB15_7 3157; AVX512VLDQ-NEXT: LBB15_8: ## %else6 3158; AVX512VLDQ-NEXT: testb $16, %al 3159; AVX512VLDQ-NEXT: jne LBB15_9 3160; AVX512VLDQ-NEXT: LBB15_10: ## %else8 3161; AVX512VLDQ-NEXT: testb $32, %al 3162; AVX512VLDQ-NEXT: jne LBB15_11 3163; AVX512VLDQ-NEXT: LBB15_12: ## %else10 3164; AVX512VLDQ-NEXT: testb $64, %al 3165; AVX512VLDQ-NEXT: jne LBB15_13 3166; AVX512VLDQ-NEXT: LBB15_14: ## %else12 3167; AVX512VLDQ-NEXT: testb %al, %al 3168; AVX512VLDQ-NEXT: js LBB15_15 3169; AVX512VLDQ-NEXT: LBB15_16: ## %else14 3170; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100 3171; AVX512VLDQ-NEXT: jne LBB15_17 3172; AVX512VLDQ-NEXT: LBB15_18: ## %else16 3173; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200 3174; AVX512VLDQ-NEXT: jne LBB15_19 3175; AVX512VLDQ-NEXT: LBB15_20: ## %else18 3176; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400 3177; AVX512VLDQ-NEXT: jne LBB15_21 3178; AVX512VLDQ-NEXT: LBB15_22: ## %else20 3179; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800 3180; AVX512VLDQ-NEXT: jne LBB15_23 3181; AVX512VLDQ-NEXT: LBB15_24: ## %else22 3182; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000 3183; AVX512VLDQ-NEXT: jne LBB15_25 3184; AVX512VLDQ-NEXT: LBB15_26: ## %else24 3185; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000 3186; AVX512VLDQ-NEXT: jne LBB15_27 3187; AVX512VLDQ-NEXT: LBB15_28: ## %else26 3188; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000 3189; AVX512VLDQ-NEXT: jne LBB15_29 3190; AVX512VLDQ-NEXT: LBB15_30: ## %else28 3191; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000 3192; AVX512VLDQ-NEXT: jne LBB15_31 3193; AVX512VLDQ-NEXT: LBB15_32: ## %else30 3194; AVX512VLDQ-NEXT: retq 3195; AVX512VLDQ-NEXT: LBB15_1: ## %cond.store 3196; AVX512VLDQ-NEXT: vpextrb $0, %xmm1, (%rdi) 3197; AVX512VLDQ-NEXT: testb $2, %al 3198; AVX512VLDQ-NEXT: je LBB15_4 3199; AVX512VLDQ-NEXT: LBB15_3: ## %cond.store1 3200; AVX512VLDQ-NEXT: vpextrb $1, %xmm1, 1(%rdi) 3201; AVX512VLDQ-NEXT: testb $4, %al 3202; AVX512VLDQ-NEXT: je LBB15_6 3203; AVX512VLDQ-NEXT: LBB15_5: ## %cond.store3 3204; AVX512VLDQ-NEXT: vpextrb $2, %xmm1, 2(%rdi) 3205; AVX512VLDQ-NEXT: testb $8, %al 3206; AVX512VLDQ-NEXT: je LBB15_8 3207; AVX512VLDQ-NEXT: LBB15_7: ## %cond.store5 3208; AVX512VLDQ-NEXT: vpextrb $3, %xmm1, 3(%rdi) 3209; AVX512VLDQ-NEXT: testb $16, %al 3210; AVX512VLDQ-NEXT: je LBB15_10 3211; AVX512VLDQ-NEXT: LBB15_9: ## %cond.store7 3212; AVX512VLDQ-NEXT: vpextrb $4, %xmm1, 4(%rdi) 3213; AVX512VLDQ-NEXT: testb $32, %al 3214; AVX512VLDQ-NEXT: je LBB15_12 3215; AVX512VLDQ-NEXT: LBB15_11: ## %cond.store9 3216; AVX512VLDQ-NEXT: vpextrb $5, %xmm1, 5(%rdi) 3217; AVX512VLDQ-NEXT: testb $64, %al 3218; AVX512VLDQ-NEXT: je LBB15_14 3219; AVX512VLDQ-NEXT: LBB15_13: ## %cond.store11 3220; AVX512VLDQ-NEXT: vpextrb $6, %xmm1, 6(%rdi) 3221; AVX512VLDQ-NEXT: testb %al, %al 3222; AVX512VLDQ-NEXT: jns LBB15_16 3223; AVX512VLDQ-NEXT: LBB15_15: ## %cond.store13 3224; AVX512VLDQ-NEXT: vpextrb $7, %xmm1, 7(%rdi) 3225; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100 3226; AVX512VLDQ-NEXT: je LBB15_18 3227; AVX512VLDQ-NEXT: LBB15_17: ## %cond.store15 3228; AVX512VLDQ-NEXT: vpextrb $8, %xmm1, 8(%rdi) 3229; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200 3230; AVX512VLDQ-NEXT: je LBB15_20 3231; AVX512VLDQ-NEXT: LBB15_19: ## %cond.store17 3232; AVX512VLDQ-NEXT: vpextrb $9, %xmm1, 9(%rdi) 3233; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400 3234; AVX512VLDQ-NEXT: je LBB15_22 3235; AVX512VLDQ-NEXT: LBB15_21: ## %cond.store19 3236; AVX512VLDQ-NEXT: vpextrb $10, %xmm1, 10(%rdi) 3237; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800 3238; AVX512VLDQ-NEXT: je LBB15_24 3239; AVX512VLDQ-NEXT: LBB15_23: ## %cond.store21 3240; AVX512VLDQ-NEXT: vpextrb $11, %xmm1, 11(%rdi) 3241; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000 3242; AVX512VLDQ-NEXT: je LBB15_26 3243; AVX512VLDQ-NEXT: LBB15_25: ## %cond.store23 3244; AVX512VLDQ-NEXT: vpextrb $12, %xmm1, 12(%rdi) 3245; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000 3246; AVX512VLDQ-NEXT: je LBB15_28 3247; AVX512VLDQ-NEXT: LBB15_27: ## %cond.store25 3248; AVX512VLDQ-NEXT: vpextrb $13, %xmm1, 13(%rdi) 3249; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000 3250; AVX512VLDQ-NEXT: je LBB15_30 3251; AVX512VLDQ-NEXT: LBB15_29: ## %cond.store27 3252; AVX512VLDQ-NEXT: vpextrb $14, %xmm1, 14(%rdi) 3253; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000 3254; AVX512VLDQ-NEXT: je LBB15_32 3255; AVX512VLDQ-NEXT: LBB15_31: ## %cond.store29 3256; AVX512VLDQ-NEXT: vpextrb $15, %xmm1, 15(%rdi) 3257; AVX512VLDQ-NEXT: retq 3258; 3259; AVX512VLBW-LABEL: store_v16i8_v16i8: 3260; AVX512VLBW: ## %bb.0: 3261; AVX512VLBW-NEXT: vptestnmb %xmm0, %xmm0, %k1 3262; AVX512VLBW-NEXT: vmovdqu8 %xmm1, (%rdi) {%k1} 3263; AVX512VLBW-NEXT: retq 3264; 3265; X86-AVX512-LABEL: store_v16i8_v16i8: 3266; X86-AVX512: ## %bb.0: 3267; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 3268; X86-AVX512-NEXT: vptestnmb %xmm0, %xmm0, %k1 3269; X86-AVX512-NEXT: vmovdqu8 %xmm1, (%eax) {%k1} 3270; X86-AVX512-NEXT: retl 3271 %mask = icmp eq <16 x i8> %trigger, zeroinitializer 3272 call void @llvm.masked.store.v16i8.p0(<16 x i8> %val, ptr %addr, i32 4, <16 x i1> %mask) 3273 ret void 3274} 3275 3276define void @store_v32i8_v32i8(<32 x i8> %trigger, ptr %addr, <32 x i8> %val) nounwind { 3277; SSE2-LABEL: store_v32i8_v32i8: 3278; SSE2: ## %bb.0: 3279; SSE2-NEXT: pxor %xmm4, %xmm4 3280; SSE2-NEXT: pcmpeqb %xmm4, %xmm0 3281; SSE2-NEXT: pmovmskb %xmm0, %ecx 3282; SSE2-NEXT: pcmpeqb %xmm4, %xmm1 3283; SSE2-NEXT: pmovmskb %xmm1, %eax 3284; SSE2-NEXT: shll $16, %eax 3285; SSE2-NEXT: orl %ecx, %eax 3286; SSE2-NEXT: testb $1, %al 3287; SSE2-NEXT: movd %xmm2, %ecx 3288; SSE2-NEXT: jne LBB16_1 3289; SSE2-NEXT: ## %bb.2: ## %else 3290; SSE2-NEXT: testb $2, %al 3291; SSE2-NEXT: jne LBB16_3 3292; SSE2-NEXT: LBB16_4: ## %else2 3293; SSE2-NEXT: testb $4, %al 3294; SSE2-NEXT: jne LBB16_5 3295; SSE2-NEXT: LBB16_6: ## %else4 3296; SSE2-NEXT: testb $8, %al 3297; SSE2-NEXT: je LBB16_8 3298; SSE2-NEXT: LBB16_7: ## %cond.store5 3299; SSE2-NEXT: shrl $24, %ecx 3300; SSE2-NEXT: movb %cl, 3(%rdi) 3301; SSE2-NEXT: LBB16_8: ## %else6 3302; SSE2-NEXT: testb $16, %al 3303; SSE2-NEXT: pextrw $2, %xmm2, %ecx 3304; SSE2-NEXT: je LBB16_10 3305; SSE2-NEXT: ## %bb.9: ## %cond.store7 3306; SSE2-NEXT: movb %cl, 4(%rdi) 3307; SSE2-NEXT: LBB16_10: ## %else8 3308; SSE2-NEXT: testb $32, %al 3309; SSE2-NEXT: je LBB16_12 3310; SSE2-NEXT: ## %bb.11: ## %cond.store9 3311; SSE2-NEXT: movb %ch, 5(%rdi) 3312; SSE2-NEXT: LBB16_12: ## %else10 3313; SSE2-NEXT: testb $64, %al 3314; SSE2-NEXT: pextrw $3, %xmm2, %ecx 3315; SSE2-NEXT: je LBB16_14 3316; SSE2-NEXT: ## %bb.13: ## %cond.store11 3317; SSE2-NEXT: movb %cl, 6(%rdi) 3318; SSE2-NEXT: LBB16_14: ## %else12 3319; SSE2-NEXT: testb %al, %al 3320; SSE2-NEXT: jns LBB16_16 3321; SSE2-NEXT: ## %bb.15: ## %cond.store13 3322; SSE2-NEXT: movb %ch, 7(%rdi) 3323; SSE2-NEXT: LBB16_16: ## %else14 3324; SSE2-NEXT: testl $256, %eax ## imm = 0x100 3325; SSE2-NEXT: pextrw $4, %xmm2, %ecx 3326; SSE2-NEXT: je LBB16_18 3327; SSE2-NEXT: ## %bb.17: ## %cond.store15 3328; SSE2-NEXT: movb %cl, 8(%rdi) 3329; SSE2-NEXT: LBB16_18: ## %else16 3330; SSE2-NEXT: testl $512, %eax ## imm = 0x200 3331; SSE2-NEXT: je LBB16_20 3332; SSE2-NEXT: ## %bb.19: ## %cond.store17 3333; SSE2-NEXT: movb %ch, 9(%rdi) 3334; SSE2-NEXT: LBB16_20: ## %else18 3335; SSE2-NEXT: testl $1024, %eax ## imm = 0x400 3336; SSE2-NEXT: pextrw $5, %xmm2, %ecx 3337; SSE2-NEXT: je LBB16_22 3338; SSE2-NEXT: ## %bb.21: ## %cond.store19 3339; SSE2-NEXT: movb %cl, 10(%rdi) 3340; SSE2-NEXT: LBB16_22: ## %else20 3341; SSE2-NEXT: testl $2048, %eax ## imm = 0x800 3342; SSE2-NEXT: je LBB16_24 3343; SSE2-NEXT: ## %bb.23: ## %cond.store21 3344; SSE2-NEXT: movb %ch, 11(%rdi) 3345; SSE2-NEXT: LBB16_24: ## %else22 3346; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000 3347; SSE2-NEXT: pextrw $6, %xmm2, %ecx 3348; SSE2-NEXT: je LBB16_26 3349; SSE2-NEXT: ## %bb.25: ## %cond.store23 3350; SSE2-NEXT: movb %cl, 12(%rdi) 3351; SSE2-NEXT: LBB16_26: ## %else24 3352; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000 3353; SSE2-NEXT: je LBB16_28 3354; SSE2-NEXT: ## %bb.27: ## %cond.store25 3355; SSE2-NEXT: movb %ch, 13(%rdi) 3356; SSE2-NEXT: LBB16_28: ## %else26 3357; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000 3358; SSE2-NEXT: pextrw $7, %xmm2, %ecx 3359; SSE2-NEXT: je LBB16_30 3360; SSE2-NEXT: ## %bb.29: ## %cond.store27 3361; SSE2-NEXT: movb %cl, 14(%rdi) 3362; SSE2-NEXT: LBB16_30: ## %else28 3363; SSE2-NEXT: testw %ax, %ax 3364; SSE2-NEXT: jns LBB16_32 3365; SSE2-NEXT: ## %bb.31: ## %cond.store29 3366; SSE2-NEXT: movb %ch, 15(%rdi) 3367; SSE2-NEXT: LBB16_32: ## %else30 3368; SSE2-NEXT: testl $65536, %eax ## imm = 0x10000 3369; SSE2-NEXT: movd %xmm3, %ecx 3370; SSE2-NEXT: jne LBB16_33 3371; SSE2-NEXT: ## %bb.34: ## %else32 3372; SSE2-NEXT: testl $131072, %eax ## imm = 0x20000 3373; SSE2-NEXT: jne LBB16_35 3374; SSE2-NEXT: LBB16_36: ## %else34 3375; SSE2-NEXT: testl $262144, %eax ## imm = 0x40000 3376; SSE2-NEXT: jne LBB16_37 3377; SSE2-NEXT: LBB16_38: ## %else36 3378; SSE2-NEXT: testl $524288, %eax ## imm = 0x80000 3379; SSE2-NEXT: je LBB16_40 3380; SSE2-NEXT: LBB16_39: ## %cond.store37 3381; SSE2-NEXT: shrl $24, %ecx 3382; SSE2-NEXT: movb %cl, 19(%rdi) 3383; SSE2-NEXT: LBB16_40: ## %else38 3384; SSE2-NEXT: testl $1048576, %eax ## imm = 0x100000 3385; SSE2-NEXT: pextrw $2, %xmm3, %ecx 3386; SSE2-NEXT: je LBB16_42 3387; SSE2-NEXT: ## %bb.41: ## %cond.store39 3388; SSE2-NEXT: movb %cl, 20(%rdi) 3389; SSE2-NEXT: LBB16_42: ## %else40 3390; SSE2-NEXT: testl $2097152, %eax ## imm = 0x200000 3391; SSE2-NEXT: je LBB16_44 3392; SSE2-NEXT: ## %bb.43: ## %cond.store41 3393; SSE2-NEXT: movb %ch, 21(%rdi) 3394; SSE2-NEXT: LBB16_44: ## %else42 3395; SSE2-NEXT: testl $4194304, %eax ## imm = 0x400000 3396; SSE2-NEXT: pextrw $3, %xmm3, %ecx 3397; SSE2-NEXT: je LBB16_46 3398; SSE2-NEXT: ## %bb.45: ## %cond.store43 3399; SSE2-NEXT: movb %cl, 22(%rdi) 3400; SSE2-NEXT: LBB16_46: ## %else44 3401; SSE2-NEXT: testl $8388608, %eax ## imm = 0x800000 3402; SSE2-NEXT: je LBB16_48 3403; SSE2-NEXT: ## %bb.47: ## %cond.store45 3404; SSE2-NEXT: movb %ch, 23(%rdi) 3405; SSE2-NEXT: LBB16_48: ## %else46 3406; SSE2-NEXT: testl $16777216, %eax ## imm = 0x1000000 3407; SSE2-NEXT: pextrw $4, %xmm3, %ecx 3408; SSE2-NEXT: je LBB16_50 3409; SSE2-NEXT: ## %bb.49: ## %cond.store47 3410; SSE2-NEXT: movb %cl, 24(%rdi) 3411; SSE2-NEXT: LBB16_50: ## %else48 3412; SSE2-NEXT: testl $33554432, %eax ## imm = 0x2000000 3413; SSE2-NEXT: je LBB16_52 3414; SSE2-NEXT: ## %bb.51: ## %cond.store49 3415; SSE2-NEXT: movb %ch, 25(%rdi) 3416; SSE2-NEXT: LBB16_52: ## %else50 3417; SSE2-NEXT: testl $67108864, %eax ## imm = 0x4000000 3418; SSE2-NEXT: pextrw $5, %xmm3, %ecx 3419; SSE2-NEXT: je LBB16_54 3420; SSE2-NEXT: ## %bb.53: ## %cond.store51 3421; SSE2-NEXT: movb %cl, 26(%rdi) 3422; SSE2-NEXT: LBB16_54: ## %else52 3423; SSE2-NEXT: testl $134217728, %eax ## imm = 0x8000000 3424; SSE2-NEXT: je LBB16_56 3425; SSE2-NEXT: ## %bb.55: ## %cond.store53 3426; SSE2-NEXT: movb %ch, 27(%rdi) 3427; SSE2-NEXT: LBB16_56: ## %else54 3428; SSE2-NEXT: testl $268435456, %eax ## imm = 0x10000000 3429; SSE2-NEXT: pextrw $6, %xmm3, %ecx 3430; SSE2-NEXT: je LBB16_58 3431; SSE2-NEXT: ## %bb.57: ## %cond.store55 3432; SSE2-NEXT: movb %cl, 28(%rdi) 3433; SSE2-NEXT: LBB16_58: ## %else56 3434; SSE2-NEXT: testl $536870912, %eax ## imm = 0x20000000 3435; SSE2-NEXT: je LBB16_60 3436; SSE2-NEXT: ## %bb.59: ## %cond.store57 3437; SSE2-NEXT: movb %ch, 29(%rdi) 3438; SSE2-NEXT: LBB16_60: ## %else58 3439; SSE2-NEXT: testl $1073741824, %eax ## imm = 0x40000000 3440; SSE2-NEXT: pextrw $7, %xmm3, %ecx 3441; SSE2-NEXT: jne LBB16_61 3442; SSE2-NEXT: ## %bb.62: ## %else60 3443; SSE2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 3444; SSE2-NEXT: jne LBB16_63 3445; SSE2-NEXT: LBB16_64: ## %else62 3446; SSE2-NEXT: retq 3447; SSE2-NEXT: LBB16_1: ## %cond.store 3448; SSE2-NEXT: movb %cl, (%rdi) 3449; SSE2-NEXT: testb $2, %al 3450; SSE2-NEXT: je LBB16_4 3451; SSE2-NEXT: LBB16_3: ## %cond.store1 3452; SSE2-NEXT: movb %ch, 1(%rdi) 3453; SSE2-NEXT: testb $4, %al 3454; SSE2-NEXT: je LBB16_6 3455; SSE2-NEXT: LBB16_5: ## %cond.store3 3456; SSE2-NEXT: movl %ecx, %edx 3457; SSE2-NEXT: shrl $16, %edx 3458; SSE2-NEXT: movb %dl, 2(%rdi) 3459; SSE2-NEXT: testb $8, %al 3460; SSE2-NEXT: jne LBB16_7 3461; SSE2-NEXT: jmp LBB16_8 3462; SSE2-NEXT: LBB16_33: ## %cond.store31 3463; SSE2-NEXT: movb %cl, 16(%rdi) 3464; SSE2-NEXT: testl $131072, %eax ## imm = 0x20000 3465; SSE2-NEXT: je LBB16_36 3466; SSE2-NEXT: LBB16_35: ## %cond.store33 3467; SSE2-NEXT: movb %ch, 17(%rdi) 3468; SSE2-NEXT: testl $262144, %eax ## imm = 0x40000 3469; SSE2-NEXT: je LBB16_38 3470; SSE2-NEXT: LBB16_37: ## %cond.store35 3471; SSE2-NEXT: movl %ecx, %edx 3472; SSE2-NEXT: shrl $16, %edx 3473; SSE2-NEXT: movb %dl, 18(%rdi) 3474; SSE2-NEXT: testl $524288, %eax ## imm = 0x80000 3475; SSE2-NEXT: jne LBB16_39 3476; SSE2-NEXT: jmp LBB16_40 3477; SSE2-NEXT: LBB16_61: ## %cond.store59 3478; SSE2-NEXT: movb %cl, 30(%rdi) 3479; SSE2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 3480; SSE2-NEXT: je LBB16_64 3481; SSE2-NEXT: LBB16_63: ## %cond.store61 3482; SSE2-NEXT: movb %ch, 31(%rdi) 3483; SSE2-NEXT: retq 3484; 3485; SSE4-LABEL: store_v32i8_v32i8: 3486; SSE4: ## %bb.0: 3487; SSE4-NEXT: pxor %xmm4, %xmm4 3488; SSE4-NEXT: pcmpeqb %xmm4, %xmm0 3489; SSE4-NEXT: pmovmskb %xmm0, %ecx 3490; SSE4-NEXT: pcmpeqb %xmm4, %xmm1 3491; SSE4-NEXT: pmovmskb %xmm1, %eax 3492; SSE4-NEXT: shll $16, %eax 3493; SSE4-NEXT: orl %ecx, %eax 3494; SSE4-NEXT: testb $1, %al 3495; SSE4-NEXT: jne LBB16_1 3496; SSE4-NEXT: ## %bb.2: ## %else 3497; SSE4-NEXT: testb $2, %al 3498; SSE4-NEXT: jne LBB16_3 3499; SSE4-NEXT: LBB16_4: ## %else2 3500; SSE4-NEXT: testb $4, %al 3501; SSE4-NEXT: jne LBB16_5 3502; SSE4-NEXT: LBB16_6: ## %else4 3503; SSE4-NEXT: testb $8, %al 3504; SSE4-NEXT: jne LBB16_7 3505; SSE4-NEXT: LBB16_8: ## %else6 3506; SSE4-NEXT: testb $16, %al 3507; SSE4-NEXT: jne LBB16_9 3508; SSE4-NEXT: LBB16_10: ## %else8 3509; SSE4-NEXT: testb $32, %al 3510; SSE4-NEXT: jne LBB16_11 3511; SSE4-NEXT: LBB16_12: ## %else10 3512; SSE4-NEXT: testb $64, %al 3513; SSE4-NEXT: jne LBB16_13 3514; SSE4-NEXT: LBB16_14: ## %else12 3515; SSE4-NEXT: testb %al, %al 3516; SSE4-NEXT: js LBB16_15 3517; SSE4-NEXT: LBB16_16: ## %else14 3518; SSE4-NEXT: testl $256, %eax ## imm = 0x100 3519; SSE4-NEXT: jne LBB16_17 3520; SSE4-NEXT: LBB16_18: ## %else16 3521; SSE4-NEXT: testl $512, %eax ## imm = 0x200 3522; SSE4-NEXT: jne LBB16_19 3523; SSE4-NEXT: LBB16_20: ## %else18 3524; SSE4-NEXT: testl $1024, %eax ## imm = 0x400 3525; SSE4-NEXT: jne LBB16_21 3526; SSE4-NEXT: LBB16_22: ## %else20 3527; SSE4-NEXT: testl $2048, %eax ## imm = 0x800 3528; SSE4-NEXT: jne LBB16_23 3529; SSE4-NEXT: LBB16_24: ## %else22 3530; SSE4-NEXT: testl $4096, %eax ## imm = 0x1000 3531; SSE4-NEXT: jne LBB16_25 3532; SSE4-NEXT: LBB16_26: ## %else24 3533; SSE4-NEXT: testl $8192, %eax ## imm = 0x2000 3534; SSE4-NEXT: jne LBB16_27 3535; SSE4-NEXT: LBB16_28: ## %else26 3536; SSE4-NEXT: testl $16384, %eax ## imm = 0x4000 3537; SSE4-NEXT: jne LBB16_29 3538; SSE4-NEXT: LBB16_30: ## %else28 3539; SSE4-NEXT: testw %ax, %ax 3540; SSE4-NEXT: js LBB16_31 3541; SSE4-NEXT: LBB16_32: ## %else30 3542; SSE4-NEXT: testl $65536, %eax ## imm = 0x10000 3543; SSE4-NEXT: jne LBB16_33 3544; SSE4-NEXT: LBB16_34: ## %else32 3545; SSE4-NEXT: testl $131072, %eax ## imm = 0x20000 3546; SSE4-NEXT: jne LBB16_35 3547; SSE4-NEXT: LBB16_36: ## %else34 3548; SSE4-NEXT: testl $262144, %eax ## imm = 0x40000 3549; SSE4-NEXT: jne LBB16_37 3550; SSE4-NEXT: LBB16_38: ## %else36 3551; SSE4-NEXT: testl $524288, %eax ## imm = 0x80000 3552; SSE4-NEXT: jne LBB16_39 3553; SSE4-NEXT: LBB16_40: ## %else38 3554; SSE4-NEXT: testl $1048576, %eax ## imm = 0x100000 3555; SSE4-NEXT: jne LBB16_41 3556; SSE4-NEXT: LBB16_42: ## %else40 3557; SSE4-NEXT: testl $2097152, %eax ## imm = 0x200000 3558; SSE4-NEXT: jne LBB16_43 3559; SSE4-NEXT: LBB16_44: ## %else42 3560; SSE4-NEXT: testl $4194304, %eax ## imm = 0x400000 3561; SSE4-NEXT: jne LBB16_45 3562; SSE4-NEXT: LBB16_46: ## %else44 3563; SSE4-NEXT: testl $8388608, %eax ## imm = 0x800000 3564; SSE4-NEXT: jne LBB16_47 3565; SSE4-NEXT: LBB16_48: ## %else46 3566; SSE4-NEXT: testl $16777216, %eax ## imm = 0x1000000 3567; SSE4-NEXT: jne LBB16_49 3568; SSE4-NEXT: LBB16_50: ## %else48 3569; SSE4-NEXT: testl $33554432, %eax ## imm = 0x2000000 3570; SSE4-NEXT: jne LBB16_51 3571; SSE4-NEXT: LBB16_52: ## %else50 3572; SSE4-NEXT: testl $67108864, %eax ## imm = 0x4000000 3573; SSE4-NEXT: jne LBB16_53 3574; SSE4-NEXT: LBB16_54: ## %else52 3575; SSE4-NEXT: testl $134217728, %eax ## imm = 0x8000000 3576; SSE4-NEXT: jne LBB16_55 3577; SSE4-NEXT: LBB16_56: ## %else54 3578; SSE4-NEXT: testl $268435456, %eax ## imm = 0x10000000 3579; SSE4-NEXT: jne LBB16_57 3580; SSE4-NEXT: LBB16_58: ## %else56 3581; SSE4-NEXT: testl $536870912, %eax ## imm = 0x20000000 3582; SSE4-NEXT: jne LBB16_59 3583; SSE4-NEXT: LBB16_60: ## %else58 3584; SSE4-NEXT: testl $1073741824, %eax ## imm = 0x40000000 3585; SSE4-NEXT: jne LBB16_61 3586; SSE4-NEXT: LBB16_62: ## %else60 3587; SSE4-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 3588; SSE4-NEXT: jne LBB16_63 3589; SSE4-NEXT: LBB16_64: ## %else62 3590; SSE4-NEXT: retq 3591; SSE4-NEXT: LBB16_1: ## %cond.store 3592; SSE4-NEXT: pextrb $0, %xmm2, (%rdi) 3593; SSE4-NEXT: testb $2, %al 3594; SSE4-NEXT: je LBB16_4 3595; SSE4-NEXT: LBB16_3: ## %cond.store1 3596; SSE4-NEXT: pextrb $1, %xmm2, 1(%rdi) 3597; SSE4-NEXT: testb $4, %al 3598; SSE4-NEXT: je LBB16_6 3599; SSE4-NEXT: LBB16_5: ## %cond.store3 3600; SSE4-NEXT: pextrb $2, %xmm2, 2(%rdi) 3601; SSE4-NEXT: testb $8, %al 3602; SSE4-NEXT: je LBB16_8 3603; SSE4-NEXT: LBB16_7: ## %cond.store5 3604; SSE4-NEXT: pextrb $3, %xmm2, 3(%rdi) 3605; SSE4-NEXT: testb $16, %al 3606; SSE4-NEXT: je LBB16_10 3607; SSE4-NEXT: LBB16_9: ## %cond.store7 3608; SSE4-NEXT: pextrb $4, %xmm2, 4(%rdi) 3609; SSE4-NEXT: testb $32, %al 3610; SSE4-NEXT: je LBB16_12 3611; SSE4-NEXT: LBB16_11: ## %cond.store9 3612; SSE4-NEXT: pextrb $5, %xmm2, 5(%rdi) 3613; SSE4-NEXT: testb $64, %al 3614; SSE4-NEXT: je LBB16_14 3615; SSE4-NEXT: LBB16_13: ## %cond.store11 3616; SSE4-NEXT: pextrb $6, %xmm2, 6(%rdi) 3617; SSE4-NEXT: testb %al, %al 3618; SSE4-NEXT: jns LBB16_16 3619; SSE4-NEXT: LBB16_15: ## %cond.store13 3620; SSE4-NEXT: pextrb $7, %xmm2, 7(%rdi) 3621; SSE4-NEXT: testl $256, %eax ## imm = 0x100 3622; SSE4-NEXT: je LBB16_18 3623; SSE4-NEXT: LBB16_17: ## %cond.store15 3624; SSE4-NEXT: pextrb $8, %xmm2, 8(%rdi) 3625; SSE4-NEXT: testl $512, %eax ## imm = 0x200 3626; SSE4-NEXT: je LBB16_20 3627; SSE4-NEXT: LBB16_19: ## %cond.store17 3628; SSE4-NEXT: pextrb $9, %xmm2, 9(%rdi) 3629; SSE4-NEXT: testl $1024, %eax ## imm = 0x400 3630; SSE4-NEXT: je LBB16_22 3631; SSE4-NEXT: LBB16_21: ## %cond.store19 3632; SSE4-NEXT: pextrb $10, %xmm2, 10(%rdi) 3633; SSE4-NEXT: testl $2048, %eax ## imm = 0x800 3634; SSE4-NEXT: je LBB16_24 3635; SSE4-NEXT: LBB16_23: ## %cond.store21 3636; SSE4-NEXT: pextrb $11, %xmm2, 11(%rdi) 3637; SSE4-NEXT: testl $4096, %eax ## imm = 0x1000 3638; SSE4-NEXT: je LBB16_26 3639; SSE4-NEXT: LBB16_25: ## %cond.store23 3640; SSE4-NEXT: pextrb $12, %xmm2, 12(%rdi) 3641; SSE4-NEXT: testl $8192, %eax ## imm = 0x2000 3642; SSE4-NEXT: je LBB16_28 3643; SSE4-NEXT: LBB16_27: ## %cond.store25 3644; SSE4-NEXT: pextrb $13, %xmm2, 13(%rdi) 3645; SSE4-NEXT: testl $16384, %eax ## imm = 0x4000 3646; SSE4-NEXT: je LBB16_30 3647; SSE4-NEXT: LBB16_29: ## %cond.store27 3648; SSE4-NEXT: pextrb $14, %xmm2, 14(%rdi) 3649; SSE4-NEXT: testw %ax, %ax 3650; SSE4-NEXT: jns LBB16_32 3651; SSE4-NEXT: LBB16_31: ## %cond.store29 3652; SSE4-NEXT: pextrb $15, %xmm2, 15(%rdi) 3653; SSE4-NEXT: testl $65536, %eax ## imm = 0x10000 3654; SSE4-NEXT: je LBB16_34 3655; SSE4-NEXT: LBB16_33: ## %cond.store31 3656; SSE4-NEXT: pextrb $0, %xmm3, 16(%rdi) 3657; SSE4-NEXT: testl $131072, %eax ## imm = 0x20000 3658; SSE4-NEXT: je LBB16_36 3659; SSE4-NEXT: LBB16_35: ## %cond.store33 3660; SSE4-NEXT: pextrb $1, %xmm3, 17(%rdi) 3661; SSE4-NEXT: testl $262144, %eax ## imm = 0x40000 3662; SSE4-NEXT: je LBB16_38 3663; SSE4-NEXT: LBB16_37: ## %cond.store35 3664; SSE4-NEXT: pextrb $2, %xmm3, 18(%rdi) 3665; SSE4-NEXT: testl $524288, %eax ## imm = 0x80000 3666; SSE4-NEXT: je LBB16_40 3667; SSE4-NEXT: LBB16_39: ## %cond.store37 3668; SSE4-NEXT: pextrb $3, %xmm3, 19(%rdi) 3669; SSE4-NEXT: testl $1048576, %eax ## imm = 0x100000 3670; SSE4-NEXT: je LBB16_42 3671; SSE4-NEXT: LBB16_41: ## %cond.store39 3672; SSE4-NEXT: pextrb $4, %xmm3, 20(%rdi) 3673; SSE4-NEXT: testl $2097152, %eax ## imm = 0x200000 3674; SSE4-NEXT: je LBB16_44 3675; SSE4-NEXT: LBB16_43: ## %cond.store41 3676; SSE4-NEXT: pextrb $5, %xmm3, 21(%rdi) 3677; SSE4-NEXT: testl $4194304, %eax ## imm = 0x400000 3678; SSE4-NEXT: je LBB16_46 3679; SSE4-NEXT: LBB16_45: ## %cond.store43 3680; SSE4-NEXT: pextrb $6, %xmm3, 22(%rdi) 3681; SSE4-NEXT: testl $8388608, %eax ## imm = 0x800000 3682; SSE4-NEXT: je LBB16_48 3683; SSE4-NEXT: LBB16_47: ## %cond.store45 3684; SSE4-NEXT: pextrb $7, %xmm3, 23(%rdi) 3685; SSE4-NEXT: testl $16777216, %eax ## imm = 0x1000000 3686; SSE4-NEXT: je LBB16_50 3687; SSE4-NEXT: LBB16_49: ## %cond.store47 3688; SSE4-NEXT: pextrb $8, %xmm3, 24(%rdi) 3689; SSE4-NEXT: testl $33554432, %eax ## imm = 0x2000000 3690; SSE4-NEXT: je LBB16_52 3691; SSE4-NEXT: LBB16_51: ## %cond.store49 3692; SSE4-NEXT: pextrb $9, %xmm3, 25(%rdi) 3693; SSE4-NEXT: testl $67108864, %eax ## imm = 0x4000000 3694; SSE4-NEXT: je LBB16_54 3695; SSE4-NEXT: LBB16_53: ## %cond.store51 3696; SSE4-NEXT: pextrb $10, %xmm3, 26(%rdi) 3697; SSE4-NEXT: testl $134217728, %eax ## imm = 0x8000000 3698; SSE4-NEXT: je LBB16_56 3699; SSE4-NEXT: LBB16_55: ## %cond.store53 3700; SSE4-NEXT: pextrb $11, %xmm3, 27(%rdi) 3701; SSE4-NEXT: testl $268435456, %eax ## imm = 0x10000000 3702; SSE4-NEXT: je LBB16_58 3703; SSE4-NEXT: LBB16_57: ## %cond.store55 3704; SSE4-NEXT: pextrb $12, %xmm3, 28(%rdi) 3705; SSE4-NEXT: testl $536870912, %eax ## imm = 0x20000000 3706; SSE4-NEXT: je LBB16_60 3707; SSE4-NEXT: LBB16_59: ## %cond.store57 3708; SSE4-NEXT: pextrb $13, %xmm3, 29(%rdi) 3709; SSE4-NEXT: testl $1073741824, %eax ## imm = 0x40000000 3710; SSE4-NEXT: je LBB16_62 3711; SSE4-NEXT: LBB16_61: ## %cond.store59 3712; SSE4-NEXT: pextrb $14, %xmm3, 30(%rdi) 3713; SSE4-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 3714; SSE4-NEXT: je LBB16_64 3715; SSE4-NEXT: LBB16_63: ## %cond.store61 3716; SSE4-NEXT: pextrb $15, %xmm3, 31(%rdi) 3717; SSE4-NEXT: retq 3718; 3719; AVX1-LABEL: store_v32i8_v32i8: 3720; AVX1: ## %bb.0: 3721; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 3722; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 3723; AVX1-NEXT: vpmovmskb %xmm3, %ecx 3724; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3725; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 3726; AVX1-NEXT: vpmovmskb %xmm0, %eax 3727; AVX1-NEXT: shll $16, %eax 3728; AVX1-NEXT: orl %ecx, %eax 3729; AVX1-NEXT: testb $1, %al 3730; AVX1-NEXT: jne LBB16_1 3731; AVX1-NEXT: ## %bb.2: ## %else 3732; AVX1-NEXT: testb $2, %al 3733; AVX1-NEXT: jne LBB16_3 3734; AVX1-NEXT: LBB16_4: ## %else2 3735; AVX1-NEXT: testb $4, %al 3736; AVX1-NEXT: jne LBB16_5 3737; AVX1-NEXT: LBB16_6: ## %else4 3738; AVX1-NEXT: testb $8, %al 3739; AVX1-NEXT: jne LBB16_7 3740; AVX1-NEXT: LBB16_8: ## %else6 3741; AVX1-NEXT: testb $16, %al 3742; AVX1-NEXT: jne LBB16_9 3743; AVX1-NEXT: LBB16_10: ## %else8 3744; AVX1-NEXT: testb $32, %al 3745; AVX1-NEXT: jne LBB16_11 3746; AVX1-NEXT: LBB16_12: ## %else10 3747; AVX1-NEXT: testb $64, %al 3748; AVX1-NEXT: jne LBB16_13 3749; AVX1-NEXT: LBB16_14: ## %else12 3750; AVX1-NEXT: testb %al, %al 3751; AVX1-NEXT: js LBB16_15 3752; AVX1-NEXT: LBB16_16: ## %else14 3753; AVX1-NEXT: testl $256, %eax ## imm = 0x100 3754; AVX1-NEXT: jne LBB16_17 3755; AVX1-NEXT: LBB16_18: ## %else16 3756; AVX1-NEXT: testl $512, %eax ## imm = 0x200 3757; AVX1-NEXT: jne LBB16_19 3758; AVX1-NEXT: LBB16_20: ## %else18 3759; AVX1-NEXT: testl $1024, %eax ## imm = 0x400 3760; AVX1-NEXT: jne LBB16_21 3761; AVX1-NEXT: LBB16_22: ## %else20 3762; AVX1-NEXT: testl $2048, %eax ## imm = 0x800 3763; AVX1-NEXT: jne LBB16_23 3764; AVX1-NEXT: LBB16_24: ## %else22 3765; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000 3766; AVX1-NEXT: jne LBB16_25 3767; AVX1-NEXT: LBB16_26: ## %else24 3768; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000 3769; AVX1-NEXT: jne LBB16_27 3770; AVX1-NEXT: LBB16_28: ## %else26 3771; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000 3772; AVX1-NEXT: jne LBB16_29 3773; AVX1-NEXT: LBB16_30: ## %else28 3774; AVX1-NEXT: testw %ax, %ax 3775; AVX1-NEXT: jns LBB16_32 3776; AVX1-NEXT: LBB16_31: ## %cond.store29 3777; AVX1-NEXT: vpextrb $15, %xmm1, 15(%rdi) 3778; AVX1-NEXT: LBB16_32: ## %else30 3779; AVX1-NEXT: testl $65536, %eax ## imm = 0x10000 3780; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 3781; AVX1-NEXT: jne LBB16_33 3782; AVX1-NEXT: ## %bb.34: ## %else32 3783; AVX1-NEXT: testl $131072, %eax ## imm = 0x20000 3784; AVX1-NEXT: jne LBB16_35 3785; AVX1-NEXT: LBB16_36: ## %else34 3786; AVX1-NEXT: testl $262144, %eax ## imm = 0x40000 3787; AVX1-NEXT: jne LBB16_37 3788; AVX1-NEXT: LBB16_38: ## %else36 3789; AVX1-NEXT: testl $524288, %eax ## imm = 0x80000 3790; AVX1-NEXT: jne LBB16_39 3791; AVX1-NEXT: LBB16_40: ## %else38 3792; AVX1-NEXT: testl $1048576, %eax ## imm = 0x100000 3793; AVX1-NEXT: jne LBB16_41 3794; AVX1-NEXT: LBB16_42: ## %else40 3795; AVX1-NEXT: testl $2097152, %eax ## imm = 0x200000 3796; AVX1-NEXT: jne LBB16_43 3797; AVX1-NEXT: LBB16_44: ## %else42 3798; AVX1-NEXT: testl $4194304, %eax ## imm = 0x400000 3799; AVX1-NEXT: jne LBB16_45 3800; AVX1-NEXT: LBB16_46: ## %else44 3801; AVX1-NEXT: testl $8388608, %eax ## imm = 0x800000 3802; AVX1-NEXT: jne LBB16_47 3803; AVX1-NEXT: LBB16_48: ## %else46 3804; AVX1-NEXT: testl $16777216, %eax ## imm = 0x1000000 3805; AVX1-NEXT: jne LBB16_49 3806; AVX1-NEXT: LBB16_50: ## %else48 3807; AVX1-NEXT: testl $33554432, %eax ## imm = 0x2000000 3808; AVX1-NEXT: jne LBB16_51 3809; AVX1-NEXT: LBB16_52: ## %else50 3810; AVX1-NEXT: testl $67108864, %eax ## imm = 0x4000000 3811; AVX1-NEXT: jne LBB16_53 3812; AVX1-NEXT: LBB16_54: ## %else52 3813; AVX1-NEXT: testl $134217728, %eax ## imm = 0x8000000 3814; AVX1-NEXT: jne LBB16_55 3815; AVX1-NEXT: LBB16_56: ## %else54 3816; AVX1-NEXT: testl $268435456, %eax ## imm = 0x10000000 3817; AVX1-NEXT: jne LBB16_57 3818; AVX1-NEXT: LBB16_58: ## %else56 3819; AVX1-NEXT: testl $536870912, %eax ## imm = 0x20000000 3820; AVX1-NEXT: jne LBB16_59 3821; AVX1-NEXT: LBB16_60: ## %else58 3822; AVX1-NEXT: testl $1073741824, %eax ## imm = 0x40000000 3823; AVX1-NEXT: jne LBB16_61 3824; AVX1-NEXT: LBB16_62: ## %else60 3825; AVX1-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 3826; AVX1-NEXT: jne LBB16_63 3827; AVX1-NEXT: LBB16_64: ## %else62 3828; AVX1-NEXT: vzeroupper 3829; AVX1-NEXT: retq 3830; AVX1-NEXT: LBB16_1: ## %cond.store 3831; AVX1-NEXT: vpextrb $0, %xmm1, (%rdi) 3832; AVX1-NEXT: testb $2, %al 3833; AVX1-NEXT: je LBB16_4 3834; AVX1-NEXT: LBB16_3: ## %cond.store1 3835; AVX1-NEXT: vpextrb $1, %xmm1, 1(%rdi) 3836; AVX1-NEXT: testb $4, %al 3837; AVX1-NEXT: je LBB16_6 3838; AVX1-NEXT: LBB16_5: ## %cond.store3 3839; AVX1-NEXT: vpextrb $2, %xmm1, 2(%rdi) 3840; AVX1-NEXT: testb $8, %al 3841; AVX1-NEXT: je LBB16_8 3842; AVX1-NEXT: LBB16_7: ## %cond.store5 3843; AVX1-NEXT: vpextrb $3, %xmm1, 3(%rdi) 3844; AVX1-NEXT: testb $16, %al 3845; AVX1-NEXT: je LBB16_10 3846; AVX1-NEXT: LBB16_9: ## %cond.store7 3847; AVX1-NEXT: vpextrb $4, %xmm1, 4(%rdi) 3848; AVX1-NEXT: testb $32, %al 3849; AVX1-NEXT: je LBB16_12 3850; AVX1-NEXT: LBB16_11: ## %cond.store9 3851; AVX1-NEXT: vpextrb $5, %xmm1, 5(%rdi) 3852; AVX1-NEXT: testb $64, %al 3853; AVX1-NEXT: je LBB16_14 3854; AVX1-NEXT: LBB16_13: ## %cond.store11 3855; AVX1-NEXT: vpextrb $6, %xmm1, 6(%rdi) 3856; AVX1-NEXT: testb %al, %al 3857; AVX1-NEXT: jns LBB16_16 3858; AVX1-NEXT: LBB16_15: ## %cond.store13 3859; AVX1-NEXT: vpextrb $7, %xmm1, 7(%rdi) 3860; AVX1-NEXT: testl $256, %eax ## imm = 0x100 3861; AVX1-NEXT: je LBB16_18 3862; AVX1-NEXT: LBB16_17: ## %cond.store15 3863; AVX1-NEXT: vpextrb $8, %xmm1, 8(%rdi) 3864; AVX1-NEXT: testl $512, %eax ## imm = 0x200 3865; AVX1-NEXT: je LBB16_20 3866; AVX1-NEXT: LBB16_19: ## %cond.store17 3867; AVX1-NEXT: vpextrb $9, %xmm1, 9(%rdi) 3868; AVX1-NEXT: testl $1024, %eax ## imm = 0x400 3869; AVX1-NEXT: je LBB16_22 3870; AVX1-NEXT: LBB16_21: ## %cond.store19 3871; AVX1-NEXT: vpextrb $10, %xmm1, 10(%rdi) 3872; AVX1-NEXT: testl $2048, %eax ## imm = 0x800 3873; AVX1-NEXT: je LBB16_24 3874; AVX1-NEXT: LBB16_23: ## %cond.store21 3875; AVX1-NEXT: vpextrb $11, %xmm1, 11(%rdi) 3876; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000 3877; AVX1-NEXT: je LBB16_26 3878; AVX1-NEXT: LBB16_25: ## %cond.store23 3879; AVX1-NEXT: vpextrb $12, %xmm1, 12(%rdi) 3880; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000 3881; AVX1-NEXT: je LBB16_28 3882; AVX1-NEXT: LBB16_27: ## %cond.store25 3883; AVX1-NEXT: vpextrb $13, %xmm1, 13(%rdi) 3884; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000 3885; AVX1-NEXT: je LBB16_30 3886; AVX1-NEXT: LBB16_29: ## %cond.store27 3887; AVX1-NEXT: vpextrb $14, %xmm1, 14(%rdi) 3888; AVX1-NEXT: testw %ax, %ax 3889; AVX1-NEXT: js LBB16_31 3890; AVX1-NEXT: jmp LBB16_32 3891; AVX1-NEXT: LBB16_33: ## %cond.store31 3892; AVX1-NEXT: vpextrb $0, %xmm0, 16(%rdi) 3893; AVX1-NEXT: testl $131072, %eax ## imm = 0x20000 3894; AVX1-NEXT: je LBB16_36 3895; AVX1-NEXT: LBB16_35: ## %cond.store33 3896; AVX1-NEXT: vpextrb $1, %xmm0, 17(%rdi) 3897; AVX1-NEXT: testl $262144, %eax ## imm = 0x40000 3898; AVX1-NEXT: je LBB16_38 3899; AVX1-NEXT: LBB16_37: ## %cond.store35 3900; AVX1-NEXT: vpextrb $2, %xmm0, 18(%rdi) 3901; AVX1-NEXT: testl $524288, %eax ## imm = 0x80000 3902; AVX1-NEXT: je LBB16_40 3903; AVX1-NEXT: LBB16_39: ## %cond.store37 3904; AVX1-NEXT: vpextrb $3, %xmm0, 19(%rdi) 3905; AVX1-NEXT: testl $1048576, %eax ## imm = 0x100000 3906; AVX1-NEXT: je LBB16_42 3907; AVX1-NEXT: LBB16_41: ## %cond.store39 3908; AVX1-NEXT: vpextrb $4, %xmm0, 20(%rdi) 3909; AVX1-NEXT: testl $2097152, %eax ## imm = 0x200000 3910; AVX1-NEXT: je LBB16_44 3911; AVX1-NEXT: LBB16_43: ## %cond.store41 3912; AVX1-NEXT: vpextrb $5, %xmm0, 21(%rdi) 3913; AVX1-NEXT: testl $4194304, %eax ## imm = 0x400000 3914; AVX1-NEXT: je LBB16_46 3915; AVX1-NEXT: LBB16_45: ## %cond.store43 3916; AVX1-NEXT: vpextrb $6, %xmm0, 22(%rdi) 3917; AVX1-NEXT: testl $8388608, %eax ## imm = 0x800000 3918; AVX1-NEXT: je LBB16_48 3919; AVX1-NEXT: LBB16_47: ## %cond.store45 3920; AVX1-NEXT: vpextrb $7, %xmm0, 23(%rdi) 3921; AVX1-NEXT: testl $16777216, %eax ## imm = 0x1000000 3922; AVX1-NEXT: je LBB16_50 3923; AVX1-NEXT: LBB16_49: ## %cond.store47 3924; AVX1-NEXT: vpextrb $8, %xmm0, 24(%rdi) 3925; AVX1-NEXT: testl $33554432, %eax ## imm = 0x2000000 3926; AVX1-NEXT: je LBB16_52 3927; AVX1-NEXT: LBB16_51: ## %cond.store49 3928; AVX1-NEXT: vpextrb $9, %xmm0, 25(%rdi) 3929; AVX1-NEXT: testl $67108864, %eax ## imm = 0x4000000 3930; AVX1-NEXT: je LBB16_54 3931; AVX1-NEXT: LBB16_53: ## %cond.store51 3932; AVX1-NEXT: vpextrb $10, %xmm0, 26(%rdi) 3933; AVX1-NEXT: testl $134217728, %eax ## imm = 0x8000000 3934; AVX1-NEXT: je LBB16_56 3935; AVX1-NEXT: LBB16_55: ## %cond.store53 3936; AVX1-NEXT: vpextrb $11, %xmm0, 27(%rdi) 3937; AVX1-NEXT: testl $268435456, %eax ## imm = 0x10000000 3938; AVX1-NEXT: je LBB16_58 3939; AVX1-NEXT: LBB16_57: ## %cond.store55 3940; AVX1-NEXT: vpextrb $12, %xmm0, 28(%rdi) 3941; AVX1-NEXT: testl $536870912, %eax ## imm = 0x20000000 3942; AVX1-NEXT: je LBB16_60 3943; AVX1-NEXT: LBB16_59: ## %cond.store57 3944; AVX1-NEXT: vpextrb $13, %xmm0, 29(%rdi) 3945; AVX1-NEXT: testl $1073741824, %eax ## imm = 0x40000000 3946; AVX1-NEXT: je LBB16_62 3947; AVX1-NEXT: LBB16_61: ## %cond.store59 3948; AVX1-NEXT: vpextrb $14, %xmm0, 30(%rdi) 3949; AVX1-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 3950; AVX1-NEXT: je LBB16_64 3951; AVX1-NEXT: LBB16_63: ## %cond.store61 3952; AVX1-NEXT: vpextrb $15, %xmm0, 31(%rdi) 3953; AVX1-NEXT: vzeroupper 3954; AVX1-NEXT: retq 3955; 3956; AVX2-LABEL: store_v32i8_v32i8: 3957; AVX2: ## %bb.0: 3958; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 3959; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 3960; AVX2-NEXT: vpmovmskb %ymm0, %eax 3961; AVX2-NEXT: testb $1, %al 3962; AVX2-NEXT: jne LBB16_1 3963; AVX2-NEXT: ## %bb.2: ## %else 3964; AVX2-NEXT: testb $2, %al 3965; AVX2-NEXT: jne LBB16_3 3966; AVX2-NEXT: LBB16_4: ## %else2 3967; AVX2-NEXT: testb $4, %al 3968; AVX2-NEXT: jne LBB16_5 3969; AVX2-NEXT: LBB16_6: ## %else4 3970; AVX2-NEXT: testb $8, %al 3971; AVX2-NEXT: jne LBB16_7 3972; AVX2-NEXT: LBB16_8: ## %else6 3973; AVX2-NEXT: testb $16, %al 3974; AVX2-NEXT: jne LBB16_9 3975; AVX2-NEXT: LBB16_10: ## %else8 3976; AVX2-NEXT: testb $32, %al 3977; AVX2-NEXT: jne LBB16_11 3978; AVX2-NEXT: LBB16_12: ## %else10 3979; AVX2-NEXT: testb $64, %al 3980; AVX2-NEXT: jne LBB16_13 3981; AVX2-NEXT: LBB16_14: ## %else12 3982; AVX2-NEXT: testb %al, %al 3983; AVX2-NEXT: js LBB16_15 3984; AVX2-NEXT: LBB16_16: ## %else14 3985; AVX2-NEXT: testl $256, %eax ## imm = 0x100 3986; AVX2-NEXT: jne LBB16_17 3987; AVX2-NEXT: LBB16_18: ## %else16 3988; AVX2-NEXT: testl $512, %eax ## imm = 0x200 3989; AVX2-NEXT: jne LBB16_19 3990; AVX2-NEXT: LBB16_20: ## %else18 3991; AVX2-NEXT: testl $1024, %eax ## imm = 0x400 3992; AVX2-NEXT: jne LBB16_21 3993; AVX2-NEXT: LBB16_22: ## %else20 3994; AVX2-NEXT: testl $2048, %eax ## imm = 0x800 3995; AVX2-NEXT: jne LBB16_23 3996; AVX2-NEXT: LBB16_24: ## %else22 3997; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000 3998; AVX2-NEXT: jne LBB16_25 3999; AVX2-NEXT: LBB16_26: ## %else24 4000; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000 4001; AVX2-NEXT: jne LBB16_27 4002; AVX2-NEXT: LBB16_28: ## %else26 4003; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000 4004; AVX2-NEXT: jne LBB16_29 4005; AVX2-NEXT: LBB16_30: ## %else28 4006; AVX2-NEXT: testw %ax, %ax 4007; AVX2-NEXT: jns LBB16_32 4008; AVX2-NEXT: LBB16_31: ## %cond.store29 4009; AVX2-NEXT: vpextrb $15, %xmm1, 15(%rdi) 4010; AVX2-NEXT: LBB16_32: ## %else30 4011; AVX2-NEXT: testl $65536, %eax ## imm = 0x10000 4012; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 4013; AVX2-NEXT: jne LBB16_33 4014; AVX2-NEXT: ## %bb.34: ## %else32 4015; AVX2-NEXT: testl $131072, %eax ## imm = 0x20000 4016; AVX2-NEXT: jne LBB16_35 4017; AVX2-NEXT: LBB16_36: ## %else34 4018; AVX2-NEXT: testl $262144, %eax ## imm = 0x40000 4019; AVX2-NEXT: jne LBB16_37 4020; AVX2-NEXT: LBB16_38: ## %else36 4021; AVX2-NEXT: testl $524288, %eax ## imm = 0x80000 4022; AVX2-NEXT: jne LBB16_39 4023; AVX2-NEXT: LBB16_40: ## %else38 4024; AVX2-NEXT: testl $1048576, %eax ## imm = 0x100000 4025; AVX2-NEXT: jne LBB16_41 4026; AVX2-NEXT: LBB16_42: ## %else40 4027; AVX2-NEXT: testl $2097152, %eax ## imm = 0x200000 4028; AVX2-NEXT: jne LBB16_43 4029; AVX2-NEXT: LBB16_44: ## %else42 4030; AVX2-NEXT: testl $4194304, %eax ## imm = 0x400000 4031; AVX2-NEXT: jne LBB16_45 4032; AVX2-NEXT: LBB16_46: ## %else44 4033; AVX2-NEXT: testl $8388608, %eax ## imm = 0x800000 4034; AVX2-NEXT: jne LBB16_47 4035; AVX2-NEXT: LBB16_48: ## %else46 4036; AVX2-NEXT: testl $16777216, %eax ## imm = 0x1000000 4037; AVX2-NEXT: jne LBB16_49 4038; AVX2-NEXT: LBB16_50: ## %else48 4039; AVX2-NEXT: testl $33554432, %eax ## imm = 0x2000000 4040; AVX2-NEXT: jne LBB16_51 4041; AVX2-NEXT: LBB16_52: ## %else50 4042; AVX2-NEXT: testl $67108864, %eax ## imm = 0x4000000 4043; AVX2-NEXT: jne LBB16_53 4044; AVX2-NEXT: LBB16_54: ## %else52 4045; AVX2-NEXT: testl $134217728, %eax ## imm = 0x8000000 4046; AVX2-NEXT: jne LBB16_55 4047; AVX2-NEXT: LBB16_56: ## %else54 4048; AVX2-NEXT: testl $268435456, %eax ## imm = 0x10000000 4049; AVX2-NEXT: jne LBB16_57 4050; AVX2-NEXT: LBB16_58: ## %else56 4051; AVX2-NEXT: testl $536870912, %eax ## imm = 0x20000000 4052; AVX2-NEXT: jne LBB16_59 4053; AVX2-NEXT: LBB16_60: ## %else58 4054; AVX2-NEXT: testl $1073741824, %eax ## imm = 0x40000000 4055; AVX2-NEXT: jne LBB16_61 4056; AVX2-NEXT: LBB16_62: ## %else60 4057; AVX2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 4058; AVX2-NEXT: jne LBB16_63 4059; AVX2-NEXT: LBB16_64: ## %else62 4060; AVX2-NEXT: vzeroupper 4061; AVX2-NEXT: retq 4062; AVX2-NEXT: LBB16_1: ## %cond.store 4063; AVX2-NEXT: vpextrb $0, %xmm1, (%rdi) 4064; AVX2-NEXT: testb $2, %al 4065; AVX2-NEXT: je LBB16_4 4066; AVX2-NEXT: LBB16_3: ## %cond.store1 4067; AVX2-NEXT: vpextrb $1, %xmm1, 1(%rdi) 4068; AVX2-NEXT: testb $4, %al 4069; AVX2-NEXT: je LBB16_6 4070; AVX2-NEXT: LBB16_5: ## %cond.store3 4071; AVX2-NEXT: vpextrb $2, %xmm1, 2(%rdi) 4072; AVX2-NEXT: testb $8, %al 4073; AVX2-NEXT: je LBB16_8 4074; AVX2-NEXT: LBB16_7: ## %cond.store5 4075; AVX2-NEXT: vpextrb $3, %xmm1, 3(%rdi) 4076; AVX2-NEXT: testb $16, %al 4077; AVX2-NEXT: je LBB16_10 4078; AVX2-NEXT: LBB16_9: ## %cond.store7 4079; AVX2-NEXT: vpextrb $4, %xmm1, 4(%rdi) 4080; AVX2-NEXT: testb $32, %al 4081; AVX2-NEXT: je LBB16_12 4082; AVX2-NEXT: LBB16_11: ## %cond.store9 4083; AVX2-NEXT: vpextrb $5, %xmm1, 5(%rdi) 4084; AVX2-NEXT: testb $64, %al 4085; AVX2-NEXT: je LBB16_14 4086; AVX2-NEXT: LBB16_13: ## %cond.store11 4087; AVX2-NEXT: vpextrb $6, %xmm1, 6(%rdi) 4088; AVX2-NEXT: testb %al, %al 4089; AVX2-NEXT: jns LBB16_16 4090; AVX2-NEXT: LBB16_15: ## %cond.store13 4091; AVX2-NEXT: vpextrb $7, %xmm1, 7(%rdi) 4092; AVX2-NEXT: testl $256, %eax ## imm = 0x100 4093; AVX2-NEXT: je LBB16_18 4094; AVX2-NEXT: LBB16_17: ## %cond.store15 4095; AVX2-NEXT: vpextrb $8, %xmm1, 8(%rdi) 4096; AVX2-NEXT: testl $512, %eax ## imm = 0x200 4097; AVX2-NEXT: je LBB16_20 4098; AVX2-NEXT: LBB16_19: ## %cond.store17 4099; AVX2-NEXT: vpextrb $9, %xmm1, 9(%rdi) 4100; AVX2-NEXT: testl $1024, %eax ## imm = 0x400 4101; AVX2-NEXT: je LBB16_22 4102; AVX2-NEXT: LBB16_21: ## %cond.store19 4103; AVX2-NEXT: vpextrb $10, %xmm1, 10(%rdi) 4104; AVX2-NEXT: testl $2048, %eax ## imm = 0x800 4105; AVX2-NEXT: je LBB16_24 4106; AVX2-NEXT: LBB16_23: ## %cond.store21 4107; AVX2-NEXT: vpextrb $11, %xmm1, 11(%rdi) 4108; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000 4109; AVX2-NEXT: je LBB16_26 4110; AVX2-NEXT: LBB16_25: ## %cond.store23 4111; AVX2-NEXT: vpextrb $12, %xmm1, 12(%rdi) 4112; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000 4113; AVX2-NEXT: je LBB16_28 4114; AVX2-NEXT: LBB16_27: ## %cond.store25 4115; AVX2-NEXT: vpextrb $13, %xmm1, 13(%rdi) 4116; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000 4117; AVX2-NEXT: je LBB16_30 4118; AVX2-NEXT: LBB16_29: ## %cond.store27 4119; AVX2-NEXT: vpextrb $14, %xmm1, 14(%rdi) 4120; AVX2-NEXT: testw %ax, %ax 4121; AVX2-NEXT: js LBB16_31 4122; AVX2-NEXT: jmp LBB16_32 4123; AVX2-NEXT: LBB16_33: ## %cond.store31 4124; AVX2-NEXT: vpextrb $0, %xmm0, 16(%rdi) 4125; AVX2-NEXT: testl $131072, %eax ## imm = 0x20000 4126; AVX2-NEXT: je LBB16_36 4127; AVX2-NEXT: LBB16_35: ## %cond.store33 4128; AVX2-NEXT: vpextrb $1, %xmm0, 17(%rdi) 4129; AVX2-NEXT: testl $262144, %eax ## imm = 0x40000 4130; AVX2-NEXT: je LBB16_38 4131; AVX2-NEXT: LBB16_37: ## %cond.store35 4132; AVX2-NEXT: vpextrb $2, %xmm0, 18(%rdi) 4133; AVX2-NEXT: testl $524288, %eax ## imm = 0x80000 4134; AVX2-NEXT: je LBB16_40 4135; AVX2-NEXT: LBB16_39: ## %cond.store37 4136; AVX2-NEXT: vpextrb $3, %xmm0, 19(%rdi) 4137; AVX2-NEXT: testl $1048576, %eax ## imm = 0x100000 4138; AVX2-NEXT: je LBB16_42 4139; AVX2-NEXT: LBB16_41: ## %cond.store39 4140; AVX2-NEXT: vpextrb $4, %xmm0, 20(%rdi) 4141; AVX2-NEXT: testl $2097152, %eax ## imm = 0x200000 4142; AVX2-NEXT: je LBB16_44 4143; AVX2-NEXT: LBB16_43: ## %cond.store41 4144; AVX2-NEXT: vpextrb $5, %xmm0, 21(%rdi) 4145; AVX2-NEXT: testl $4194304, %eax ## imm = 0x400000 4146; AVX2-NEXT: je LBB16_46 4147; AVX2-NEXT: LBB16_45: ## %cond.store43 4148; AVX2-NEXT: vpextrb $6, %xmm0, 22(%rdi) 4149; AVX2-NEXT: testl $8388608, %eax ## imm = 0x800000 4150; AVX2-NEXT: je LBB16_48 4151; AVX2-NEXT: LBB16_47: ## %cond.store45 4152; AVX2-NEXT: vpextrb $7, %xmm0, 23(%rdi) 4153; AVX2-NEXT: testl $16777216, %eax ## imm = 0x1000000 4154; AVX2-NEXT: je LBB16_50 4155; AVX2-NEXT: LBB16_49: ## %cond.store47 4156; AVX2-NEXT: vpextrb $8, %xmm0, 24(%rdi) 4157; AVX2-NEXT: testl $33554432, %eax ## imm = 0x2000000 4158; AVX2-NEXT: je LBB16_52 4159; AVX2-NEXT: LBB16_51: ## %cond.store49 4160; AVX2-NEXT: vpextrb $9, %xmm0, 25(%rdi) 4161; AVX2-NEXT: testl $67108864, %eax ## imm = 0x4000000 4162; AVX2-NEXT: je LBB16_54 4163; AVX2-NEXT: LBB16_53: ## %cond.store51 4164; AVX2-NEXT: vpextrb $10, %xmm0, 26(%rdi) 4165; AVX2-NEXT: testl $134217728, %eax ## imm = 0x8000000 4166; AVX2-NEXT: je LBB16_56 4167; AVX2-NEXT: LBB16_55: ## %cond.store53 4168; AVX2-NEXT: vpextrb $11, %xmm0, 27(%rdi) 4169; AVX2-NEXT: testl $268435456, %eax ## imm = 0x10000000 4170; AVX2-NEXT: je LBB16_58 4171; AVX2-NEXT: LBB16_57: ## %cond.store55 4172; AVX2-NEXT: vpextrb $12, %xmm0, 28(%rdi) 4173; AVX2-NEXT: testl $536870912, %eax ## imm = 0x20000000 4174; AVX2-NEXT: je LBB16_60 4175; AVX2-NEXT: LBB16_59: ## %cond.store57 4176; AVX2-NEXT: vpextrb $13, %xmm0, 29(%rdi) 4177; AVX2-NEXT: testl $1073741824, %eax ## imm = 0x40000000 4178; AVX2-NEXT: je LBB16_62 4179; AVX2-NEXT: LBB16_61: ## %cond.store59 4180; AVX2-NEXT: vpextrb $14, %xmm0, 30(%rdi) 4181; AVX2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 4182; AVX2-NEXT: je LBB16_64 4183; AVX2-NEXT: LBB16_63: ## %cond.store61 4184; AVX2-NEXT: vpextrb $15, %xmm0, 31(%rdi) 4185; AVX2-NEXT: vzeroupper 4186; AVX2-NEXT: retq 4187; 4188; AVX512F-LABEL: store_v32i8_v32i8: 4189; AVX512F: ## %bb.0: 4190; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 4191; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 4192; AVX512F-NEXT: vpmovmskb %ymm0, %eax 4193; AVX512F-NEXT: testb $1, %al 4194; AVX512F-NEXT: jne LBB16_1 4195; AVX512F-NEXT: ## %bb.2: ## %else 4196; AVX512F-NEXT: testb $2, %al 4197; AVX512F-NEXT: jne LBB16_3 4198; AVX512F-NEXT: LBB16_4: ## %else2 4199; AVX512F-NEXT: testb $4, %al 4200; AVX512F-NEXT: jne LBB16_5 4201; AVX512F-NEXT: LBB16_6: ## %else4 4202; AVX512F-NEXT: testb $8, %al 4203; AVX512F-NEXT: jne LBB16_7 4204; AVX512F-NEXT: LBB16_8: ## %else6 4205; AVX512F-NEXT: testb $16, %al 4206; AVX512F-NEXT: jne LBB16_9 4207; AVX512F-NEXT: LBB16_10: ## %else8 4208; AVX512F-NEXT: testb $32, %al 4209; AVX512F-NEXT: jne LBB16_11 4210; AVX512F-NEXT: LBB16_12: ## %else10 4211; AVX512F-NEXT: testb $64, %al 4212; AVX512F-NEXT: jne LBB16_13 4213; AVX512F-NEXT: LBB16_14: ## %else12 4214; AVX512F-NEXT: testb %al, %al 4215; AVX512F-NEXT: js LBB16_15 4216; AVX512F-NEXT: LBB16_16: ## %else14 4217; AVX512F-NEXT: testl $256, %eax ## imm = 0x100 4218; AVX512F-NEXT: jne LBB16_17 4219; AVX512F-NEXT: LBB16_18: ## %else16 4220; AVX512F-NEXT: testl $512, %eax ## imm = 0x200 4221; AVX512F-NEXT: jne LBB16_19 4222; AVX512F-NEXT: LBB16_20: ## %else18 4223; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400 4224; AVX512F-NEXT: jne LBB16_21 4225; AVX512F-NEXT: LBB16_22: ## %else20 4226; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800 4227; AVX512F-NEXT: jne LBB16_23 4228; AVX512F-NEXT: LBB16_24: ## %else22 4229; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000 4230; AVX512F-NEXT: jne LBB16_25 4231; AVX512F-NEXT: LBB16_26: ## %else24 4232; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000 4233; AVX512F-NEXT: jne LBB16_27 4234; AVX512F-NEXT: LBB16_28: ## %else26 4235; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000 4236; AVX512F-NEXT: jne LBB16_29 4237; AVX512F-NEXT: LBB16_30: ## %else28 4238; AVX512F-NEXT: testw %ax, %ax 4239; AVX512F-NEXT: jns LBB16_32 4240; AVX512F-NEXT: LBB16_31: ## %cond.store29 4241; AVX512F-NEXT: vpextrb $15, %xmm1, 15(%rdi) 4242; AVX512F-NEXT: LBB16_32: ## %else30 4243; AVX512F-NEXT: testl $65536, %eax ## imm = 0x10000 4244; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 4245; AVX512F-NEXT: jne LBB16_33 4246; AVX512F-NEXT: ## %bb.34: ## %else32 4247; AVX512F-NEXT: testl $131072, %eax ## imm = 0x20000 4248; AVX512F-NEXT: jne LBB16_35 4249; AVX512F-NEXT: LBB16_36: ## %else34 4250; AVX512F-NEXT: testl $262144, %eax ## imm = 0x40000 4251; AVX512F-NEXT: jne LBB16_37 4252; AVX512F-NEXT: LBB16_38: ## %else36 4253; AVX512F-NEXT: testl $524288, %eax ## imm = 0x80000 4254; AVX512F-NEXT: jne LBB16_39 4255; AVX512F-NEXT: LBB16_40: ## %else38 4256; AVX512F-NEXT: testl $1048576, %eax ## imm = 0x100000 4257; AVX512F-NEXT: jne LBB16_41 4258; AVX512F-NEXT: LBB16_42: ## %else40 4259; AVX512F-NEXT: testl $2097152, %eax ## imm = 0x200000 4260; AVX512F-NEXT: jne LBB16_43 4261; AVX512F-NEXT: LBB16_44: ## %else42 4262; AVX512F-NEXT: testl $4194304, %eax ## imm = 0x400000 4263; AVX512F-NEXT: jne LBB16_45 4264; AVX512F-NEXT: LBB16_46: ## %else44 4265; AVX512F-NEXT: testl $8388608, %eax ## imm = 0x800000 4266; AVX512F-NEXT: jne LBB16_47 4267; AVX512F-NEXT: LBB16_48: ## %else46 4268; AVX512F-NEXT: testl $16777216, %eax ## imm = 0x1000000 4269; AVX512F-NEXT: jne LBB16_49 4270; AVX512F-NEXT: LBB16_50: ## %else48 4271; AVX512F-NEXT: testl $33554432, %eax ## imm = 0x2000000 4272; AVX512F-NEXT: jne LBB16_51 4273; AVX512F-NEXT: LBB16_52: ## %else50 4274; AVX512F-NEXT: testl $67108864, %eax ## imm = 0x4000000 4275; AVX512F-NEXT: jne LBB16_53 4276; AVX512F-NEXT: LBB16_54: ## %else52 4277; AVX512F-NEXT: testl $134217728, %eax ## imm = 0x8000000 4278; AVX512F-NEXT: jne LBB16_55 4279; AVX512F-NEXT: LBB16_56: ## %else54 4280; AVX512F-NEXT: testl $268435456, %eax ## imm = 0x10000000 4281; AVX512F-NEXT: jne LBB16_57 4282; AVX512F-NEXT: LBB16_58: ## %else56 4283; AVX512F-NEXT: testl $536870912, %eax ## imm = 0x20000000 4284; AVX512F-NEXT: jne LBB16_59 4285; AVX512F-NEXT: LBB16_60: ## %else58 4286; AVX512F-NEXT: testl $1073741824, %eax ## imm = 0x40000000 4287; AVX512F-NEXT: jne LBB16_61 4288; AVX512F-NEXT: LBB16_62: ## %else60 4289; AVX512F-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 4290; AVX512F-NEXT: jne LBB16_63 4291; AVX512F-NEXT: LBB16_64: ## %else62 4292; AVX512F-NEXT: vzeroupper 4293; AVX512F-NEXT: retq 4294; AVX512F-NEXT: LBB16_1: ## %cond.store 4295; AVX512F-NEXT: vpextrb $0, %xmm1, (%rdi) 4296; AVX512F-NEXT: testb $2, %al 4297; AVX512F-NEXT: je LBB16_4 4298; AVX512F-NEXT: LBB16_3: ## %cond.store1 4299; AVX512F-NEXT: vpextrb $1, %xmm1, 1(%rdi) 4300; AVX512F-NEXT: testb $4, %al 4301; AVX512F-NEXT: je LBB16_6 4302; AVX512F-NEXT: LBB16_5: ## %cond.store3 4303; AVX512F-NEXT: vpextrb $2, %xmm1, 2(%rdi) 4304; AVX512F-NEXT: testb $8, %al 4305; AVX512F-NEXT: je LBB16_8 4306; AVX512F-NEXT: LBB16_7: ## %cond.store5 4307; AVX512F-NEXT: vpextrb $3, %xmm1, 3(%rdi) 4308; AVX512F-NEXT: testb $16, %al 4309; AVX512F-NEXT: je LBB16_10 4310; AVX512F-NEXT: LBB16_9: ## %cond.store7 4311; AVX512F-NEXT: vpextrb $4, %xmm1, 4(%rdi) 4312; AVX512F-NEXT: testb $32, %al 4313; AVX512F-NEXT: je LBB16_12 4314; AVX512F-NEXT: LBB16_11: ## %cond.store9 4315; AVX512F-NEXT: vpextrb $5, %xmm1, 5(%rdi) 4316; AVX512F-NEXT: testb $64, %al 4317; AVX512F-NEXT: je LBB16_14 4318; AVX512F-NEXT: LBB16_13: ## %cond.store11 4319; AVX512F-NEXT: vpextrb $6, %xmm1, 6(%rdi) 4320; AVX512F-NEXT: testb %al, %al 4321; AVX512F-NEXT: jns LBB16_16 4322; AVX512F-NEXT: LBB16_15: ## %cond.store13 4323; AVX512F-NEXT: vpextrb $7, %xmm1, 7(%rdi) 4324; AVX512F-NEXT: testl $256, %eax ## imm = 0x100 4325; AVX512F-NEXT: je LBB16_18 4326; AVX512F-NEXT: LBB16_17: ## %cond.store15 4327; AVX512F-NEXT: vpextrb $8, %xmm1, 8(%rdi) 4328; AVX512F-NEXT: testl $512, %eax ## imm = 0x200 4329; AVX512F-NEXT: je LBB16_20 4330; AVX512F-NEXT: LBB16_19: ## %cond.store17 4331; AVX512F-NEXT: vpextrb $9, %xmm1, 9(%rdi) 4332; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400 4333; AVX512F-NEXT: je LBB16_22 4334; AVX512F-NEXT: LBB16_21: ## %cond.store19 4335; AVX512F-NEXT: vpextrb $10, %xmm1, 10(%rdi) 4336; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800 4337; AVX512F-NEXT: je LBB16_24 4338; AVX512F-NEXT: LBB16_23: ## %cond.store21 4339; AVX512F-NEXT: vpextrb $11, %xmm1, 11(%rdi) 4340; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000 4341; AVX512F-NEXT: je LBB16_26 4342; AVX512F-NEXT: LBB16_25: ## %cond.store23 4343; AVX512F-NEXT: vpextrb $12, %xmm1, 12(%rdi) 4344; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000 4345; AVX512F-NEXT: je LBB16_28 4346; AVX512F-NEXT: LBB16_27: ## %cond.store25 4347; AVX512F-NEXT: vpextrb $13, %xmm1, 13(%rdi) 4348; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000 4349; AVX512F-NEXT: je LBB16_30 4350; AVX512F-NEXT: LBB16_29: ## %cond.store27 4351; AVX512F-NEXT: vpextrb $14, %xmm1, 14(%rdi) 4352; AVX512F-NEXT: testw %ax, %ax 4353; AVX512F-NEXT: js LBB16_31 4354; AVX512F-NEXT: jmp LBB16_32 4355; AVX512F-NEXT: LBB16_33: ## %cond.store31 4356; AVX512F-NEXT: vpextrb $0, %xmm0, 16(%rdi) 4357; AVX512F-NEXT: testl $131072, %eax ## imm = 0x20000 4358; AVX512F-NEXT: je LBB16_36 4359; AVX512F-NEXT: LBB16_35: ## %cond.store33 4360; AVX512F-NEXT: vpextrb $1, %xmm0, 17(%rdi) 4361; AVX512F-NEXT: testl $262144, %eax ## imm = 0x40000 4362; AVX512F-NEXT: je LBB16_38 4363; AVX512F-NEXT: LBB16_37: ## %cond.store35 4364; AVX512F-NEXT: vpextrb $2, %xmm0, 18(%rdi) 4365; AVX512F-NEXT: testl $524288, %eax ## imm = 0x80000 4366; AVX512F-NEXT: je LBB16_40 4367; AVX512F-NEXT: LBB16_39: ## %cond.store37 4368; AVX512F-NEXT: vpextrb $3, %xmm0, 19(%rdi) 4369; AVX512F-NEXT: testl $1048576, %eax ## imm = 0x100000 4370; AVX512F-NEXT: je LBB16_42 4371; AVX512F-NEXT: LBB16_41: ## %cond.store39 4372; AVX512F-NEXT: vpextrb $4, %xmm0, 20(%rdi) 4373; AVX512F-NEXT: testl $2097152, %eax ## imm = 0x200000 4374; AVX512F-NEXT: je LBB16_44 4375; AVX512F-NEXT: LBB16_43: ## %cond.store41 4376; AVX512F-NEXT: vpextrb $5, %xmm0, 21(%rdi) 4377; AVX512F-NEXT: testl $4194304, %eax ## imm = 0x400000 4378; AVX512F-NEXT: je LBB16_46 4379; AVX512F-NEXT: LBB16_45: ## %cond.store43 4380; AVX512F-NEXT: vpextrb $6, %xmm0, 22(%rdi) 4381; AVX512F-NEXT: testl $8388608, %eax ## imm = 0x800000 4382; AVX512F-NEXT: je LBB16_48 4383; AVX512F-NEXT: LBB16_47: ## %cond.store45 4384; AVX512F-NEXT: vpextrb $7, %xmm0, 23(%rdi) 4385; AVX512F-NEXT: testl $16777216, %eax ## imm = 0x1000000 4386; AVX512F-NEXT: je LBB16_50 4387; AVX512F-NEXT: LBB16_49: ## %cond.store47 4388; AVX512F-NEXT: vpextrb $8, %xmm0, 24(%rdi) 4389; AVX512F-NEXT: testl $33554432, %eax ## imm = 0x2000000 4390; AVX512F-NEXT: je LBB16_52 4391; AVX512F-NEXT: LBB16_51: ## %cond.store49 4392; AVX512F-NEXT: vpextrb $9, %xmm0, 25(%rdi) 4393; AVX512F-NEXT: testl $67108864, %eax ## imm = 0x4000000 4394; AVX512F-NEXT: je LBB16_54 4395; AVX512F-NEXT: LBB16_53: ## %cond.store51 4396; AVX512F-NEXT: vpextrb $10, %xmm0, 26(%rdi) 4397; AVX512F-NEXT: testl $134217728, %eax ## imm = 0x8000000 4398; AVX512F-NEXT: je LBB16_56 4399; AVX512F-NEXT: LBB16_55: ## %cond.store53 4400; AVX512F-NEXT: vpextrb $11, %xmm0, 27(%rdi) 4401; AVX512F-NEXT: testl $268435456, %eax ## imm = 0x10000000 4402; AVX512F-NEXT: je LBB16_58 4403; AVX512F-NEXT: LBB16_57: ## %cond.store55 4404; AVX512F-NEXT: vpextrb $12, %xmm0, 28(%rdi) 4405; AVX512F-NEXT: testl $536870912, %eax ## imm = 0x20000000 4406; AVX512F-NEXT: je LBB16_60 4407; AVX512F-NEXT: LBB16_59: ## %cond.store57 4408; AVX512F-NEXT: vpextrb $13, %xmm0, 29(%rdi) 4409; AVX512F-NEXT: testl $1073741824, %eax ## imm = 0x40000000 4410; AVX512F-NEXT: je LBB16_62 4411; AVX512F-NEXT: LBB16_61: ## %cond.store59 4412; AVX512F-NEXT: vpextrb $14, %xmm0, 30(%rdi) 4413; AVX512F-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 4414; AVX512F-NEXT: je LBB16_64 4415; AVX512F-NEXT: LBB16_63: ## %cond.store61 4416; AVX512F-NEXT: vpextrb $15, %xmm0, 31(%rdi) 4417; AVX512F-NEXT: vzeroupper 4418; AVX512F-NEXT: retq 4419; 4420; AVX512VLDQ-LABEL: store_v32i8_v32i8: 4421; AVX512VLDQ: ## %bb.0: 4422; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 4423; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 4424; AVX512VLDQ-NEXT: vpmovmskb %ymm0, %eax 4425; AVX512VLDQ-NEXT: testb $1, %al 4426; AVX512VLDQ-NEXT: jne LBB16_1 4427; AVX512VLDQ-NEXT: ## %bb.2: ## %else 4428; AVX512VLDQ-NEXT: testb $2, %al 4429; AVX512VLDQ-NEXT: jne LBB16_3 4430; AVX512VLDQ-NEXT: LBB16_4: ## %else2 4431; AVX512VLDQ-NEXT: testb $4, %al 4432; AVX512VLDQ-NEXT: jne LBB16_5 4433; AVX512VLDQ-NEXT: LBB16_6: ## %else4 4434; AVX512VLDQ-NEXT: testb $8, %al 4435; AVX512VLDQ-NEXT: jne LBB16_7 4436; AVX512VLDQ-NEXT: LBB16_8: ## %else6 4437; AVX512VLDQ-NEXT: testb $16, %al 4438; AVX512VLDQ-NEXT: jne LBB16_9 4439; AVX512VLDQ-NEXT: LBB16_10: ## %else8 4440; AVX512VLDQ-NEXT: testb $32, %al 4441; AVX512VLDQ-NEXT: jne LBB16_11 4442; AVX512VLDQ-NEXT: LBB16_12: ## %else10 4443; AVX512VLDQ-NEXT: testb $64, %al 4444; AVX512VLDQ-NEXT: jne LBB16_13 4445; AVX512VLDQ-NEXT: LBB16_14: ## %else12 4446; AVX512VLDQ-NEXT: testb %al, %al 4447; AVX512VLDQ-NEXT: js LBB16_15 4448; AVX512VLDQ-NEXT: LBB16_16: ## %else14 4449; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100 4450; AVX512VLDQ-NEXT: jne LBB16_17 4451; AVX512VLDQ-NEXT: LBB16_18: ## %else16 4452; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200 4453; AVX512VLDQ-NEXT: jne LBB16_19 4454; AVX512VLDQ-NEXT: LBB16_20: ## %else18 4455; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400 4456; AVX512VLDQ-NEXT: jne LBB16_21 4457; AVX512VLDQ-NEXT: LBB16_22: ## %else20 4458; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800 4459; AVX512VLDQ-NEXT: jne LBB16_23 4460; AVX512VLDQ-NEXT: LBB16_24: ## %else22 4461; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000 4462; AVX512VLDQ-NEXT: jne LBB16_25 4463; AVX512VLDQ-NEXT: LBB16_26: ## %else24 4464; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000 4465; AVX512VLDQ-NEXT: jne LBB16_27 4466; AVX512VLDQ-NEXT: LBB16_28: ## %else26 4467; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000 4468; AVX512VLDQ-NEXT: jne LBB16_29 4469; AVX512VLDQ-NEXT: LBB16_30: ## %else28 4470; AVX512VLDQ-NEXT: testw %ax, %ax 4471; AVX512VLDQ-NEXT: jns LBB16_32 4472; AVX512VLDQ-NEXT: LBB16_31: ## %cond.store29 4473; AVX512VLDQ-NEXT: vpextrb $15, %xmm1, 15(%rdi) 4474; AVX512VLDQ-NEXT: LBB16_32: ## %else30 4475; AVX512VLDQ-NEXT: testl $65536, %eax ## imm = 0x10000 4476; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 4477; AVX512VLDQ-NEXT: jne LBB16_33 4478; AVX512VLDQ-NEXT: ## %bb.34: ## %else32 4479; AVX512VLDQ-NEXT: testl $131072, %eax ## imm = 0x20000 4480; AVX512VLDQ-NEXT: jne LBB16_35 4481; AVX512VLDQ-NEXT: LBB16_36: ## %else34 4482; AVX512VLDQ-NEXT: testl $262144, %eax ## imm = 0x40000 4483; AVX512VLDQ-NEXT: jne LBB16_37 4484; AVX512VLDQ-NEXT: LBB16_38: ## %else36 4485; AVX512VLDQ-NEXT: testl $524288, %eax ## imm = 0x80000 4486; AVX512VLDQ-NEXT: jne LBB16_39 4487; AVX512VLDQ-NEXT: LBB16_40: ## %else38 4488; AVX512VLDQ-NEXT: testl $1048576, %eax ## imm = 0x100000 4489; AVX512VLDQ-NEXT: jne LBB16_41 4490; AVX512VLDQ-NEXT: LBB16_42: ## %else40 4491; AVX512VLDQ-NEXT: testl $2097152, %eax ## imm = 0x200000 4492; AVX512VLDQ-NEXT: jne LBB16_43 4493; AVX512VLDQ-NEXT: LBB16_44: ## %else42 4494; AVX512VLDQ-NEXT: testl $4194304, %eax ## imm = 0x400000 4495; AVX512VLDQ-NEXT: jne LBB16_45 4496; AVX512VLDQ-NEXT: LBB16_46: ## %else44 4497; AVX512VLDQ-NEXT: testl $8388608, %eax ## imm = 0x800000 4498; AVX512VLDQ-NEXT: jne LBB16_47 4499; AVX512VLDQ-NEXT: LBB16_48: ## %else46 4500; AVX512VLDQ-NEXT: testl $16777216, %eax ## imm = 0x1000000 4501; AVX512VLDQ-NEXT: jne LBB16_49 4502; AVX512VLDQ-NEXT: LBB16_50: ## %else48 4503; AVX512VLDQ-NEXT: testl $33554432, %eax ## imm = 0x2000000 4504; AVX512VLDQ-NEXT: jne LBB16_51 4505; AVX512VLDQ-NEXT: LBB16_52: ## %else50 4506; AVX512VLDQ-NEXT: testl $67108864, %eax ## imm = 0x4000000 4507; AVX512VLDQ-NEXT: jne LBB16_53 4508; AVX512VLDQ-NEXT: LBB16_54: ## %else52 4509; AVX512VLDQ-NEXT: testl $134217728, %eax ## imm = 0x8000000 4510; AVX512VLDQ-NEXT: jne LBB16_55 4511; AVX512VLDQ-NEXT: LBB16_56: ## %else54 4512; AVX512VLDQ-NEXT: testl $268435456, %eax ## imm = 0x10000000 4513; AVX512VLDQ-NEXT: jne LBB16_57 4514; AVX512VLDQ-NEXT: LBB16_58: ## %else56 4515; AVX512VLDQ-NEXT: testl $536870912, %eax ## imm = 0x20000000 4516; AVX512VLDQ-NEXT: jne LBB16_59 4517; AVX512VLDQ-NEXT: LBB16_60: ## %else58 4518; AVX512VLDQ-NEXT: testl $1073741824, %eax ## imm = 0x40000000 4519; AVX512VLDQ-NEXT: jne LBB16_61 4520; AVX512VLDQ-NEXT: LBB16_62: ## %else60 4521; AVX512VLDQ-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 4522; AVX512VLDQ-NEXT: jne LBB16_63 4523; AVX512VLDQ-NEXT: LBB16_64: ## %else62 4524; AVX512VLDQ-NEXT: vzeroupper 4525; AVX512VLDQ-NEXT: retq 4526; AVX512VLDQ-NEXT: LBB16_1: ## %cond.store 4527; AVX512VLDQ-NEXT: vpextrb $0, %xmm1, (%rdi) 4528; AVX512VLDQ-NEXT: testb $2, %al 4529; AVX512VLDQ-NEXT: je LBB16_4 4530; AVX512VLDQ-NEXT: LBB16_3: ## %cond.store1 4531; AVX512VLDQ-NEXT: vpextrb $1, %xmm1, 1(%rdi) 4532; AVX512VLDQ-NEXT: testb $4, %al 4533; AVX512VLDQ-NEXT: je LBB16_6 4534; AVX512VLDQ-NEXT: LBB16_5: ## %cond.store3 4535; AVX512VLDQ-NEXT: vpextrb $2, %xmm1, 2(%rdi) 4536; AVX512VLDQ-NEXT: testb $8, %al 4537; AVX512VLDQ-NEXT: je LBB16_8 4538; AVX512VLDQ-NEXT: LBB16_7: ## %cond.store5 4539; AVX512VLDQ-NEXT: vpextrb $3, %xmm1, 3(%rdi) 4540; AVX512VLDQ-NEXT: testb $16, %al 4541; AVX512VLDQ-NEXT: je LBB16_10 4542; AVX512VLDQ-NEXT: LBB16_9: ## %cond.store7 4543; AVX512VLDQ-NEXT: vpextrb $4, %xmm1, 4(%rdi) 4544; AVX512VLDQ-NEXT: testb $32, %al 4545; AVX512VLDQ-NEXT: je LBB16_12 4546; AVX512VLDQ-NEXT: LBB16_11: ## %cond.store9 4547; AVX512VLDQ-NEXT: vpextrb $5, %xmm1, 5(%rdi) 4548; AVX512VLDQ-NEXT: testb $64, %al 4549; AVX512VLDQ-NEXT: je LBB16_14 4550; AVX512VLDQ-NEXT: LBB16_13: ## %cond.store11 4551; AVX512VLDQ-NEXT: vpextrb $6, %xmm1, 6(%rdi) 4552; AVX512VLDQ-NEXT: testb %al, %al 4553; AVX512VLDQ-NEXT: jns LBB16_16 4554; AVX512VLDQ-NEXT: LBB16_15: ## %cond.store13 4555; AVX512VLDQ-NEXT: vpextrb $7, %xmm1, 7(%rdi) 4556; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100 4557; AVX512VLDQ-NEXT: je LBB16_18 4558; AVX512VLDQ-NEXT: LBB16_17: ## %cond.store15 4559; AVX512VLDQ-NEXT: vpextrb $8, %xmm1, 8(%rdi) 4560; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200 4561; AVX512VLDQ-NEXT: je LBB16_20 4562; AVX512VLDQ-NEXT: LBB16_19: ## %cond.store17 4563; AVX512VLDQ-NEXT: vpextrb $9, %xmm1, 9(%rdi) 4564; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400 4565; AVX512VLDQ-NEXT: je LBB16_22 4566; AVX512VLDQ-NEXT: LBB16_21: ## %cond.store19 4567; AVX512VLDQ-NEXT: vpextrb $10, %xmm1, 10(%rdi) 4568; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800 4569; AVX512VLDQ-NEXT: je LBB16_24 4570; AVX512VLDQ-NEXT: LBB16_23: ## %cond.store21 4571; AVX512VLDQ-NEXT: vpextrb $11, %xmm1, 11(%rdi) 4572; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000 4573; AVX512VLDQ-NEXT: je LBB16_26 4574; AVX512VLDQ-NEXT: LBB16_25: ## %cond.store23 4575; AVX512VLDQ-NEXT: vpextrb $12, %xmm1, 12(%rdi) 4576; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000 4577; AVX512VLDQ-NEXT: je LBB16_28 4578; AVX512VLDQ-NEXT: LBB16_27: ## %cond.store25 4579; AVX512VLDQ-NEXT: vpextrb $13, %xmm1, 13(%rdi) 4580; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000 4581; AVX512VLDQ-NEXT: je LBB16_30 4582; AVX512VLDQ-NEXT: LBB16_29: ## %cond.store27 4583; AVX512VLDQ-NEXT: vpextrb $14, %xmm1, 14(%rdi) 4584; AVX512VLDQ-NEXT: testw %ax, %ax 4585; AVX512VLDQ-NEXT: js LBB16_31 4586; AVX512VLDQ-NEXT: jmp LBB16_32 4587; AVX512VLDQ-NEXT: LBB16_33: ## %cond.store31 4588; AVX512VLDQ-NEXT: vpextrb $0, %xmm0, 16(%rdi) 4589; AVX512VLDQ-NEXT: testl $131072, %eax ## imm = 0x20000 4590; AVX512VLDQ-NEXT: je LBB16_36 4591; AVX512VLDQ-NEXT: LBB16_35: ## %cond.store33 4592; AVX512VLDQ-NEXT: vpextrb $1, %xmm0, 17(%rdi) 4593; AVX512VLDQ-NEXT: testl $262144, %eax ## imm = 0x40000 4594; AVX512VLDQ-NEXT: je LBB16_38 4595; AVX512VLDQ-NEXT: LBB16_37: ## %cond.store35 4596; AVX512VLDQ-NEXT: vpextrb $2, %xmm0, 18(%rdi) 4597; AVX512VLDQ-NEXT: testl $524288, %eax ## imm = 0x80000 4598; AVX512VLDQ-NEXT: je LBB16_40 4599; AVX512VLDQ-NEXT: LBB16_39: ## %cond.store37 4600; AVX512VLDQ-NEXT: vpextrb $3, %xmm0, 19(%rdi) 4601; AVX512VLDQ-NEXT: testl $1048576, %eax ## imm = 0x100000 4602; AVX512VLDQ-NEXT: je LBB16_42 4603; AVX512VLDQ-NEXT: LBB16_41: ## %cond.store39 4604; AVX512VLDQ-NEXT: vpextrb $4, %xmm0, 20(%rdi) 4605; AVX512VLDQ-NEXT: testl $2097152, %eax ## imm = 0x200000 4606; AVX512VLDQ-NEXT: je LBB16_44 4607; AVX512VLDQ-NEXT: LBB16_43: ## %cond.store41 4608; AVX512VLDQ-NEXT: vpextrb $5, %xmm0, 21(%rdi) 4609; AVX512VLDQ-NEXT: testl $4194304, %eax ## imm = 0x400000 4610; AVX512VLDQ-NEXT: je LBB16_46 4611; AVX512VLDQ-NEXT: LBB16_45: ## %cond.store43 4612; AVX512VLDQ-NEXT: vpextrb $6, %xmm0, 22(%rdi) 4613; AVX512VLDQ-NEXT: testl $8388608, %eax ## imm = 0x800000 4614; AVX512VLDQ-NEXT: je LBB16_48 4615; AVX512VLDQ-NEXT: LBB16_47: ## %cond.store45 4616; AVX512VLDQ-NEXT: vpextrb $7, %xmm0, 23(%rdi) 4617; AVX512VLDQ-NEXT: testl $16777216, %eax ## imm = 0x1000000 4618; AVX512VLDQ-NEXT: je LBB16_50 4619; AVX512VLDQ-NEXT: LBB16_49: ## %cond.store47 4620; AVX512VLDQ-NEXT: vpextrb $8, %xmm0, 24(%rdi) 4621; AVX512VLDQ-NEXT: testl $33554432, %eax ## imm = 0x2000000 4622; AVX512VLDQ-NEXT: je LBB16_52 4623; AVX512VLDQ-NEXT: LBB16_51: ## %cond.store49 4624; AVX512VLDQ-NEXT: vpextrb $9, %xmm0, 25(%rdi) 4625; AVX512VLDQ-NEXT: testl $67108864, %eax ## imm = 0x4000000 4626; AVX512VLDQ-NEXT: je LBB16_54 4627; AVX512VLDQ-NEXT: LBB16_53: ## %cond.store51 4628; AVX512VLDQ-NEXT: vpextrb $10, %xmm0, 26(%rdi) 4629; AVX512VLDQ-NEXT: testl $134217728, %eax ## imm = 0x8000000 4630; AVX512VLDQ-NEXT: je LBB16_56 4631; AVX512VLDQ-NEXT: LBB16_55: ## %cond.store53 4632; AVX512VLDQ-NEXT: vpextrb $11, %xmm0, 27(%rdi) 4633; AVX512VLDQ-NEXT: testl $268435456, %eax ## imm = 0x10000000 4634; AVX512VLDQ-NEXT: je LBB16_58 4635; AVX512VLDQ-NEXT: LBB16_57: ## %cond.store55 4636; AVX512VLDQ-NEXT: vpextrb $12, %xmm0, 28(%rdi) 4637; AVX512VLDQ-NEXT: testl $536870912, %eax ## imm = 0x20000000 4638; AVX512VLDQ-NEXT: je LBB16_60 4639; AVX512VLDQ-NEXT: LBB16_59: ## %cond.store57 4640; AVX512VLDQ-NEXT: vpextrb $13, %xmm0, 29(%rdi) 4641; AVX512VLDQ-NEXT: testl $1073741824, %eax ## imm = 0x40000000 4642; AVX512VLDQ-NEXT: je LBB16_62 4643; AVX512VLDQ-NEXT: LBB16_61: ## %cond.store59 4644; AVX512VLDQ-NEXT: vpextrb $14, %xmm0, 30(%rdi) 4645; AVX512VLDQ-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 4646; AVX512VLDQ-NEXT: je LBB16_64 4647; AVX512VLDQ-NEXT: LBB16_63: ## %cond.store61 4648; AVX512VLDQ-NEXT: vpextrb $15, %xmm0, 31(%rdi) 4649; AVX512VLDQ-NEXT: vzeroupper 4650; AVX512VLDQ-NEXT: retq 4651; 4652; AVX512VLBW-LABEL: store_v32i8_v32i8: 4653; AVX512VLBW: ## %bb.0: 4654; AVX512VLBW-NEXT: vptestnmb %ymm0, %ymm0, %k1 4655; AVX512VLBW-NEXT: vmovdqu8 %ymm1, (%rdi) {%k1} 4656; AVX512VLBW-NEXT: vzeroupper 4657; AVX512VLBW-NEXT: retq 4658; 4659; X86-AVX512-LABEL: store_v32i8_v32i8: 4660; X86-AVX512: ## %bb.0: 4661; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 4662; X86-AVX512-NEXT: vptestnmb %ymm0, %ymm0, %k1 4663; X86-AVX512-NEXT: vmovdqu8 %ymm1, (%eax) {%k1} 4664; X86-AVX512-NEXT: vzeroupper 4665; X86-AVX512-NEXT: retl 4666 %mask = icmp eq <32 x i8> %trigger, zeroinitializer 4667 call void @llvm.masked.store.v32i8.p0(<32 x i8> %val, ptr %addr, i32 4, <32 x i1> %mask) 4668 ret void 4669} 4670 4671;;; Stores with Constant Masks 4672 4673define void @mstore_constmask_v4i32_v4i32(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) nounwind { 4674; SSE-LABEL: mstore_constmask_v4i32_v4i32: 4675; SSE: ## %bb.0: 4676; SSE-NEXT: movups %xmm1, (%rdi) 4677; SSE-NEXT: retq 4678; 4679; AVX-LABEL: mstore_constmask_v4i32_v4i32: 4680; AVX: ## %bb.0: 4681; AVX-NEXT: vmovups %xmm1, (%rdi) 4682; AVX-NEXT: retq 4683; 4684; X86-AVX512-LABEL: mstore_constmask_v4i32_v4i32: 4685; X86-AVX512: ## %bb.0: 4686; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 4687; X86-AVX512-NEXT: vmovups %xmm1, (%eax) 4688; X86-AVX512-NEXT: retl 4689 %mask = icmp eq <4 x i32> %trigger, zeroinitializer 4690 call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>) 4691 ret void 4692} 4693 4694; Make sure we are able to detect all ones constant mask after type legalization 4695; to avoid masked stores. 4696define void @mstore_constmask_allones_split(<16 x i64> %trigger, ptr %addr, <16 x i64> %val) nounwind { 4697; SSE2-LABEL: mstore_constmask_allones_split: 4698; SSE2: ## %bb.0: 4699; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 4700; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 4701; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 4702; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 4703; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 4704; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 4705; SSE2-NEXT: movq %xmm5, (%rdi) 4706; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] 4707; SSE2-NEXT: movq %xmm5, 8(%rdi) 4708; SSE2-NEXT: pshufd {{.*#+}} xmm5 = mem[2,3,2,3] 4709; SSE2-NEXT: movq %xmm5, 24(%rdi) 4710; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax 4711; SSE2-NEXT: movq %rax, 32(%rdi) 4712; SSE2-NEXT: movq %xmm4, 48(%rdi) 4713; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] 4714; SSE2-NEXT: movq %xmm4, 56(%rdi) 4715; SSE2-NEXT: movq %xmm3, 64(%rdi) 4716; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] 4717; SSE2-NEXT: movq %xmm3, 72(%rdi) 4718; SSE2-NEXT: movq %xmm2, 80(%rdi) 4719; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 4720; SSE2-NEXT: movq %xmm2, 88(%rdi) 4721; SSE2-NEXT: movq %xmm1, 96(%rdi) 4722; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 4723; SSE2-NEXT: movq %xmm1, 104(%rdi) 4724; SSE2-NEXT: movq %xmm0, 112(%rdi) 4725; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 4726; SSE2-NEXT: movq %xmm0, 120(%rdi) 4727; SSE2-NEXT: retq 4728; 4729; SSE4-LABEL: mstore_constmask_allones_split: 4730; SSE4: ## %bb.0: 4731; SSE4-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 4732; SSE4-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 4733; SSE4-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 4734; SSE4-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 4735; SSE4-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 4736; SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 4737; SSE4-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6 4738; SSE4-NEXT: movups %xmm6, (%rdi) 4739; SSE4-NEXT: palignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] 4740; SSE4-NEXT: movdqu %xmm5, 24(%rdi) 4741; SSE4-NEXT: movups %xmm4, 48(%rdi) 4742; SSE4-NEXT: movups %xmm3, 64(%rdi) 4743; SSE4-NEXT: movups %xmm2, 80(%rdi) 4744; SSE4-NEXT: movups %xmm1, 96(%rdi) 4745; SSE4-NEXT: movups %xmm0, 112(%rdi) 4746; SSE4-NEXT: retq 4747; 4748; AVX1-LABEL: mstore_constmask_allones_split: 4749; AVX1: ## %bb.0: 4750; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,18446744073709551615] 4751; AVX1-NEXT: vmaskmovpd %ymm5, %ymm0, 32(%rdi) 4752; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,0,18446744073709551615] 4753; AVX1-NEXT: vmaskmovpd %ymm4, %ymm0, (%rdi) 4754; AVX1-NEXT: vmovups %ymm7, 96(%rdi) 4755; AVX1-NEXT: vmovups %ymm6, 64(%rdi) 4756; AVX1-NEXT: vzeroupper 4757; AVX1-NEXT: retq 4758; 4759; AVX2-LABEL: mstore_constmask_allones_split: 4760; AVX2: ## %bb.0: 4761; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,18446744073709551615] 4762; AVX2-NEXT: vpmaskmovq %ymm5, %ymm0, 32(%rdi) 4763; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,0,18446744073709551615] 4764; AVX2-NEXT: vpmaskmovq %ymm4, %ymm0, (%rdi) 4765; AVX2-NEXT: vmovups %ymm7, 96(%rdi) 4766; AVX2-NEXT: vmovups %ymm6, 64(%rdi) 4767; AVX2-NEXT: vzeroupper 4768; AVX2-NEXT: retq 4769; 4770; AVX512F-LABEL: mstore_constmask_allones_split: 4771; AVX512F: ## %bb.0: 4772; AVX512F-NEXT: movb $-37, %al 4773; AVX512F-NEXT: kmovw %eax, %k1 4774; AVX512F-NEXT: vmovdqu64 %zmm2, (%rdi) {%k1} 4775; AVX512F-NEXT: vmovups %zmm3, 64(%rdi) 4776; AVX512F-NEXT: vzeroupper 4777; AVX512F-NEXT: retq 4778; 4779; AVX512VLDQ-LABEL: mstore_constmask_allones_split: 4780; AVX512VLDQ: ## %bb.0: 4781; AVX512VLDQ-NEXT: movb $-37, %al 4782; AVX512VLDQ-NEXT: kmovw %eax, %k1 4783; AVX512VLDQ-NEXT: vmovdqu64 %zmm2, (%rdi) {%k1} 4784; AVX512VLDQ-NEXT: vmovups %zmm3, 64(%rdi) 4785; AVX512VLDQ-NEXT: vzeroupper 4786; AVX512VLDQ-NEXT: retq 4787; 4788; AVX512VLBW-LABEL: mstore_constmask_allones_split: 4789; AVX512VLBW: ## %bb.0: 4790; AVX512VLBW-NEXT: movb $-37, %al 4791; AVX512VLBW-NEXT: kmovd %eax, %k1 4792; AVX512VLBW-NEXT: vmovdqu64 %zmm2, (%rdi) {%k1} 4793; AVX512VLBW-NEXT: vmovups %zmm3, 64(%rdi) 4794; AVX512VLBW-NEXT: vzeroupper 4795; AVX512VLBW-NEXT: retq 4796; 4797; X86-AVX512-LABEL: mstore_constmask_allones_split: 4798; X86-AVX512: ## %bb.0: 4799; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 4800; X86-AVX512-NEXT: movb $-37, %cl 4801; X86-AVX512-NEXT: kmovd %ecx, %k1 4802; X86-AVX512-NEXT: vmovdqu64 %zmm2, (%eax) {%k1} 4803; X86-AVX512-NEXT: vmovups %zmm3, 64(%eax) 4804; X86-AVX512-NEXT: vzeroupper 4805; X86-AVX512-NEXT: retl 4806 %mask = icmp eq <16 x i64> %trigger, zeroinitializer 4807 call void @llvm.masked.store.v16i64.p0(<16 x i64> %val, ptr %addr, i32 4, <16 x i1><i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 4808 ret void 4809} 4810 4811; When only one element of the mask is set, reduce to a scalar store. 4812 4813define void @one_mask_bit_set1(ptr %addr, <4 x i32> %val) nounwind { 4814; SSE-LABEL: one_mask_bit_set1: 4815; SSE: ## %bb.0: 4816; SSE-NEXT: movss %xmm0, (%rdi) 4817; SSE-NEXT: retq 4818; 4819; AVX-LABEL: one_mask_bit_set1: 4820; AVX: ## %bb.0: 4821; AVX-NEXT: vmovss %xmm0, (%rdi) 4822; AVX-NEXT: retq 4823; 4824; X86-AVX512-LABEL: one_mask_bit_set1: 4825; X86-AVX512: ## %bb.0: 4826; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 4827; X86-AVX512-NEXT: vmovss %xmm0, (%eax) 4828; X86-AVX512-NEXT: retl 4829 call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>) 4830 ret void 4831} 4832 4833; Choose a different element to show that the correct address offset is produced. 4834 4835define void @one_mask_bit_set2(ptr %addr, <4 x float> %val) nounwind { 4836; SSE2-LABEL: one_mask_bit_set2: 4837; SSE2: ## %bb.0: 4838; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 4839; SSE2-NEXT: movss %xmm0, 8(%rdi) 4840; SSE2-NEXT: retq 4841; 4842; SSE4-LABEL: one_mask_bit_set2: 4843; SSE4: ## %bb.0: 4844; SSE4-NEXT: extractps $2, %xmm0, 8(%rdi) 4845; SSE4-NEXT: retq 4846; 4847; AVX-LABEL: one_mask_bit_set2: 4848; AVX: ## %bb.0: 4849; AVX-NEXT: vextractps $2, %xmm0, 8(%rdi) 4850; AVX-NEXT: retq 4851; 4852; X86-AVX512-LABEL: one_mask_bit_set2: 4853; X86-AVX512: ## %bb.0: 4854; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 4855; X86-AVX512-NEXT: vextractps $2, %xmm0, 8(%eax) 4856; X86-AVX512-NEXT: retl 4857 call void @llvm.masked.store.v4f32.p0(<4 x float> %val, ptr %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>) 4858 ret void 4859} 4860 4861; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly. 4862 4863define void @one_mask_bit_set3(ptr %addr, <4 x i64> %val) nounwind { 4864; SSE-LABEL: one_mask_bit_set3: 4865; SSE: ## %bb.0: 4866; SSE-NEXT: movlps %xmm1, 16(%rdi) 4867; SSE-NEXT: retq 4868; 4869; AVX-LABEL: one_mask_bit_set3: 4870; AVX: ## %bb.0: 4871; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 4872; AVX-NEXT: vmovlps %xmm0, 16(%rdi) 4873; AVX-NEXT: vzeroupper 4874; AVX-NEXT: retq 4875; 4876; X86-AVX512-LABEL: one_mask_bit_set3: 4877; X86-AVX512: ## %bb.0: 4878; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 4879; X86-AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 4880; X86-AVX512-NEXT: vmovlps %xmm0, 16(%eax) 4881; X86-AVX512-NEXT: vzeroupper 4882; X86-AVX512-NEXT: retl 4883 call void @llvm.masked.store.v4i64.p0(<4 x i64> %val, ptr %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>) 4884 ret void 4885} 4886 4887; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly. 4888 4889define void @one_mask_bit_set4(ptr %addr, <4 x double> %val) nounwind { 4890; SSE-LABEL: one_mask_bit_set4: 4891; SSE: ## %bb.0: 4892; SSE-NEXT: movhps %xmm1, 24(%rdi) 4893; SSE-NEXT: retq 4894; 4895; AVX-LABEL: one_mask_bit_set4: 4896; AVX: ## %bb.0: 4897; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 4898; AVX-NEXT: vmovhps %xmm0, 24(%rdi) 4899; AVX-NEXT: vzeroupper 4900; AVX-NEXT: retq 4901; 4902; X86-AVX512-LABEL: one_mask_bit_set4: 4903; X86-AVX512: ## %bb.0: 4904; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 4905; X86-AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 4906; X86-AVX512-NEXT: vmovhps %xmm0, 24(%eax) 4907; X86-AVX512-NEXT: vzeroupper 4908; X86-AVX512-NEXT: retl 4909 call void @llvm.masked.store.v4f64.p0(<4 x double> %val, ptr %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>) 4910 ret void 4911} 4912 4913; Try a 512-bit vector to make sure AVX doesn't die and AVX512 works as expected. 4914 4915define void @one_mask_bit_set5(ptr %addr, <8 x double> %val) nounwind { 4916; SSE-LABEL: one_mask_bit_set5: 4917; SSE: ## %bb.0: 4918; SSE-NEXT: movlps %xmm3, 48(%rdi) 4919; SSE-NEXT: retq 4920; 4921; AVX1OR2-LABEL: one_mask_bit_set5: 4922; AVX1OR2: ## %bb.0: 4923; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm0 4924; AVX1OR2-NEXT: vmovlps %xmm0, 48(%rdi) 4925; AVX1OR2-NEXT: vzeroupper 4926; AVX1OR2-NEXT: retq 4927; 4928; AVX512-LABEL: one_mask_bit_set5: 4929; AVX512: ## %bb.0: 4930; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 4931; AVX512-NEXT: vmovlps %xmm0, 48(%rdi) 4932; AVX512-NEXT: vzeroupper 4933; AVX512-NEXT: retq 4934; 4935; X86-AVX512-LABEL: one_mask_bit_set5: 4936; X86-AVX512: ## %bb.0: 4937; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 4938; X86-AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 4939; X86-AVX512-NEXT: vmovlps %xmm0, 48(%eax) 4940; X86-AVX512-NEXT: vzeroupper 4941; X86-AVX512-NEXT: retl 4942 call void @llvm.masked.store.v8f64.p0(<8 x double> %val, ptr %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false>) 4943 ret void 4944} 4945 4946; Try one elt in each half of a vector that needs to split 4947define void @one_mask_bit_set6(ptr %addr, <16 x i64> %val) nounwind { 4948; SSE2-LABEL: one_mask_bit_set6: 4949; SSE2: ## %bb.0: 4950; SSE2-NEXT: movlps %xmm3, 48(%rdi) 4951; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] 4952; SSE2-NEXT: movq %xmm0, 88(%rdi) 4953; SSE2-NEXT: retq 4954; 4955; SSE4-LABEL: one_mask_bit_set6: 4956; SSE4: ## %bb.0: 4957; SSE4-NEXT: movlps %xmm3, 48(%rdi) 4958; SSE4-NEXT: pextrq $1, %xmm5, 88(%rdi) 4959; SSE4-NEXT: retq 4960; 4961; AVX1-LABEL: one_mask_bit_set6: 4962; AVX1: ## %bb.0: 4963; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [0,0,0,18446744073709551615] 4964; AVX1-NEXT: vmaskmovpd %ymm2, %ymm0, 64(%rdi) 4965; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [0,0,18446744073709551615,0] 4966; AVX1-NEXT: vmaskmovpd %ymm1, %ymm0, 32(%rdi) 4967; AVX1-NEXT: vzeroupper 4968; AVX1-NEXT: retq 4969; 4970; AVX2-LABEL: one_mask_bit_set6: 4971; AVX2: ## %bb.0: 4972; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,0,18446744073709551615] 4973; AVX2-NEXT: vpmaskmovq %ymm2, %ymm0, 64(%rdi) 4974; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,18446744073709551615,0] 4975; AVX2-NEXT: vpmaskmovq %ymm1, %ymm0, 32(%rdi) 4976; AVX2-NEXT: vzeroupper 4977; AVX2-NEXT: retq 4978; 4979; AVX512-LABEL: one_mask_bit_set6: 4980; AVX512: ## %bb.0: 4981; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 4982; AVX512-NEXT: vmovlps %xmm0, 48(%rdi) 4983; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm0 4984; AVX512-NEXT: vpextrq $1, %xmm0, 88(%rdi) 4985; AVX512-NEXT: vzeroupper 4986; AVX512-NEXT: retq 4987; 4988; X86-AVX512-LABEL: one_mask_bit_set6: 4989; X86-AVX512: ## %bb.0: 4990; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 4991; X86-AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 4992; X86-AVX512-NEXT: vmovlps %xmm0, 48(%eax) 4993; X86-AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0 4994; X86-AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1] 4995; X86-AVX512-NEXT: vmovlps %xmm0, 88(%eax) 4996; X86-AVX512-NEXT: vzeroupper 4997; X86-AVX512-NEXT: retl 4998 call void @llvm.masked.store.v16i64.p0(<16 x i64> %val, ptr %addr, i32 4, <16 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false>) 4999 ret void 5000} 5001 5002define void @top_bits_unset_stack() nounwind { 5003; SSE-LABEL: top_bits_unset_stack: 5004; SSE: ## %bb.0: ## %entry 5005; SSE-NEXT: xorps %xmm0, %xmm0 5006; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) 5007; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) 5008; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) 5009; SSE-NEXT: retq 5010; 5011; AVX1OR2-LABEL: top_bits_unset_stack: 5012; AVX1OR2: ## %bb.0: ## %entry 5013; AVX1OR2-NEXT: vxorpd %xmm0, %xmm0, %xmm0 5014; AVX1OR2-NEXT: vmovapd {{.*#+}} ymm1 = [18446744073709551615,18446744073709551615,0,0] 5015; AVX1OR2-NEXT: vmaskmovpd %ymm0, %ymm1, -{{[0-9]+}}(%rsp) 5016; AVX1OR2-NEXT: vmovupd %ymm0, -{{[0-9]+}}(%rsp) 5017; AVX1OR2-NEXT: vzeroupper 5018; AVX1OR2-NEXT: retq 5019; 5020; AVX512F-LABEL: top_bits_unset_stack: 5021; AVX512F: ## %bb.0: ## %entry 5022; AVX512F-NEXT: vxorpd %xmm0, %xmm0, %xmm0 5023; AVX512F-NEXT: movb $63, %al 5024; AVX512F-NEXT: kmovw %eax, %k1 5025; AVX512F-NEXT: vmovupd %zmm0, -{{[0-9]+}}(%rsp) {%k1} 5026; AVX512F-NEXT: vzeroupper 5027; AVX512F-NEXT: retq 5028; 5029; AVX512VLDQ-LABEL: top_bits_unset_stack: 5030; AVX512VLDQ: ## %bb.0: ## %entry 5031; AVX512VLDQ-NEXT: vxorpd %xmm0, %xmm0, %xmm0 5032; AVX512VLDQ-NEXT: movb $63, %al 5033; AVX512VLDQ-NEXT: kmovw %eax, %k1 5034; AVX512VLDQ-NEXT: vmovupd %zmm0, -{{[0-9]+}}(%rsp) {%k1} 5035; AVX512VLDQ-NEXT: vzeroupper 5036; AVX512VLDQ-NEXT: retq 5037; 5038; AVX512VLBW-LABEL: top_bits_unset_stack: 5039; AVX512VLBW: ## %bb.0: ## %entry 5040; AVX512VLBW-NEXT: vxorpd %xmm0, %xmm0, %xmm0 5041; AVX512VLBW-NEXT: movb $63, %al 5042; AVX512VLBW-NEXT: kmovd %eax, %k1 5043; AVX512VLBW-NEXT: vmovupd %zmm0, -{{[0-9]+}}(%rsp) {%k1} 5044; AVX512VLBW-NEXT: vzeroupper 5045; AVX512VLBW-NEXT: retq 5046; 5047; X86-AVX512-LABEL: top_bits_unset_stack: 5048; X86-AVX512: ## %bb.0: ## %entry 5049; X86-AVX512-NEXT: subl $76, %esp 5050; X86-AVX512-NEXT: vxorpd %xmm0, %xmm0, %xmm0 5051; X86-AVX512-NEXT: movb $63, %al 5052; X86-AVX512-NEXT: kmovd %eax, %k1 5053; X86-AVX512-NEXT: vmovupd %zmm0, (%esp) {%k1} 5054; X86-AVX512-NEXT: addl $76, %esp 5055; X86-AVX512-NEXT: vzeroupper 5056; X86-AVX512-NEXT: retl 5057entry: 5058 %P.i150.i.i = alloca [3 x [3 x double]], align 16 5059 call void @llvm.masked.store.v8f64.p0(<8 x double> zeroinitializer, ptr %P.i150.i.i, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false>) 5060 ret void 5061} 5062 5063 5064; SimplifyDemandedBits eliminates an ashr here. 5065 5066define void @masked_store_bool_mask_demand_trunc_sext(<4 x double> %x, ptr %p, <4 x i32> %masksrc) nounwind { 5067; SSE-LABEL: masked_store_bool_mask_demand_trunc_sext: 5068; SSE: ## %bb.0: 5069; SSE-NEXT: pslld $31, %xmm2 5070; SSE-NEXT: movmskps %xmm2, %eax 5071; SSE-NEXT: testb $1, %al 5072; SSE-NEXT: jne LBB26_1 5073; SSE-NEXT: ## %bb.2: ## %else 5074; SSE-NEXT: testb $2, %al 5075; SSE-NEXT: jne LBB26_3 5076; SSE-NEXT: LBB26_4: ## %else2 5077; SSE-NEXT: testb $4, %al 5078; SSE-NEXT: jne LBB26_5 5079; SSE-NEXT: LBB26_6: ## %else4 5080; SSE-NEXT: testb $8, %al 5081; SSE-NEXT: jne LBB26_7 5082; SSE-NEXT: LBB26_8: ## %else6 5083; SSE-NEXT: retq 5084; SSE-NEXT: LBB26_1: ## %cond.store 5085; SSE-NEXT: movlps %xmm0, (%rdi) 5086; SSE-NEXT: testb $2, %al 5087; SSE-NEXT: je LBB26_4 5088; SSE-NEXT: LBB26_3: ## %cond.store1 5089; SSE-NEXT: movhps %xmm0, 8(%rdi) 5090; SSE-NEXT: testb $4, %al 5091; SSE-NEXT: je LBB26_6 5092; SSE-NEXT: LBB26_5: ## %cond.store3 5093; SSE-NEXT: movlps %xmm1, 16(%rdi) 5094; SSE-NEXT: testb $8, %al 5095; SSE-NEXT: je LBB26_8 5096; SSE-NEXT: LBB26_7: ## %cond.store5 5097; SSE-NEXT: movhps %xmm1, 24(%rdi) 5098; SSE-NEXT: retq 5099; 5100; AVX1-LABEL: masked_store_bool_mask_demand_trunc_sext: 5101; AVX1: ## %bb.0: 5102; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 5103; AVX1-NEXT: vpmovsxdq %xmm1, %xmm2 5104; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 5105; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 5106; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 5107; AVX1-NEXT: vmaskmovpd %ymm0, %ymm1, (%rdi) 5108; AVX1-NEXT: vzeroupper 5109; AVX1-NEXT: retq 5110; 5111; AVX2-LABEL: masked_store_bool_mask_demand_trunc_sext: 5112; AVX2: ## %bb.0: 5113; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 5114; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 5115; AVX2-NEXT: vmaskmovpd %ymm0, %ymm1, (%rdi) 5116; AVX2-NEXT: vzeroupper 5117; AVX2-NEXT: retq 5118; 5119; AVX512F-LABEL: masked_store_bool_mask_demand_trunc_sext: 5120; AVX512F: ## %bb.0: 5121; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 5122; AVX512F-NEXT: vpslld $31, %xmm1, %xmm1 5123; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 5124; AVX512F-NEXT: kshiftlw $12, %k0, %k0 5125; AVX512F-NEXT: kshiftrw $12, %k0, %k1 5126; AVX512F-NEXT: vmovupd %zmm0, (%rdi) {%k1} 5127; AVX512F-NEXT: vzeroupper 5128; AVX512F-NEXT: retq 5129; 5130; AVX512VLDQ-LABEL: masked_store_bool_mask_demand_trunc_sext: 5131; AVX512VLDQ: ## %bb.0: 5132; AVX512VLDQ-NEXT: vpslld $31, %xmm1, %xmm1 5133; AVX512VLDQ-NEXT: vpmovd2m %xmm1, %k1 5134; AVX512VLDQ-NEXT: vmovupd %ymm0, (%rdi) {%k1} 5135; AVX512VLDQ-NEXT: vzeroupper 5136; AVX512VLDQ-NEXT: retq 5137; 5138; AVX512VLBW-LABEL: masked_store_bool_mask_demand_trunc_sext: 5139; AVX512VLBW: ## %bb.0: 5140; AVX512VLBW-NEXT: vpslld $31, %xmm1, %xmm1 5141; AVX512VLBW-NEXT: vptestmd %xmm1, %xmm1, %k1 5142; AVX512VLBW-NEXT: vmovupd %ymm0, (%rdi) {%k1} 5143; AVX512VLBW-NEXT: vzeroupper 5144; AVX512VLBW-NEXT: retq 5145; 5146; X86-AVX512-LABEL: masked_store_bool_mask_demand_trunc_sext: 5147; X86-AVX512: ## %bb.0: 5148; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 5149; X86-AVX512-NEXT: vpslld $31, %xmm1, %xmm1 5150; X86-AVX512-NEXT: vpmovd2m %xmm1, %k1 5151; X86-AVX512-NEXT: vmovupd %ymm0, (%eax) {%k1} 5152; X86-AVX512-NEXT: vzeroupper 5153; X86-AVX512-NEXT: retl 5154 %sext = sext <4 x i32> %masksrc to <4 x i64> 5155 %boolmask = trunc <4 x i64> %sext to <4 x i1> 5156 call void @llvm.masked.store.v4f64.p0(<4 x double> %x, ptr %p, i32 4, <4 x i1> %boolmask) 5157 ret void 5158} 5159 5160; PR26697 5161 5162define void @one_mask_bit_set1_variable(ptr %addr, <4 x float> %val, <4 x i32> %mask) nounwind { 5163; SSE2-LABEL: one_mask_bit_set1_variable: 5164; SSE2: ## %bb.0: 5165; SSE2-NEXT: movmskps %xmm1, %eax 5166; SSE2-NEXT: testb $1, %al 5167; SSE2-NEXT: jne LBB27_1 5168; SSE2-NEXT: ## %bb.2: ## %else 5169; SSE2-NEXT: testb $2, %al 5170; SSE2-NEXT: jne LBB27_3 5171; SSE2-NEXT: LBB27_4: ## %else2 5172; SSE2-NEXT: testb $4, %al 5173; SSE2-NEXT: jne LBB27_5 5174; SSE2-NEXT: LBB27_6: ## %else4 5175; SSE2-NEXT: testb $8, %al 5176; SSE2-NEXT: jne LBB27_7 5177; SSE2-NEXT: LBB27_8: ## %else6 5178; SSE2-NEXT: retq 5179; SSE2-NEXT: LBB27_1: ## %cond.store 5180; SSE2-NEXT: movss %xmm0, (%rdi) 5181; SSE2-NEXT: testb $2, %al 5182; SSE2-NEXT: je LBB27_4 5183; SSE2-NEXT: LBB27_3: ## %cond.store1 5184; SSE2-NEXT: movaps %xmm0, %xmm1 5185; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 5186; SSE2-NEXT: movss %xmm1, 4(%rdi) 5187; SSE2-NEXT: testb $4, %al 5188; SSE2-NEXT: je LBB27_6 5189; SSE2-NEXT: LBB27_5: ## %cond.store3 5190; SSE2-NEXT: movaps %xmm0, %xmm1 5191; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 5192; SSE2-NEXT: movss %xmm1, 8(%rdi) 5193; SSE2-NEXT: testb $8, %al 5194; SSE2-NEXT: je LBB27_8 5195; SSE2-NEXT: LBB27_7: ## %cond.store5 5196; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 5197; SSE2-NEXT: movss %xmm0, 12(%rdi) 5198; SSE2-NEXT: retq 5199; 5200; SSE4-LABEL: one_mask_bit_set1_variable: 5201; SSE4: ## %bb.0: 5202; SSE4-NEXT: movmskps %xmm1, %eax 5203; SSE4-NEXT: testb $1, %al 5204; SSE4-NEXT: jne LBB27_1 5205; SSE4-NEXT: ## %bb.2: ## %else 5206; SSE4-NEXT: testb $2, %al 5207; SSE4-NEXT: jne LBB27_3 5208; SSE4-NEXT: LBB27_4: ## %else2 5209; SSE4-NEXT: testb $4, %al 5210; SSE4-NEXT: jne LBB27_5 5211; SSE4-NEXT: LBB27_6: ## %else4 5212; SSE4-NEXT: testb $8, %al 5213; SSE4-NEXT: jne LBB27_7 5214; SSE4-NEXT: LBB27_8: ## %else6 5215; SSE4-NEXT: retq 5216; SSE4-NEXT: LBB27_1: ## %cond.store 5217; SSE4-NEXT: movss %xmm0, (%rdi) 5218; SSE4-NEXT: testb $2, %al 5219; SSE4-NEXT: je LBB27_4 5220; SSE4-NEXT: LBB27_3: ## %cond.store1 5221; SSE4-NEXT: extractps $1, %xmm0, 4(%rdi) 5222; SSE4-NEXT: testb $4, %al 5223; SSE4-NEXT: je LBB27_6 5224; SSE4-NEXT: LBB27_5: ## %cond.store3 5225; SSE4-NEXT: extractps $2, %xmm0, 8(%rdi) 5226; SSE4-NEXT: testb $8, %al 5227; SSE4-NEXT: je LBB27_8 5228; SSE4-NEXT: LBB27_7: ## %cond.store5 5229; SSE4-NEXT: extractps $3, %xmm0, 12(%rdi) 5230; SSE4-NEXT: retq 5231; 5232; AVX1OR2-LABEL: one_mask_bit_set1_variable: 5233; AVX1OR2: ## %bb.0: 5234; AVX1OR2-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi) 5235; AVX1OR2-NEXT: retq 5236; 5237; AVX512F-LABEL: one_mask_bit_set1_variable: 5238; AVX512F: ## %bb.0: 5239; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 5240; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 5241; AVX512F-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k0 5242; AVX512F-NEXT: kshiftlw $12, %k0, %k0 5243; AVX512F-NEXT: kshiftrw $12, %k0, %k1 5244; AVX512F-NEXT: vmovups %zmm0, (%rdi) {%k1} 5245; AVX512F-NEXT: vzeroupper 5246; AVX512F-NEXT: retq 5247; 5248; AVX512VL-LABEL: one_mask_bit_set1_variable: 5249; AVX512VL: ## %bb.0: 5250; AVX512VL-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %k1 5251; AVX512VL-NEXT: vmovups %xmm0, (%rdi) {%k1} 5252; AVX512VL-NEXT: retq 5253; 5254; X86-AVX512-LABEL: one_mask_bit_set1_variable: 5255; X86-AVX512: ## %bb.0: 5256; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 5257; X86-AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm1, %k1 5258; X86-AVX512-NEXT: vmovups %xmm0, (%eax) {%k1} 5259; X86-AVX512-NEXT: retl 5260 %mask_signbit = and <4 x i32> %mask, <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648> 5261 %mask_bool = icmp ne <4 x i32> %mask_signbit, zeroinitializer 5262 call void @llvm.masked.store.v4f32.p0(<4 x float> %val, ptr %addr, i32 1, <4 x i1> %mask_bool) 5263 ret void 5264} 5265 5266; This needs to be widened to v4i32. 5267; This used to assert in type legalization. PR38436 5268; FIXME: The codegen for AVX512 should use KSHIFT to zero the upper bits of the mask. 5269define void @widen_masked_store(<3 x i32> %v, ptr %p, <3 x i1> %mask) nounwind { 5270; SSE2-LABEL: widen_masked_store: 5271; SSE2: ## %bb.0: 5272; SSE2-NEXT: andb $1, %sil 5273; SSE2-NEXT: andb $1, %dl 5274; SSE2-NEXT: addb %dl, %dl 5275; SSE2-NEXT: orb %sil, %dl 5276; SSE2-NEXT: andb $1, %cl 5277; SSE2-NEXT: shlb $2, %cl 5278; SSE2-NEXT: orb %dl, %cl 5279; SSE2-NEXT: testb $1, %cl 5280; SSE2-NEXT: jne LBB28_1 5281; SSE2-NEXT: ## %bb.2: ## %else 5282; SSE2-NEXT: testb $2, %cl 5283; SSE2-NEXT: jne LBB28_3 5284; SSE2-NEXT: LBB28_4: ## %else2 5285; SSE2-NEXT: testb $4, %cl 5286; SSE2-NEXT: jne LBB28_5 5287; SSE2-NEXT: LBB28_6: ## %else4 5288; SSE2-NEXT: retq 5289; SSE2-NEXT: LBB28_1: ## %cond.store 5290; SSE2-NEXT: movd %xmm0, (%rdi) 5291; SSE2-NEXT: testb $2, %cl 5292; SSE2-NEXT: je LBB28_4 5293; SSE2-NEXT: LBB28_3: ## %cond.store1 5294; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 5295; SSE2-NEXT: movd %xmm1, 4(%rdi) 5296; SSE2-NEXT: testb $4, %cl 5297; SSE2-NEXT: je LBB28_6 5298; SSE2-NEXT: LBB28_5: ## %cond.store3 5299; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 5300; SSE2-NEXT: movd %xmm0, 8(%rdi) 5301; SSE2-NEXT: retq 5302; 5303; SSE4-LABEL: widen_masked_store: 5304; SSE4: ## %bb.0: 5305; SSE4-NEXT: andb $1, %sil 5306; SSE4-NEXT: andb $1, %dl 5307; SSE4-NEXT: addb %dl, %dl 5308; SSE4-NEXT: orb %sil, %dl 5309; SSE4-NEXT: andb $1, %cl 5310; SSE4-NEXT: shlb $2, %cl 5311; SSE4-NEXT: orb %dl, %cl 5312; SSE4-NEXT: testb $1, %cl 5313; SSE4-NEXT: jne LBB28_1 5314; SSE4-NEXT: ## %bb.2: ## %else 5315; SSE4-NEXT: testb $2, %cl 5316; SSE4-NEXT: jne LBB28_3 5317; SSE4-NEXT: LBB28_4: ## %else2 5318; SSE4-NEXT: testb $4, %cl 5319; SSE4-NEXT: jne LBB28_5 5320; SSE4-NEXT: LBB28_6: ## %else4 5321; SSE4-NEXT: retq 5322; SSE4-NEXT: LBB28_1: ## %cond.store 5323; SSE4-NEXT: movss %xmm0, (%rdi) 5324; SSE4-NEXT: testb $2, %cl 5325; SSE4-NEXT: je LBB28_4 5326; SSE4-NEXT: LBB28_3: ## %cond.store1 5327; SSE4-NEXT: extractps $1, %xmm0, 4(%rdi) 5328; SSE4-NEXT: testb $4, %cl 5329; SSE4-NEXT: je LBB28_6 5330; SSE4-NEXT: LBB28_5: ## %cond.store3 5331; SSE4-NEXT: extractps $2, %xmm0, 8(%rdi) 5332; SSE4-NEXT: retq 5333; 5334; AVX1-LABEL: widen_masked_store: 5335; AVX1: ## %bb.0: 5336; AVX1-NEXT: vmovd %edx, %xmm1 5337; AVX1-NEXT: vmovd %esi, %xmm2 5338; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 5339; AVX1-NEXT: vmovd %ecx, %xmm2 5340; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 5341; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 5342; AVX1-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi) 5343; AVX1-NEXT: retq 5344; 5345; AVX2-LABEL: widen_masked_store: 5346; AVX2: ## %bb.0: 5347; AVX2-NEXT: vmovd %edx, %xmm1 5348; AVX2-NEXT: vmovd %esi, %xmm2 5349; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 5350; AVX2-NEXT: vmovd %ecx, %xmm2 5351; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 5352; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 5353; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, (%rdi) 5354; AVX2-NEXT: retq 5355; 5356; AVX512F-LABEL: widen_masked_store: 5357; AVX512F: ## %bb.0: 5358; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 5359; AVX512F-NEXT: andl $1, %esi 5360; AVX512F-NEXT: kmovw %esi, %k0 5361; AVX512F-NEXT: kmovw %edx, %k1 5362; AVX512F-NEXT: kshiftlw $15, %k1, %k1 5363; AVX512F-NEXT: kshiftrw $14, %k1, %k1 5364; AVX512F-NEXT: korw %k1, %k0, %k0 5365; AVX512F-NEXT: movw $-5, %ax 5366; AVX512F-NEXT: kmovw %eax, %k1 5367; AVX512F-NEXT: kandw %k1, %k0, %k0 5368; AVX512F-NEXT: kmovw %ecx, %k1 5369; AVX512F-NEXT: kshiftlw $15, %k1, %k1 5370; AVX512F-NEXT: kshiftrw $13, %k1, %k1 5371; AVX512F-NEXT: korw %k1, %k0, %k0 5372; AVX512F-NEXT: movb $7, %al 5373; AVX512F-NEXT: kmovw %eax, %k1 5374; AVX512F-NEXT: kandw %k1, %k0, %k0 5375; AVX512F-NEXT: kshiftlw $12, %k0, %k0 5376; AVX512F-NEXT: kshiftrw $12, %k0, %k1 5377; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} 5378; AVX512F-NEXT: vzeroupper 5379; AVX512F-NEXT: retq 5380; 5381; AVX512VLDQ-LABEL: widen_masked_store: 5382; AVX512VLDQ: ## %bb.0: 5383; AVX512VLDQ-NEXT: kmovw %edx, %k0 5384; AVX512VLDQ-NEXT: kshiftlb $7, %k0, %k0 5385; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k0 5386; AVX512VLDQ-NEXT: kmovw %esi, %k1 5387; AVX512VLDQ-NEXT: kshiftlb $7, %k1, %k1 5388; AVX512VLDQ-NEXT: kshiftrb $7, %k1, %k1 5389; AVX512VLDQ-NEXT: korw %k0, %k1, %k0 5390; AVX512VLDQ-NEXT: movb $-5, %al 5391; AVX512VLDQ-NEXT: kmovw %eax, %k1 5392; AVX512VLDQ-NEXT: kandw %k1, %k0, %k0 5393; AVX512VLDQ-NEXT: kmovw %ecx, %k1 5394; AVX512VLDQ-NEXT: kshiftlb $7, %k1, %k1 5395; AVX512VLDQ-NEXT: kshiftrb $5, %k1, %k1 5396; AVX512VLDQ-NEXT: korw %k1, %k0, %k0 5397; AVX512VLDQ-NEXT: movb $7, %al 5398; AVX512VLDQ-NEXT: kmovw %eax, %k1 5399; AVX512VLDQ-NEXT: kandw %k1, %k0, %k1 5400; AVX512VLDQ-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1} 5401; AVX512VLDQ-NEXT: retq 5402; 5403; AVX512VLBW-LABEL: widen_masked_store: 5404; AVX512VLBW: ## %bb.0: 5405; AVX512VLBW-NEXT: andl $1, %esi 5406; AVX512VLBW-NEXT: kmovw %esi, %k0 5407; AVX512VLBW-NEXT: kmovd %edx, %k1 5408; AVX512VLBW-NEXT: kshiftlw $15, %k1, %k1 5409; AVX512VLBW-NEXT: kshiftrw $14, %k1, %k1 5410; AVX512VLBW-NEXT: korw %k1, %k0, %k0 5411; AVX512VLBW-NEXT: movw $-5, %ax 5412; AVX512VLBW-NEXT: kmovd %eax, %k1 5413; AVX512VLBW-NEXT: kandw %k1, %k0, %k0 5414; AVX512VLBW-NEXT: kmovd %ecx, %k1 5415; AVX512VLBW-NEXT: kshiftlw $15, %k1, %k1 5416; AVX512VLBW-NEXT: kshiftrw $13, %k1, %k1 5417; AVX512VLBW-NEXT: korw %k1, %k0, %k0 5418; AVX512VLBW-NEXT: movb $7, %al 5419; AVX512VLBW-NEXT: kmovd %eax, %k1 5420; AVX512VLBW-NEXT: kandw %k1, %k0, %k1 5421; AVX512VLBW-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1} 5422; AVX512VLBW-NEXT: retq 5423; 5424; X86-AVX512-LABEL: widen_masked_store: 5425; X86-AVX512: ## %bb.0: 5426; X86-AVX512-NEXT: kmovb {{[0-9]+}}(%esp), %k0 5427; X86-AVX512-NEXT: kshiftlb $7, %k0, %k0 5428; X86-AVX512-NEXT: kshiftrb $6, %k0, %k0 5429; X86-AVX512-NEXT: kmovb {{[0-9]+}}(%esp), %k1 5430; X86-AVX512-NEXT: kshiftlb $7, %k1, %k1 5431; X86-AVX512-NEXT: kshiftrb $7, %k1, %k1 5432; X86-AVX512-NEXT: korw %k0, %k1, %k0 5433; X86-AVX512-NEXT: movb $-5, %al 5434; X86-AVX512-NEXT: kmovd %eax, %k1 5435; X86-AVX512-NEXT: kandw %k1, %k0, %k0 5436; X86-AVX512-NEXT: kmovb {{[0-9]+}}(%esp), %k1 5437; X86-AVX512-NEXT: kshiftlb $7, %k1, %k1 5438; X86-AVX512-NEXT: kshiftrb $5, %k1, %k1 5439; X86-AVX512-NEXT: korw %k1, %k0, %k0 5440; X86-AVX512-NEXT: movb $7, %al 5441; X86-AVX512-NEXT: kmovd %eax, %k1 5442; X86-AVX512-NEXT: kandw %k1, %k0, %k1 5443; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 5444; X86-AVX512-NEXT: vmovdqa32 %xmm0, (%eax) {%k1} 5445; X86-AVX512-NEXT: retl 5446 call void @llvm.masked.store.v3i32.p0(<3 x i32> %v, ptr %p, i32 16, <3 x i1> %mask) 5447 ret void 5448} 5449 5450define void @zero_mask(ptr %addr, <2 x double> %val) nounwind { 5451; SSE-LABEL: zero_mask: 5452; SSE: ## %bb.0: 5453; SSE-NEXT: retq 5454; 5455; AVX-LABEL: zero_mask: 5456; AVX: ## %bb.0: 5457; AVX-NEXT: retq 5458; 5459; X86-AVX512-LABEL: zero_mask: 5460; X86-AVX512: ## %bb.0: 5461; X86-AVX512-NEXT: retl 5462 call void @llvm.masked.store.v2f64.p0(<2 x double> %val, ptr %addr, i32 4, <2 x i1> zeroinitializer) 5463 ret void 5464} 5465 5466define void @PR11210(<4 x float> %x, ptr %ptr, <4 x float> %y, <2 x i64> %mask) nounwind { 5467; SSE2-LABEL: PR11210: 5468; SSE2: ## %bb.0: 5469; SSE2-NEXT: movmskps %xmm2, %eax 5470; SSE2-NEXT: testb $1, %al 5471; SSE2-NEXT: jne LBB30_1 5472; SSE2-NEXT: ## %bb.2: ## %else 5473; SSE2-NEXT: testb $2, %al 5474; SSE2-NEXT: jne LBB30_3 5475; SSE2-NEXT: LBB30_4: ## %else2 5476; SSE2-NEXT: testb $4, %al 5477; SSE2-NEXT: jne LBB30_5 5478; SSE2-NEXT: LBB30_6: ## %else4 5479; SSE2-NEXT: testb $8, %al 5480; SSE2-NEXT: jne LBB30_7 5481; SSE2-NEXT: LBB30_8: ## %else6 5482; SSE2-NEXT: testb $1, %al 5483; SSE2-NEXT: jne LBB30_9 5484; SSE2-NEXT: LBB30_10: ## %else9 5485; SSE2-NEXT: testb $2, %al 5486; SSE2-NEXT: jne LBB30_11 5487; SSE2-NEXT: LBB30_12: ## %else11 5488; SSE2-NEXT: testb $4, %al 5489; SSE2-NEXT: jne LBB30_13 5490; SSE2-NEXT: LBB30_14: ## %else13 5491; SSE2-NEXT: testb $8, %al 5492; SSE2-NEXT: jne LBB30_15 5493; SSE2-NEXT: LBB30_16: ## %else15 5494; SSE2-NEXT: retq 5495; SSE2-NEXT: LBB30_1: ## %cond.store 5496; SSE2-NEXT: movss %xmm0, (%rdi) 5497; SSE2-NEXT: testb $2, %al 5498; SSE2-NEXT: je LBB30_4 5499; SSE2-NEXT: LBB30_3: ## %cond.store1 5500; SSE2-NEXT: movaps %xmm0, %xmm2 5501; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] 5502; SSE2-NEXT: movss %xmm2, 4(%rdi) 5503; SSE2-NEXT: testb $4, %al 5504; SSE2-NEXT: je LBB30_6 5505; SSE2-NEXT: LBB30_5: ## %cond.store3 5506; SSE2-NEXT: movaps %xmm0, %xmm2 5507; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5508; SSE2-NEXT: movss %xmm2, 8(%rdi) 5509; SSE2-NEXT: testb $8, %al 5510; SSE2-NEXT: je LBB30_8 5511; SSE2-NEXT: LBB30_7: ## %cond.store5 5512; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 5513; SSE2-NEXT: movss %xmm0, 12(%rdi) 5514; SSE2-NEXT: testb $1, %al 5515; SSE2-NEXT: je LBB30_10 5516; SSE2-NEXT: LBB30_9: ## %cond.store8 5517; SSE2-NEXT: movss %xmm1, (%rdi) 5518; SSE2-NEXT: testb $2, %al 5519; SSE2-NEXT: je LBB30_12 5520; SSE2-NEXT: LBB30_11: ## %cond.store10 5521; SSE2-NEXT: movaps %xmm1, %xmm0 5522; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] 5523; SSE2-NEXT: movss %xmm0, 4(%rdi) 5524; SSE2-NEXT: testb $4, %al 5525; SSE2-NEXT: je LBB30_14 5526; SSE2-NEXT: LBB30_13: ## %cond.store12 5527; SSE2-NEXT: movaps %xmm1, %xmm0 5528; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 5529; SSE2-NEXT: movss %xmm0, 8(%rdi) 5530; SSE2-NEXT: testb $8, %al 5531; SSE2-NEXT: je LBB30_16 5532; SSE2-NEXT: LBB30_15: ## %cond.store14 5533; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 5534; SSE2-NEXT: movss %xmm1, 12(%rdi) 5535; SSE2-NEXT: retq 5536; 5537; SSE4-LABEL: PR11210: 5538; SSE4: ## %bb.0: 5539; SSE4-NEXT: movmskps %xmm2, %eax 5540; SSE4-NEXT: testb $1, %al 5541; SSE4-NEXT: jne LBB30_1 5542; SSE4-NEXT: ## %bb.2: ## %else 5543; SSE4-NEXT: testb $2, %al 5544; SSE4-NEXT: jne LBB30_3 5545; SSE4-NEXT: LBB30_4: ## %else2 5546; SSE4-NEXT: testb $4, %al 5547; SSE4-NEXT: jne LBB30_5 5548; SSE4-NEXT: LBB30_6: ## %else4 5549; SSE4-NEXT: testb $8, %al 5550; SSE4-NEXT: jne LBB30_7 5551; SSE4-NEXT: LBB30_8: ## %else6 5552; SSE4-NEXT: testb $1, %al 5553; SSE4-NEXT: jne LBB30_9 5554; SSE4-NEXT: LBB30_10: ## %else9 5555; SSE4-NEXT: testb $2, %al 5556; SSE4-NEXT: jne LBB30_11 5557; SSE4-NEXT: LBB30_12: ## %else11 5558; SSE4-NEXT: testb $4, %al 5559; SSE4-NEXT: jne LBB30_13 5560; SSE4-NEXT: LBB30_14: ## %else13 5561; SSE4-NEXT: testb $8, %al 5562; SSE4-NEXT: jne LBB30_15 5563; SSE4-NEXT: LBB30_16: ## %else15 5564; SSE4-NEXT: retq 5565; SSE4-NEXT: LBB30_1: ## %cond.store 5566; SSE4-NEXT: movss %xmm0, (%rdi) 5567; SSE4-NEXT: testb $2, %al 5568; SSE4-NEXT: je LBB30_4 5569; SSE4-NEXT: LBB30_3: ## %cond.store1 5570; SSE4-NEXT: extractps $1, %xmm0, 4(%rdi) 5571; SSE4-NEXT: testb $4, %al 5572; SSE4-NEXT: je LBB30_6 5573; SSE4-NEXT: LBB30_5: ## %cond.store3 5574; SSE4-NEXT: extractps $2, %xmm0, 8(%rdi) 5575; SSE4-NEXT: testb $8, %al 5576; SSE4-NEXT: je LBB30_8 5577; SSE4-NEXT: LBB30_7: ## %cond.store5 5578; SSE4-NEXT: extractps $3, %xmm0, 12(%rdi) 5579; SSE4-NEXT: testb $1, %al 5580; SSE4-NEXT: je LBB30_10 5581; SSE4-NEXT: LBB30_9: ## %cond.store8 5582; SSE4-NEXT: movss %xmm1, (%rdi) 5583; SSE4-NEXT: testb $2, %al 5584; SSE4-NEXT: je LBB30_12 5585; SSE4-NEXT: LBB30_11: ## %cond.store10 5586; SSE4-NEXT: extractps $1, %xmm1, 4(%rdi) 5587; SSE4-NEXT: testb $4, %al 5588; SSE4-NEXT: je LBB30_14 5589; SSE4-NEXT: LBB30_13: ## %cond.store12 5590; SSE4-NEXT: extractps $2, %xmm1, 8(%rdi) 5591; SSE4-NEXT: testb $8, %al 5592; SSE4-NEXT: je LBB30_16 5593; SSE4-NEXT: LBB30_15: ## %cond.store14 5594; SSE4-NEXT: extractps $3, %xmm1, 12(%rdi) 5595; SSE4-NEXT: retq 5596; 5597; AVX1OR2-LABEL: PR11210: 5598; AVX1OR2: ## %bb.0: 5599; AVX1OR2-NEXT: vmaskmovps %xmm1, %xmm2, (%rdi) 5600; AVX1OR2-NEXT: retq 5601; 5602; AVX512F-LABEL: PR11210: 5603; AVX512F: ## %bb.0: 5604; AVX512F-NEXT: ## kill: def $xmm2 killed $xmm2 def $zmm2 5605; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 5606; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 5607; AVX512F-NEXT: vpcmpgtd %zmm2, %zmm0, %k0 5608; AVX512F-NEXT: kshiftlw $12, %k0, %k0 5609; AVX512F-NEXT: kshiftrw $12, %k0, %k1 5610; AVX512F-NEXT: vmovups %zmm1, (%rdi) {%k1} 5611; AVX512F-NEXT: vzeroupper 5612; AVX512F-NEXT: retq 5613; 5614; AVX512VLDQ-LABEL: PR11210: 5615; AVX512VLDQ: ## %bb.0: 5616; AVX512VLDQ-NEXT: vpmovd2m %xmm2, %k1 5617; AVX512VLDQ-NEXT: vmovups %xmm1, (%rdi) {%k1} 5618; AVX512VLDQ-NEXT: retq 5619; 5620; AVX512VLBW-LABEL: PR11210: 5621; AVX512VLBW: ## %bb.0: 5622; AVX512VLBW-NEXT: vpxor %xmm0, %xmm0, %xmm0 5623; AVX512VLBW-NEXT: vpcmpgtd %xmm2, %xmm0, %k1 5624; AVX512VLBW-NEXT: vmovups %xmm1, (%rdi) {%k1} 5625; AVX512VLBW-NEXT: retq 5626; 5627; X86-AVX512-LABEL: PR11210: 5628; X86-AVX512: ## %bb.0: 5629; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 5630; X86-AVX512-NEXT: vpmovd2m %xmm2, %k1 5631; X86-AVX512-NEXT: vmovups %xmm1, (%eax) {%k1} 5632; X86-AVX512-NEXT: retl 5633 %bc = bitcast <2 x i64> %mask to <4 x i32> 5634 %trunc = icmp slt <4 x i32> %bc, zeroinitializer 5635 call void @llvm.masked.store.v4f32.p0(<4 x float> %x, ptr %ptr, i32 1, <4 x i1> %trunc) 5636 call void @llvm.masked.store.v4f32.p0(<4 x float> %y, ptr %ptr, i32 1, <4 x i1> %trunc) 5637 ret void 5638} 5639 5640define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigger.ptr, ptr %val.ptr, ptr %dst) nounwind { 5641; SSE-LABEL: store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts: 5642; SSE: ## %bb.0: 5643; SSE-NEXT: pushq %rbp 5644; SSE-NEXT: pushq %r15 5645; SSE-NEXT: pushq %r14 5646; SSE-NEXT: pushq %r13 5647; SSE-NEXT: pushq %r12 5648; SSE-NEXT: pushq %rbx 5649; SSE-NEXT: movdqa (%rdi), %xmm1 5650; SSE-NEXT: movdqa 32(%rdi), %xmm2 5651; SSE-NEXT: movdqa 64(%rdi), %xmm0 5652; SSE-NEXT: movl 92(%rsi), %eax 5653; SSE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill 5654; SSE-NEXT: movl 88(%rsi), %eax 5655; SSE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill 5656; SSE-NEXT: movl 84(%rsi), %eax 5657; SSE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill 5658; SSE-NEXT: movl 80(%rsi), %eax 5659; SSE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill 5660; SSE-NEXT: movl 76(%rsi), %eax 5661; SSE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill 5662; SSE-NEXT: movl 72(%rsi), %eax 5663; SSE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill 5664; SSE-NEXT: movl 68(%rsi), %eax 5665; SSE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill 5666; SSE-NEXT: movl 64(%rsi), %eax 5667; SSE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill 5668; SSE-NEXT: movl 60(%rsi), %eax 5669; SSE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill 5670; SSE-NEXT: movl 56(%rsi), %eax 5671; SSE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill 5672; SSE-NEXT: movl 52(%rsi), %eax 5673; SSE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill 5674; SSE-NEXT: packssdw 48(%rdi), %xmm2 5675; SSE-NEXT: packssdw 16(%rdi), %xmm1 5676; SSE-NEXT: packsswb %xmm2, %xmm1 5677; SSE-NEXT: packssdw 80(%rdi), %xmm0 5678; SSE-NEXT: packsswb %xmm0, %xmm0 5679; SSE-NEXT: pmovmskb %xmm1, %eax 5680; SSE-NEXT: andl $21845, %eax ## imm = 0x5555 5681; SSE-NEXT: pmovmskb %xmm0, %edi 5682; SSE-NEXT: andl $85, %edi 5683; SSE-NEXT: shll $16, %edi 5684; SSE-NEXT: orl %eax, %edi 5685; SSE-NEXT: movl 48(%rsi), %r13d 5686; SSE-NEXT: testb $1, %dil 5687; SSE-NEXT: movl 44(%rsi), %eax 5688; SSE-NEXT: movl 40(%rsi), %ecx 5689; SSE-NEXT: movl 36(%rsi), %r8d 5690; SSE-NEXT: movl 32(%rsi), %r9d 5691; SSE-NEXT: movl 28(%rsi), %r10d 5692; SSE-NEXT: movl 24(%rsi), %r11d 5693; SSE-NEXT: movl 20(%rsi), %ebx 5694; SSE-NEXT: movl 16(%rsi), %ebp 5695; SSE-NEXT: movl 12(%rsi), %r14d 5696; SSE-NEXT: movl 8(%rsi), %r15d 5697; SSE-NEXT: movl 4(%rsi), %r12d 5698; SSE-NEXT: jne LBB31_1 5699; SSE-NEXT: ## %bb.2: ## %else 5700; SSE-NEXT: testb $2, %dil 5701; SSE-NEXT: jne LBB31_3 5702; SSE-NEXT: LBB31_4: ## %else2 5703; SSE-NEXT: testb $4, %dil 5704; SSE-NEXT: jne LBB31_5 5705; SSE-NEXT: LBB31_6: ## %else4 5706; SSE-NEXT: testb $8, %dil 5707; SSE-NEXT: jne LBB31_7 5708; SSE-NEXT: LBB31_8: ## %else6 5709; SSE-NEXT: testb $16, %dil 5710; SSE-NEXT: jne LBB31_9 5711; SSE-NEXT: LBB31_10: ## %else8 5712; SSE-NEXT: testb $32, %dil 5713; SSE-NEXT: jne LBB31_11 5714; SSE-NEXT: LBB31_12: ## %else10 5715; SSE-NEXT: testb $64, %dil 5716; SSE-NEXT: jne LBB31_13 5717; SSE-NEXT: LBB31_14: ## %else12 5718; SSE-NEXT: testb %dil, %dil 5719; SSE-NEXT: js LBB31_15 5720; SSE-NEXT: LBB31_16: ## %else14 5721; SSE-NEXT: testl $256, %edi ## imm = 0x100 5722; SSE-NEXT: jne LBB31_17 5723; SSE-NEXT: LBB31_18: ## %else16 5724; SSE-NEXT: testl $512, %edi ## imm = 0x200 5725; SSE-NEXT: jne LBB31_19 5726; SSE-NEXT: LBB31_20: ## %else18 5727; SSE-NEXT: testl $1024, %edi ## imm = 0x400 5728; SSE-NEXT: jne LBB31_21 5729; SSE-NEXT: LBB31_22: ## %else20 5730; SSE-NEXT: testl $2048, %edi ## imm = 0x800 5731; SSE-NEXT: jne LBB31_23 5732; SSE-NEXT: LBB31_24: ## %else22 5733; SSE-NEXT: testl $4096, %edi ## imm = 0x1000 5734; SSE-NEXT: jne LBB31_25 5735; SSE-NEXT: LBB31_26: ## %else24 5736; SSE-NEXT: testl $8192, %edi ## imm = 0x2000 5737; SSE-NEXT: jne LBB31_27 5738; SSE-NEXT: LBB31_28: ## %else26 5739; SSE-NEXT: testl $16384, %edi ## imm = 0x4000 5740; SSE-NEXT: jne LBB31_29 5741; SSE-NEXT: LBB31_30: ## %else28 5742; SSE-NEXT: testw %di, %di 5743; SSE-NEXT: js LBB31_31 5744; SSE-NEXT: LBB31_32: ## %else30 5745; SSE-NEXT: testl $65536, %edi ## imm = 0x10000 5746; SSE-NEXT: jne LBB31_33 5747; SSE-NEXT: LBB31_34: ## %else32 5748; SSE-NEXT: testl $131072, %edi ## imm = 0x20000 5749; SSE-NEXT: jne LBB31_35 5750; SSE-NEXT: LBB31_36: ## %else34 5751; SSE-NEXT: testl $262144, %edi ## imm = 0x40000 5752; SSE-NEXT: jne LBB31_37 5753; SSE-NEXT: LBB31_38: ## %else36 5754; SSE-NEXT: testl $524288, %edi ## imm = 0x80000 5755; SSE-NEXT: jne LBB31_39 5756; SSE-NEXT: LBB31_40: ## %else38 5757; SSE-NEXT: testl $1048576, %edi ## imm = 0x100000 5758; SSE-NEXT: jne LBB31_41 5759; SSE-NEXT: LBB31_42: ## %else40 5760; SSE-NEXT: testl $2097152, %edi ## imm = 0x200000 5761; SSE-NEXT: jne LBB31_43 5762; SSE-NEXT: LBB31_44: ## %else42 5763; SSE-NEXT: testl $4194304, %edi ## imm = 0x400000 5764; SSE-NEXT: je LBB31_46 5765; SSE-NEXT: LBB31_45: ## %cond.store43 5766; SSE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload 5767; SSE-NEXT: movl %eax, 88(%rdx) 5768; SSE-NEXT: LBB31_46: ## %else44 5769; SSE-NEXT: movb $1, %al 5770; SSE-NEXT: testb %al, %al 5771; SSE-NEXT: jne LBB31_48 5772; SSE-NEXT: ## %bb.47: ## %cond.store45 5773; SSE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload 5774; SSE-NEXT: movl %eax, 92(%rdx) 5775; SSE-NEXT: LBB31_48: ## %else46 5776; SSE-NEXT: popq %rbx 5777; SSE-NEXT: popq %r12 5778; SSE-NEXT: popq %r13 5779; SSE-NEXT: popq %r14 5780; SSE-NEXT: popq %r15 5781; SSE-NEXT: popq %rbp 5782; SSE-NEXT: retq 5783; SSE-NEXT: LBB31_1: ## %cond.store 5784; SSE-NEXT: movl (%rsi), %esi 5785; SSE-NEXT: movl %esi, (%rdx) 5786; SSE-NEXT: testb $2, %dil 5787; SSE-NEXT: je LBB31_4 5788; SSE-NEXT: LBB31_3: ## %cond.store1 5789; SSE-NEXT: movl %r12d, 4(%rdx) 5790; SSE-NEXT: testb $4, %dil 5791; SSE-NEXT: je LBB31_6 5792; SSE-NEXT: LBB31_5: ## %cond.store3 5793; SSE-NEXT: movl %r15d, 8(%rdx) 5794; SSE-NEXT: testb $8, %dil 5795; SSE-NEXT: je LBB31_8 5796; SSE-NEXT: LBB31_7: ## %cond.store5 5797; SSE-NEXT: movl %r14d, 12(%rdx) 5798; SSE-NEXT: testb $16, %dil 5799; SSE-NEXT: je LBB31_10 5800; SSE-NEXT: LBB31_9: ## %cond.store7 5801; SSE-NEXT: movl %ebp, 16(%rdx) 5802; SSE-NEXT: testb $32, %dil 5803; SSE-NEXT: je LBB31_12 5804; SSE-NEXT: LBB31_11: ## %cond.store9 5805; SSE-NEXT: movl %ebx, 20(%rdx) 5806; SSE-NEXT: testb $64, %dil 5807; SSE-NEXT: je LBB31_14 5808; SSE-NEXT: LBB31_13: ## %cond.store11 5809; SSE-NEXT: movl %r11d, 24(%rdx) 5810; SSE-NEXT: testb %dil, %dil 5811; SSE-NEXT: jns LBB31_16 5812; SSE-NEXT: LBB31_15: ## %cond.store13 5813; SSE-NEXT: movl %r10d, 28(%rdx) 5814; SSE-NEXT: testl $256, %edi ## imm = 0x100 5815; SSE-NEXT: je LBB31_18 5816; SSE-NEXT: LBB31_17: ## %cond.store15 5817; SSE-NEXT: movl %r9d, 32(%rdx) 5818; SSE-NEXT: testl $512, %edi ## imm = 0x200 5819; SSE-NEXT: je LBB31_20 5820; SSE-NEXT: LBB31_19: ## %cond.store17 5821; SSE-NEXT: movl %r8d, 36(%rdx) 5822; SSE-NEXT: testl $1024, %edi ## imm = 0x400 5823; SSE-NEXT: je LBB31_22 5824; SSE-NEXT: LBB31_21: ## %cond.store19 5825; SSE-NEXT: movl %ecx, 40(%rdx) 5826; SSE-NEXT: testl $2048, %edi ## imm = 0x800 5827; SSE-NEXT: je LBB31_24 5828; SSE-NEXT: LBB31_23: ## %cond.store21 5829; SSE-NEXT: movl %eax, 44(%rdx) 5830; SSE-NEXT: testl $4096, %edi ## imm = 0x1000 5831; SSE-NEXT: je LBB31_26 5832; SSE-NEXT: LBB31_25: ## %cond.store23 5833; SSE-NEXT: movl %r13d, 48(%rdx) 5834; SSE-NEXT: testl $8192, %edi ## imm = 0x2000 5835; SSE-NEXT: je LBB31_28 5836; SSE-NEXT: LBB31_27: ## %cond.store25 5837; SSE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload 5838; SSE-NEXT: movl %eax, 52(%rdx) 5839; SSE-NEXT: testl $16384, %edi ## imm = 0x4000 5840; SSE-NEXT: je LBB31_30 5841; SSE-NEXT: LBB31_29: ## %cond.store27 5842; SSE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload 5843; SSE-NEXT: movl %eax, 56(%rdx) 5844; SSE-NEXT: testw %di, %di 5845; SSE-NEXT: jns LBB31_32 5846; SSE-NEXT: LBB31_31: ## %cond.store29 5847; SSE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload 5848; SSE-NEXT: movl %eax, 60(%rdx) 5849; SSE-NEXT: testl $65536, %edi ## imm = 0x10000 5850; SSE-NEXT: je LBB31_34 5851; SSE-NEXT: LBB31_33: ## %cond.store31 5852; SSE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload 5853; SSE-NEXT: movl %eax, 64(%rdx) 5854; SSE-NEXT: testl $131072, %edi ## imm = 0x20000 5855; SSE-NEXT: je LBB31_36 5856; SSE-NEXT: LBB31_35: ## %cond.store33 5857; SSE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload 5858; SSE-NEXT: movl %eax, 68(%rdx) 5859; SSE-NEXT: testl $262144, %edi ## imm = 0x40000 5860; SSE-NEXT: je LBB31_38 5861; SSE-NEXT: LBB31_37: ## %cond.store35 5862; SSE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload 5863; SSE-NEXT: movl %eax, 72(%rdx) 5864; SSE-NEXT: testl $524288, %edi ## imm = 0x80000 5865; SSE-NEXT: je LBB31_40 5866; SSE-NEXT: LBB31_39: ## %cond.store37 5867; SSE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload 5868; SSE-NEXT: movl %eax, 76(%rdx) 5869; SSE-NEXT: testl $1048576, %edi ## imm = 0x100000 5870; SSE-NEXT: je LBB31_42 5871; SSE-NEXT: LBB31_41: ## %cond.store39 5872; SSE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload 5873; SSE-NEXT: movl %eax, 80(%rdx) 5874; SSE-NEXT: testl $2097152, %edi ## imm = 0x200000 5875; SSE-NEXT: je LBB31_44 5876; SSE-NEXT: LBB31_43: ## %cond.store41 5877; SSE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload 5878; SSE-NEXT: movl %eax, 84(%rdx) 5879; SSE-NEXT: testl $4194304, %edi ## imm = 0x400000 5880; SSE-NEXT: jne LBB31_45 5881; SSE-NEXT: jmp LBB31_46 5882; 5883; AVX1-LABEL: store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts: 5884; AVX1: ## %bb.0: 5885; AVX1-NEXT: vmovdqa (%rsi), %ymm0 5886; AVX1-NEXT: vmovaps 32(%rsi), %ymm1 5887; AVX1-NEXT: vmovaps 64(%rsi), %ymm2 5888; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 5889; AVX1-NEXT: vpcmpgtd 48(%rdi), %xmm3, %xmm4 5890; AVX1-NEXT: vpcmpgtd 32(%rdi), %xmm3, %xmm5 5891; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 5892; AVX1-NEXT: vpacksswb %xmm4, %xmm4, %xmm4 5893; AVX1-NEXT: vpcmpgtd 80(%rdi), %xmm3, %xmm5 5894; AVX1-NEXT: vpcmpgtd 64(%rdi), %xmm3, %xmm6 5895; AVX1-NEXT: vpcmpgtd 16(%rdi), %xmm3, %xmm7 5896; AVX1-NEXT: vpcmpgtd (%rdi), %xmm3, %xmm8 5897; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm3[2,3],xmm8[4,5],xmm3[6,7] 5898; AVX1-NEXT: vpslld $31, %xmm8, %xmm8 5899; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm3[2,3],xmm7[4,5],xmm3[6,7] 5900; AVX1-NEXT: vpslld $31, %xmm7, %xmm7 5901; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 5902; AVX1-NEXT: vmaskmovps %ymm0, %ymm7, (%rdx) 5903; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1],xmm3[2,3],xmm6[4,5],xmm3[6,7] 5904; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 5905; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,3],xmm5[4,5],xmm3[6,7] 5906; AVX1-NEXT: vpslld $31, %xmm5, %xmm5 5907; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 5908; AVX1-NEXT: vmaskmovps %ymm2, %ymm0, 64(%rdx) 5909; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 5910; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 5911; AVX1-NEXT: vpslld $31, %xmm2, %xmm2 5912; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 5913; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 5914; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 5915; AVX1-NEXT: vmaskmovps %ymm1, %ymm0, 32(%rdx) 5916; AVX1-NEXT: vzeroupper 5917; AVX1-NEXT: retq 5918; 5919; AVX2-LABEL: store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts: 5920; AVX2: ## %bb.0: 5921; AVX2-NEXT: vmovdqa (%rsi), %ymm0 5922; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 5923; AVX2-NEXT: vmovdqa 64(%rsi), %ymm2 5924; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 5925; AVX2-NEXT: vpcmpgtd 32(%rdi), %ymm3, %ymm4 5926; AVX2-NEXT: vpcmpgtd (%rdi), %ymm3, %ymm5 5927; AVX2-NEXT: vpackssdw %ymm4, %ymm5, %ymm4 5928; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] 5929; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = mem[0,2,2,3,4,6,6,7] 5930; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] 5931; AVX2-NEXT: vpcmpgtd %ymm5, %ymm3, %ymm3 5932; AVX2-NEXT: vpacksswb %ymm3, %ymm4, %ymm3 5933; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,1,3] 5934; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 5935; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 5936; AVX2-NEXT: vpslld $31, %ymm3, %ymm3 5937; AVX2-NEXT: vpmaskmovd %ymm0, %ymm3, (%rdx) 5938; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm0 5939; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 5940; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 5941; AVX2-NEXT: vpmaskmovd %ymm2, %ymm0, 64(%rdx) 5942; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 5943; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 5944; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 5945; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, 32(%rdx) 5946; AVX2-NEXT: vzeroupper 5947; AVX2-NEXT: retq 5948; 5949; AVX512F-LABEL: store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts: 5950; AVX512F: ## %bb.0: 5951; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm0 5952; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm1 5953; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 5954; AVX512F-NEXT: movw $21845, %ax ## imm = 0x5555 5955; AVX512F-NEXT: kmovw %eax, %k1 5956; AVX512F-NEXT: vpcmpgtd (%rdi), %zmm2, %k1 {%k1} 5957; AVX512F-NEXT: movw $85, %ax 5958; AVX512F-NEXT: kmovw %eax, %k2 5959; AVX512F-NEXT: vpcmpgtd 64(%rdi), %zmm2, %k2 {%k2} 5960; AVX512F-NEXT: vmovdqu32 %zmm1, 64(%rdx) {%k2} 5961; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdx) {%k1} 5962; AVX512F-NEXT: vzeroupper 5963; AVX512F-NEXT: retq 5964; 5965; AVX512VLDQ-LABEL: store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts: 5966; AVX512VLDQ: ## %bb.0: 5967; AVX512VLDQ-NEXT: vmovdqa64 (%rsi), %zmm0 5968; AVX512VLDQ-NEXT: vmovdqa64 64(%rsi), %zmm1 5969; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 5970; AVX512VLDQ-NEXT: movw $21845, %ax ## imm = 0x5555 5971; AVX512VLDQ-NEXT: kmovw %eax, %k1 5972; AVX512VLDQ-NEXT: vpcmpgtd (%rdi), %zmm2, %k1 {%k1} 5973; AVX512VLDQ-NEXT: movw $85, %ax 5974; AVX512VLDQ-NEXT: kmovw %eax, %k2 5975; AVX512VLDQ-NEXT: vpcmpgtd 64(%rdi), %zmm2, %k2 {%k2} 5976; AVX512VLDQ-NEXT: vmovdqu32 %zmm1, 64(%rdx) {%k2} 5977; AVX512VLDQ-NEXT: vmovdqu32 %zmm0, (%rdx) {%k1} 5978; AVX512VLDQ-NEXT: vzeroupper 5979; AVX512VLDQ-NEXT: retq 5980; 5981; AVX512VLBW-LABEL: store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts: 5982; AVX512VLBW: ## %bb.0: 5983; AVX512VLBW-NEXT: vmovdqa64 (%rsi), %zmm0 5984; AVX512VLBW-NEXT: vmovdqa64 64(%rsi), %zmm1 5985; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 5986; AVX512VLBW-NEXT: movw $21845, %ax ## imm = 0x5555 5987; AVX512VLBW-NEXT: kmovd %eax, %k1 5988; AVX512VLBW-NEXT: vpcmpgtd (%rdi), %zmm2, %k1 {%k1} 5989; AVX512VLBW-NEXT: movw $85, %ax 5990; AVX512VLBW-NEXT: kmovd %eax, %k2 5991; AVX512VLBW-NEXT: vpcmpgtd 64(%rdi), %zmm2, %k2 {%k2} 5992; AVX512VLBW-NEXT: vmovdqu32 %zmm1, 64(%rdx) {%k2} 5993; AVX512VLBW-NEXT: vmovdqu32 %zmm0, (%rdx) {%k1} 5994; AVX512VLBW-NEXT: vzeroupper 5995; AVX512VLBW-NEXT: retq 5996; 5997; X86-AVX512-LABEL: store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts: 5998; X86-AVX512: ## %bb.0: 5999; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 6000; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx 6001; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx 6002; X86-AVX512-NEXT: vmovdqa64 (%edx), %zmm0 6003; X86-AVX512-NEXT: vmovdqa64 64(%edx), %zmm1 6004; X86-AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 6005; X86-AVX512-NEXT: movw $21845, %dx ## imm = 0x5555 6006; X86-AVX512-NEXT: kmovd %edx, %k1 6007; X86-AVX512-NEXT: vpcmpgtd (%ecx), %zmm2, %k1 {%k1} 6008; X86-AVX512-NEXT: movw $85, %dx 6009; X86-AVX512-NEXT: kmovd %edx, %k2 6010; X86-AVX512-NEXT: vpcmpgtd 64(%ecx), %zmm2, %k2 {%k2} 6011; X86-AVX512-NEXT: vmovdqu32 %zmm1, 64(%eax) {%k2} 6012; X86-AVX512-NEXT: vmovdqu32 %zmm0, (%eax) {%k1} 6013; X86-AVX512-NEXT: vzeroupper 6014; X86-AVX512-NEXT: retl 6015 %trigger = load <24 x i32>, ptr %trigger.ptr 6016 %val = load <24 x i32>, ptr %val.ptr 6017 %mask.src = icmp slt <24 x i32> %trigger, zeroinitializer 6018 %mask = and <24 x i1> %mask.src, <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false> 6019 call void @llvm.masked.store.v24i32.p0(<24 x i32> %val, ptr %dst, i32 1, <24 x i1> %mask) 6020 ret void 6021} 6022 6023; From https://reviews.llvm.org/rGf8d9097168b7#1165311 6024define void @undefshuffle(<8 x i1> %i0, ptr %src, ptr %dst) nounwind { 6025; SSE2-LABEL: undefshuffle: 6026; SSE2: ## %bb.0: ## %else 6027; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) 6028; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 6029; SSE2-NEXT: movd %eax, %xmm0 6030; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 6031; SSE2-NEXT: pinsrw $1, %eax, %xmm0 6032; SSE2-NEXT: pinsrw $2, -{{[0-9]+}}(%rsp), %xmm0 6033; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 6034; SSE2-NEXT: pinsrw $3, %eax, %xmm0 6035; SSE2-NEXT: psllw $15, %xmm0 6036; SSE2-NEXT: packsswb %xmm0, %xmm0 6037; SSE2-NEXT: pmovmskb %xmm0, %eax 6038; SSE2-NEXT: testb $1, %al 6039; SSE2-NEXT: jne LBB32_1 6040; SSE2-NEXT: ## %bb.2: ## %else23 6041; SSE2-NEXT: testb $2, %al 6042; SSE2-NEXT: jne LBB32_3 6043; SSE2-NEXT: LBB32_4: ## %else25 6044; SSE2-NEXT: testb $4, %al 6045; SSE2-NEXT: jne LBB32_5 6046; SSE2-NEXT: LBB32_6: ## %else27 6047; SSE2-NEXT: testb $8, %al 6048; SSE2-NEXT: jne LBB32_7 6049; SSE2-NEXT: LBB32_8: ## %else29 6050; SSE2-NEXT: testb $16, %al 6051; SSE2-NEXT: jne LBB32_9 6052; SSE2-NEXT: LBB32_10: ## %else31 6053; SSE2-NEXT: testb $32, %al 6054; SSE2-NEXT: jne LBB32_11 6055; SSE2-NEXT: LBB32_12: ## %else33 6056; SSE2-NEXT: testb $64, %al 6057; SSE2-NEXT: jne LBB32_13 6058; SSE2-NEXT: LBB32_14: ## %else35 6059; SSE2-NEXT: testb $-128, %al 6060; SSE2-NEXT: jne LBB32_15 6061; SSE2-NEXT: LBB32_16: ## %else37 6062; SSE2-NEXT: retq 6063; SSE2-NEXT: LBB32_1: ## %cond.store 6064; SSE2-NEXT: movl $0, (%rsi) 6065; SSE2-NEXT: testb $2, %al 6066; SSE2-NEXT: je LBB32_4 6067; SSE2-NEXT: LBB32_3: ## %cond.store24 6068; SSE2-NEXT: movl $0, 4(%rsi) 6069; SSE2-NEXT: testb $4, %al 6070; SSE2-NEXT: je LBB32_6 6071; SSE2-NEXT: LBB32_5: ## %cond.store26 6072; SSE2-NEXT: movl $0, 8(%rsi) 6073; SSE2-NEXT: testb $8, %al 6074; SSE2-NEXT: je LBB32_8 6075; SSE2-NEXT: LBB32_7: ## %cond.store28 6076; SSE2-NEXT: movl $0, 12(%rsi) 6077; SSE2-NEXT: testb $16, %al 6078; SSE2-NEXT: je LBB32_10 6079; SSE2-NEXT: LBB32_9: ## %cond.store30 6080; SSE2-NEXT: movl $0, 16(%rsi) 6081; SSE2-NEXT: testb $32, %al 6082; SSE2-NEXT: je LBB32_12 6083; SSE2-NEXT: LBB32_11: ## %cond.store32 6084; SSE2-NEXT: movl $0, 20(%rsi) 6085; SSE2-NEXT: testb $64, %al 6086; SSE2-NEXT: je LBB32_14 6087; SSE2-NEXT: LBB32_13: ## %cond.store34 6088; SSE2-NEXT: movl $0, 24(%rsi) 6089; SSE2-NEXT: testb $-128, %al 6090; SSE2-NEXT: je LBB32_16 6091; SSE2-NEXT: LBB32_15: ## %cond.store36 6092; SSE2-NEXT: movl $0, 28(%rsi) 6093; SSE2-NEXT: retq 6094; 6095; SSE4-LABEL: undefshuffle: 6096; SSE4: ## %bb.0: ## %else 6097; SSE4-NEXT: psllw $15, %xmm0 6098; SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 6099; SSE4-NEXT: packsswb %xmm0, %xmm0 6100; SSE4-NEXT: pmovmskb %xmm0, %eax 6101; SSE4-NEXT: testb $1, %al 6102; SSE4-NEXT: jne LBB32_1 6103; SSE4-NEXT: ## %bb.2: ## %else23 6104; SSE4-NEXT: testb $2, %al 6105; SSE4-NEXT: jne LBB32_3 6106; SSE4-NEXT: LBB32_4: ## %else25 6107; SSE4-NEXT: testb $4, %al 6108; SSE4-NEXT: jne LBB32_5 6109; SSE4-NEXT: LBB32_6: ## %else27 6110; SSE4-NEXT: testb $8, %al 6111; SSE4-NEXT: jne LBB32_7 6112; SSE4-NEXT: LBB32_8: ## %else29 6113; SSE4-NEXT: testb $16, %al 6114; SSE4-NEXT: jne LBB32_9 6115; SSE4-NEXT: LBB32_10: ## %else31 6116; SSE4-NEXT: testb $32, %al 6117; SSE4-NEXT: jne LBB32_11 6118; SSE4-NEXT: LBB32_12: ## %else33 6119; SSE4-NEXT: testb $64, %al 6120; SSE4-NEXT: jne LBB32_13 6121; SSE4-NEXT: LBB32_14: ## %else35 6122; SSE4-NEXT: testb $-128, %al 6123; SSE4-NEXT: jne LBB32_15 6124; SSE4-NEXT: LBB32_16: ## %else37 6125; SSE4-NEXT: retq 6126; SSE4-NEXT: LBB32_1: ## %cond.store 6127; SSE4-NEXT: movl $0, (%rsi) 6128; SSE4-NEXT: testb $2, %al 6129; SSE4-NEXT: je LBB32_4 6130; SSE4-NEXT: LBB32_3: ## %cond.store24 6131; SSE4-NEXT: movl $0, 4(%rsi) 6132; SSE4-NEXT: testb $4, %al 6133; SSE4-NEXT: je LBB32_6 6134; SSE4-NEXT: LBB32_5: ## %cond.store26 6135; SSE4-NEXT: movl $0, 8(%rsi) 6136; SSE4-NEXT: testb $8, %al 6137; SSE4-NEXT: je LBB32_8 6138; SSE4-NEXT: LBB32_7: ## %cond.store28 6139; SSE4-NEXT: movl $0, 12(%rsi) 6140; SSE4-NEXT: testb $16, %al 6141; SSE4-NEXT: je LBB32_10 6142; SSE4-NEXT: LBB32_9: ## %cond.store30 6143; SSE4-NEXT: movl $0, 16(%rsi) 6144; SSE4-NEXT: testb $32, %al 6145; SSE4-NEXT: je LBB32_12 6146; SSE4-NEXT: LBB32_11: ## %cond.store32 6147; SSE4-NEXT: movl $0, 20(%rsi) 6148; SSE4-NEXT: testb $64, %al 6149; SSE4-NEXT: je LBB32_14 6150; SSE4-NEXT: LBB32_13: ## %cond.store34 6151; SSE4-NEXT: movl $0, 24(%rsi) 6152; SSE4-NEXT: testb $-128, %al 6153; SSE4-NEXT: je LBB32_16 6154; SSE4-NEXT: LBB32_15: ## %cond.store36 6155; SSE4-NEXT: movl $0, 28(%rsi) 6156; SSE4-NEXT: retq 6157; 6158; AVX1-LABEL: undefshuffle: 6159; AVX1: ## %bb.0: 6160; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 6161; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 6162; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 6163; AVX1-NEXT: vmaskmovps %ymm1, %ymm0, (%rsi) 6164; AVX1-NEXT: vzeroupper 6165; AVX1-NEXT: retq 6166; 6167; AVX2-LABEL: undefshuffle: 6168; AVX2: ## %bb.0: 6169; AVX2-NEXT: ## kill: def $xmm0 killed $xmm0 def $ymm0 6170; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,u,u,2,u,u,u,4,u,u,u,6,u,u,u],zero,ymm0[u,u,u],zero,ymm0[u,u,u],zero,ymm0[u,u,u],zero,ymm0[u,u,u] 6171; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 6172; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 6173; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, (%rsi) 6174; AVX2-NEXT: vzeroupper 6175; AVX2-NEXT: retq 6176; 6177; AVX512F-LABEL: undefshuffle: 6178; AVX512F: ## %bb.0: 6179; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 6180; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 6181; AVX512F-NEXT: movb $15, %al 6182; AVX512F-NEXT: kmovw %eax, %k1 6183; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1} 6184; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 6185; AVX512F-NEXT: vmovdqu32 %zmm0, (%rsi) {%k1} 6186; AVX512F-NEXT: vzeroupper 6187; AVX512F-NEXT: retq 6188; 6189; AVX512VLDQ-LABEL: undefshuffle: 6190; AVX512VLDQ: ## %bb.0: 6191; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0 6192; AVX512VLDQ-NEXT: vpslld $31, %ymm0, %ymm0 6193; AVX512VLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 6194; AVX512VLDQ-NEXT: movb $15, %al 6195; AVX512VLDQ-NEXT: kmovw %eax, %k1 6196; AVX512VLDQ-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 {%k1} 6197; AVX512VLDQ-NEXT: vmovdqu32 %ymm1, (%rsi) {%k1} 6198; AVX512VLDQ-NEXT: vzeroupper 6199; AVX512VLDQ-NEXT: retq 6200; 6201; AVX512VLBW-LABEL: undefshuffle: 6202; AVX512VLBW: ## %bb.0: 6203; AVX512VLBW-NEXT: vpsllw $15, %xmm0, %xmm0 6204; AVX512VLBW-NEXT: vpmovw2m %xmm0, %k0 6205; AVX512VLBW-NEXT: movl $15, %eax 6206; AVX512VLBW-NEXT: kmovd %eax, %k1 6207; AVX512VLBW-NEXT: kandd %k1, %k0, %k1 6208; AVX512VLBW-NEXT: vpxor %xmm0, %xmm0, %xmm0 6209; AVX512VLBW-NEXT: vmovdqu32 %ymm0, (%rsi) {%k1} 6210; AVX512VLBW-NEXT: vzeroupper 6211; AVX512VLBW-NEXT: retq 6212; 6213; X86-AVX512-LABEL: undefshuffle: 6214; X86-AVX512: ## %bb.0: 6215; X86-AVX512-NEXT: vpsllw $15, %xmm0, %xmm0 6216; X86-AVX512-NEXT: vpmovw2m %xmm0, %k0 6217; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 6218; X86-AVX512-NEXT: movl $15, %ecx 6219; X86-AVX512-NEXT: kmovd %ecx, %k1 6220; X86-AVX512-NEXT: kandd %k1, %k0, %k1 6221; X86-AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 6222; X86-AVX512-NEXT: vmovdqu32 %ymm0, (%eax) {%k1} 6223; X86-AVX512-NEXT: vzeroupper 6224; X86-AVX512-NEXT: retl 6225 %i1 = shufflevector <8 x i1> %i0, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 6226 %i2 = shufflevector <16 x i1> %i1, <16 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 6227 %it51 = and <32 x i1> %i2, <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false> 6228 %i3 = shufflevector <32 x i1> %it51, <32 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 6229 %i4 = shufflevector <32 x i1> %it51, <32 x i1> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 6230 %i5 = tail call <8 x i32> @llvm.masked.load.v8i32.p0(ptr %src, i32 1, <8 x i1> %i4, <8 x i32> zeroinitializer) 6231 tail call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr %dst, i32 1, <8 x i1> %i3) 6232 ret void 6233} 6234declare <8 x i32> @llvm.masked.load.v8i32.p0(ptr nocapture, i32 immarg, <8 x i1>, <8 x i32>) 6235 6236declare void @llvm.masked.store.v8f64.p0(<8 x double>, ptr, i32, <8 x i1>) 6237declare void @llvm.masked.store.v4f64.p0(<4 x double>, ptr, i32, <4 x i1>) 6238declare void @llvm.masked.store.v2f64.p0(<2 x double>, ptr, i32, <2 x i1>) 6239declare void @llvm.masked.store.v1f64.p0(<1 x double>, ptr, i32, <1 x i1>) 6240 6241declare void @llvm.masked.store.v16f32.p0(<16 x float>, ptr, i32, <16 x i1>) 6242declare void @llvm.masked.store.v8f32.p0(<8 x float>, ptr, i32, <8 x i1>) 6243declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32, <4 x i1>) 6244declare void @llvm.masked.store.v2f32.p0(<2 x float>, ptr, i32, <2 x i1>) 6245 6246declare void @llvm.masked.store.v16i64.p0(<16 x i64>, ptr, i32, <16 x i1>) 6247declare void @llvm.masked.store.v8i64.p0(<8 x i64>, ptr, i32, <8 x i1>) 6248declare void @llvm.masked.store.v4i64.p0(<4 x i64>, ptr, i32, <4 x i1>) 6249declare void @llvm.masked.store.v2i64.p0(<2 x i64>, ptr, i32, <2 x i1>) 6250declare void @llvm.masked.store.v1i64.p0(<1 x i64>, ptr, i32, <1 x i1>) 6251 6252declare void @llvm.masked.store.v24i32.p0(<24 x i32>, ptr, i32, <24 x i1>) 6253declare void @llvm.masked.store.v16i32.p0(<16 x i32>, ptr, i32, <16 x i1>) 6254declare void @llvm.masked.store.v8i32.p0(<8 x i32>, ptr, i32, <8 x i1>) 6255declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>) 6256declare void @llvm.masked.store.v3i32.p0(<3 x i32>, ptr, i32, <3 x i1>) 6257declare void @llvm.masked.store.v2i32.p0(<2 x i32>, ptr, i32, <2 x i1>) 6258declare void @llvm.masked.store.v1i32.p0(<1 x i32>, ptr, i32, <1 x i1>) 6259 6260declare void @llvm.masked.store.v32i16.p0(<32 x i16>, ptr, i32, <32 x i1>) 6261declare void @llvm.masked.store.v16i16.p0(<16 x i16>, ptr, i32, <16 x i1>) 6262declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32, <8 x i1>) 6263declare void @llvm.masked.store.v4i16.p0(<4 x i16>, ptr, i32, <4 x i1>) 6264 6265declare void @llvm.masked.store.v64i8.p0(<64 x i8>, ptr, i32, <64 x i1>) 6266declare void @llvm.masked.store.v32i8.p0(<32 x i8>, ptr, i32, <32 x i1>) 6267declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32, <16 x i1>) 6268declare void @llvm.masked.store.v8i8.p0(<8 x i8>, ptr, i32, <8 x i1>) 6269