1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 2; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE,SSE42 4; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX2 5; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512,AVX512-V4 6; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 -mattr=+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512-VBMI 7 8define i4 @reverse_cmp_v4i1(<4 x i32> %a0, <4 x i32> %a1) { 9; SSE2-LABEL: reverse_cmp_v4i1: 10; SSE2: # %bb.0: 11; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 12; SSE2-NEXT: movmskps %xmm0, %eax 13; SSE2-NEXT: leal (%rax,%rax), %ecx 14; SSE2-NEXT: andb $4, %cl 15; SSE2-NEXT: leal (,%rax,8), %edx 16; SSE2-NEXT: andb $8, %dl 17; SSE2-NEXT: orb %cl, %dl 18; SSE2-NEXT: movl %eax, %ecx 19; SSE2-NEXT: shrb %cl 20; SSE2-NEXT: andb $2, %cl 21; SSE2-NEXT: orb %dl, %cl 22; SSE2-NEXT: shrb $3, %al 23; SSE2-NEXT: orb %cl, %al 24; SSE2-NEXT: # kill: def $al killed $al killed $rax 25; SSE2-NEXT: retq 26; 27; SSE42-LABEL: reverse_cmp_v4i1: 28; SSE42: # %bb.0: 29; SSE42-NEXT: pcmpeqd %xmm1, %xmm0 30; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] 31; SSE42-NEXT: movmskps %xmm0, %eax 32; SSE42-NEXT: # kill: def $al killed $al killed $eax 33; SSE42-NEXT: retq 34; 35; AVX2-LABEL: reverse_cmp_v4i1: 36; AVX2: # %bb.0: 37; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 38; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] 39; AVX2-NEXT: vmovmskps %xmm0, %eax 40; AVX2-NEXT: # kill: def $al killed $al killed $eax 41; AVX2-NEXT: retq 42; 43; AVX512-LABEL: reverse_cmp_v4i1: 44; AVX512: # %bb.0: 45; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0] 46; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] 47; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 48; AVX512-NEXT: kmovd %k0, %eax 49; AVX512-NEXT: # kill: def $al killed $al killed $eax 50; AVX512-NEXT: retq 51 %cmp = icmp eq <4 x i32> %a0, %a1 52 %mask = bitcast <4 x i1> %cmp to i4 53 %rev = tail call i4 @llvm.bitreverse.i4(i4 %mask) 54 ret i4 %rev 55} 56declare i4 @llvm.bitreverse.i4(i4) 57 58define i8 @reverse_cmp_v8i1(<8 x i16> %a0, <8 x i16> %a1) { 59; SSE2-LABEL: reverse_cmp_v8i1: 60; SSE2: # %bb.0: 61; SSE2-NEXT: pcmpeqw %xmm1, %xmm0 62; SSE2-NEXT: packsswb %xmm0, %xmm0 63; SSE2-NEXT: pmovmskb %xmm0, %eax 64; SSE2-NEXT: rolb $4, %al 65; SSE2-NEXT: movl %eax, %ecx 66; SSE2-NEXT: andb $51, %cl 67; SSE2-NEXT: shlb $2, %cl 68; SSE2-NEXT: shrb $2, %al 69; SSE2-NEXT: andb $51, %al 70; SSE2-NEXT: orb %cl, %al 71; SSE2-NEXT: movl %eax, %ecx 72; SSE2-NEXT: andb $85, %cl 73; SSE2-NEXT: addb %cl, %cl 74; SSE2-NEXT: shrb %al 75; SSE2-NEXT: andb $85, %al 76; SSE2-NEXT: orb %cl, %al 77; SSE2-NEXT: # kill: def $al killed $al killed $eax 78; SSE2-NEXT: retq 79; 80; SSE42-LABEL: reverse_cmp_v8i1: 81; SSE42: # %bb.0: 82; SSE42-NEXT: pcmpeqw %xmm1, %xmm0 83; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,15,u,13,u,11,u,9,u,7,u,5,u,3,u,1] 84; SSE42-NEXT: packsswb %xmm0, %xmm0 85; SSE42-NEXT: pmovmskb %xmm0, %eax 86; SSE42-NEXT: # kill: def $al killed $al killed $eax 87; SSE42-NEXT: retq 88; 89; AVX2-LABEL: reverse_cmp_v8i1: 90; AVX2: # %bb.0: 91; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 92; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,12,10,8,6,4,2,0,u,u,u,u,u,u,u,u] 93; AVX2-NEXT: vpmovmskb %xmm0, %eax 94; AVX2-NEXT: # kill: def $al killed $al killed $eax 95; AVX2-NEXT: retq 96; 97; AVX512-LABEL: reverse_cmp_v8i1: 98; AVX512: # %bb.0: 99; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 100; AVX512-NEXT: vpmovm2d %k0, %ymm0 101; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] 102; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0 103; AVX512-NEXT: vpmovd2m %ymm0, %k0 104; AVX512-NEXT: kmovd %k0, %eax 105; AVX512-NEXT: # kill: def $al killed $al killed $eax 106; AVX512-NEXT: vzeroupper 107; AVX512-NEXT: retq 108 %cmp = icmp eq <8 x i16> %a0, %a1 109 %mask = bitcast <8 x i1> %cmp to i8 110 %rev = tail call i8 @llvm.bitreverse.i8(i8 %mask) 111 ret i8 %rev 112} 113declare i8 @llvm.bitreverse.i8(i8) 114 115define i16 @reverse_cmp_v16i1(<16 x i8> %a0, <16 x i8> %a1) { 116; SSE2-LABEL: reverse_cmp_v16i1: 117; SSE2: # %bb.0: 118; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 119; SSE2-NEXT: pmovmskb %xmm0, %eax 120; SSE2-NEXT: rolw $8, %ax 121; SSE2-NEXT: movl %eax, %ecx 122; SSE2-NEXT: andl $3855, %ecx # imm = 0xF0F 123; SSE2-NEXT: shll $4, %ecx 124; SSE2-NEXT: shrl $4, %eax 125; SSE2-NEXT: andl $3855, %eax # imm = 0xF0F 126; SSE2-NEXT: orl %ecx, %eax 127; SSE2-NEXT: movl %eax, %ecx 128; SSE2-NEXT: andl $13107, %ecx # imm = 0x3333 129; SSE2-NEXT: shrl $2, %eax 130; SSE2-NEXT: andl $13107, %eax # imm = 0x3333 131; SSE2-NEXT: leal (%rax,%rcx,4), %eax 132; SSE2-NEXT: movl %eax, %ecx 133; SSE2-NEXT: andl $21845, %ecx # imm = 0x5555 134; SSE2-NEXT: shrl %eax 135; SSE2-NEXT: andl $21845, %eax # imm = 0x5555 136; SSE2-NEXT: leal (%rax,%rcx,2), %eax 137; SSE2-NEXT: # kill: def $ax killed $ax killed $eax 138; SSE2-NEXT: retq 139; 140; SSE42-LABEL: reverse_cmp_v16i1: 141; SSE42: # %bb.0: 142; SSE42-NEXT: pcmpeqb %xmm1, %xmm0 143; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] 144; SSE42-NEXT: pmovmskb %xmm0, %eax 145; SSE42-NEXT: # kill: def $ax killed $ax killed $eax 146; SSE42-NEXT: retq 147; 148; AVX2-LABEL: reverse_cmp_v16i1: 149; AVX2: # %bb.0: 150; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 151; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] 152; AVX2-NEXT: vpmovmskb %xmm0, %eax 153; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 154; AVX2-NEXT: retq 155; 156; AVX512-LABEL: reverse_cmp_v16i1: 157; AVX512: # %bb.0: 158; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 159; AVX512-NEXT: vpmovm2w %k0, %ymm0 160; AVX512-NEXT: vpmovsxbw {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] 161; AVX512-NEXT: vpermw %ymm0, %ymm1, %ymm0 162; AVX512-NEXT: vpmovw2m %ymm0, %k0 163; AVX512-NEXT: kmovd %k0, %eax 164; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 165; AVX512-NEXT: vzeroupper 166; AVX512-NEXT: retq 167 %cmp = icmp eq <16 x i8> %a0, %a1 168 %mask = bitcast <16 x i1> %cmp to i16 169 %rev = tail call i16 @llvm.bitreverse.i16(i16 %mask) 170 ret i16 %rev 171} 172declare i16 @llvm.bitreverse.i16(i16) 173 174define i32 @reverse_cmp_v32i1(<32 x i8> %a0, <32 x i8> %a1) { 175; SSE2-LABEL: reverse_cmp_v32i1: 176; SSE2: # %bb.0: 177; SSE2-NEXT: pcmpeqb %xmm2, %xmm0 178; SSE2-NEXT: pmovmskb %xmm0, %eax 179; SSE2-NEXT: pcmpeqb %xmm3, %xmm1 180; SSE2-NEXT: pmovmskb %xmm1, %ecx 181; SSE2-NEXT: shll $16, %ecx 182; SSE2-NEXT: orl %eax, %ecx 183; SSE2-NEXT: bswapl %ecx 184; SSE2-NEXT: movl %ecx, %eax 185; SSE2-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 186; SSE2-NEXT: shll $4, %eax 187; SSE2-NEXT: shrl $4, %ecx 188; SSE2-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F 189; SSE2-NEXT: orl %eax, %ecx 190; SSE2-NEXT: movl %ecx, %eax 191; SSE2-NEXT: andl $858993459, %eax # imm = 0x33333333 192; SSE2-NEXT: shrl $2, %ecx 193; SSE2-NEXT: andl $858993459, %ecx # imm = 0x33333333 194; SSE2-NEXT: leal (%rcx,%rax,4), %eax 195; SSE2-NEXT: movl %eax, %ecx 196; SSE2-NEXT: andl $1431655765, %ecx # imm = 0x55555555 197; SSE2-NEXT: shrl %eax 198; SSE2-NEXT: andl $1431655765, %eax # imm = 0x55555555 199; SSE2-NEXT: leal (%rax,%rcx,2), %eax 200; SSE2-NEXT: retq 201; 202; SSE42-LABEL: reverse_cmp_v32i1: 203; SSE42: # %bb.0: 204; SSE42-NEXT: pcmpeqb %xmm2, %xmm0 205; SSE42-NEXT: pcmpeqb %xmm3, %xmm1 206; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] 207; SSE42-NEXT: pshufb %xmm2, %xmm1 208; SSE42-NEXT: pmovmskb %xmm1, %ecx 209; SSE42-NEXT: pshufb %xmm2, %xmm0 210; SSE42-NEXT: pmovmskb %xmm0, %eax 211; SSE42-NEXT: shll $16, %eax 212; SSE42-NEXT: orl %ecx, %eax 213; SSE42-NEXT: retq 214; 215; AVX2-LABEL: reverse_cmp_v32i1: 216; AVX2: # %bb.0: 217; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 218; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16] 219; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 220; AVX2-NEXT: vpmovmskb %ymm0, %eax 221; AVX2-NEXT: vzeroupper 222; AVX2-NEXT: retq 223; 224; AVX512-V4-LABEL: reverse_cmp_v32i1: 225; AVX512-V4: # %bb.0: 226; AVX512-V4-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 227; AVX512-V4-NEXT: vpmovm2b %k0, %ymm0 228; AVX512-V4-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16] 229; AVX512-V4-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 230; AVX512-V4-NEXT: vpmovb2m %ymm0, %k0 231; AVX512-V4-NEXT: kmovd %k0, %eax 232; AVX512-V4-NEXT: vzeroupper 233; AVX512-V4-NEXT: retq 234; 235; AVX512-VBMI-LABEL: reverse_cmp_v32i1: 236; AVX512-VBMI: # %bb.0: 237; AVX512-VBMI-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 238; AVX512-VBMI-NEXT: vpmovm2b %k0, %ymm0 239; AVX512-VBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] 240; AVX512-VBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 241; AVX512-VBMI-NEXT: vpmovb2m %ymm0, %k0 242; AVX512-VBMI-NEXT: kmovd %k0, %eax 243; AVX512-VBMI-NEXT: vzeroupper 244; AVX512-VBMI-NEXT: retq 245 %cmp = icmp eq <32 x i8> %a0, %a1 246 %mask = bitcast <32 x i1> %cmp to i32 247 %rev = tail call i32 @llvm.bitreverse.i32(i32 %mask) 248 ret i32 %rev 249} 250declare i32 @llvm.bitreverse.i32(i32) 251 252define i64 @reverse_cmp_v64i1(<64 x i8> %a0, <64 x i8> %a1) { 253; SSE2-LABEL: reverse_cmp_v64i1: 254; SSE2: # %bb.0: 255; SSE2-NEXT: pcmpeqb %xmm4, %xmm0 256; SSE2-NEXT: pmovmskb %xmm0, %eax 257; SSE2-NEXT: pcmpeqb %xmm5, %xmm1 258; SSE2-NEXT: pmovmskb %xmm1, %ecx 259; SSE2-NEXT: shll $16, %ecx 260; SSE2-NEXT: orl %eax, %ecx 261; SSE2-NEXT: pcmpeqb %xmm6, %xmm2 262; SSE2-NEXT: pmovmskb %xmm2, %eax 263; SSE2-NEXT: pcmpeqb %xmm7, %xmm3 264; SSE2-NEXT: pmovmskb %xmm3, %edx 265; SSE2-NEXT: shll $16, %edx 266; SSE2-NEXT: orl %eax, %edx 267; SSE2-NEXT: shlq $32, %rdx 268; SSE2-NEXT: orq %rcx, %rdx 269; SSE2-NEXT: bswapq %rdx 270; SSE2-NEXT: movq %rdx, %rax 271; SSE2-NEXT: shrq $4, %rax 272; SSE2-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F 273; SSE2-NEXT: andq %rcx, %rax 274; SSE2-NEXT: andq %rcx, %rdx 275; SSE2-NEXT: shlq $4, %rdx 276; SSE2-NEXT: orq %rax, %rdx 277; SSE2-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 278; SSE2-NEXT: movq %rdx, %rcx 279; SSE2-NEXT: andq %rax, %rcx 280; SSE2-NEXT: shrq $2, %rdx 281; SSE2-NEXT: andq %rax, %rdx 282; SSE2-NEXT: leaq (%rdx,%rcx,4), %rax 283; SSE2-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 284; SSE2-NEXT: movq %rax, %rdx 285; SSE2-NEXT: andq %rcx, %rdx 286; SSE2-NEXT: shrq %rax 287; SSE2-NEXT: andq %rcx, %rax 288; SSE2-NEXT: leaq (%rax,%rdx,2), %rax 289; SSE2-NEXT: retq 290; 291; SSE42-LABEL: reverse_cmp_v64i1: 292; SSE42: # %bb.0: 293; SSE42-NEXT: pcmpeqb %xmm4, %xmm0 294; SSE42-NEXT: pcmpeqb %xmm5, %xmm1 295; SSE42-NEXT: pcmpeqb %xmm6, %xmm2 296; SSE42-NEXT: pcmpeqb %xmm7, %xmm3 297; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] 298; SSE42-NEXT: pshufb %xmm4, %xmm3 299; SSE42-NEXT: pmovmskb %xmm3, %eax 300; SSE42-NEXT: pshufb %xmm4, %xmm2 301; SSE42-NEXT: pmovmskb %xmm2, %ecx 302; SSE42-NEXT: shll $16, %ecx 303; SSE42-NEXT: orl %eax, %ecx 304; SSE42-NEXT: pshufb %xmm4, %xmm1 305; SSE42-NEXT: pmovmskb %xmm1, %edx 306; SSE42-NEXT: pshufb %xmm4, %xmm0 307; SSE42-NEXT: pmovmskb %xmm0, %eax 308; SSE42-NEXT: shll $16, %eax 309; SSE42-NEXT: orl %edx, %eax 310; SSE42-NEXT: shlq $32, %rax 311; SSE42-NEXT: orq %rcx, %rax 312; SSE42-NEXT: retq 313; 314; AVX2-LABEL: reverse_cmp_v64i1: 315; AVX2: # %bb.0: 316; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 317; AVX2-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1 318; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] 319; AVX2-NEXT: # ymm2 = mem[0,1,0,1] 320; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 321; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] 322; AVX2-NEXT: vpmovmskb %ymm1, %ecx 323; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 324; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 325; AVX2-NEXT: vpmovmskb %ymm0, %eax 326; AVX2-NEXT: shlq $32, %rax 327; AVX2-NEXT: orq %rcx, %rax 328; AVX2-NEXT: vzeroupper 329; AVX2-NEXT: retq 330; 331; AVX512-V4-LABEL: reverse_cmp_v64i1: 332; AVX512-V4: # %bb.0: 333; AVX512-V4-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 334; AVX512-V4-NEXT: vpmovm2b %k0, %zmm0 335; AVX512-V4-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48] 336; AVX512-V4-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1] 337; AVX512-V4-NEXT: vpmovb2m %zmm0, %k0 338; AVX512-V4-NEXT: kmovq %k0, %rax 339; AVX512-V4-NEXT: vzeroupper 340; AVX512-V4-NEXT: retq 341; 342; AVX512-VBMI-LABEL: reverse_cmp_v64i1: 343; AVX512-VBMI: # %bb.0: 344; AVX512-VBMI-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 345; AVX512-VBMI-NEXT: vpmovm2b %k0, %zmm0 346; AVX512-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] 347; AVX512-VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0 348; AVX512-VBMI-NEXT: vpmovb2m %zmm0, %k0 349; AVX512-VBMI-NEXT: kmovq %k0, %rax 350; AVX512-VBMI-NEXT: vzeroupper 351; AVX512-VBMI-NEXT: retq 352 %cmp = icmp eq <64 x i8> %a0, %a1 353 %mask = bitcast <64 x i1> %cmp to i64 354 %rev = tail call i64 @llvm.bitreverse.i64(i64 %mask) 355 ret i64 %rev 356} 357declare i64 @llvm.bitreverse.i64(i64) 358 359;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: 360; SSE: {{.*}} 361