1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=ALL,SSE,SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512F 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512BW 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512BW 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=ALL,XOP,XOPAVX1 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=ALL,XOP,XOPAVX2 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+gfni | FileCheck %s --check-prefixes=ALL,GFNISSE 12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX1 13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX2 14; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX512,GFNIAVX512F 15; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX512,GFNIAVX512BW 16 17; Make sure we don't crash with avx512bw and xop 18; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx512bw 19 20define i8 @test_bitreverse_i8(i8 %a) nounwind { 21; SSE-LABEL: test_bitreverse_i8: 22; SSE: # %bb.0: 23; SSE-NEXT: rolb $4, %dil 24; SSE-NEXT: movl %edi, %eax 25; SSE-NEXT: andb $51, %al 26; SSE-NEXT: shlb $2, %al 27; SSE-NEXT: shrb $2, %dil 28; SSE-NEXT: andb $51, %dil 29; SSE-NEXT: orb %dil, %al 30; SSE-NEXT: movl %eax, %ecx 31; SSE-NEXT: andb $85, %cl 32; SSE-NEXT: addb %cl, %cl 33; SSE-NEXT: shrb %al 34; SSE-NEXT: andb $85, %al 35; SSE-NEXT: orb %cl, %al 36; SSE-NEXT: retq 37; 38; AVX-LABEL: test_bitreverse_i8: 39; AVX: # %bb.0: 40; AVX-NEXT: rolb $4, %dil 41; AVX-NEXT: movl %edi, %eax 42; AVX-NEXT: andb $51, %al 43; AVX-NEXT: shlb $2, %al 44; AVX-NEXT: shrb $2, %dil 45; AVX-NEXT: andb $51, %dil 46; AVX-NEXT: orb %dil, %al 47; AVX-NEXT: movl %eax, %ecx 48; AVX-NEXT: andb $85, %cl 49; AVX-NEXT: addb %cl, %cl 50; AVX-NEXT: shrb %al 51; AVX-NEXT: andb $85, %al 52; AVX-NEXT: orb %cl, %al 53; AVX-NEXT: retq 54; 55; XOP-LABEL: test_bitreverse_i8: 56; XOP: # %bb.0: 57; XOP-NEXT: vmovd %edi, %xmm0 58; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 59; XOP-NEXT: vmovd %xmm0, %eax 60; XOP-NEXT: # kill: def $al killed $al killed $eax 61; XOP-NEXT: retq 62; 63; GFNISSE-LABEL: test_bitreverse_i8: 64; GFNISSE: # %bb.0: 65; GFNISSE-NEXT: movd %edi, %xmm0 66; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 67; GFNISSE-NEXT: movd %xmm0, %eax 68; GFNISSE-NEXT: # kill: def $al killed $al killed $eax 69; GFNISSE-NEXT: retq 70; 71; GFNIAVX-LABEL: test_bitreverse_i8: 72; GFNIAVX: # %bb.0: 73; GFNIAVX-NEXT: vmovd %edi, %xmm0 74; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 75; GFNIAVX-NEXT: vmovd %xmm0, %eax 76; GFNIAVX-NEXT: # kill: def $al killed $al killed $eax 77; GFNIAVX-NEXT: retq 78 %b = call i8 @llvm.bitreverse.i8(i8 %a) 79 ret i8 %b 80} 81 82define i16 @test_bitreverse_i16(i16 %a) nounwind { 83; SSE-LABEL: test_bitreverse_i16: 84; SSE: # %bb.0: 85; SSE-NEXT: # kill: def $edi killed $edi def $rdi 86; SSE-NEXT: rolw $8, %di 87; SSE-NEXT: movl %edi, %eax 88; SSE-NEXT: andl $3855, %eax # imm = 0xF0F 89; SSE-NEXT: shll $4, %eax 90; SSE-NEXT: shrl $4, %edi 91; SSE-NEXT: andl $3855, %edi # imm = 0xF0F 92; SSE-NEXT: orl %eax, %edi 93; SSE-NEXT: movl %edi, %eax 94; SSE-NEXT: andl $13107, %eax # imm = 0x3333 95; SSE-NEXT: shrl $2, %edi 96; SSE-NEXT: andl $13107, %edi # imm = 0x3333 97; SSE-NEXT: leal (%rdi,%rax,4), %eax 98; SSE-NEXT: movl %eax, %ecx 99; SSE-NEXT: andl $21845, %ecx # imm = 0x5555 100; SSE-NEXT: shrl %eax 101; SSE-NEXT: andl $21845, %eax # imm = 0x5555 102; SSE-NEXT: leal (%rax,%rcx,2), %eax 103; SSE-NEXT: # kill: def $ax killed $ax killed $eax 104; SSE-NEXT: retq 105; 106; AVX-LABEL: test_bitreverse_i16: 107; AVX: # %bb.0: 108; AVX-NEXT: # kill: def $edi killed $edi def $rdi 109; AVX-NEXT: rolw $8, %di 110; AVX-NEXT: movl %edi, %eax 111; AVX-NEXT: andl $3855, %eax # imm = 0xF0F 112; AVX-NEXT: shll $4, %eax 113; AVX-NEXT: shrl $4, %edi 114; AVX-NEXT: andl $3855, %edi # imm = 0xF0F 115; AVX-NEXT: orl %eax, %edi 116; AVX-NEXT: movl %edi, %eax 117; AVX-NEXT: andl $13107, %eax # imm = 0x3333 118; AVX-NEXT: shrl $2, %edi 119; AVX-NEXT: andl $13107, %edi # imm = 0x3333 120; AVX-NEXT: leal (%rdi,%rax,4), %eax 121; AVX-NEXT: movl %eax, %ecx 122; AVX-NEXT: andl $21845, %ecx # imm = 0x5555 123; AVX-NEXT: shrl %eax 124; AVX-NEXT: andl $21845, %eax # imm = 0x5555 125; AVX-NEXT: leal (%rax,%rcx,2), %eax 126; AVX-NEXT: # kill: def $ax killed $ax killed $eax 127; AVX-NEXT: retq 128; 129; XOP-LABEL: test_bitreverse_i16: 130; XOP: # %bb.0: 131; XOP-NEXT: vmovd %edi, %xmm0 132; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 133; XOP-NEXT: vmovd %xmm0, %eax 134; XOP-NEXT: # kill: def $ax killed $ax killed $eax 135; XOP-NEXT: retq 136; 137; GFNISSE-LABEL: test_bitreverse_i16: 138; GFNISSE: # %bb.0: 139; GFNISSE-NEXT: movd %edi, %xmm0 140; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 141; GFNISSE-NEXT: movd %xmm0, %eax 142; GFNISSE-NEXT: rolw $8, %ax 143; GFNISSE-NEXT: # kill: def $ax killed $ax killed $eax 144; GFNISSE-NEXT: retq 145; 146; GFNIAVX-LABEL: test_bitreverse_i16: 147; GFNIAVX: # %bb.0: 148; GFNIAVX-NEXT: vmovd %edi, %xmm0 149; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 150; GFNIAVX-NEXT: vmovd %xmm0, %eax 151; GFNIAVX-NEXT: rolw $8, %ax 152; GFNIAVX-NEXT: # kill: def $ax killed $ax killed $eax 153; GFNIAVX-NEXT: retq 154 %b = call i16 @llvm.bitreverse.i16(i16 %a) 155 ret i16 %b 156} 157 158define i32 @test_bitreverse_i32(i32 %a) nounwind { 159; SSE-LABEL: test_bitreverse_i32: 160; SSE: # %bb.0: 161; SSE-NEXT: # kill: def $edi killed $edi def $rdi 162; SSE-NEXT: bswapl %edi 163; SSE-NEXT: movl %edi, %eax 164; SSE-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 165; SSE-NEXT: shll $4, %eax 166; SSE-NEXT: shrl $4, %edi 167; SSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F 168; SSE-NEXT: orl %eax, %edi 169; SSE-NEXT: movl %edi, %eax 170; SSE-NEXT: andl $858993459, %eax # imm = 0x33333333 171; SSE-NEXT: shrl $2, %edi 172; SSE-NEXT: andl $858993459, %edi # imm = 0x33333333 173; SSE-NEXT: leal (%rdi,%rax,4), %eax 174; SSE-NEXT: movl %eax, %ecx 175; SSE-NEXT: andl $1431655765, %ecx # imm = 0x55555555 176; SSE-NEXT: shrl %eax 177; SSE-NEXT: andl $1431655765, %eax # imm = 0x55555555 178; SSE-NEXT: leal (%rax,%rcx,2), %eax 179; SSE-NEXT: retq 180; 181; AVX-LABEL: test_bitreverse_i32: 182; AVX: # %bb.0: 183; AVX-NEXT: # kill: def $edi killed $edi def $rdi 184; AVX-NEXT: bswapl %edi 185; AVX-NEXT: movl %edi, %eax 186; AVX-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 187; AVX-NEXT: shll $4, %eax 188; AVX-NEXT: shrl $4, %edi 189; AVX-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F 190; AVX-NEXT: orl %eax, %edi 191; AVX-NEXT: movl %edi, %eax 192; AVX-NEXT: andl $858993459, %eax # imm = 0x33333333 193; AVX-NEXT: shrl $2, %edi 194; AVX-NEXT: andl $858993459, %edi # imm = 0x33333333 195; AVX-NEXT: leal (%rdi,%rax,4), %eax 196; AVX-NEXT: movl %eax, %ecx 197; AVX-NEXT: andl $1431655765, %ecx # imm = 0x55555555 198; AVX-NEXT: shrl %eax 199; AVX-NEXT: andl $1431655765, %eax # imm = 0x55555555 200; AVX-NEXT: leal (%rax,%rcx,2), %eax 201; AVX-NEXT: retq 202; 203; XOP-LABEL: test_bitreverse_i32: 204; XOP: # %bb.0: 205; XOP-NEXT: vmovd %edi, %xmm0 206; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 207; XOP-NEXT: vmovd %xmm0, %eax 208; XOP-NEXT: retq 209; 210; GFNISSE-LABEL: test_bitreverse_i32: 211; GFNISSE: # %bb.0: 212; GFNISSE-NEXT: movd %edi, %xmm0 213; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 214; GFNISSE-NEXT: movd %xmm0, %eax 215; GFNISSE-NEXT: bswapl %eax 216; GFNISSE-NEXT: retq 217; 218; GFNIAVX-LABEL: test_bitreverse_i32: 219; GFNIAVX: # %bb.0: 220; GFNIAVX-NEXT: vmovd %edi, %xmm0 221; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 222; GFNIAVX-NEXT: vmovd %xmm0, %eax 223; GFNIAVX-NEXT: bswapl %eax 224; GFNIAVX-NEXT: retq 225 %b = call i32 @llvm.bitreverse.i32(i32 %a) 226 ret i32 %b 227} 228 229define i64 @test_bitreverse_i64(i64 %a) nounwind { 230; SSE-LABEL: test_bitreverse_i64: 231; SSE: # %bb.0: 232; SSE-NEXT: bswapq %rdi 233; SSE-NEXT: movq %rdi, %rax 234; SSE-NEXT: shrq $4, %rax 235; SSE-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F 236; SSE-NEXT: andq %rcx, %rax 237; SSE-NEXT: andq %rcx, %rdi 238; SSE-NEXT: shlq $4, %rdi 239; SSE-NEXT: orq %rax, %rdi 240; SSE-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 241; SSE-NEXT: movq %rdi, %rcx 242; SSE-NEXT: andq %rax, %rcx 243; SSE-NEXT: shrq $2, %rdi 244; SSE-NEXT: andq %rax, %rdi 245; SSE-NEXT: leaq (%rdi,%rcx,4), %rax 246; SSE-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 247; SSE-NEXT: movq %rax, %rdx 248; SSE-NEXT: andq %rcx, %rdx 249; SSE-NEXT: shrq %rax 250; SSE-NEXT: andq %rcx, %rax 251; SSE-NEXT: leaq (%rax,%rdx,2), %rax 252; SSE-NEXT: retq 253; 254; AVX-LABEL: test_bitreverse_i64: 255; AVX: # %bb.0: 256; AVX-NEXT: bswapq %rdi 257; AVX-NEXT: movq %rdi, %rax 258; AVX-NEXT: shrq $4, %rax 259; AVX-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F 260; AVX-NEXT: andq %rcx, %rax 261; AVX-NEXT: andq %rcx, %rdi 262; AVX-NEXT: shlq $4, %rdi 263; AVX-NEXT: orq %rax, %rdi 264; AVX-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 265; AVX-NEXT: movq %rdi, %rcx 266; AVX-NEXT: andq %rax, %rcx 267; AVX-NEXT: shrq $2, %rdi 268; AVX-NEXT: andq %rax, %rdi 269; AVX-NEXT: leaq (%rdi,%rcx,4), %rax 270; AVX-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 271; AVX-NEXT: movq %rax, %rdx 272; AVX-NEXT: andq %rcx, %rdx 273; AVX-NEXT: shrq %rax 274; AVX-NEXT: andq %rcx, %rax 275; AVX-NEXT: leaq (%rax,%rdx,2), %rax 276; AVX-NEXT: retq 277; 278; XOP-LABEL: test_bitreverse_i64: 279; XOP: # %bb.0: 280; XOP-NEXT: vmovq %rdi, %xmm0 281; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 282; XOP-NEXT: vmovq %xmm0, %rax 283; XOP-NEXT: retq 284; 285; GFNISSE-LABEL: test_bitreverse_i64: 286; GFNISSE: # %bb.0: 287; GFNISSE-NEXT: movq %rdi, %xmm0 288; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 289; GFNISSE-NEXT: movq %xmm0, %rax 290; GFNISSE-NEXT: bswapq %rax 291; GFNISSE-NEXT: retq 292; 293; GFNIAVX-LABEL: test_bitreverse_i64: 294; GFNIAVX: # %bb.0: 295; GFNIAVX-NEXT: vmovq %rdi, %xmm0 296; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 297; GFNIAVX-NEXT: vmovq %xmm0, %rax 298; GFNIAVX-NEXT: bswapq %rax 299; GFNIAVX-NEXT: retq 300 %b = call i64 @llvm.bitreverse.i64(i64 %a) 301 ret i64 %b 302} 303 304define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind { 305; SSE2-LABEL: test_bitreverse_v16i8: 306; SSE2: # %bb.0: 307; SSE2-NEXT: movdqa %xmm0, %xmm1 308; SSE2-NEXT: psrlw $4, %xmm1 309; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 310; SSE2-NEXT: pand %xmm2, %xmm1 311; SSE2-NEXT: pand %xmm2, %xmm0 312; SSE2-NEXT: psllw $4, %xmm0 313; SSE2-NEXT: por %xmm1, %xmm0 314; SSE2-NEXT: movdqa %xmm0, %xmm1 315; SSE2-NEXT: psrlw $2, %xmm1 316; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 317; SSE2-NEXT: pand %xmm2, %xmm1 318; SSE2-NEXT: pand %xmm2, %xmm0 319; SSE2-NEXT: psllw $2, %xmm0 320; SSE2-NEXT: por %xmm1, %xmm0 321; SSE2-NEXT: movdqa %xmm0, %xmm1 322; SSE2-NEXT: psrlw $1, %xmm1 323; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 324; SSE2-NEXT: pand %xmm2, %xmm1 325; SSE2-NEXT: pand %xmm2, %xmm0 326; SSE2-NEXT: paddb %xmm0, %xmm0 327; SSE2-NEXT: por %xmm1, %xmm0 328; SSE2-NEXT: retq 329; 330; SSSE3-LABEL: test_bitreverse_v16i8: 331; SSSE3: # %bb.0: 332; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 333; SSSE3-NEXT: movdqa %xmm0, %xmm2 334; SSSE3-NEXT: pand %xmm1, %xmm2 335; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 336; SSSE3-NEXT: pshufb %xmm2, %xmm3 337; SSSE3-NEXT: psrlw $4, %xmm0 338; SSSE3-NEXT: pand %xmm1, %xmm0 339; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 340; SSSE3-NEXT: pshufb %xmm0, %xmm1 341; SSSE3-NEXT: por %xmm3, %xmm1 342; SSSE3-NEXT: movdqa %xmm1, %xmm0 343; SSSE3-NEXT: retq 344; 345; AVX1-LABEL: test_bitreverse_v16i8: 346; AVX1: # %bb.0: 347; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 348; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 349; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 350; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 351; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 352; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 353; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 354; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0 355; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 356; AVX1-NEXT: retq 357; 358; AVX2-LABEL: test_bitreverse_v16i8: 359; AVX2: # %bb.0: 360; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 361; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 362; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 363; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 364; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 365; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 366; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 367; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 368; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 369; AVX2-NEXT: retq 370; 371; AVX512-LABEL: test_bitreverse_v16i8: 372; AVX512: # %bb.0: 373; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 374; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 375; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 376; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2 377; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0 378; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 379; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 380; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0 381; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 382; AVX512-NEXT: retq 383; 384; XOP-LABEL: test_bitreverse_v16i8: 385; XOP: # %bb.0: 386; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 387; XOP-NEXT: retq 388; 389; GFNISSE-LABEL: test_bitreverse_v16i8: 390; GFNISSE: # %bb.0: 391; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 392; GFNISSE-NEXT: retq 393; 394; GFNIAVX-LABEL: test_bitreverse_v16i8: 395; GFNIAVX: # %bb.0: 396; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 397; GFNIAVX-NEXT: retq 398 %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) 399 ret <16 x i8> %b 400} 401 402define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind { 403; SSE2-LABEL: test_bitreverse_v8i16: 404; SSE2: # %bb.0: 405; SSE2-NEXT: movdqa %xmm0, %xmm1 406; SSE2-NEXT: psrlw $8, %xmm1 407; SSE2-NEXT: psllw $8, %xmm0 408; SSE2-NEXT: por %xmm1, %xmm0 409; SSE2-NEXT: movdqa %xmm0, %xmm1 410; SSE2-NEXT: psrlw $4, %xmm1 411; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 412; SSE2-NEXT: pand %xmm2, %xmm1 413; SSE2-NEXT: pand %xmm2, %xmm0 414; SSE2-NEXT: psllw $4, %xmm0 415; SSE2-NEXT: por %xmm1, %xmm0 416; SSE2-NEXT: movdqa %xmm0, %xmm1 417; SSE2-NEXT: psrlw $2, %xmm1 418; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 419; SSE2-NEXT: pand %xmm2, %xmm1 420; SSE2-NEXT: pand %xmm2, %xmm0 421; SSE2-NEXT: psllw $2, %xmm0 422; SSE2-NEXT: por %xmm1, %xmm0 423; SSE2-NEXT: movdqa %xmm0, %xmm1 424; SSE2-NEXT: psrlw $1, %xmm1 425; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 426; SSE2-NEXT: pand %xmm2, %xmm1 427; SSE2-NEXT: pand %xmm2, %xmm0 428; SSE2-NEXT: paddb %xmm0, %xmm0 429; SSE2-NEXT: por %xmm1, %xmm0 430; SSE2-NEXT: retq 431; 432; SSSE3-LABEL: test_bitreverse_v8i16: 433; SSSE3: # %bb.0: 434; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 435; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 436; SSSE3-NEXT: movdqa %xmm0, %xmm2 437; SSSE3-NEXT: pand %xmm1, %xmm2 438; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 439; SSSE3-NEXT: pshufb %xmm2, %xmm3 440; SSSE3-NEXT: psrlw $4, %xmm0 441; SSSE3-NEXT: pand %xmm1, %xmm0 442; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 443; SSSE3-NEXT: pshufb %xmm0, %xmm1 444; SSSE3-NEXT: por %xmm3, %xmm1 445; SSSE3-NEXT: movdqa %xmm1, %xmm0 446; SSSE3-NEXT: retq 447; 448; AVX1-LABEL: test_bitreverse_v8i16: 449; AVX1: # %bb.0: 450; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 451; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 452; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 453; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 454; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 455; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 456; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 457; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 458; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0 459; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 460; AVX1-NEXT: retq 461; 462; AVX2-LABEL: test_bitreverse_v8i16: 463; AVX2: # %bb.0: 464; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 465; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 466; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 467; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 468; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 469; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 470; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 471; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 472; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 473; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 474; AVX2-NEXT: retq 475; 476; AVX512-LABEL: test_bitreverse_v8i16: 477; AVX512: # %bb.0: 478; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 479; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 480; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 481; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 482; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2 483; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0 484; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 485; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 486; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0 487; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 488; AVX512-NEXT: retq 489; 490; XOP-LABEL: test_bitreverse_v8i16: 491; XOP: # %bb.0: 492; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 493; XOP-NEXT: retq 494; 495; GFNISSE-LABEL: test_bitreverse_v8i16: 496; GFNISSE: # %bb.0: 497; GFNISSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 498; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 499; GFNISSE-NEXT: retq 500; 501; GFNIAVX-LABEL: test_bitreverse_v8i16: 502; GFNIAVX: # %bb.0: 503; GFNIAVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 504; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 505; GFNIAVX-NEXT: retq 506 %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) 507 ret <8 x i16> %b 508} 509 510define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind { 511; SSE2-LABEL: test_bitreverse_v4i32: 512; SSE2: # %bb.0: 513; SSE2-NEXT: pxor %xmm1, %xmm1 514; SSE2-NEXT: movdqa %xmm0, %xmm2 515; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 516; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 517; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 518; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 519; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 520; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 521; SSE2-NEXT: packuswb %xmm2, %xmm0 522; SSE2-NEXT: movdqa %xmm0, %xmm1 523; SSE2-NEXT: psrlw $4, %xmm1 524; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 525; SSE2-NEXT: pand %xmm2, %xmm1 526; SSE2-NEXT: pand %xmm2, %xmm0 527; SSE2-NEXT: psllw $4, %xmm0 528; SSE2-NEXT: por %xmm1, %xmm0 529; SSE2-NEXT: movdqa %xmm0, %xmm1 530; SSE2-NEXT: psrlw $2, %xmm1 531; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 532; SSE2-NEXT: pand %xmm2, %xmm1 533; SSE2-NEXT: pand %xmm2, %xmm0 534; SSE2-NEXT: psllw $2, %xmm0 535; SSE2-NEXT: por %xmm1, %xmm0 536; SSE2-NEXT: movdqa %xmm0, %xmm1 537; SSE2-NEXT: psrlw $1, %xmm1 538; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 539; SSE2-NEXT: pand %xmm2, %xmm1 540; SSE2-NEXT: pand %xmm2, %xmm0 541; SSE2-NEXT: paddb %xmm0, %xmm0 542; SSE2-NEXT: por %xmm1, %xmm0 543; SSE2-NEXT: retq 544; 545; SSSE3-LABEL: test_bitreverse_v4i32: 546; SSSE3: # %bb.0: 547; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 548; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 549; SSSE3-NEXT: movdqa %xmm0, %xmm2 550; SSSE3-NEXT: pand %xmm1, %xmm2 551; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 552; SSSE3-NEXT: pshufb %xmm2, %xmm3 553; SSSE3-NEXT: psrlw $4, %xmm0 554; SSSE3-NEXT: pand %xmm1, %xmm0 555; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 556; SSSE3-NEXT: pshufb %xmm0, %xmm1 557; SSSE3-NEXT: por %xmm3, %xmm1 558; SSSE3-NEXT: movdqa %xmm1, %xmm0 559; SSSE3-NEXT: retq 560; 561; AVX1-LABEL: test_bitreverse_v4i32: 562; AVX1: # %bb.0: 563; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 564; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 565; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 566; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 567; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 568; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 569; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 570; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 571; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0 572; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 573; AVX1-NEXT: retq 574; 575; AVX2-LABEL: test_bitreverse_v4i32: 576; AVX2: # %bb.0: 577; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 578; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 579; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 580; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 581; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 582; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 583; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 584; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 585; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 586; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 587; AVX2-NEXT: retq 588; 589; AVX512-LABEL: test_bitreverse_v4i32: 590; AVX512: # %bb.0: 591; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 592; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 593; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 594; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 595; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2 596; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0 597; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 598; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 599; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0 600; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 601; AVX512-NEXT: retq 602; 603; XOP-LABEL: test_bitreverse_v4i32: 604; XOP: # %bb.0: 605; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 606; XOP-NEXT: retq 607; 608; GFNISSE-LABEL: test_bitreverse_v4i32: 609; GFNISSE: # %bb.0: 610; GFNISSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 611; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 612; GFNISSE-NEXT: retq 613; 614; GFNIAVX-LABEL: test_bitreverse_v4i32: 615; GFNIAVX: # %bb.0: 616; GFNIAVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 617; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 618; GFNIAVX-NEXT: retq 619 %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) 620 ret <4 x i32> %b 621} 622 623define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind { 624; SSE2-LABEL: test_bitreverse_v2i64: 625; SSE2: # %bb.0: 626; SSE2-NEXT: pxor %xmm1, %xmm1 627; SSE2-NEXT: movdqa %xmm0, %xmm2 628; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 629; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 630; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 631; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 632; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 633; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 634; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 635; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 636; SSE2-NEXT: packuswb %xmm2, %xmm0 637; SSE2-NEXT: movdqa %xmm0, %xmm1 638; SSE2-NEXT: psrlw $4, %xmm1 639; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 640; SSE2-NEXT: pand %xmm2, %xmm1 641; SSE2-NEXT: pand %xmm2, %xmm0 642; SSE2-NEXT: psllw $4, %xmm0 643; SSE2-NEXT: por %xmm1, %xmm0 644; SSE2-NEXT: movdqa %xmm0, %xmm1 645; SSE2-NEXT: psrlw $2, %xmm1 646; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 647; SSE2-NEXT: pand %xmm2, %xmm1 648; SSE2-NEXT: pand %xmm2, %xmm0 649; SSE2-NEXT: psllw $2, %xmm0 650; SSE2-NEXT: por %xmm1, %xmm0 651; SSE2-NEXT: movdqa %xmm0, %xmm1 652; SSE2-NEXT: psrlw $1, %xmm1 653; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 654; SSE2-NEXT: pand %xmm2, %xmm1 655; SSE2-NEXT: pand %xmm2, %xmm0 656; SSE2-NEXT: paddb %xmm0, %xmm0 657; SSE2-NEXT: por %xmm1, %xmm0 658; SSE2-NEXT: retq 659; 660; SSSE3-LABEL: test_bitreverse_v2i64: 661; SSSE3: # %bb.0: 662; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 663; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 664; SSSE3-NEXT: movdqa %xmm0, %xmm2 665; SSSE3-NEXT: pand %xmm1, %xmm2 666; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 667; SSSE3-NEXT: pshufb %xmm2, %xmm3 668; SSSE3-NEXT: psrlw $4, %xmm0 669; SSSE3-NEXT: pand %xmm1, %xmm0 670; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 671; SSSE3-NEXT: pshufb %xmm0, %xmm1 672; SSSE3-NEXT: por %xmm3, %xmm1 673; SSSE3-NEXT: movdqa %xmm1, %xmm0 674; SSSE3-NEXT: retq 675; 676; AVX1-LABEL: test_bitreverse_v2i64: 677; AVX1: # %bb.0: 678; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 679; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 680; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 681; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 682; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 683; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 684; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 685; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 686; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0 687; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 688; AVX1-NEXT: retq 689; 690; AVX2-LABEL: test_bitreverse_v2i64: 691; AVX2: # %bb.0: 692; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 693; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 694; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 695; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 696; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 697; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 698; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 699; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 700; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 701; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 702; AVX2-NEXT: retq 703; 704; AVX512-LABEL: test_bitreverse_v2i64: 705; AVX512: # %bb.0: 706; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 707; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 708; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 709; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 710; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2 711; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0 712; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 713; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 714; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0 715; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 716; AVX512-NEXT: retq 717; 718; XOP-LABEL: test_bitreverse_v2i64: 719; XOP: # %bb.0: 720; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 721; XOP-NEXT: retq 722; 723; GFNISSE-LABEL: test_bitreverse_v2i64: 724; GFNISSE: # %bb.0: 725; GFNISSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 726; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 727; GFNISSE-NEXT: retq 728; 729; GFNIAVX-LABEL: test_bitreverse_v2i64: 730; GFNIAVX: # %bb.0: 731; GFNIAVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 732; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 733; GFNIAVX-NEXT: retq 734 %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) 735 ret <2 x i64> %b 736} 737 738define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind { 739; SSE2-LABEL: test_bitreverse_v32i8: 740; SSE2: # %bb.0: 741; SSE2-NEXT: movdqa %xmm0, %xmm3 742; SSE2-NEXT: psrlw $4, %xmm3 743; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 744; SSE2-NEXT: pand %xmm2, %xmm3 745; SSE2-NEXT: pand %xmm2, %xmm0 746; SSE2-NEXT: psllw $4, %xmm0 747; SSE2-NEXT: por %xmm3, %xmm0 748; SSE2-NEXT: movdqa %xmm0, %xmm4 749; SSE2-NEXT: psrlw $2, %xmm4 750; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 751; SSE2-NEXT: pand %xmm3, %xmm4 752; SSE2-NEXT: pand %xmm3, %xmm0 753; SSE2-NEXT: psllw $2, %xmm0 754; SSE2-NEXT: por %xmm4, %xmm0 755; SSE2-NEXT: movdqa %xmm0, %xmm5 756; SSE2-NEXT: psrlw $1, %xmm5 757; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 758; SSE2-NEXT: pand %xmm4, %xmm5 759; SSE2-NEXT: pand %xmm4, %xmm0 760; SSE2-NEXT: paddb %xmm0, %xmm0 761; SSE2-NEXT: por %xmm5, %xmm0 762; SSE2-NEXT: movdqa %xmm1, %xmm5 763; SSE2-NEXT: psrlw $4, %xmm5 764; SSE2-NEXT: pand %xmm2, %xmm5 765; SSE2-NEXT: pand %xmm2, %xmm1 766; SSE2-NEXT: psllw $4, %xmm1 767; SSE2-NEXT: por %xmm5, %xmm1 768; SSE2-NEXT: movdqa %xmm1, %xmm2 769; SSE2-NEXT: psrlw $2, %xmm2 770; SSE2-NEXT: pand %xmm3, %xmm2 771; SSE2-NEXT: pand %xmm3, %xmm1 772; SSE2-NEXT: psllw $2, %xmm1 773; SSE2-NEXT: por %xmm2, %xmm1 774; SSE2-NEXT: movdqa %xmm1, %xmm2 775; SSE2-NEXT: psrlw $1, %xmm2 776; SSE2-NEXT: pand %xmm4, %xmm2 777; SSE2-NEXT: pand %xmm4, %xmm1 778; SSE2-NEXT: paddb %xmm1, %xmm1 779; SSE2-NEXT: por %xmm2, %xmm1 780; SSE2-NEXT: retq 781; 782; SSSE3-LABEL: test_bitreverse_v32i8: 783; SSSE3: # %bb.0: 784; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 785; SSSE3-NEXT: movdqa %xmm0, %xmm2 786; SSSE3-NEXT: pand %xmm4, %xmm2 787; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 788; SSSE3-NEXT: movdqa %xmm5, %xmm6 789; SSSE3-NEXT: pshufb %xmm2, %xmm6 790; SSSE3-NEXT: psrlw $4, %xmm0 791; SSSE3-NEXT: pand %xmm4, %xmm0 792; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 793; SSSE3-NEXT: movdqa %xmm2, %xmm3 794; SSSE3-NEXT: pshufb %xmm0, %xmm3 795; SSSE3-NEXT: por %xmm6, %xmm3 796; SSSE3-NEXT: movdqa %xmm1, %xmm0 797; SSSE3-NEXT: pand %xmm4, %xmm0 798; SSSE3-NEXT: pshufb %xmm0, %xmm5 799; SSSE3-NEXT: psrlw $4, %xmm1 800; SSSE3-NEXT: pand %xmm4, %xmm1 801; SSSE3-NEXT: pshufb %xmm1, %xmm2 802; SSSE3-NEXT: por %xmm5, %xmm2 803; SSSE3-NEXT: movdqa %xmm3, %xmm0 804; SSSE3-NEXT: movdqa %xmm2, %xmm1 805; SSSE3-NEXT: retq 806; 807; AVX1-LABEL: test_bitreverse_v32i8: 808; AVX1: # %bb.0: 809; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 810; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 811; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 812; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 813; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 814; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 815; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 816; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 817; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 818; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 819; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 820; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 821; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 822; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 823; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 824; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 825; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 826; AVX1-NEXT: retq 827; 828; AVX2-LABEL: test_bitreverse_v32i8: 829; AVX2: # %bb.0: 830; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 831; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 832; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 833; AVX2-NEXT: # ymm3 = mem[0,1,0,1] 834; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 835; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 836; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 837; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 838; AVX2-NEXT: # ymm1 = mem[0,1,0,1] 839; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 840; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 841; AVX2-NEXT: retq 842; 843; AVX512-LABEL: test_bitreverse_v32i8: 844; AVX512: # %bb.0: 845; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 846; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 847; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 848; AVX512-NEXT: # ymm3 = mem[0,1,0,1] 849; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 850; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 851; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 852; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 853; AVX512-NEXT: # ymm1 = mem[0,1,0,1] 854; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 855; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 856; AVX512-NEXT: retq 857; 858; XOPAVX1-LABEL: test_bitreverse_v32i8: 859; XOPAVX1: # %bb.0: 860; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 861; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 862; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 863; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 864; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 865; XOPAVX1-NEXT: retq 866; 867; XOPAVX2-LABEL: test_bitreverse_v32i8: 868; XOPAVX2: # %bb.0: 869; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 870; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 871; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 872; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 873; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 874; XOPAVX2-NEXT: retq 875; 876; GFNISSE-LABEL: test_bitreverse_v32i8: 877; GFNISSE: # %bb.0: 878; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] 879; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm0 880; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm1 881; GFNISSE-NEXT: retq 882; 883; GFNIAVX-LABEL: test_bitreverse_v32i8: 884; GFNIAVX: # %bb.0: 885; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 886; GFNIAVX-NEXT: retq 887 %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) 888 ret <32 x i8> %b 889} 890 891define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind { 892; SSE2-LABEL: test_bitreverse_v16i16: 893; SSE2: # %bb.0: 894; SSE2-NEXT: movdqa %xmm0, %xmm2 895; SSE2-NEXT: psrlw $8, %xmm2 896; SSE2-NEXT: psllw $8, %xmm0 897; SSE2-NEXT: por %xmm2, %xmm0 898; SSE2-NEXT: movdqa %xmm0, %xmm3 899; SSE2-NEXT: psrlw $4, %xmm3 900; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 901; SSE2-NEXT: pand %xmm2, %xmm3 902; SSE2-NEXT: pand %xmm2, %xmm0 903; SSE2-NEXT: psllw $4, %xmm0 904; SSE2-NEXT: por %xmm3, %xmm0 905; SSE2-NEXT: movdqa %xmm0, %xmm4 906; SSE2-NEXT: psrlw $2, %xmm4 907; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 908; SSE2-NEXT: pand %xmm3, %xmm4 909; SSE2-NEXT: pand %xmm3, %xmm0 910; SSE2-NEXT: psllw $2, %xmm0 911; SSE2-NEXT: por %xmm4, %xmm0 912; SSE2-NEXT: movdqa %xmm0, %xmm5 913; SSE2-NEXT: psrlw $1, %xmm5 914; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 915; SSE2-NEXT: pand %xmm4, %xmm5 916; SSE2-NEXT: pand %xmm4, %xmm0 917; SSE2-NEXT: paddb %xmm0, %xmm0 918; SSE2-NEXT: por %xmm5, %xmm0 919; SSE2-NEXT: movdqa %xmm1, %xmm5 920; SSE2-NEXT: psrlw $8, %xmm5 921; SSE2-NEXT: psllw $8, %xmm1 922; SSE2-NEXT: por %xmm5, %xmm1 923; SSE2-NEXT: movdqa %xmm1, %xmm5 924; SSE2-NEXT: psrlw $4, %xmm5 925; SSE2-NEXT: pand %xmm2, %xmm5 926; SSE2-NEXT: pand %xmm2, %xmm1 927; SSE2-NEXT: psllw $4, %xmm1 928; SSE2-NEXT: por %xmm5, %xmm1 929; SSE2-NEXT: movdqa %xmm1, %xmm2 930; SSE2-NEXT: psrlw $2, %xmm2 931; SSE2-NEXT: pand %xmm3, %xmm2 932; SSE2-NEXT: pand %xmm3, %xmm1 933; SSE2-NEXT: psllw $2, %xmm1 934; SSE2-NEXT: por %xmm2, %xmm1 935; SSE2-NEXT: movdqa %xmm1, %xmm2 936; SSE2-NEXT: psrlw $1, %xmm2 937; SSE2-NEXT: pand %xmm4, %xmm2 938; SSE2-NEXT: pand %xmm4, %xmm1 939; SSE2-NEXT: paddb %xmm1, %xmm1 940; SSE2-NEXT: por %xmm2, %xmm1 941; SSE2-NEXT: retq 942; 943; SSSE3-LABEL: test_bitreverse_v16i16: 944; SSSE3: # %bb.0: 945; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 946; SSSE3-NEXT: pshufb %xmm4, %xmm0 947; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 948; SSSE3-NEXT: movdqa %xmm0, %xmm2 949; SSSE3-NEXT: pand %xmm5, %xmm2 950; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 951; SSSE3-NEXT: movdqa %xmm6, %xmm7 952; SSSE3-NEXT: pshufb %xmm2, %xmm7 953; SSSE3-NEXT: psrlw $4, %xmm0 954; SSSE3-NEXT: pand %xmm5, %xmm0 955; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 956; SSSE3-NEXT: movdqa %xmm2, %xmm3 957; SSSE3-NEXT: pshufb %xmm0, %xmm3 958; SSSE3-NEXT: por %xmm7, %xmm3 959; SSSE3-NEXT: pshufb %xmm4, %xmm1 960; SSSE3-NEXT: movdqa %xmm1, %xmm0 961; SSSE3-NEXT: pand %xmm5, %xmm0 962; SSSE3-NEXT: pshufb %xmm0, %xmm6 963; SSSE3-NEXT: psrlw $4, %xmm1 964; SSSE3-NEXT: pand %xmm5, %xmm1 965; SSSE3-NEXT: pshufb %xmm1, %xmm2 966; SSSE3-NEXT: por %xmm6, %xmm2 967; SSSE3-NEXT: movdqa %xmm3, %xmm0 968; SSSE3-NEXT: movdqa %xmm2, %xmm1 969; SSSE3-NEXT: retq 970; 971; AVX1-LABEL: test_bitreverse_v16i16: 972; AVX1: # %bb.0: 973; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 974; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 975; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 976; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 977; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 978; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 979; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 980; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 981; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 982; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 983; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 984; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 985; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 986; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 987; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 988; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 989; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 990; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 991; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 992; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 993; AVX1-NEXT: retq 994; 995; AVX2-LABEL: test_bitreverse_v16i16: 996; AVX2: # %bb.0: 997; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 998; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 999; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1000; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1001; AVX2-NEXT: # ymm3 = mem[0,1,0,1] 1002; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1003; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1004; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1005; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1006; AVX2-NEXT: # ymm1 = mem[0,1,0,1] 1007; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1008; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 1009; AVX2-NEXT: retq 1010; 1011; AVX512-LABEL: test_bitreverse_v16i16: 1012; AVX512: # %bb.0: 1013; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 1014; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1015; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1016; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1017; AVX512-NEXT: # ymm3 = mem[0,1,0,1] 1018; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1019; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1020; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1021; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1022; AVX512-NEXT: # ymm1 = mem[0,1,0,1] 1023; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1024; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1025; AVX512-NEXT: retq 1026; 1027; XOPAVX1-LABEL: test_bitreverse_v16i16: 1028; XOPAVX1: # %bb.0: 1029; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1030; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 1031; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1032; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1033; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1034; XOPAVX1-NEXT: retq 1035; 1036; XOPAVX2-LABEL: test_bitreverse_v16i16: 1037; XOPAVX2: # %bb.0: 1038; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1039; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 1040; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1041; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1042; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1043; XOPAVX2-NEXT: retq 1044; 1045; GFNISSE-LABEL: test_bitreverse_v16i16: 1046; GFNISSE: # %bb.0: 1047; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1048; GFNISSE-NEXT: pshufb %xmm2, %xmm0 1049; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] 1050; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 1051; GFNISSE-NEXT: pshufb %xmm2, %xmm1 1052; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 1053; GFNISSE-NEXT: retq 1054; 1055; GFNIAVX1-LABEL: test_bitreverse_v16i16: 1056; GFNIAVX1: # %bb.0: 1057; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1058; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1059; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1060; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1061; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1062; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] 1063; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1064; GFNIAVX1-NEXT: retq 1065; 1066; GFNIAVX2-LABEL: test_bitreverse_v16i16: 1067; GFNIAVX2: # %bb.0: 1068; GFNIAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 1069; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1070; GFNIAVX2-NEXT: retq 1071; 1072; GFNIAVX512-LABEL: test_bitreverse_v16i16: 1073; GFNIAVX512: # %bb.0: 1074; GFNIAVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 1075; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1076; GFNIAVX512-NEXT: retq 1077 %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) 1078 ret <16 x i16> %b 1079} 1080 1081define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind { 1082; SSE2-LABEL: test_bitreverse_v8i32: 1083; SSE2: # %bb.0: 1084; SSE2-NEXT: pxor %xmm2, %xmm2 1085; SSE2-NEXT: movdqa %xmm0, %xmm3 1086; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 1087; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 1088; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 1089; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1090; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 1091; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 1092; SSE2-NEXT: packuswb %xmm3, %xmm0 1093; SSE2-NEXT: movdqa %xmm0, %xmm4 1094; SSE2-NEXT: psrlw $4, %xmm4 1095; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1096; SSE2-NEXT: pand %xmm3, %xmm4 1097; SSE2-NEXT: pand %xmm3, %xmm0 1098; SSE2-NEXT: psllw $4, %xmm0 1099; SSE2-NEXT: por %xmm4, %xmm0 1100; SSE2-NEXT: movdqa %xmm0, %xmm5 1101; SSE2-NEXT: psrlw $2, %xmm5 1102; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1103; SSE2-NEXT: pand %xmm4, %xmm5 1104; SSE2-NEXT: pand %xmm4, %xmm0 1105; SSE2-NEXT: psllw $2, %xmm0 1106; SSE2-NEXT: por %xmm5, %xmm0 1107; SSE2-NEXT: movdqa %xmm0, %xmm6 1108; SSE2-NEXT: psrlw $1, %xmm6 1109; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1110; SSE2-NEXT: pand %xmm5, %xmm6 1111; SSE2-NEXT: pand %xmm5, %xmm0 1112; SSE2-NEXT: paddb %xmm0, %xmm0 1113; SSE2-NEXT: por %xmm6, %xmm0 1114; SSE2-NEXT: movdqa %xmm1, %xmm6 1115; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] 1116; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 1117; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 1118; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1119; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 1120; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 1121; SSE2-NEXT: packuswb %xmm6, %xmm1 1122; SSE2-NEXT: movdqa %xmm1, %xmm2 1123; SSE2-NEXT: psrlw $4, %xmm2 1124; SSE2-NEXT: pand %xmm3, %xmm2 1125; SSE2-NEXT: pand %xmm3, %xmm1 1126; SSE2-NEXT: psllw $4, %xmm1 1127; SSE2-NEXT: por %xmm2, %xmm1 1128; SSE2-NEXT: movdqa %xmm1, %xmm2 1129; SSE2-NEXT: psrlw $2, %xmm2 1130; SSE2-NEXT: pand %xmm4, %xmm2 1131; SSE2-NEXT: pand %xmm4, %xmm1 1132; SSE2-NEXT: psllw $2, %xmm1 1133; SSE2-NEXT: por %xmm2, %xmm1 1134; SSE2-NEXT: movdqa %xmm1, %xmm2 1135; SSE2-NEXT: psrlw $1, %xmm2 1136; SSE2-NEXT: pand %xmm5, %xmm2 1137; SSE2-NEXT: pand %xmm5, %xmm1 1138; SSE2-NEXT: paddb %xmm1, %xmm1 1139; SSE2-NEXT: por %xmm2, %xmm1 1140; SSE2-NEXT: retq 1141; 1142; SSSE3-LABEL: test_bitreverse_v8i32: 1143; SSSE3: # %bb.0: 1144; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1145; SSSE3-NEXT: pshufb %xmm4, %xmm0 1146; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1147; SSSE3-NEXT: movdqa %xmm0, %xmm2 1148; SSSE3-NEXT: pand %xmm5, %xmm2 1149; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1150; SSSE3-NEXT: movdqa %xmm6, %xmm7 1151; SSSE3-NEXT: pshufb %xmm2, %xmm7 1152; SSSE3-NEXT: psrlw $4, %xmm0 1153; SSSE3-NEXT: pand %xmm5, %xmm0 1154; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1155; SSSE3-NEXT: movdqa %xmm2, %xmm3 1156; SSSE3-NEXT: pshufb %xmm0, %xmm3 1157; SSSE3-NEXT: por %xmm7, %xmm3 1158; SSSE3-NEXT: pshufb %xmm4, %xmm1 1159; SSSE3-NEXT: movdqa %xmm1, %xmm0 1160; SSSE3-NEXT: pand %xmm5, %xmm0 1161; SSSE3-NEXT: pshufb %xmm0, %xmm6 1162; SSSE3-NEXT: psrlw $4, %xmm1 1163; SSSE3-NEXT: pand %xmm5, %xmm1 1164; SSSE3-NEXT: pshufb %xmm1, %xmm2 1165; SSSE3-NEXT: por %xmm6, %xmm2 1166; SSSE3-NEXT: movdqa %xmm3, %xmm0 1167; SSSE3-NEXT: movdqa %xmm2, %xmm1 1168; SSSE3-NEXT: retq 1169; 1170; AVX1-LABEL: test_bitreverse_v8i32: 1171; AVX1: # %bb.0: 1172; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1173; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1174; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1175; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1176; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 1177; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1178; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1179; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1180; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1181; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1182; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1183; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 1184; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1185; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 1186; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 1187; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1188; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1189; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1190; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 1191; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1192; AVX1-NEXT: retq 1193; 1194; AVX2-LABEL: test_bitreverse_v8i32: 1195; AVX2: # %bb.0: 1196; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1197; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1198; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1199; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1200; AVX2-NEXT: # ymm3 = mem[0,1,0,1] 1201; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1202; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1203; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1204; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1205; AVX2-NEXT: # ymm1 = mem[0,1,0,1] 1206; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1207; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 1208; AVX2-NEXT: retq 1209; 1210; AVX512-LABEL: test_bitreverse_v8i32: 1211; AVX512: # %bb.0: 1212; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1213; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1214; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1215; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1216; AVX512-NEXT: # ymm3 = mem[0,1,0,1] 1217; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1218; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1219; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1220; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1221; AVX512-NEXT: # ymm1 = mem[0,1,0,1] 1222; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1223; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1224; AVX512-NEXT: retq 1225; 1226; XOPAVX1-LABEL: test_bitreverse_v8i32: 1227; XOPAVX1: # %bb.0: 1228; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1229; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 1230; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1231; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1232; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1233; XOPAVX1-NEXT: retq 1234; 1235; XOPAVX2-LABEL: test_bitreverse_v8i32: 1236; XOPAVX2: # %bb.0: 1237; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1238; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 1239; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1240; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1241; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1242; XOPAVX2-NEXT: retq 1243; 1244; GFNISSE-LABEL: test_bitreverse_v8i32: 1245; GFNISSE: # %bb.0: 1246; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1247; GFNISSE-NEXT: pshufb %xmm2, %xmm0 1248; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] 1249; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 1250; GFNISSE-NEXT: pshufb %xmm2, %xmm1 1251; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 1252; GFNISSE-NEXT: retq 1253; 1254; GFNIAVX1-LABEL: test_bitreverse_v8i32: 1255; GFNIAVX1: # %bb.0: 1256; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1257; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1258; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1259; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1260; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1261; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] 1262; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1263; GFNIAVX1-NEXT: retq 1264; 1265; GFNIAVX2-LABEL: test_bitreverse_v8i32: 1266; GFNIAVX2: # %bb.0: 1267; GFNIAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1268; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1269; GFNIAVX2-NEXT: retq 1270; 1271; GFNIAVX512-LABEL: test_bitreverse_v8i32: 1272; GFNIAVX512: # %bb.0: 1273; GFNIAVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1274; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1275; GFNIAVX512-NEXT: retq 1276 %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) 1277 ret <8 x i32> %b 1278} 1279 1280define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind { 1281; SSE2-LABEL: test_bitreverse_v4i64: 1282; SSE2: # %bb.0: 1283; SSE2-NEXT: pxor %xmm2, %xmm2 1284; SSE2-NEXT: movdqa %xmm0, %xmm3 1285; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 1286; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 1287; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 1288; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 1289; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1290; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1291; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 1292; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 1293; SSE2-NEXT: packuswb %xmm3, %xmm0 1294; SSE2-NEXT: movdqa %xmm0, %xmm4 1295; SSE2-NEXT: psrlw $4, %xmm4 1296; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1297; SSE2-NEXT: pand %xmm3, %xmm4 1298; SSE2-NEXT: pand %xmm3, %xmm0 1299; SSE2-NEXT: psllw $4, %xmm0 1300; SSE2-NEXT: por %xmm4, %xmm0 1301; SSE2-NEXT: movdqa %xmm0, %xmm5 1302; SSE2-NEXT: psrlw $2, %xmm5 1303; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1304; SSE2-NEXT: pand %xmm4, %xmm5 1305; SSE2-NEXT: pand %xmm4, %xmm0 1306; SSE2-NEXT: psllw $2, %xmm0 1307; SSE2-NEXT: por %xmm5, %xmm0 1308; SSE2-NEXT: movdqa %xmm0, %xmm6 1309; SSE2-NEXT: psrlw $1, %xmm6 1310; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1311; SSE2-NEXT: pand %xmm5, %xmm6 1312; SSE2-NEXT: pand %xmm5, %xmm0 1313; SSE2-NEXT: paddb %xmm0, %xmm0 1314; SSE2-NEXT: por %xmm6, %xmm0 1315; SSE2-NEXT: movdqa %xmm1, %xmm6 1316; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] 1317; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] 1318; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 1319; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 1320; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1321; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 1322; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 1323; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 1324; SSE2-NEXT: packuswb %xmm6, %xmm1 1325; SSE2-NEXT: movdqa %xmm1, %xmm2 1326; SSE2-NEXT: psrlw $4, %xmm2 1327; SSE2-NEXT: pand %xmm3, %xmm2 1328; SSE2-NEXT: pand %xmm3, %xmm1 1329; SSE2-NEXT: psllw $4, %xmm1 1330; SSE2-NEXT: por %xmm2, %xmm1 1331; SSE2-NEXT: movdqa %xmm1, %xmm2 1332; SSE2-NEXT: psrlw $2, %xmm2 1333; SSE2-NEXT: pand %xmm4, %xmm2 1334; SSE2-NEXT: pand %xmm4, %xmm1 1335; SSE2-NEXT: psllw $2, %xmm1 1336; SSE2-NEXT: por %xmm2, %xmm1 1337; SSE2-NEXT: movdqa %xmm1, %xmm2 1338; SSE2-NEXT: psrlw $1, %xmm2 1339; SSE2-NEXT: pand %xmm5, %xmm2 1340; SSE2-NEXT: pand %xmm5, %xmm1 1341; SSE2-NEXT: paddb %xmm1, %xmm1 1342; SSE2-NEXT: por %xmm2, %xmm1 1343; SSE2-NEXT: retq 1344; 1345; SSSE3-LABEL: test_bitreverse_v4i64: 1346; SSSE3: # %bb.0: 1347; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1348; SSSE3-NEXT: pshufb %xmm4, %xmm0 1349; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1350; SSSE3-NEXT: movdqa %xmm0, %xmm2 1351; SSSE3-NEXT: pand %xmm5, %xmm2 1352; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1353; SSSE3-NEXT: movdqa %xmm6, %xmm7 1354; SSSE3-NEXT: pshufb %xmm2, %xmm7 1355; SSSE3-NEXT: psrlw $4, %xmm0 1356; SSSE3-NEXT: pand %xmm5, %xmm0 1357; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1358; SSSE3-NEXT: movdqa %xmm2, %xmm3 1359; SSSE3-NEXT: pshufb %xmm0, %xmm3 1360; SSSE3-NEXT: por %xmm7, %xmm3 1361; SSSE3-NEXT: pshufb %xmm4, %xmm1 1362; SSSE3-NEXT: movdqa %xmm1, %xmm0 1363; SSSE3-NEXT: pand %xmm5, %xmm0 1364; SSSE3-NEXT: pshufb %xmm0, %xmm6 1365; SSSE3-NEXT: psrlw $4, %xmm1 1366; SSSE3-NEXT: pand %xmm5, %xmm1 1367; SSSE3-NEXT: pshufb %xmm1, %xmm2 1368; SSSE3-NEXT: por %xmm6, %xmm2 1369; SSSE3-NEXT: movdqa %xmm3, %xmm0 1370; SSSE3-NEXT: movdqa %xmm2, %xmm1 1371; SSSE3-NEXT: retq 1372; 1373; AVX1-LABEL: test_bitreverse_v4i64: 1374; AVX1: # %bb.0: 1375; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1376; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1377; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1378; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1379; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 1380; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1381; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1382; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1383; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1384; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1385; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1386; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 1387; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1388; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 1389; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 1390; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1391; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1392; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1393; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 1394; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1395; AVX1-NEXT: retq 1396; 1397; AVX2-LABEL: test_bitreverse_v4i64: 1398; AVX2: # %bb.0: 1399; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1400; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1401; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1402; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1403; AVX2-NEXT: # ymm3 = mem[0,1,0,1] 1404; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1405; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1406; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1407; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1408; AVX2-NEXT: # ymm1 = mem[0,1,0,1] 1409; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1410; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 1411; AVX2-NEXT: retq 1412; 1413; AVX512-LABEL: test_bitreverse_v4i64: 1414; AVX512: # %bb.0: 1415; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1416; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1417; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1418; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1419; AVX512-NEXT: # ymm3 = mem[0,1,0,1] 1420; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1421; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1422; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1423; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1424; AVX512-NEXT: # ymm1 = mem[0,1,0,1] 1425; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1426; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1427; AVX512-NEXT: retq 1428; 1429; XOPAVX1-LABEL: test_bitreverse_v4i64: 1430; XOPAVX1: # %bb.0: 1431; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1432; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 1433; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1434; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1435; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1436; XOPAVX1-NEXT: retq 1437; 1438; XOPAVX2-LABEL: test_bitreverse_v4i64: 1439; XOPAVX2: # %bb.0: 1440; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1441; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 1442; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1443; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1444; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1445; XOPAVX2-NEXT: retq 1446; 1447; GFNISSE-LABEL: test_bitreverse_v4i64: 1448; GFNISSE: # %bb.0: 1449; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1450; GFNISSE-NEXT: pshufb %xmm2, %xmm0 1451; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] 1452; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 1453; GFNISSE-NEXT: pshufb %xmm2, %xmm1 1454; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 1455; GFNISSE-NEXT: retq 1456; 1457; GFNIAVX1-LABEL: test_bitreverse_v4i64: 1458; GFNIAVX1: # %bb.0: 1459; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1460; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1461; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1462; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1463; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1464; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] 1465; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 1466; GFNIAVX1-NEXT: retq 1467; 1468; GFNIAVX2-LABEL: test_bitreverse_v4i64: 1469; GFNIAVX2: # %bb.0: 1470; GFNIAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1471; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1472; GFNIAVX2-NEXT: retq 1473; 1474; GFNIAVX512-LABEL: test_bitreverse_v4i64: 1475; GFNIAVX512: # %bb.0: 1476; GFNIAVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1477; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1478; GFNIAVX512-NEXT: retq 1479 %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) 1480 ret <4 x i64> %b 1481} 1482 1483define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind { 1484; SSE2-LABEL: test_bitreverse_v64i8: 1485; SSE2: # %bb.0: 1486; SSE2-NEXT: movdqa %xmm0, %xmm5 1487; SSE2-NEXT: psrlw $4, %xmm5 1488; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1489; SSE2-NEXT: pand %xmm4, %xmm5 1490; SSE2-NEXT: pand %xmm4, %xmm0 1491; SSE2-NEXT: psllw $4, %xmm0 1492; SSE2-NEXT: por %xmm5, %xmm0 1493; SSE2-NEXT: movdqa %xmm0, %xmm6 1494; SSE2-NEXT: psrlw $2, %xmm6 1495; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1496; SSE2-NEXT: pand %xmm5, %xmm6 1497; SSE2-NEXT: pand %xmm5, %xmm0 1498; SSE2-NEXT: psllw $2, %xmm0 1499; SSE2-NEXT: por %xmm6, %xmm0 1500; SSE2-NEXT: movdqa %xmm0, %xmm7 1501; SSE2-NEXT: psrlw $1, %xmm7 1502; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1503; SSE2-NEXT: pand %xmm6, %xmm7 1504; SSE2-NEXT: pand %xmm6, %xmm0 1505; SSE2-NEXT: paddb %xmm0, %xmm0 1506; SSE2-NEXT: por %xmm7, %xmm0 1507; SSE2-NEXT: movdqa %xmm1, %xmm7 1508; SSE2-NEXT: psrlw $4, %xmm7 1509; SSE2-NEXT: pand %xmm4, %xmm7 1510; SSE2-NEXT: pand %xmm4, %xmm1 1511; SSE2-NEXT: psllw $4, %xmm1 1512; SSE2-NEXT: por %xmm7, %xmm1 1513; SSE2-NEXT: movdqa %xmm1, %xmm7 1514; SSE2-NEXT: psrlw $2, %xmm7 1515; SSE2-NEXT: pand %xmm5, %xmm7 1516; SSE2-NEXT: pand %xmm5, %xmm1 1517; SSE2-NEXT: psllw $2, %xmm1 1518; SSE2-NEXT: por %xmm7, %xmm1 1519; SSE2-NEXT: movdqa %xmm1, %xmm7 1520; SSE2-NEXT: psrlw $1, %xmm7 1521; SSE2-NEXT: pand %xmm6, %xmm7 1522; SSE2-NEXT: pand %xmm6, %xmm1 1523; SSE2-NEXT: paddb %xmm1, %xmm1 1524; SSE2-NEXT: por %xmm7, %xmm1 1525; SSE2-NEXT: movdqa %xmm2, %xmm7 1526; SSE2-NEXT: psrlw $4, %xmm7 1527; SSE2-NEXT: pand %xmm4, %xmm7 1528; SSE2-NEXT: pand %xmm4, %xmm2 1529; SSE2-NEXT: psllw $4, %xmm2 1530; SSE2-NEXT: por %xmm7, %xmm2 1531; SSE2-NEXT: movdqa %xmm2, %xmm7 1532; SSE2-NEXT: psrlw $2, %xmm7 1533; SSE2-NEXT: pand %xmm5, %xmm7 1534; SSE2-NEXT: pand %xmm5, %xmm2 1535; SSE2-NEXT: psllw $2, %xmm2 1536; SSE2-NEXT: por %xmm7, %xmm2 1537; SSE2-NEXT: movdqa %xmm2, %xmm7 1538; SSE2-NEXT: psrlw $1, %xmm7 1539; SSE2-NEXT: pand %xmm6, %xmm7 1540; SSE2-NEXT: pand %xmm6, %xmm2 1541; SSE2-NEXT: paddb %xmm2, %xmm2 1542; SSE2-NEXT: por %xmm7, %xmm2 1543; SSE2-NEXT: movdqa %xmm3, %xmm7 1544; SSE2-NEXT: psrlw $4, %xmm7 1545; SSE2-NEXT: pand %xmm4, %xmm7 1546; SSE2-NEXT: pand %xmm4, %xmm3 1547; SSE2-NEXT: psllw $4, %xmm3 1548; SSE2-NEXT: por %xmm7, %xmm3 1549; SSE2-NEXT: movdqa %xmm3, %xmm4 1550; SSE2-NEXT: psrlw $2, %xmm4 1551; SSE2-NEXT: pand %xmm5, %xmm4 1552; SSE2-NEXT: pand %xmm5, %xmm3 1553; SSE2-NEXT: psllw $2, %xmm3 1554; SSE2-NEXT: por %xmm4, %xmm3 1555; SSE2-NEXT: movdqa %xmm3, %xmm4 1556; SSE2-NEXT: psrlw $1, %xmm4 1557; SSE2-NEXT: pand %xmm6, %xmm4 1558; SSE2-NEXT: pand %xmm6, %xmm3 1559; SSE2-NEXT: paddb %xmm3, %xmm3 1560; SSE2-NEXT: por %xmm4, %xmm3 1561; SSE2-NEXT: retq 1562; 1563; SSSE3-LABEL: test_bitreverse_v64i8: 1564; SSSE3: # %bb.0: 1565; SSSE3-NEXT: movdqa %xmm0, %xmm5 1566; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1567; SSSE3-NEXT: pand %xmm8, %xmm0 1568; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1569; SSSE3-NEXT: movdqa %xmm7, %xmm6 1570; SSSE3-NEXT: pshufb %xmm0, %xmm6 1571; SSSE3-NEXT: psrlw $4, %xmm5 1572; SSSE3-NEXT: pand %xmm8, %xmm5 1573; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1574; SSSE3-NEXT: movdqa %xmm4, %xmm0 1575; SSSE3-NEXT: pshufb %xmm5, %xmm0 1576; SSSE3-NEXT: por %xmm6, %xmm0 1577; SSSE3-NEXT: movdqa %xmm1, %xmm5 1578; SSSE3-NEXT: pand %xmm8, %xmm5 1579; SSSE3-NEXT: movdqa %xmm7, %xmm6 1580; SSSE3-NEXT: pshufb %xmm5, %xmm6 1581; SSSE3-NEXT: psrlw $4, %xmm1 1582; SSSE3-NEXT: pand %xmm8, %xmm1 1583; SSSE3-NEXT: movdqa %xmm4, %xmm5 1584; SSSE3-NEXT: pshufb %xmm1, %xmm5 1585; SSSE3-NEXT: por %xmm6, %xmm5 1586; SSSE3-NEXT: movdqa %xmm2, %xmm1 1587; SSSE3-NEXT: pand %xmm8, %xmm1 1588; SSSE3-NEXT: movdqa %xmm7, %xmm9 1589; SSSE3-NEXT: pshufb %xmm1, %xmm9 1590; SSSE3-NEXT: psrlw $4, %xmm2 1591; SSSE3-NEXT: pand %xmm8, %xmm2 1592; SSSE3-NEXT: movdqa %xmm4, %xmm6 1593; SSSE3-NEXT: pshufb %xmm2, %xmm6 1594; SSSE3-NEXT: por %xmm9, %xmm6 1595; SSSE3-NEXT: movdqa %xmm3, %xmm1 1596; SSSE3-NEXT: pand %xmm8, %xmm1 1597; SSSE3-NEXT: pshufb %xmm1, %xmm7 1598; SSSE3-NEXT: psrlw $4, %xmm3 1599; SSSE3-NEXT: pand %xmm8, %xmm3 1600; SSSE3-NEXT: pshufb %xmm3, %xmm4 1601; SSSE3-NEXT: por %xmm7, %xmm4 1602; SSSE3-NEXT: movdqa %xmm5, %xmm1 1603; SSSE3-NEXT: movdqa %xmm6, %xmm2 1604; SSSE3-NEXT: movdqa %xmm4, %xmm3 1605; SSSE3-NEXT: retq 1606; 1607; AVX1-LABEL: test_bitreverse_v64i8: 1608; AVX1: # %bb.0: 1609; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1610; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1611; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 1612; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1613; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1614; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1615; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1616; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1617; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 1618; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 1619; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm4 1620; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1621; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1622; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1623; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1624; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0 1625; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1626; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1627; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 1628; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1629; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1630; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1631; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 1632; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 1633; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 1634; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1635; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1636; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1637; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1638; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 1639; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1640; AVX1-NEXT: retq 1641; 1642; AVX2-LABEL: test_bitreverse_v64i8: 1643; AVX2: # %bb.0: 1644; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1645; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 1646; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1647; AVX2-NEXT: # ymm4 = mem[0,1,0,1] 1648; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 1649; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1650; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 1651; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1652; AVX2-NEXT: # ymm5 = mem[0,1,0,1] 1653; AVX2-NEXT: vpshufb %ymm0, %ymm5, %ymm0 1654; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0 1655; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3 1656; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 1657; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 1658; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 1659; AVX2-NEXT: vpshufb %ymm1, %ymm5, %ymm1 1660; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1 1661; AVX2-NEXT: retq 1662; 1663; AVX512F-LABEL: test_bitreverse_v64i8: 1664; AVX512F: # %bb.0: 1665; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1666; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1667; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 1668; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1669; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] 1670; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 1671; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 1672; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm4 1673; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 1674; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 1675; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 1676; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1677; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] 1678; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 1679; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 1680; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 1681; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 1682; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1683; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0 1684; AVX512F-NEXT: retq 1685; 1686; AVX512BW-LABEL: test_bitreverse_v64i8: 1687; AVX512BW: # %bb.0: 1688; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1689; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 1690; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1691; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1692; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 1693; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 1694; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 1695; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1696; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1697; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 1698; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 1699; AVX512BW-NEXT: retq 1700; 1701; XOPAVX1-LABEL: test_bitreverse_v64i8: 1702; XOPAVX1: # %bb.0: 1703; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1704; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 1705; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1706; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 1707; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1708; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1709; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1710; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 1711; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1712; XOPAVX1-NEXT: retq 1713; 1714; XOPAVX2-LABEL: test_bitreverse_v64i8: 1715; XOPAVX2: # %bb.0: 1716; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 1717; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 1718; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1719; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 1720; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 1721; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1722; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1723; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 1724; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 1725; XOPAVX2-NEXT: retq 1726; 1727; GFNISSE-LABEL: test_bitreverse_v64i8: 1728; GFNISSE: # %bb.0: 1729; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] 1730; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm0 1731; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm1 1732; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm2 1733; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm3 1734; GFNISSE-NEXT: retq 1735; 1736; GFNIAVX1-LABEL: test_bitreverse_v64i8: 1737; GFNIAVX1: # %bb.0: 1738; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] 1739; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 1740; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 1741; GFNIAVX1-NEXT: retq 1742; 1743; GFNIAVX2-LABEL: test_bitreverse_v64i8: 1744; GFNIAVX2: # %bb.0: 1745; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] 1746; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 1747; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 1748; GFNIAVX2-NEXT: retq 1749; 1750; GFNIAVX512-LABEL: test_bitreverse_v64i8: 1751; GFNIAVX512: # %bb.0: 1752; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 1753; GFNIAVX512-NEXT: retq 1754 %b = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) 1755 ret <64 x i8> %b 1756} 1757 1758define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { 1759; SSE2-LABEL: test_bitreverse_v32i16: 1760; SSE2: # %bb.0: 1761; SSE2-NEXT: movdqa %xmm0, %xmm4 1762; SSE2-NEXT: psrlw $8, %xmm4 1763; SSE2-NEXT: psllw $8, %xmm0 1764; SSE2-NEXT: por %xmm4, %xmm0 1765; SSE2-NEXT: movdqa %xmm0, %xmm5 1766; SSE2-NEXT: psrlw $4, %xmm5 1767; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1768; SSE2-NEXT: pand %xmm4, %xmm5 1769; SSE2-NEXT: pand %xmm4, %xmm0 1770; SSE2-NEXT: psllw $4, %xmm0 1771; SSE2-NEXT: por %xmm5, %xmm0 1772; SSE2-NEXT: movdqa %xmm0, %xmm6 1773; SSE2-NEXT: psrlw $2, %xmm6 1774; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1775; SSE2-NEXT: pand %xmm5, %xmm6 1776; SSE2-NEXT: pand %xmm5, %xmm0 1777; SSE2-NEXT: psllw $2, %xmm0 1778; SSE2-NEXT: por %xmm6, %xmm0 1779; SSE2-NEXT: movdqa %xmm0, %xmm7 1780; SSE2-NEXT: psrlw $1, %xmm7 1781; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1782; SSE2-NEXT: pand %xmm6, %xmm7 1783; SSE2-NEXT: pand %xmm6, %xmm0 1784; SSE2-NEXT: paddb %xmm0, %xmm0 1785; SSE2-NEXT: por %xmm7, %xmm0 1786; SSE2-NEXT: movdqa %xmm1, %xmm7 1787; SSE2-NEXT: psrlw $8, %xmm7 1788; SSE2-NEXT: psllw $8, %xmm1 1789; SSE2-NEXT: por %xmm7, %xmm1 1790; SSE2-NEXT: movdqa %xmm1, %xmm7 1791; SSE2-NEXT: psrlw $4, %xmm7 1792; SSE2-NEXT: pand %xmm4, %xmm7 1793; SSE2-NEXT: pand %xmm4, %xmm1 1794; SSE2-NEXT: psllw $4, %xmm1 1795; SSE2-NEXT: por %xmm7, %xmm1 1796; SSE2-NEXT: movdqa %xmm1, %xmm7 1797; SSE2-NEXT: psrlw $2, %xmm7 1798; SSE2-NEXT: pand %xmm5, %xmm7 1799; SSE2-NEXT: pand %xmm5, %xmm1 1800; SSE2-NEXT: psllw $2, %xmm1 1801; SSE2-NEXT: por %xmm7, %xmm1 1802; SSE2-NEXT: movdqa %xmm1, %xmm7 1803; SSE2-NEXT: psrlw $1, %xmm7 1804; SSE2-NEXT: pand %xmm6, %xmm7 1805; SSE2-NEXT: pand %xmm6, %xmm1 1806; SSE2-NEXT: paddb %xmm1, %xmm1 1807; SSE2-NEXT: por %xmm7, %xmm1 1808; SSE2-NEXT: movdqa %xmm2, %xmm7 1809; SSE2-NEXT: psrlw $8, %xmm7 1810; SSE2-NEXT: psllw $8, %xmm2 1811; SSE2-NEXT: por %xmm7, %xmm2 1812; SSE2-NEXT: movdqa %xmm2, %xmm7 1813; SSE2-NEXT: psrlw $4, %xmm7 1814; SSE2-NEXT: pand %xmm4, %xmm7 1815; SSE2-NEXT: pand %xmm4, %xmm2 1816; SSE2-NEXT: psllw $4, %xmm2 1817; SSE2-NEXT: por %xmm7, %xmm2 1818; SSE2-NEXT: movdqa %xmm2, %xmm7 1819; SSE2-NEXT: psrlw $2, %xmm7 1820; SSE2-NEXT: pand %xmm5, %xmm7 1821; SSE2-NEXT: pand %xmm5, %xmm2 1822; SSE2-NEXT: psllw $2, %xmm2 1823; SSE2-NEXT: por %xmm7, %xmm2 1824; SSE2-NEXT: movdqa %xmm2, %xmm7 1825; SSE2-NEXT: psrlw $1, %xmm7 1826; SSE2-NEXT: pand %xmm6, %xmm7 1827; SSE2-NEXT: pand %xmm6, %xmm2 1828; SSE2-NEXT: paddb %xmm2, %xmm2 1829; SSE2-NEXT: por %xmm7, %xmm2 1830; SSE2-NEXT: movdqa %xmm3, %xmm7 1831; SSE2-NEXT: psrlw $8, %xmm7 1832; SSE2-NEXT: psllw $8, %xmm3 1833; SSE2-NEXT: por %xmm7, %xmm3 1834; SSE2-NEXT: movdqa %xmm3, %xmm7 1835; SSE2-NEXT: psrlw $4, %xmm7 1836; SSE2-NEXT: pand %xmm4, %xmm7 1837; SSE2-NEXT: pand %xmm4, %xmm3 1838; SSE2-NEXT: psllw $4, %xmm3 1839; SSE2-NEXT: por %xmm7, %xmm3 1840; SSE2-NEXT: movdqa %xmm3, %xmm4 1841; SSE2-NEXT: psrlw $2, %xmm4 1842; SSE2-NEXT: pand %xmm5, %xmm4 1843; SSE2-NEXT: pand %xmm5, %xmm3 1844; SSE2-NEXT: psllw $2, %xmm3 1845; SSE2-NEXT: por %xmm4, %xmm3 1846; SSE2-NEXT: movdqa %xmm3, %xmm4 1847; SSE2-NEXT: psrlw $1, %xmm4 1848; SSE2-NEXT: pand %xmm6, %xmm4 1849; SSE2-NEXT: pand %xmm6, %xmm3 1850; SSE2-NEXT: paddb %xmm3, %xmm3 1851; SSE2-NEXT: por %xmm4, %xmm3 1852; SSE2-NEXT: retq 1853; 1854; SSSE3-LABEL: test_bitreverse_v32i16: 1855; SSSE3: # %bb.0: 1856; SSSE3-NEXT: movdqa %xmm1, %xmm5 1857; SSSE3-NEXT: movdqa %xmm0, %xmm1 1858; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1859; SSSE3-NEXT: pshufb %xmm8, %xmm1 1860; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1861; SSSE3-NEXT: movdqa %xmm1, %xmm0 1862; SSSE3-NEXT: pand %xmm7, %xmm0 1863; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1864; SSSE3-NEXT: movdqa %xmm6, %xmm9 1865; SSSE3-NEXT: pshufb %xmm0, %xmm9 1866; SSSE3-NEXT: psrlw $4, %xmm1 1867; SSSE3-NEXT: pand %xmm7, %xmm1 1868; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1869; SSSE3-NEXT: movdqa %xmm4, %xmm0 1870; SSSE3-NEXT: pshufb %xmm1, %xmm0 1871; SSSE3-NEXT: por %xmm9, %xmm0 1872; SSSE3-NEXT: pshufb %xmm8, %xmm5 1873; SSSE3-NEXT: movdqa %xmm5, %xmm1 1874; SSSE3-NEXT: pand %xmm7, %xmm1 1875; SSSE3-NEXT: movdqa %xmm6, %xmm9 1876; SSSE3-NEXT: pshufb %xmm1, %xmm9 1877; SSSE3-NEXT: psrlw $4, %xmm5 1878; SSSE3-NEXT: pand %xmm7, %xmm5 1879; SSSE3-NEXT: movdqa %xmm4, %xmm1 1880; SSSE3-NEXT: pshufb %xmm5, %xmm1 1881; SSSE3-NEXT: por %xmm9, %xmm1 1882; SSSE3-NEXT: pshufb %xmm8, %xmm2 1883; SSSE3-NEXT: movdqa %xmm2, %xmm5 1884; SSSE3-NEXT: pand %xmm7, %xmm5 1885; SSSE3-NEXT: movdqa %xmm6, %xmm9 1886; SSSE3-NEXT: pshufb %xmm5, %xmm9 1887; SSSE3-NEXT: psrlw $4, %xmm2 1888; SSSE3-NEXT: pand %xmm7, %xmm2 1889; SSSE3-NEXT: movdqa %xmm4, %xmm5 1890; SSSE3-NEXT: pshufb %xmm2, %xmm5 1891; SSSE3-NEXT: por %xmm9, %xmm5 1892; SSSE3-NEXT: pshufb %xmm8, %xmm3 1893; SSSE3-NEXT: movdqa %xmm3, %xmm2 1894; SSSE3-NEXT: pand %xmm7, %xmm2 1895; SSSE3-NEXT: pshufb %xmm2, %xmm6 1896; SSSE3-NEXT: psrlw $4, %xmm3 1897; SSSE3-NEXT: pand %xmm7, %xmm3 1898; SSSE3-NEXT: pshufb %xmm3, %xmm4 1899; SSSE3-NEXT: por %xmm6, %xmm4 1900; SSSE3-NEXT: movdqa %xmm5, %xmm2 1901; SSSE3-NEXT: movdqa %xmm4, %xmm3 1902; SSSE3-NEXT: retq 1903; 1904; AVX1-LABEL: test_bitreverse_v32i16: 1905; AVX1: # %bb.0: 1906; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1907; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1908; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1909; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1910; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 1911; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1912; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1913; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1914; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 1915; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1916; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 1917; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 1918; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1919; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 1920; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1921; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1922; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 1923; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 1924; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 1925; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1926; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1927; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1928; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 1929; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1930; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1931; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 1932; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 1933; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 1934; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1935; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 1936; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 1937; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1938; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 1939; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 1940; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 1941; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1942; AVX1-NEXT: retq 1943; 1944; AVX2-LABEL: test_bitreverse_v32i16: 1945; AVX2: # %bb.0: 1946; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1947; AVX2-NEXT: # ymm2 = mem[0,1,0,1] 1948; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 1949; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1950; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 1951; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1952; AVX2-NEXT: # ymm5 = mem[0,1,0,1] 1953; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 1954; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1955; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 1956; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1957; AVX2-NEXT: # ymm6 = mem[0,1,0,1] 1958; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 1959; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 1960; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 1961; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 1962; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 1963; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 1964; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 1965; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 1966; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 1967; AVX2-NEXT: retq 1968; 1969; AVX512F-LABEL: test_bitreverse_v32i16: 1970; AVX512F: # %bb.0: 1971; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1972; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1973; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] 1974; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 1975; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1976; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 1977; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1978; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] 1979; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 1980; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 1981; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2 1982; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2 1983; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 1984; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 1985; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 1986; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1987; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] 1988; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 1989; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 1990; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 1991; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 1992; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1993; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0 1994; AVX512F-NEXT: retq 1995; 1996; AVX512BW-LABEL: test_bitreverse_v32i16: 1997; AVX512BW: # %bb.0: 1998; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62] 1999; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2000; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 2001; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2002; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2003; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 2004; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 2005; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 2006; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2007; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2008; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 2009; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 2010; AVX512BW-NEXT: retq 2011; 2012; XOPAVX1-LABEL: test_bitreverse_v32i16: 2013; XOPAVX1: # %bb.0: 2014; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2015; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 2016; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2017; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2018; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2019; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2020; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2021; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2022; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2023; XOPAVX1-NEXT: retq 2024; 2025; XOPAVX2-LABEL: test_bitreverse_v32i16: 2026; XOPAVX2: # %bb.0: 2027; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2028; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 2029; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2030; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2031; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2032; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2033; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2034; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2035; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2036; XOPAVX2-NEXT: retq 2037; 2038; GFNISSE-LABEL: test_bitreverse_v32i16: 2039; GFNISSE: # %bb.0: 2040; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2041; GFNISSE-NEXT: pshufb %xmm4, %xmm0 2042; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] 2043; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 2044; GFNISSE-NEXT: pshufb %xmm4, %xmm1 2045; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 2046; GFNISSE-NEXT: pshufb %xmm4, %xmm2 2047; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm2 2048; GFNISSE-NEXT: pshufb %xmm4, %xmm3 2049; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm3 2050; GFNISSE-NEXT: retq 2051; 2052; GFNIAVX1-LABEL: test_bitreverse_v32i16: 2053; GFNIAVX1: # %bb.0: 2054; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2055; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2056; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2057; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2058; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2059; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] 2060; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 2061; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 2062; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 2063; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2064; GFNIAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 2065; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 2066; GFNIAVX1-NEXT: retq 2067; 2068; GFNIAVX2-LABEL: test_bitreverse_v32i16: 2069; GFNIAVX2: # %bb.0: 2070; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2071; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1] 2072; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2073; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] 2074; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 2075; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2076; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 2077; GFNIAVX2-NEXT: retq 2078; 2079; GFNIAVX512F-LABEL: test_bitreverse_v32i16: 2080; GFNIAVX512F: # %bb.0: 2081; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2082; GFNIAVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2083; GFNIAVX512F-NEXT: # ymm2 = mem[0,1,0,1] 2084; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2085; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2086; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2087; GFNIAVX512F-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 2088; GFNIAVX512F-NEXT: retq 2089; 2090; GFNIAVX512BW-LABEL: test_bitreverse_v32i16: 2091; GFNIAVX512BW: # %bb.0: 2092; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62] 2093; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 2094; GFNIAVX512BW-NEXT: retq 2095 %b = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) 2096 ret <32 x i16> %b 2097} 2098 2099define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { 2100; SSE2-LABEL: test_bitreverse_v16i32: 2101; SSE2: # %bb.0: 2102; SSE2-NEXT: pxor %xmm4, %xmm4 2103; SSE2-NEXT: movdqa %xmm0, %xmm5 2104; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] 2105; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7] 2106; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4] 2107; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 2108; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 2109; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 2110; SSE2-NEXT: packuswb %xmm5, %xmm0 2111; SSE2-NEXT: movdqa %xmm0, %xmm6 2112; SSE2-NEXT: psrlw $4, %xmm6 2113; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2114; SSE2-NEXT: pand %xmm5, %xmm6 2115; SSE2-NEXT: pand %xmm5, %xmm0 2116; SSE2-NEXT: psllw $4, %xmm0 2117; SSE2-NEXT: por %xmm6, %xmm0 2118; SSE2-NEXT: movdqa %xmm0, %xmm7 2119; SSE2-NEXT: psrlw $2, %xmm7 2120; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 2121; SSE2-NEXT: pand %xmm6, %xmm7 2122; SSE2-NEXT: pand %xmm6, %xmm0 2123; SSE2-NEXT: psllw $2, %xmm0 2124; SSE2-NEXT: por %xmm7, %xmm0 2125; SSE2-NEXT: movdqa %xmm0, %xmm8 2126; SSE2-NEXT: psrlw $1, %xmm8 2127; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 2128; SSE2-NEXT: pand %xmm7, %xmm8 2129; SSE2-NEXT: pand %xmm7, %xmm0 2130; SSE2-NEXT: paddb %xmm0, %xmm0 2131; SSE2-NEXT: por %xmm8, %xmm0 2132; SSE2-NEXT: movdqa %xmm1, %xmm8 2133; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] 2134; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,2,1,0,4,5,6,7] 2135; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4] 2136; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 2137; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 2138; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 2139; SSE2-NEXT: packuswb %xmm8, %xmm1 2140; SSE2-NEXT: movdqa %xmm1, %xmm8 2141; SSE2-NEXT: psrlw $4, %xmm8 2142; SSE2-NEXT: pand %xmm5, %xmm8 2143; SSE2-NEXT: pand %xmm5, %xmm1 2144; SSE2-NEXT: psllw $4, %xmm1 2145; SSE2-NEXT: por %xmm8, %xmm1 2146; SSE2-NEXT: movdqa %xmm1, %xmm8 2147; SSE2-NEXT: psrlw $2, %xmm8 2148; SSE2-NEXT: pand %xmm6, %xmm8 2149; SSE2-NEXT: pand %xmm6, %xmm1 2150; SSE2-NEXT: psllw $2, %xmm1 2151; SSE2-NEXT: por %xmm8, %xmm1 2152; SSE2-NEXT: movdqa %xmm1, %xmm8 2153; SSE2-NEXT: psrlw $1, %xmm8 2154; SSE2-NEXT: pand %xmm7, %xmm8 2155; SSE2-NEXT: pand %xmm7, %xmm1 2156; SSE2-NEXT: paddb %xmm1, %xmm1 2157; SSE2-NEXT: por %xmm8, %xmm1 2158; SSE2-NEXT: movdqa %xmm2, %xmm8 2159; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] 2160; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,2,1,0,4,5,6,7] 2161; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4] 2162; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 2163; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 2164; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 2165; SSE2-NEXT: packuswb %xmm8, %xmm2 2166; SSE2-NEXT: movdqa %xmm2, %xmm8 2167; SSE2-NEXT: psrlw $4, %xmm8 2168; SSE2-NEXT: pand %xmm5, %xmm8 2169; SSE2-NEXT: pand %xmm5, %xmm2 2170; SSE2-NEXT: psllw $4, %xmm2 2171; SSE2-NEXT: por %xmm8, %xmm2 2172; SSE2-NEXT: movdqa %xmm2, %xmm8 2173; SSE2-NEXT: psrlw $2, %xmm8 2174; SSE2-NEXT: pand %xmm6, %xmm8 2175; SSE2-NEXT: pand %xmm6, %xmm2 2176; SSE2-NEXT: psllw $2, %xmm2 2177; SSE2-NEXT: por %xmm8, %xmm2 2178; SSE2-NEXT: movdqa %xmm2, %xmm8 2179; SSE2-NEXT: psrlw $1, %xmm8 2180; SSE2-NEXT: pand %xmm7, %xmm8 2181; SSE2-NEXT: pand %xmm7, %xmm2 2182; SSE2-NEXT: paddb %xmm2, %xmm2 2183; SSE2-NEXT: por %xmm8, %xmm2 2184; SSE2-NEXT: movdqa %xmm3, %xmm8 2185; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] 2186; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,2,1,0,4,5,6,7] 2187; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4] 2188; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 2189; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 2190; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 2191; SSE2-NEXT: packuswb %xmm8, %xmm3 2192; SSE2-NEXT: movdqa %xmm3, %xmm4 2193; SSE2-NEXT: psrlw $4, %xmm4 2194; SSE2-NEXT: pand %xmm5, %xmm4 2195; SSE2-NEXT: pand %xmm5, %xmm3 2196; SSE2-NEXT: psllw $4, %xmm3 2197; SSE2-NEXT: por %xmm4, %xmm3 2198; SSE2-NEXT: movdqa %xmm3, %xmm4 2199; SSE2-NEXT: psrlw $2, %xmm4 2200; SSE2-NEXT: pand %xmm6, %xmm4 2201; SSE2-NEXT: pand %xmm6, %xmm3 2202; SSE2-NEXT: psllw $2, %xmm3 2203; SSE2-NEXT: por %xmm4, %xmm3 2204; SSE2-NEXT: movdqa %xmm3, %xmm4 2205; SSE2-NEXT: psrlw $1, %xmm4 2206; SSE2-NEXT: pand %xmm7, %xmm4 2207; SSE2-NEXT: pand %xmm7, %xmm3 2208; SSE2-NEXT: paddb %xmm3, %xmm3 2209; SSE2-NEXT: por %xmm4, %xmm3 2210; SSE2-NEXT: retq 2211; 2212; SSSE3-LABEL: test_bitreverse_v16i32: 2213; SSSE3: # %bb.0: 2214; SSSE3-NEXT: movdqa %xmm1, %xmm5 2215; SSSE3-NEXT: movdqa %xmm0, %xmm1 2216; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2217; SSSE3-NEXT: pshufb %xmm8, %xmm1 2218; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2219; SSSE3-NEXT: movdqa %xmm1, %xmm0 2220; SSSE3-NEXT: pand %xmm7, %xmm0 2221; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2222; SSSE3-NEXT: movdqa %xmm6, %xmm9 2223; SSSE3-NEXT: pshufb %xmm0, %xmm9 2224; SSSE3-NEXT: psrlw $4, %xmm1 2225; SSSE3-NEXT: pand %xmm7, %xmm1 2226; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2227; SSSE3-NEXT: movdqa %xmm4, %xmm0 2228; SSSE3-NEXT: pshufb %xmm1, %xmm0 2229; SSSE3-NEXT: por %xmm9, %xmm0 2230; SSSE3-NEXT: pshufb %xmm8, %xmm5 2231; SSSE3-NEXT: movdqa %xmm5, %xmm1 2232; SSSE3-NEXT: pand %xmm7, %xmm1 2233; SSSE3-NEXT: movdqa %xmm6, %xmm9 2234; SSSE3-NEXT: pshufb %xmm1, %xmm9 2235; SSSE3-NEXT: psrlw $4, %xmm5 2236; SSSE3-NEXT: pand %xmm7, %xmm5 2237; SSSE3-NEXT: movdqa %xmm4, %xmm1 2238; SSSE3-NEXT: pshufb %xmm5, %xmm1 2239; SSSE3-NEXT: por %xmm9, %xmm1 2240; SSSE3-NEXT: pshufb %xmm8, %xmm2 2241; SSSE3-NEXT: movdqa %xmm2, %xmm5 2242; SSSE3-NEXT: pand %xmm7, %xmm5 2243; SSSE3-NEXT: movdqa %xmm6, %xmm9 2244; SSSE3-NEXT: pshufb %xmm5, %xmm9 2245; SSSE3-NEXT: psrlw $4, %xmm2 2246; SSSE3-NEXT: pand %xmm7, %xmm2 2247; SSSE3-NEXT: movdqa %xmm4, %xmm5 2248; SSSE3-NEXT: pshufb %xmm2, %xmm5 2249; SSSE3-NEXT: por %xmm9, %xmm5 2250; SSSE3-NEXT: pshufb %xmm8, %xmm3 2251; SSSE3-NEXT: movdqa %xmm3, %xmm2 2252; SSSE3-NEXT: pand %xmm7, %xmm2 2253; SSSE3-NEXT: pshufb %xmm2, %xmm6 2254; SSSE3-NEXT: psrlw $4, %xmm3 2255; SSSE3-NEXT: pand %xmm7, %xmm3 2256; SSSE3-NEXT: pshufb %xmm3, %xmm4 2257; SSSE3-NEXT: por %xmm6, %xmm4 2258; SSSE3-NEXT: movdqa %xmm5, %xmm2 2259; SSSE3-NEXT: movdqa %xmm4, %xmm3 2260; SSSE3-NEXT: retq 2261; 2262; AVX1-LABEL: test_bitreverse_v16i32: 2263; AVX1: # %bb.0: 2264; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2265; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2266; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2267; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2268; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2269; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2270; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2271; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2272; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2273; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2274; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2275; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2276; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2277; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 2278; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2279; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 2280; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2281; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 2282; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 2283; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2284; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2285; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2286; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2287; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2288; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2289; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2290; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2291; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2292; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2293; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 2294; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 2295; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 2296; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2297; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 2298; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 2299; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2300; AVX1-NEXT: retq 2301; 2302; AVX2-LABEL: test_bitreverse_v16i32: 2303; AVX2: # %bb.0: 2304; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2305; AVX2-NEXT: # ymm2 = mem[0,1,0,1] 2306; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2307; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2308; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 2309; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2310; AVX2-NEXT: # ymm5 = mem[0,1,0,1] 2311; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2312; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 2313; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 2314; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2315; AVX2-NEXT: # ymm6 = mem[0,1,0,1] 2316; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 2317; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 2318; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2319; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 2320; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2321; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 2322; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2323; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 2324; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 2325; AVX2-NEXT: retq 2326; 2327; AVX512F-LABEL: test_bitreverse_v16i32: 2328; AVX512F: # %bb.0: 2329; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2330; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2331; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] 2332; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2333; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2334; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 2335; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2336; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] 2337; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2338; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2339; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2 2340; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2341; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 2342; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 2343; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 2344; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2345; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] 2346; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 2347; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 2348; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 2349; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 2350; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2351; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0 2352; AVX512F-NEXT: retq 2353; 2354; AVX512BW-LABEL: test_bitreverse_v16i32: 2355; AVX512BW: # %bb.0: 2356; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60] 2357; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2358; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 2359; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2360; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2361; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 2362; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 2363; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 2364; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2365; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2366; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 2367; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 2368; AVX512BW-NEXT: retq 2369; 2370; XOPAVX1-LABEL: test_bitreverse_v16i32: 2371; XOPAVX1: # %bb.0: 2372; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2373; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 2374; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2375; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2376; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2377; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2378; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2379; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2380; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2381; XOPAVX1-NEXT: retq 2382; 2383; XOPAVX2-LABEL: test_bitreverse_v16i32: 2384; XOPAVX2: # %bb.0: 2385; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2386; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 2387; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2388; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2389; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2390; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2391; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2392; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2393; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2394; XOPAVX2-NEXT: retq 2395; 2396; GFNISSE-LABEL: test_bitreverse_v16i32: 2397; GFNISSE: # %bb.0: 2398; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2399; GFNISSE-NEXT: pshufb %xmm4, %xmm0 2400; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] 2401; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 2402; GFNISSE-NEXT: pshufb %xmm4, %xmm1 2403; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 2404; GFNISSE-NEXT: pshufb %xmm4, %xmm2 2405; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm2 2406; GFNISSE-NEXT: pshufb %xmm4, %xmm3 2407; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm3 2408; GFNISSE-NEXT: retq 2409; 2410; GFNIAVX1-LABEL: test_bitreverse_v16i32: 2411; GFNIAVX1: # %bb.0: 2412; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2413; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2414; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2415; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2416; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2417; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] 2418; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 2419; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 2420; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 2421; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2422; GFNIAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 2423; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 2424; GFNIAVX1-NEXT: retq 2425; 2426; GFNIAVX2-LABEL: test_bitreverse_v16i32: 2427; GFNIAVX2: # %bb.0: 2428; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2429; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1] 2430; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2431; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] 2432; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 2433; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2434; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 2435; GFNIAVX2-NEXT: retq 2436; 2437; GFNIAVX512F-LABEL: test_bitreverse_v16i32: 2438; GFNIAVX512F: # %bb.0: 2439; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2440; GFNIAVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2441; GFNIAVX512F-NEXT: # ymm2 = mem[0,1,0,1] 2442; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2443; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2444; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2445; GFNIAVX512F-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 2446; GFNIAVX512F-NEXT: retq 2447; 2448; GFNIAVX512BW-LABEL: test_bitreverse_v16i32: 2449; GFNIAVX512BW: # %bb.0: 2450; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60] 2451; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 2452; GFNIAVX512BW-NEXT: retq 2453 %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) 2454 ret <16 x i32> %b 2455} 2456 2457define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { 2458; SSE2-LABEL: test_bitreverse_v8i64: 2459; SSE2: # %bb.0: 2460; SSE2-NEXT: pxor %xmm4, %xmm4 2461; SSE2-NEXT: movdqa %xmm0, %xmm5 2462; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] 2463; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] 2464; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7] 2465; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4] 2466; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 2467; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 2468; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 2469; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 2470; SSE2-NEXT: packuswb %xmm5, %xmm0 2471; SSE2-NEXT: movdqa %xmm0, %xmm6 2472; SSE2-NEXT: psrlw $4, %xmm6 2473; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2474; SSE2-NEXT: pand %xmm5, %xmm6 2475; SSE2-NEXT: pand %xmm5, %xmm0 2476; SSE2-NEXT: psllw $4, %xmm0 2477; SSE2-NEXT: por %xmm6, %xmm0 2478; SSE2-NEXT: movdqa %xmm0, %xmm7 2479; SSE2-NEXT: psrlw $2, %xmm7 2480; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 2481; SSE2-NEXT: pand %xmm6, %xmm7 2482; SSE2-NEXT: pand %xmm6, %xmm0 2483; SSE2-NEXT: psllw $2, %xmm0 2484; SSE2-NEXT: por %xmm7, %xmm0 2485; SSE2-NEXT: movdqa %xmm0, %xmm8 2486; SSE2-NEXT: psrlw $1, %xmm8 2487; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 2488; SSE2-NEXT: pand %xmm7, %xmm8 2489; SSE2-NEXT: pand %xmm7, %xmm0 2490; SSE2-NEXT: paddb %xmm0, %xmm0 2491; SSE2-NEXT: por %xmm8, %xmm0 2492; SSE2-NEXT: movdqa %xmm1, %xmm8 2493; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] 2494; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,3,0,1] 2495; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,2,1,0,4,5,6,7] 2496; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4] 2497; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 2498; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2499; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 2500; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 2501; SSE2-NEXT: packuswb %xmm8, %xmm1 2502; SSE2-NEXT: movdqa %xmm1, %xmm8 2503; SSE2-NEXT: psrlw $4, %xmm8 2504; SSE2-NEXT: pand %xmm5, %xmm8 2505; SSE2-NEXT: pand %xmm5, %xmm1 2506; SSE2-NEXT: psllw $4, %xmm1 2507; SSE2-NEXT: por %xmm8, %xmm1 2508; SSE2-NEXT: movdqa %xmm1, %xmm8 2509; SSE2-NEXT: psrlw $2, %xmm8 2510; SSE2-NEXT: pand %xmm6, %xmm8 2511; SSE2-NEXT: pand %xmm6, %xmm1 2512; SSE2-NEXT: psllw $2, %xmm1 2513; SSE2-NEXT: por %xmm8, %xmm1 2514; SSE2-NEXT: movdqa %xmm1, %xmm8 2515; SSE2-NEXT: psrlw $1, %xmm8 2516; SSE2-NEXT: pand %xmm7, %xmm8 2517; SSE2-NEXT: pand %xmm7, %xmm1 2518; SSE2-NEXT: paddb %xmm1, %xmm1 2519; SSE2-NEXT: por %xmm8, %xmm1 2520; SSE2-NEXT: movdqa %xmm2, %xmm8 2521; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] 2522; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,3,0,1] 2523; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,2,1,0,4,5,6,7] 2524; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4] 2525; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 2526; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 2527; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 2528; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 2529; SSE2-NEXT: packuswb %xmm8, %xmm2 2530; SSE2-NEXT: movdqa %xmm2, %xmm8 2531; SSE2-NEXT: psrlw $4, %xmm8 2532; SSE2-NEXT: pand %xmm5, %xmm8 2533; SSE2-NEXT: pand %xmm5, %xmm2 2534; SSE2-NEXT: psllw $4, %xmm2 2535; SSE2-NEXT: por %xmm8, %xmm2 2536; SSE2-NEXT: movdqa %xmm2, %xmm8 2537; SSE2-NEXT: psrlw $2, %xmm8 2538; SSE2-NEXT: pand %xmm6, %xmm8 2539; SSE2-NEXT: pand %xmm6, %xmm2 2540; SSE2-NEXT: psllw $2, %xmm2 2541; SSE2-NEXT: por %xmm8, %xmm2 2542; SSE2-NEXT: movdqa %xmm2, %xmm8 2543; SSE2-NEXT: psrlw $1, %xmm8 2544; SSE2-NEXT: pand %xmm7, %xmm8 2545; SSE2-NEXT: pand %xmm7, %xmm2 2546; SSE2-NEXT: paddb %xmm2, %xmm2 2547; SSE2-NEXT: por %xmm8, %xmm2 2548; SSE2-NEXT: movdqa %xmm3, %xmm8 2549; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] 2550; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,3,0,1] 2551; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,2,1,0,4,5,6,7] 2552; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4] 2553; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 2554; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 2555; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 2556; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 2557; SSE2-NEXT: packuswb %xmm8, %xmm3 2558; SSE2-NEXT: movdqa %xmm3, %xmm4 2559; SSE2-NEXT: psrlw $4, %xmm4 2560; SSE2-NEXT: pand %xmm5, %xmm4 2561; SSE2-NEXT: pand %xmm5, %xmm3 2562; SSE2-NEXT: psllw $4, %xmm3 2563; SSE2-NEXT: por %xmm4, %xmm3 2564; SSE2-NEXT: movdqa %xmm3, %xmm4 2565; SSE2-NEXT: psrlw $2, %xmm4 2566; SSE2-NEXT: pand %xmm6, %xmm4 2567; SSE2-NEXT: pand %xmm6, %xmm3 2568; SSE2-NEXT: psllw $2, %xmm3 2569; SSE2-NEXT: por %xmm4, %xmm3 2570; SSE2-NEXT: movdqa %xmm3, %xmm4 2571; SSE2-NEXT: psrlw $1, %xmm4 2572; SSE2-NEXT: pand %xmm7, %xmm4 2573; SSE2-NEXT: pand %xmm7, %xmm3 2574; SSE2-NEXT: paddb %xmm3, %xmm3 2575; SSE2-NEXT: por %xmm4, %xmm3 2576; SSE2-NEXT: retq 2577; 2578; SSSE3-LABEL: test_bitreverse_v8i64: 2579; SSSE3: # %bb.0: 2580; SSSE3-NEXT: movdqa %xmm1, %xmm5 2581; SSSE3-NEXT: movdqa %xmm0, %xmm1 2582; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2583; SSSE3-NEXT: pshufb %xmm8, %xmm1 2584; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2585; SSSE3-NEXT: movdqa %xmm1, %xmm0 2586; SSSE3-NEXT: pand %xmm7, %xmm0 2587; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2588; SSSE3-NEXT: movdqa %xmm6, %xmm9 2589; SSSE3-NEXT: pshufb %xmm0, %xmm9 2590; SSSE3-NEXT: psrlw $4, %xmm1 2591; SSSE3-NEXT: pand %xmm7, %xmm1 2592; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2593; SSSE3-NEXT: movdqa %xmm4, %xmm0 2594; SSSE3-NEXT: pshufb %xmm1, %xmm0 2595; SSSE3-NEXT: por %xmm9, %xmm0 2596; SSSE3-NEXT: pshufb %xmm8, %xmm5 2597; SSSE3-NEXT: movdqa %xmm5, %xmm1 2598; SSSE3-NEXT: pand %xmm7, %xmm1 2599; SSSE3-NEXT: movdqa %xmm6, %xmm9 2600; SSSE3-NEXT: pshufb %xmm1, %xmm9 2601; SSSE3-NEXT: psrlw $4, %xmm5 2602; SSSE3-NEXT: pand %xmm7, %xmm5 2603; SSSE3-NEXT: movdqa %xmm4, %xmm1 2604; SSSE3-NEXT: pshufb %xmm5, %xmm1 2605; SSSE3-NEXT: por %xmm9, %xmm1 2606; SSSE3-NEXT: pshufb %xmm8, %xmm2 2607; SSSE3-NEXT: movdqa %xmm2, %xmm5 2608; SSSE3-NEXT: pand %xmm7, %xmm5 2609; SSSE3-NEXT: movdqa %xmm6, %xmm9 2610; SSSE3-NEXT: pshufb %xmm5, %xmm9 2611; SSSE3-NEXT: psrlw $4, %xmm2 2612; SSSE3-NEXT: pand %xmm7, %xmm2 2613; SSSE3-NEXT: movdqa %xmm4, %xmm5 2614; SSSE3-NEXT: pshufb %xmm2, %xmm5 2615; SSSE3-NEXT: por %xmm9, %xmm5 2616; SSSE3-NEXT: pshufb %xmm8, %xmm3 2617; SSSE3-NEXT: movdqa %xmm3, %xmm2 2618; SSSE3-NEXT: pand %xmm7, %xmm2 2619; SSSE3-NEXT: pshufb %xmm2, %xmm6 2620; SSSE3-NEXT: psrlw $4, %xmm3 2621; SSSE3-NEXT: pand %xmm7, %xmm3 2622; SSSE3-NEXT: pshufb %xmm3, %xmm4 2623; SSSE3-NEXT: por %xmm6, %xmm4 2624; SSSE3-NEXT: movdqa %xmm5, %xmm2 2625; SSSE3-NEXT: movdqa %xmm4, %xmm3 2626; SSSE3-NEXT: retq 2627; 2628; AVX1-LABEL: test_bitreverse_v8i64: 2629; AVX1: # %bb.0: 2630; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2631; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2632; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2633; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2634; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2635; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2636; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2637; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2638; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2639; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2640; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2641; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2642; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2643; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 2644; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2645; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 2646; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2647; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 2648; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 2649; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2650; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2651; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2652; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2653; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2654; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2655; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2656; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2657; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2658; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2659; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 2660; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 2661; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 2662; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2663; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 2664; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 2665; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2666; AVX1-NEXT: retq 2667; 2668; AVX2-LABEL: test_bitreverse_v8i64: 2669; AVX2: # %bb.0: 2670; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2671; AVX2-NEXT: # ymm2 = mem[0,1,0,1] 2672; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2673; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2674; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 2675; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2676; AVX2-NEXT: # ymm5 = mem[0,1,0,1] 2677; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2678; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 2679; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 2680; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2681; AVX2-NEXT: # ymm6 = mem[0,1,0,1] 2682; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 2683; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 2684; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2685; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 2686; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2687; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 2688; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2689; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 2690; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 2691; AVX2-NEXT: retq 2692; 2693; AVX512F-LABEL: test_bitreverse_v8i64: 2694; AVX512F: # %bb.0: 2695; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2696; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2697; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] 2698; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2699; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2700; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 2701; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2702; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] 2703; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2704; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2705; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2 2706; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2707; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 2708; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 2709; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 2710; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2711; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] 2712; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 2713; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 2714; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 2715; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 2716; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2717; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0 2718; AVX512F-NEXT: retq 2719; 2720; AVX512BW-LABEL: test_bitreverse_v8i64: 2721; AVX512BW: # %bb.0: 2722; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56] 2723; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2724; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 2725; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2726; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2727; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 2728; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 2729; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 2730; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2731; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2732; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 2733; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 2734; AVX512BW-NEXT: retq 2735; 2736; XOPAVX1-LABEL: test_bitreverse_v8i64: 2737; XOPAVX1: # %bb.0: 2738; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2739; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 2740; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2741; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2742; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2743; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2744; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2745; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2746; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2747; XOPAVX1-NEXT: retq 2748; 2749; XOPAVX2-LABEL: test_bitreverse_v8i64: 2750; XOPAVX2: # %bb.0: 2751; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2752; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 2753; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2754; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2755; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2756; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2757; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2758; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2759; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2760; XOPAVX2-NEXT: retq 2761; 2762; GFNISSE-LABEL: test_bitreverse_v8i64: 2763; GFNISSE: # %bb.0: 2764; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2765; GFNISSE-NEXT: pshufb %xmm4, %xmm0 2766; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] 2767; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 2768; GFNISSE-NEXT: pshufb %xmm4, %xmm1 2769; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 2770; GFNISSE-NEXT: pshufb %xmm4, %xmm2 2771; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm2 2772; GFNISSE-NEXT: pshufb %xmm4, %xmm3 2773; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm3 2774; GFNISSE-NEXT: retq 2775; 2776; GFNIAVX1-LABEL: test_bitreverse_v8i64: 2777; GFNIAVX1: # %bb.0: 2778; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2779; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2780; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2781; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2782; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2783; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] 2784; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 2785; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 2786; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 2787; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2788; GFNIAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 2789; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 2790; GFNIAVX1-NEXT: retq 2791; 2792; GFNIAVX2-LABEL: test_bitreverse_v8i64: 2793; GFNIAVX2: # %bb.0: 2794; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2795; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1] 2796; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2797; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] 2798; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 2799; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2800; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 2801; GFNIAVX2-NEXT: retq 2802; 2803; GFNIAVX512F-LABEL: test_bitreverse_v8i64: 2804; GFNIAVX512F: # %bb.0: 2805; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2806; GFNIAVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2807; GFNIAVX512F-NEXT: # ymm2 = mem[0,1,0,1] 2808; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2809; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2810; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2811; GFNIAVX512F-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 2812; GFNIAVX512F-NEXT: retq 2813; 2814; GFNIAVX512BW-LABEL: test_bitreverse_v8i64: 2815; GFNIAVX512BW: # %bb.0: 2816; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56] 2817; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 2818; GFNIAVX512BW-NEXT: retq 2819 %b = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) 2820 ret <8 x i64> %b 2821} 2822 2823; 2824; Constant Folding 2825; 2826 2827define i32 @fold_bitreverse_i32() nounwind { 2828; ALL-LABEL: fold_bitreverse_i32: 2829; ALL: # %bb.0: 2830; ALL-NEXT: movl $16711935, %eax # imm = 0xFF00FF 2831; ALL-NEXT: retq 2832 %b = call i32 @llvm.bitreverse.i32(i32 4278255360) 2833 ret i32 %b 2834} 2835 2836define <16 x i8> @fold_bitreverse_v16i8() nounwind { 2837; SSE-LABEL: fold_bitreverse_v16i8: 2838; SSE: # %bb.0: 2839; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 2840; SSE-NEXT: retq 2841; 2842; AVX-LABEL: fold_bitreverse_v16i8: 2843; AVX: # %bb.0: 2844; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 2845; AVX-NEXT: retq 2846; 2847; XOP-LABEL: fold_bitreverse_v16i8: 2848; XOP: # %bb.0: 2849; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 2850; XOP-NEXT: retq 2851; 2852; GFNISSE-LABEL: fold_bitreverse_v16i8: 2853; GFNISSE: # %bb.0: 2854; GFNISSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 2855; GFNISSE-NEXT: retq 2856; 2857; GFNIAVX-LABEL: fold_bitreverse_v16i8: 2858; GFNIAVX: # %bb.0: 2859; GFNIAVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 2860; GFNIAVX-NEXT: retq 2861 %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> <i8 0, i8 -1, i8 2, i8 -3, i8 4, i8 -5, i8 6, i8 -7, i8 8, i8 -9, i8 10, i8 -11, i8 12, i8 -13, i8 14, i8 -15>) 2862 ret <16 x i8> %b 2863} 2864 2865define <16 x i16> @fold_bitreverse_v16i16() nounwind { 2866; SSE-LABEL: fold_bitreverse_v16i16: 2867; SSE: # %bb.0: 2868; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959] 2869; SSE-NEXT: movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863] 2870; SSE-NEXT: retq 2871; 2872; AVX-LABEL: fold_bitreverse_v16i16: 2873; AVX: # %bb.0: 2874; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 2875; AVX-NEXT: retq 2876; 2877; XOP-LABEL: fold_bitreverse_v16i16: 2878; XOP: # %bb.0: 2879; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 2880; XOP-NEXT: retq 2881; 2882; GFNISSE-LABEL: fold_bitreverse_v16i16: 2883; GFNISSE: # %bb.0: 2884; GFNISSE-NEXT: movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959] 2885; GFNISSE-NEXT: movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863] 2886; GFNISSE-NEXT: retq 2887; 2888; GFNIAVX-LABEL: fold_bitreverse_v16i16: 2889; GFNIAVX: # %bb.0: 2890; GFNIAVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 2891; GFNIAVX-NEXT: retq 2892 %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> <i16 0, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6, i16 -7, i16 8, i16 -9, i16 10, i16 -11, i16 12, i16 -13, i16 14, i16 -15>) 2893 ret <16 x i16> %b 2894} 2895 2896define <16 x i32> @fold_bitreverse_v16i32() nounwind { 2897; SSE-LABEL: fold_bitreverse_v16i32: 2898; SSE: # %bb.0: 2899; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471] 2900; SSE-NEXT: movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559] 2901; SSE-NEXT: movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015] 2902; SSE-NEXT: movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103] 2903; SSE-NEXT: retq 2904; 2905; AVX1-LABEL: fold_bitreverse_v16i32: 2906; AVX1: # %bb.0: 2907; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 2908; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 2909; AVX1-NEXT: retq 2910; 2911; AVX2-LABEL: fold_bitreverse_v16i32: 2912; AVX2: # %bb.0: 2913; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 2914; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 2915; AVX2-NEXT: retq 2916; 2917; AVX512-LABEL: fold_bitreverse_v16i32: 2918; AVX512: # %bb.0: 2919; AVX512-NEXT: vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 2920; AVX512-NEXT: retq 2921; 2922; XOP-LABEL: fold_bitreverse_v16i32: 2923; XOP: # %bb.0: 2924; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 2925; XOP-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 2926; XOP-NEXT: retq 2927; 2928; GFNISSE-LABEL: fold_bitreverse_v16i32: 2929; GFNISSE: # %bb.0: 2930; GFNISSE-NEXT: movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471] 2931; GFNISSE-NEXT: movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559] 2932; GFNISSE-NEXT: movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015] 2933; GFNISSE-NEXT: movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103] 2934; GFNISSE-NEXT: retq 2935; 2936; GFNIAVX1-LABEL: fold_bitreverse_v16i32: 2937; GFNIAVX1: # %bb.0: 2938; GFNIAVX1-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 2939; GFNIAVX1-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 2940; GFNIAVX1-NEXT: retq 2941; 2942; GFNIAVX2-LABEL: fold_bitreverse_v16i32: 2943; GFNIAVX2: # %bb.0: 2944; GFNIAVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 2945; GFNIAVX2-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 2946; GFNIAVX2-NEXT: retq 2947; 2948; GFNIAVX512-LABEL: fold_bitreverse_v16i32: 2949; GFNIAVX512: # %bb.0: 2950; GFNIAVX512-NEXT: vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 2951; GFNIAVX512-NEXT: retq 2952 %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> <i32 0, i32 -1, i32 2, i32 -3, i32 4, i32 -5, i32 6, i32 -7, i32 8, i32 -9, i32 10, i32 -11, i32 12, i32 -13, i32 14, i32 -15>) 2953 ret <16 x i32> %b 2954} 2955 2956declare i8 @llvm.bitreverse.i8(i8) readnone 2957declare i16 @llvm.bitreverse.i16(i16) readnone 2958declare i32 @llvm.bitreverse.i32(i32) readnone 2959declare i64 @llvm.bitreverse.i64(i64) readnone 2960 2961declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) readnone 2962declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone 2963declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone 2964declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone 2965 2966declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>) readnone 2967declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone 2968declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>) readnone 2969declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) readnone 2970 2971declare <64 x i8> @llvm.bitreverse.v64i8(<64 x i8>) readnone 2972declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>) readnone 2973declare <16 x i32> @llvm.bitreverse.v16i32(<16 x i32>) readnone 2974declare <8 x i64> @llvm.bitreverse.v8i64(<8 x i64>) readnone 2975