1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2NOBW 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX512BW 7 8; 9; sdiv by 7 10; 11 12define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind { 13; SSE2-LABEL: test_div7_2i64: 14; SSE2: # %bb.0: 15; SSE2-NEXT: movq %xmm0, %rax 16; SSE2-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 17; SSE2-NEXT: imulq %rcx 18; SSE2-NEXT: movq %rdx, %rax 19; SSE2-NEXT: shrq $63, %rax 20; SSE2-NEXT: sarq %rdx 21; SSE2-NEXT: addq %rax, %rdx 22; SSE2-NEXT: movq %rdx, %xmm1 23; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 24; SSE2-NEXT: movq %xmm0, %rax 25; SSE2-NEXT: imulq %rcx 26; SSE2-NEXT: movq %rdx, %rax 27; SSE2-NEXT: shrq $63, %rax 28; SSE2-NEXT: sarq %rdx 29; SSE2-NEXT: addq %rax, %rdx 30; SSE2-NEXT: movq %rdx, %xmm0 31; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 32; SSE2-NEXT: movdqa %xmm1, %xmm0 33; SSE2-NEXT: retq 34; 35; SSE41-LABEL: test_div7_2i64: 36; SSE41: # %bb.0: 37; SSE41-NEXT: pextrq $1, %xmm0, %rax 38; SSE41-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 39; SSE41-NEXT: imulq %rcx 40; SSE41-NEXT: movq %rdx, %rax 41; SSE41-NEXT: shrq $63, %rax 42; SSE41-NEXT: sarq %rdx 43; SSE41-NEXT: addq %rax, %rdx 44; SSE41-NEXT: movq %rdx, %xmm1 45; SSE41-NEXT: movq %xmm0, %rax 46; SSE41-NEXT: imulq %rcx 47; SSE41-NEXT: movq %rdx, %rax 48; SSE41-NEXT: shrq $63, %rax 49; SSE41-NEXT: sarq %rdx 50; SSE41-NEXT: addq %rax, %rdx 51; SSE41-NEXT: movq %rdx, %xmm0 52; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 53; SSE41-NEXT: retq 54; 55; AVX-LABEL: test_div7_2i64: 56; AVX: # %bb.0: 57; AVX-NEXT: vpextrq $1, %xmm0, %rax 58; AVX-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 59; AVX-NEXT: imulq %rcx 60; AVX-NEXT: movq %rdx, %rax 61; AVX-NEXT: shrq $63, %rax 62; AVX-NEXT: sarq %rdx 63; AVX-NEXT: addq %rax, %rdx 64; AVX-NEXT: vmovq %rdx, %xmm1 65; AVX-NEXT: vmovq %xmm0, %rax 66; AVX-NEXT: imulq %rcx 67; AVX-NEXT: movq %rdx, %rax 68; AVX-NEXT: shrq $63, %rax 69; AVX-NEXT: sarq %rdx 70; AVX-NEXT: addq %rax, %rdx 71; AVX-NEXT: vmovq %rdx, %xmm0 72; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 73; AVX-NEXT: retq 74 %res = sdiv <2 x i64> %a, <i64 7, i64 7> 75 ret <2 x i64> %res 76} 77 78define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind { 79; SSE2-LABEL: test_div7_4i32: 80; SSE2: # %bb.0: 81; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] 82; SSE2-NEXT: movdqa %xmm0, %xmm2 83; SSE2-NEXT: pmuludq %xmm1, %xmm2 84; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] 85; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 86; SSE2-NEXT: pmuludq %xmm1, %xmm3 87; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] 88; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 89; SSE2-NEXT: pxor %xmm3, %xmm3 90; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 91; SSE2-NEXT: pand %xmm1, %xmm3 92; SSE2-NEXT: paddd %xmm0, %xmm3 93; SSE2-NEXT: psubd %xmm3, %xmm2 94; SSE2-NEXT: paddd %xmm2, %xmm0 95; SSE2-NEXT: movdqa %xmm0, %xmm1 96; SSE2-NEXT: psrld $31, %xmm1 97; SSE2-NEXT: psrad $2, %xmm0 98; SSE2-NEXT: paddd %xmm1, %xmm0 99; SSE2-NEXT: retq 100; 101; SSE41-LABEL: test_div7_4i32: 102; SSE41: # %bb.0: 103; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 104; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] 105; SSE41-NEXT: pmuldq %xmm2, %xmm1 106; SSE41-NEXT: pmuldq %xmm0, %xmm2 107; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 108; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] 109; SSE41-NEXT: paddd %xmm2, %xmm0 110; SSE41-NEXT: movdqa %xmm0, %xmm1 111; SSE41-NEXT: psrld $31, %xmm1 112; SSE41-NEXT: psrad $2, %xmm0 113; SSE41-NEXT: paddd %xmm1, %xmm0 114; SSE41-NEXT: retq 115; 116; AVX1-LABEL: test_div7_4i32: 117; AVX1: # %bb.0: 118; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 119; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] 120; AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 121; AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 122; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 123; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] 124; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 125; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1 126; AVX1-NEXT: vpsrad $2, %xmm0, %xmm0 127; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 128; AVX1-NEXT: retq 129; 130; AVX2-LABEL: test_div7_4i32: 131; AVX2: # %bb.0: 132; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 133; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] 134; AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 135; AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 136; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 137; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] 138; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 139; AVX2-NEXT: vpsrld $31, %xmm0, %xmm1 140; AVX2-NEXT: vpsrad $2, %xmm0, %xmm0 141; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 142; AVX2-NEXT: retq 143 %res = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7> 144 ret <4 x i32> %res 145} 146 147define <8 x i16> @test_div7_8i16(<8 x i16> %a) nounwind { 148; SSE-LABEL: test_div7_8i16: 149; SSE: # %bb.0: 150; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [18725,18725,18725,18725,18725,18725,18725,18725] 151; SSE-NEXT: movdqa %xmm0, %xmm1 152; SSE-NEXT: psrlw $15, %xmm1 153; SSE-NEXT: psraw $1, %xmm0 154; SSE-NEXT: paddw %xmm1, %xmm0 155; SSE-NEXT: retq 156; 157; AVX-LABEL: test_div7_8i16: 158; AVX: # %bb.0: 159; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [18725,18725,18725,18725,18725,18725,18725,18725] 160; AVX-NEXT: vpsrlw $15, %xmm0, %xmm1 161; AVX-NEXT: vpsraw $1, %xmm0, %xmm0 162; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 163; AVX-NEXT: retq 164 %res = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 165 ret <8 x i16> %res 166} 167 168define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind { 169; SSE-LABEL: test_div7_16i8: 170; SSE: # %bb.0: 171; SSE-NEXT: pxor %xmm1, %xmm1 172; SSE-NEXT: pxor %xmm2, %xmm2 173; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 174; SSE-NEXT: movdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632] 175; SSE-NEXT: pmulhw %xmm3, %xmm2 176; SSE-NEXT: psrlw $8, %xmm2 177; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 178; SSE-NEXT: pmulhw %xmm3, %xmm1 179; SSE-NEXT: psrlw $8, %xmm1 180; SSE-NEXT: packuswb %xmm2, %xmm1 181; SSE-NEXT: paddb %xmm1, %xmm0 182; SSE-NEXT: movdqa %xmm0, %xmm1 183; SSE-NEXT: psrlw $2, %xmm1 184; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 185; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 186; SSE-NEXT: pxor %xmm2, %xmm1 187; SSE-NEXT: psrlw $7, %xmm0 188; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 189; SSE-NEXT: paddb %xmm1, %xmm0 190; SSE-NEXT: psubb %xmm2, %xmm0 191; SSE-NEXT: retq 192; 193; AVX1-LABEL: test_div7_16i8: 194; AVX1: # %bb.0: 195; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 196; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 197; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632] 198; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm2 199; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 200; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 201; AVX1-NEXT: vpmulhw %xmm3, %xmm1, %xmm1 202; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 203; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 204; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 205; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 206; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 207; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 208; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 209; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 210; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 211; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 212; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0 213; AVX1-NEXT: retq 214; 215; AVX2NOBW-LABEL: test_div7_16i8: 216; AVX2NOBW: # %bb.0: 217; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm1 218; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427] 219; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 220; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 221; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 222; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 223; AVX2NOBW-NEXT: vpsrlw $2, %xmm0, %xmm1 224; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 225; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 226; AVX2NOBW-NEXT: vpxor %xmm2, %xmm1, %xmm1 227; AVX2NOBW-NEXT: vpsrlw $7, %xmm0, %xmm0 228; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 229; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 230; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm0, %xmm0 231; AVX2NOBW-NEXT: vzeroupper 232; AVX2NOBW-NEXT: retq 233; 234; AVX512BW-LABEL: test_div7_16i8: 235; AVX512BW: # %bb.0: 236; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm1 237; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427] 238; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 239; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 240; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 241; AVX512BW-NEXT: vpsrlw $2, %xmm0, %xmm1 242; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 243; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 244; AVX512BW-NEXT: vpxor %xmm2, %xmm1, %xmm1 245; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm0 246; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 247; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 248; AVX512BW-NEXT: vpsubb %xmm2, %xmm0, %xmm0 249; AVX512BW-NEXT: vzeroupper 250; AVX512BW-NEXT: retq 251 %res = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7> 252 ret <16 x i8> %res 253} 254 255; 256; sdiv by non-splat constant 257; 258 259define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind { 260; SSE-LABEL: test_divconstant_16i8: 261; SSE: # %bb.0: 262; SSE-NEXT: pxor %xmm1, %xmm1 263; SSE-NEXT: pxor %xmm2, %xmm2 264; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 265; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [37632,20224,11008,47872,26368,14592,14592,37632] 266; SSE-NEXT: psrlw $8, %xmm2 267; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 268; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [37632,33024,14592,26368,47872,11008,20224,37632] 269; SSE-NEXT: psrlw $8, %xmm1 270; SSE-NEXT: packuswb %xmm2, %xmm1 271; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 272; SSE-NEXT: paddb %xmm1, %xmm0 273; SSE-NEXT: movdqa %xmm0, %xmm1 274; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 275; SSE-NEXT: psraw $8, %xmm1 276; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,64,128,32,64,128,128,64] 277; SSE-NEXT: psrlw $8, %xmm1 278; SSE-NEXT: movdqa %xmm0, %xmm2 279; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 280; SSE-NEXT: psraw $8, %xmm2 281; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,64,128,64,32,128,64,32] 282; SSE-NEXT: psrlw $8, %xmm2 283; SSE-NEXT: packuswb %xmm1, %xmm2 284; SSE-NEXT: psrlw $7, %xmm0 285; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 286; SSE-NEXT: paddb %xmm2, %xmm0 287; SSE-NEXT: retq 288; 289; AVX1-LABEL: test_divconstant_16i8: 290; AVX1: # %bb.0: 291; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 292; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 293; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [37632,20224,11008,47872,26368,14592,14592,37632] 294; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 295; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 296; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [37632,33024,14592,26368,47872,11008,20224,37632] 297; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 298; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 299; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 300; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 301; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 302; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 303; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,64,128,32,64,128,128,64] 304; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 305; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 306; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 307; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [64,64,128,64,32,128,64,32] 308; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 309; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 310; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 311; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 312; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 313; AVX1-NEXT: retq 314; 315; AVX2NOBW-LABEL: test_divconstant_16i8: 316; AVX2NOBW: # %bb.0: 317; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm1 318; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [65427,65409,57,103,65467,43,79,65427,65427,79,43,65467,103,57,57,65427] 319; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 320; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 321; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 322; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 323; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 324; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm1 325; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [64,64,128,64,32,128,64,32,32,64,128,32,64,128,128,64] 326; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 327; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 328; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 329; AVX2NOBW-NEXT: vpsrlw $7, %xmm0, %xmm0 330; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 331; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 332; AVX2NOBW-NEXT: vzeroupper 333; AVX2NOBW-NEXT: retq 334; 335; AVX512BW-LABEL: test_divconstant_16i8: 336; AVX512BW: # %bb.0: 337; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,2,1,2,3,1,2,3,3,2,1,3,2,1,1,2] 338; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm2 339; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [65427,65409,57,103,65467,43,79,65427,65427,79,43,65467,103,57,57,65427] 340; AVX512BW-NEXT: vpsrlw $8, %ymm2, %ymm2 341; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 342; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 343; AVX512BW-NEXT: vpaddb %xmm0, %xmm2, %xmm0 344; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm2 345; AVX512BW-NEXT: vpsravw %zmm1, %zmm2, %zmm1 346; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 347; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm0 348; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 349; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 350; AVX512BW-NEXT: vzeroupper 351; AVX512BW-NEXT: retq 352 %res = sdiv <16 x i8> %a, <i8 7, i8 8, i8 9, i8 10,i8 11, i8 12, i8 13, i8 14, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9,i8 9, i8 7> 353 ret <16 x i8> %res 354} 355 356; 357; srem by 7 358; 359 360define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { 361; SSE2-LABEL: test_rem7_2i64: 362; SSE2: # %bb.0: 363; SSE2-NEXT: movq %xmm0, %rcx 364; SSE2-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 365; SSE2-NEXT: movq %rcx, %rax 366; SSE2-NEXT: imulq %rsi 367; SSE2-NEXT: movq %rdx, %rax 368; SSE2-NEXT: shrq $63, %rax 369; SSE2-NEXT: sarq %rdx 370; SSE2-NEXT: addq %rax, %rdx 371; SSE2-NEXT: leaq (,%rdx,8), %rax 372; SSE2-NEXT: subq %rax, %rdx 373; SSE2-NEXT: addq %rcx, %rdx 374; SSE2-NEXT: movq %rdx, %xmm1 375; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 376; SSE2-NEXT: movq %xmm0, %rcx 377; SSE2-NEXT: movq %rcx, %rax 378; SSE2-NEXT: imulq %rsi 379; SSE2-NEXT: movq %rdx, %rax 380; SSE2-NEXT: shrq $63, %rax 381; SSE2-NEXT: sarq %rdx 382; SSE2-NEXT: addq %rax, %rdx 383; SSE2-NEXT: leaq (,%rdx,8), %rax 384; SSE2-NEXT: subq %rax, %rdx 385; SSE2-NEXT: addq %rcx, %rdx 386; SSE2-NEXT: movq %rdx, %xmm0 387; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 388; SSE2-NEXT: movdqa %xmm1, %xmm0 389; SSE2-NEXT: retq 390; 391; SSE41-LABEL: test_rem7_2i64: 392; SSE41: # %bb.0: 393; SSE41-NEXT: pextrq $1, %xmm0, %rcx 394; SSE41-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 395; SSE41-NEXT: movq %rcx, %rax 396; SSE41-NEXT: imulq %rsi 397; SSE41-NEXT: movq %rdx, %rax 398; SSE41-NEXT: shrq $63, %rax 399; SSE41-NEXT: sarq %rdx 400; SSE41-NEXT: addq %rax, %rdx 401; SSE41-NEXT: leaq (,%rdx,8), %rax 402; SSE41-NEXT: subq %rax, %rdx 403; SSE41-NEXT: addq %rcx, %rdx 404; SSE41-NEXT: movq %rdx, %xmm1 405; SSE41-NEXT: movq %xmm0, %rcx 406; SSE41-NEXT: movq %rcx, %rax 407; SSE41-NEXT: imulq %rsi 408; SSE41-NEXT: movq %rdx, %rax 409; SSE41-NEXT: shrq $63, %rax 410; SSE41-NEXT: sarq %rdx 411; SSE41-NEXT: addq %rax, %rdx 412; SSE41-NEXT: leaq (,%rdx,8), %rax 413; SSE41-NEXT: subq %rax, %rdx 414; SSE41-NEXT: addq %rcx, %rdx 415; SSE41-NEXT: movq %rdx, %xmm0 416; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 417; SSE41-NEXT: retq 418; 419; AVX-LABEL: test_rem7_2i64: 420; AVX: # %bb.0: 421; AVX-NEXT: vpextrq $1, %xmm0, %rcx 422; AVX-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 423; AVX-NEXT: movq %rcx, %rax 424; AVX-NEXT: imulq %rsi 425; AVX-NEXT: movq %rdx, %rax 426; AVX-NEXT: shrq $63, %rax 427; AVX-NEXT: sarq %rdx 428; AVX-NEXT: addq %rax, %rdx 429; AVX-NEXT: leaq (,%rdx,8), %rax 430; AVX-NEXT: subq %rax, %rdx 431; AVX-NEXT: addq %rcx, %rdx 432; AVX-NEXT: vmovq %rdx, %xmm1 433; AVX-NEXT: vmovq %xmm0, %rcx 434; AVX-NEXT: movq %rcx, %rax 435; AVX-NEXT: imulq %rsi 436; AVX-NEXT: movq %rdx, %rax 437; AVX-NEXT: shrq $63, %rax 438; AVX-NEXT: sarq %rdx 439; AVX-NEXT: addq %rax, %rdx 440; AVX-NEXT: leaq (,%rdx,8), %rax 441; AVX-NEXT: subq %rax, %rdx 442; AVX-NEXT: addq %rcx, %rdx 443; AVX-NEXT: vmovq %rdx, %xmm0 444; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 445; AVX-NEXT: retq 446 %res = srem <2 x i64> %a, <i64 7, i64 7> 447 ret <2 x i64> %res 448} 449 450define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind { 451; SSE2-LABEL: test_rem7_4i32: 452; SSE2: # %bb.0: 453; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] 454; SSE2-NEXT: movdqa %xmm0, %xmm2 455; SSE2-NEXT: pmuludq %xmm1, %xmm2 456; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] 457; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 458; SSE2-NEXT: pmuludq %xmm1, %xmm3 459; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] 460; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 461; SSE2-NEXT: pxor %xmm3, %xmm3 462; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 463; SSE2-NEXT: pand %xmm1, %xmm3 464; SSE2-NEXT: paddd %xmm0, %xmm3 465; SSE2-NEXT: psubd %xmm3, %xmm2 466; SSE2-NEXT: paddd %xmm0, %xmm2 467; SSE2-NEXT: movdqa %xmm2, %xmm1 468; SSE2-NEXT: psrld $31, %xmm1 469; SSE2-NEXT: psrad $2, %xmm2 470; SSE2-NEXT: paddd %xmm1, %xmm2 471; SSE2-NEXT: movdqa %xmm2, %xmm1 472; SSE2-NEXT: pslld $3, %xmm1 473; SSE2-NEXT: psubd %xmm1, %xmm2 474; SSE2-NEXT: paddd %xmm2, %xmm0 475; SSE2-NEXT: retq 476; 477; SSE41-LABEL: test_rem7_4i32: 478; SSE41: # %bb.0: 479; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 480; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] 481; SSE41-NEXT: pmuldq %xmm2, %xmm1 482; SSE41-NEXT: pmuldq %xmm0, %xmm2 483; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 484; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] 485; SSE41-NEXT: paddd %xmm0, %xmm2 486; SSE41-NEXT: movdqa %xmm2, %xmm1 487; SSE41-NEXT: psrld $31, %xmm1 488; SSE41-NEXT: psrad $2, %xmm2 489; SSE41-NEXT: paddd %xmm1, %xmm2 490; SSE41-NEXT: movdqa %xmm2, %xmm1 491; SSE41-NEXT: pslld $3, %xmm1 492; SSE41-NEXT: psubd %xmm1, %xmm2 493; SSE41-NEXT: paddd %xmm2, %xmm0 494; SSE41-NEXT: retq 495; 496; AVX1-LABEL: test_rem7_4i32: 497; AVX1: # %bb.0: 498; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 499; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] 500; AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 501; AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 502; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 503; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] 504; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 505; AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 506; AVX1-NEXT: vpsrad $2, %xmm1, %xmm1 507; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 508; AVX1-NEXT: vpslld $3, %xmm1, %xmm2 509; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm1 510; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 511; AVX1-NEXT: retq 512; 513; AVX2-LABEL: test_rem7_4i32: 514; AVX2: # %bb.0: 515; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 516; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] 517; AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 518; AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 519; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 520; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] 521; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1 522; AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 523; AVX2-NEXT: vpsrad $2, %xmm1, %xmm1 524; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 525; AVX2-NEXT: vpslld $3, %xmm1, %xmm2 526; AVX2-NEXT: vpsubd %xmm2, %xmm1, %xmm1 527; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 528; AVX2-NEXT: retq 529 %res = srem <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7> 530 ret <4 x i32> %res 531} 532 533define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind { 534; SSE-LABEL: test_rem7_8i16: 535; SSE: # %bb.0: 536; SSE-NEXT: movdqa {{.*#+}} xmm1 = [18725,18725,18725,18725,18725,18725,18725,18725] 537; SSE-NEXT: pmulhw %xmm0, %xmm1 538; SSE-NEXT: movdqa %xmm1, %xmm2 539; SSE-NEXT: psrlw $15, %xmm2 540; SSE-NEXT: psraw $1, %xmm1 541; SSE-NEXT: paddw %xmm2, %xmm1 542; SSE-NEXT: movdqa %xmm1, %xmm2 543; SSE-NEXT: psllw $3, %xmm2 544; SSE-NEXT: psubw %xmm2, %xmm1 545; SSE-NEXT: paddw %xmm1, %xmm0 546; SSE-NEXT: retq 547; 548; AVX-LABEL: test_rem7_8i16: 549; AVX: # %bb.0: 550; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [18725,18725,18725,18725,18725,18725,18725,18725] 551; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2 552; AVX-NEXT: vpsraw $1, %xmm1, %xmm1 553; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 554; AVX-NEXT: vpsllw $3, %xmm1, %xmm2 555; AVX-NEXT: vpsubw %xmm2, %xmm1, %xmm1 556; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 557; AVX-NEXT: retq 558 %res = srem <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 559 ret <8 x i16> %res 560} 561 562define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { 563; SSE-LABEL: test_rem7_16i8: 564; SSE: # %bb.0: 565; SSE-NEXT: pxor %xmm1, %xmm1 566; SSE-NEXT: pxor %xmm2, %xmm2 567; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 568; SSE-NEXT: movdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632] 569; SSE-NEXT: pmulhw %xmm3, %xmm2 570; SSE-NEXT: psrlw $8, %xmm2 571; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 572; SSE-NEXT: pmulhw %xmm3, %xmm1 573; SSE-NEXT: psrlw $8, %xmm1 574; SSE-NEXT: packuswb %xmm2, %xmm1 575; SSE-NEXT: paddb %xmm0, %xmm1 576; SSE-NEXT: movdqa %xmm1, %xmm2 577; SSE-NEXT: psrlw $2, %xmm2 578; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 579; SSE-NEXT: movdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 580; SSE-NEXT: pxor %xmm3, %xmm2 581; SSE-NEXT: psrlw $7, %xmm1 582; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 583; SSE-NEXT: paddb %xmm2, %xmm1 584; SSE-NEXT: psubb %xmm3, %xmm1 585; SSE-NEXT: movdqa %xmm1, %xmm2 586; SSE-NEXT: psllw $3, %xmm2 587; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 588; SSE-NEXT: psubb %xmm2, %xmm1 589; SSE-NEXT: paddb %xmm1, %xmm0 590; SSE-NEXT: retq 591; 592; AVX1-LABEL: test_rem7_16i8: 593; AVX1: # %bb.0: 594; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 595; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 596; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632] 597; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm2 598; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 599; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 600; AVX1-NEXT: vpmulhw %xmm3, %xmm1, %xmm1 601; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 602; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 603; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm1 604; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2 605; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 606; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 607; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 608; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 609; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 610; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 611; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 612; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2 613; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 614; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 615; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 616; AVX1-NEXT: retq 617; 618; AVX2NOBW-LABEL: test_rem7_16i8: 619; AVX2NOBW: # %bb.0: 620; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm1 621; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427] 622; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 623; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 624; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 625; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm1 626; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm2 627; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 628; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 629; AVX2NOBW-NEXT: vpxor %xmm3, %xmm2, %xmm2 630; AVX2NOBW-NEXT: vpsrlw $7, %xmm1, %xmm1 631; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 632; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 633; AVX2NOBW-NEXT: vpsubb %xmm3, %xmm1, %xmm1 634; AVX2NOBW-NEXT: vpsllw $3, %xmm1, %xmm2 635; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 636; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 637; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 638; AVX2NOBW-NEXT: vzeroupper 639; AVX2NOBW-NEXT: retq 640; 641; AVX512BW-LABEL: test_rem7_16i8: 642; AVX512BW: # %bb.0: 643; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm1 644; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427] 645; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 646; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 647; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm1 648; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm2 649; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 650; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 651; AVX512BW-NEXT: vpxor %xmm3, %xmm2, %xmm2 652; AVX512BW-NEXT: vpsrlw $7, %xmm1, %xmm1 653; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 654; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 655; AVX512BW-NEXT: vpsubb %xmm3, %xmm1, %xmm1 656; AVX512BW-NEXT: vpsllw $3, %xmm1, %xmm2 657; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 658; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 659; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 660; AVX512BW-NEXT: vzeroupper 661; AVX512BW-NEXT: retq 662 %res = srem <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7> 663 ret <16 x i8> %res 664} 665 666; 667; srem by non-splat constant 668; 669 670define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { 671; SSE2-LABEL: test_remconstant_16i8: 672; SSE2: # %bb.0: 673; SSE2-NEXT: pxor %xmm2, %xmm2 674; SSE2-NEXT: pxor %xmm1, %xmm1 675; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 676; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [37632,20224,11008,47872,26368,14592,14592,37632] 677; SSE2-NEXT: psrlw $8, %xmm1 678; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 679; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [37632,33024,14592,26368,47872,11008,20224,37632] 680; SSE2-NEXT: psrlw $8, %xmm2 681; SSE2-NEXT: packuswb %xmm1, %xmm2 682; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,255,0,0,255,255,0,0,255,0,0,0,255] 683; SSE2-NEXT: pand %xmm0, %xmm1 684; SSE2-NEXT: paddb %xmm2, %xmm1 685; SSE2-NEXT: movdqa %xmm1, %xmm2 686; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 687; SSE2-NEXT: psraw $8, %xmm2 688; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32,64,128,32,64,128,128,64] 689; SSE2-NEXT: psrlw $8, %xmm2 690; SSE2-NEXT: movdqa %xmm1, %xmm3 691; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 692; SSE2-NEXT: psraw $8, %xmm3 693; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [64,64,128,64,32,128,64,32] 694; SSE2-NEXT: psrlw $8, %xmm3 695; SSE2-NEXT: packuswb %xmm2, %xmm3 696; SSE2-NEXT: psrlw $7, %xmm1 697; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 698; SSE2-NEXT: paddb %xmm3, %xmm1 699; SSE2-NEXT: movdqa %xmm1, %xmm2 700; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 701; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [14,13,12,11,10,9,9,7] 702; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 703; SSE2-NEXT: pand %xmm3, %xmm2 704; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 705; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [7,8,9,10,11,12,13,14] 706; SSE2-NEXT: pand %xmm3, %xmm1 707; SSE2-NEXT: packuswb %xmm2, %xmm1 708; SSE2-NEXT: psubb %xmm1, %xmm0 709; SSE2-NEXT: retq 710; 711; SSE41-LABEL: test_remconstant_16i8: 712; SSE41: # %bb.0: 713; SSE41-NEXT: pxor %xmm2, %xmm2 714; SSE41-NEXT: pxor %xmm1, %xmm1 715; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 716; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [37632,20224,11008,47872,26368,14592,14592,37632] 717; SSE41-NEXT: psrlw $8, %xmm1 718; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 719; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [37632,33024,14592,26368,47872,11008,20224,37632] 720; SSE41-NEXT: psrlw $8, %xmm2 721; SSE41-NEXT: packuswb %xmm1, %xmm2 722; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,255,0,0,255,255,0,0,255,0,0,0,255] 723; SSE41-NEXT: pand %xmm0, %xmm1 724; SSE41-NEXT: paddb %xmm2, %xmm1 725; SSE41-NEXT: movdqa %xmm1, %xmm2 726; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 727; SSE41-NEXT: psraw $8, %xmm2 728; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32,64,128,32,64,128,128,64] 729; SSE41-NEXT: psrlw $8, %xmm2 730; SSE41-NEXT: movdqa %xmm1, %xmm3 731; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 732; SSE41-NEXT: psraw $8, %xmm3 733; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [64,64,128,64,32,128,64,32] 734; SSE41-NEXT: psrlw $8, %xmm3 735; SSE41-NEXT: packuswb %xmm2, %xmm3 736; SSE41-NEXT: psrlw $7, %xmm1 737; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 738; SSE41-NEXT: paddb %xmm3, %xmm1 739; SSE41-NEXT: movdqa %xmm1, %xmm2 740; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7] 741; SSE41-NEXT: psllw $8, %xmm2 742; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [7,0,9,0,11,0,13,0,14,0,12,0,10,0,9,0] 743; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 744; SSE41-NEXT: por %xmm2, %xmm1 745; SSE41-NEXT: psubb %xmm1, %xmm0 746; SSE41-NEXT: retq 747; 748; AVX1-LABEL: test_remconstant_16i8: 749; AVX1: # %bb.0: 750; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 751; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 752; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [37632,20224,11008,47872,26368,14592,14592,37632] 753; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 754; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 755; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [37632,33024,14592,26368,47872,11008,20224,37632] 756; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 757; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 758; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 759; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 760; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 761; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 762; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [32,64,128,32,64,128,128,64] 763; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 764; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 765; AVX1-NEXT: vpsraw $8, %xmm3, %xmm3 766; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [64,64,128,64,32,128,64,32] 767; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 768; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 769; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 770; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 771; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 772; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7] 773; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 774; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [7,0,9,0,11,0,13,0,14,0,12,0,10,0,9,0] 775; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 776; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 777; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 778; AVX1-NEXT: retq 779; 780; AVX2NOBW-LABEL: test_remconstant_16i8: 781; AVX2NOBW: # %bb.0: 782; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm1 783; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [65427,65409,57,103,65467,43,79,65427,65427,79,43,65467,103,57,57,65427] 784; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 785; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 786; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 787; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 788; AVX2NOBW-NEXT: vpaddb %xmm2, %xmm1, %xmm1 789; AVX2NOBW-NEXT: vpmovsxbw %xmm1, %ymm2 790; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [64,64,128,64,32,128,64,32,32,64,128,32,64,128,128,64] 791; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2 792; AVX2NOBW-NEXT: vextracti128 $1, %ymm2, %xmm3 793; AVX2NOBW-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 794; AVX2NOBW-NEXT: vpsrlw $7, %xmm1, %xmm1 795; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 796; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 797; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 798; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7] 799; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 800; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 801; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 802; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 803; AVX2NOBW-NEXT: vzeroupper 804; AVX2NOBW-NEXT: retq 805; 806; AVX512BW-LABEL: test_remconstant_16i8: 807; AVX512BW: # %bb.0: 808; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,2,1,2,3,1,2,3,3,2,1,3,2,1,1,2] 809; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm2 810; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [65427,65409,57,103,65467,43,79,65427,65427,79,43,65467,103,57,57,65427] 811; AVX512BW-NEXT: vpsrlw $8, %ymm2, %ymm2 812; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 813; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 814; AVX512BW-NEXT: vpaddb %xmm3, %xmm2, %xmm2 815; AVX512BW-NEXT: vpmovsxbw %xmm2, %ymm3 816; AVX512BW-NEXT: vpsravw %zmm1, %zmm3, %zmm1 817; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 818; AVX512BW-NEXT: vpsrlw $7, %xmm2, %xmm2 819; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 820; AVX512BW-NEXT: vpaddb %xmm2, %xmm1, %xmm1 821; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 822; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7] 823; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 824; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 825; AVX512BW-NEXT: vzeroupper 826; AVX512BW-NEXT: retq 827 %res = srem <16 x i8> %a, <i8 7, i8 8, i8 9, i8 10,i8 11, i8 12, i8 13, i8 14, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9,i8 9, i8 7> 828 ret <16 x i8> %res 829} 830 831; This test is just to show what an scalarized v16i8 division looks like. 832define <16 x i8> @test_rem_variable_16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 833; SSE2-LABEL: test_rem_variable_16i8: 834; SSE2: # %bb.0: 835; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) 836; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) 837; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 838; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 839; SSE2-NEXT: movsbl %ah, %eax 840; SSE2-NEXT: movd %eax, %xmm0 841; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 842; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 843; SSE2-NEXT: movsbl %ah, %eax 844; SSE2-NEXT: movd %eax, %xmm1 845; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 846; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 847; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 848; SSE2-NEXT: movsbl %ah, %eax 849; SSE2-NEXT: movd %eax, %xmm0 850; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 851; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 852; SSE2-NEXT: movsbl %ah, %eax 853; SSE2-NEXT: movd %eax, %xmm2 854; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 855; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 856; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 857; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 858; SSE2-NEXT: movsbl %ah, %eax 859; SSE2-NEXT: movd %eax, %xmm0 860; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 861; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 862; SSE2-NEXT: movsbl %ah, %eax 863; SSE2-NEXT: movd %eax, %xmm3 864; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 865; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 866; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 867; SSE2-NEXT: movsbl %ah, %eax 868; SSE2-NEXT: movd %eax, %xmm0 869; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 870; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 871; SSE2-NEXT: movsbl %ah, %eax 872; SSE2-NEXT: movd %eax, %xmm1 873; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 874; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 875; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 876; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 877; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 878; SSE2-NEXT: movsbl %ah, %eax 879; SSE2-NEXT: movd %eax, %xmm0 880; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 881; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 882; SSE2-NEXT: movsbl %ah, %eax 883; SSE2-NEXT: movd %eax, %xmm2 884; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 885; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 886; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 887; SSE2-NEXT: movsbl %ah, %eax 888; SSE2-NEXT: movd %eax, %xmm0 889; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 890; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 891; SSE2-NEXT: movsbl %ah, %eax 892; SSE2-NEXT: movd %eax, %xmm3 893; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 894; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 895; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 896; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 897; SSE2-NEXT: movsbl %ah, %eax 898; SSE2-NEXT: movd %eax, %xmm0 899; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 900; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 901; SSE2-NEXT: movsbl %ah, %eax 902; SSE2-NEXT: movd %eax, %xmm2 903; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 904; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 905; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 906; SSE2-NEXT: movsbl %ah, %eax 907; SSE2-NEXT: movd %eax, %xmm4 908; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 909; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 910; SSE2-NEXT: movsbl %ah, %eax 911; SSE2-NEXT: movd %eax, %xmm0 912; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 913; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 914; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 915; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 916; SSE2-NEXT: retq 917; 918; SSE41-LABEL: test_rem_variable_16i8: 919; SSE41: # %bb.0: 920; SSE41-NEXT: pextrb $1, %xmm1, %ecx 921; SSE41-NEXT: pextrb $1, %xmm0, %eax 922; SSE41-NEXT: cbtw 923; SSE41-NEXT: idivb %cl 924; SSE41-NEXT: movsbl %ah, %ecx 925; SSE41-NEXT: movd %xmm1, %edx 926; SSE41-NEXT: movd %xmm0, %eax 927; SSE41-NEXT: cbtw 928; SSE41-NEXT: idivb %dl 929; SSE41-NEXT: movsbl %ah, %eax 930; SSE41-NEXT: movd %eax, %xmm2 931; SSE41-NEXT: pinsrb $1, %ecx, %xmm2 932; SSE41-NEXT: pextrb $2, %xmm1, %ecx 933; SSE41-NEXT: pextrb $2, %xmm0, %eax 934; SSE41-NEXT: cbtw 935; SSE41-NEXT: idivb %cl 936; SSE41-NEXT: movsbl %ah, %eax 937; SSE41-NEXT: pinsrb $2, %eax, %xmm2 938; SSE41-NEXT: pextrb $3, %xmm1, %ecx 939; SSE41-NEXT: pextrb $3, %xmm0, %eax 940; SSE41-NEXT: cbtw 941; SSE41-NEXT: idivb %cl 942; SSE41-NEXT: movsbl %ah, %eax 943; SSE41-NEXT: pinsrb $3, %eax, %xmm2 944; SSE41-NEXT: pextrb $4, %xmm1, %ecx 945; SSE41-NEXT: pextrb $4, %xmm0, %eax 946; SSE41-NEXT: cbtw 947; SSE41-NEXT: idivb %cl 948; SSE41-NEXT: movsbl %ah, %eax 949; SSE41-NEXT: pinsrb $4, %eax, %xmm2 950; SSE41-NEXT: pextrb $5, %xmm1, %ecx 951; SSE41-NEXT: pextrb $5, %xmm0, %eax 952; SSE41-NEXT: cbtw 953; SSE41-NEXT: idivb %cl 954; SSE41-NEXT: movsbl %ah, %eax 955; SSE41-NEXT: pinsrb $5, %eax, %xmm2 956; SSE41-NEXT: pextrb $6, %xmm1, %ecx 957; SSE41-NEXT: pextrb $6, %xmm0, %eax 958; SSE41-NEXT: cbtw 959; SSE41-NEXT: idivb %cl 960; SSE41-NEXT: movsbl %ah, %eax 961; SSE41-NEXT: pinsrb $6, %eax, %xmm2 962; SSE41-NEXT: pextrb $7, %xmm1, %ecx 963; SSE41-NEXT: pextrb $7, %xmm0, %eax 964; SSE41-NEXT: cbtw 965; SSE41-NEXT: idivb %cl 966; SSE41-NEXT: movsbl %ah, %eax 967; SSE41-NEXT: pinsrb $7, %eax, %xmm2 968; SSE41-NEXT: pextrb $8, %xmm1, %ecx 969; SSE41-NEXT: pextrb $8, %xmm0, %eax 970; SSE41-NEXT: cbtw 971; SSE41-NEXT: idivb %cl 972; SSE41-NEXT: movsbl %ah, %eax 973; SSE41-NEXT: pinsrb $8, %eax, %xmm2 974; SSE41-NEXT: pextrb $9, %xmm1, %ecx 975; SSE41-NEXT: pextrb $9, %xmm0, %eax 976; SSE41-NEXT: cbtw 977; SSE41-NEXT: idivb %cl 978; SSE41-NEXT: movsbl %ah, %eax 979; SSE41-NEXT: pinsrb $9, %eax, %xmm2 980; SSE41-NEXT: pextrb $10, %xmm1, %ecx 981; SSE41-NEXT: pextrb $10, %xmm0, %eax 982; SSE41-NEXT: cbtw 983; SSE41-NEXT: idivb %cl 984; SSE41-NEXT: movsbl %ah, %eax 985; SSE41-NEXT: pinsrb $10, %eax, %xmm2 986; SSE41-NEXT: pextrb $11, %xmm1, %ecx 987; SSE41-NEXT: pextrb $11, %xmm0, %eax 988; SSE41-NEXT: cbtw 989; SSE41-NEXT: idivb %cl 990; SSE41-NEXT: movsbl %ah, %eax 991; SSE41-NEXT: pinsrb $11, %eax, %xmm2 992; SSE41-NEXT: pextrb $12, %xmm1, %ecx 993; SSE41-NEXT: pextrb $12, %xmm0, %eax 994; SSE41-NEXT: cbtw 995; SSE41-NEXT: idivb %cl 996; SSE41-NEXT: movsbl %ah, %eax 997; SSE41-NEXT: pinsrb $12, %eax, %xmm2 998; SSE41-NEXT: pextrb $13, %xmm1, %ecx 999; SSE41-NEXT: pextrb $13, %xmm0, %eax 1000; SSE41-NEXT: cbtw 1001; SSE41-NEXT: idivb %cl 1002; SSE41-NEXT: movsbl %ah, %eax 1003; SSE41-NEXT: pinsrb $13, %eax, %xmm2 1004; SSE41-NEXT: pextrb $14, %xmm1, %ecx 1005; SSE41-NEXT: pextrb $14, %xmm0, %eax 1006; SSE41-NEXT: cbtw 1007; SSE41-NEXT: idivb %cl 1008; SSE41-NEXT: movsbl %ah, %eax 1009; SSE41-NEXT: pinsrb $14, %eax, %xmm2 1010; SSE41-NEXT: pextrb $15, %xmm1, %ecx 1011; SSE41-NEXT: pextrb $15, %xmm0, %eax 1012; SSE41-NEXT: cbtw 1013; SSE41-NEXT: idivb %cl 1014; SSE41-NEXT: movsbl %ah, %eax 1015; SSE41-NEXT: pinsrb $15, %eax, %xmm2 1016; SSE41-NEXT: movdqa %xmm2, %xmm0 1017; SSE41-NEXT: retq 1018; 1019; AVX-LABEL: test_rem_variable_16i8: 1020; AVX: # %bb.0: 1021; AVX-NEXT: vpextrb $1, %xmm1, %ecx 1022; AVX-NEXT: vpextrb $1, %xmm0, %eax 1023; AVX-NEXT: cbtw 1024; AVX-NEXT: idivb %cl 1025; AVX-NEXT: movsbl %ah, %ecx 1026; AVX-NEXT: vmovd %xmm1, %edx 1027; AVX-NEXT: vmovd %xmm0, %eax 1028; AVX-NEXT: cbtw 1029; AVX-NEXT: idivb %dl 1030; AVX-NEXT: movsbl %ah, %eax 1031; AVX-NEXT: vmovd %eax, %xmm2 1032; AVX-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 1033; AVX-NEXT: vpextrb $2, %xmm1, %ecx 1034; AVX-NEXT: vpextrb $2, %xmm0, %eax 1035; AVX-NEXT: cbtw 1036; AVX-NEXT: idivb %cl 1037; AVX-NEXT: movsbl %ah, %eax 1038; AVX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 1039; AVX-NEXT: vpextrb $3, %xmm1, %ecx 1040; AVX-NEXT: vpextrb $3, %xmm0, %eax 1041; AVX-NEXT: cbtw 1042; AVX-NEXT: idivb %cl 1043; AVX-NEXT: movsbl %ah, %eax 1044; AVX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 1045; AVX-NEXT: vpextrb $4, %xmm1, %ecx 1046; AVX-NEXT: vpextrb $4, %xmm0, %eax 1047; AVX-NEXT: cbtw 1048; AVX-NEXT: idivb %cl 1049; AVX-NEXT: movsbl %ah, %eax 1050; AVX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 1051; AVX-NEXT: vpextrb $5, %xmm1, %ecx 1052; AVX-NEXT: vpextrb $5, %xmm0, %eax 1053; AVX-NEXT: cbtw 1054; AVX-NEXT: idivb %cl 1055; AVX-NEXT: movsbl %ah, %eax 1056; AVX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 1057; AVX-NEXT: vpextrb $6, %xmm1, %ecx 1058; AVX-NEXT: vpextrb $6, %xmm0, %eax 1059; AVX-NEXT: cbtw 1060; AVX-NEXT: idivb %cl 1061; AVX-NEXT: movsbl %ah, %eax 1062; AVX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 1063; AVX-NEXT: vpextrb $7, %xmm1, %ecx 1064; AVX-NEXT: vpextrb $7, %xmm0, %eax 1065; AVX-NEXT: cbtw 1066; AVX-NEXT: idivb %cl 1067; AVX-NEXT: movsbl %ah, %eax 1068; AVX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 1069; AVX-NEXT: vpextrb $8, %xmm1, %ecx 1070; AVX-NEXT: vpextrb $8, %xmm0, %eax 1071; AVX-NEXT: cbtw 1072; AVX-NEXT: idivb %cl 1073; AVX-NEXT: movsbl %ah, %eax 1074; AVX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 1075; AVX-NEXT: vpextrb $9, %xmm1, %ecx 1076; AVX-NEXT: vpextrb $9, %xmm0, %eax 1077; AVX-NEXT: cbtw 1078; AVX-NEXT: idivb %cl 1079; AVX-NEXT: movsbl %ah, %eax 1080; AVX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 1081; AVX-NEXT: vpextrb $10, %xmm1, %ecx 1082; AVX-NEXT: vpextrb $10, %xmm0, %eax 1083; AVX-NEXT: cbtw 1084; AVX-NEXT: idivb %cl 1085; AVX-NEXT: movsbl %ah, %eax 1086; AVX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 1087; AVX-NEXT: vpextrb $11, %xmm1, %ecx 1088; AVX-NEXT: vpextrb $11, %xmm0, %eax 1089; AVX-NEXT: cbtw 1090; AVX-NEXT: idivb %cl 1091; AVX-NEXT: movsbl %ah, %eax 1092; AVX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 1093; AVX-NEXT: vpextrb $12, %xmm1, %ecx 1094; AVX-NEXT: vpextrb $12, %xmm0, %eax 1095; AVX-NEXT: cbtw 1096; AVX-NEXT: idivb %cl 1097; AVX-NEXT: movsbl %ah, %eax 1098; AVX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 1099; AVX-NEXT: vpextrb $13, %xmm1, %ecx 1100; AVX-NEXT: vpextrb $13, %xmm0, %eax 1101; AVX-NEXT: cbtw 1102; AVX-NEXT: idivb %cl 1103; AVX-NEXT: movsbl %ah, %eax 1104; AVX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 1105; AVX-NEXT: vpextrb $14, %xmm1, %ecx 1106; AVX-NEXT: vpextrb $14, %xmm0, %eax 1107; AVX-NEXT: cbtw 1108; AVX-NEXT: idivb %cl 1109; AVX-NEXT: movsbl %ah, %eax 1110; AVX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 1111; AVX-NEXT: vpextrb $15, %xmm1, %ecx 1112; AVX-NEXT: vpextrb $15, %xmm0, %eax 1113; AVX-NEXT: cbtw 1114; AVX-NEXT: idivb %cl 1115; AVX-NEXT: movsbl %ah, %eax 1116; AVX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 1117; AVX-NEXT: retq 1118 %res = srem <16 x i8> %a, %b 1119 ret <16 x i8> %res 1120} 1121