1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512F 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512BW 7 8define void @avg_v4i8(ptr %a, ptr %b) nounwind { 9; SSE2-LABEL: avg_v4i8: 10; SSE2: # %bb.0: 11; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 12; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 13; SSE2-NEXT: pavgb %xmm0, %xmm1 14; SSE2-NEXT: movd %xmm1, (%rax) 15; SSE2-NEXT: retq 16; 17; AVX-LABEL: avg_v4i8: 18; AVX: # %bb.0: 19; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 20; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 21; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0 22; AVX-NEXT: vmovd %xmm0, (%rax) 23; AVX-NEXT: retq 24 %1 = load <4 x i8>, ptr %a 25 %2 = load <4 x i8>, ptr %b 26 %3 = zext <4 x i8> %1 to <4 x i32> 27 %4 = zext <4 x i8> %2 to <4 x i32> 28 %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1> 29 %6 = add nuw nsw <4 x i32> %5, %4 30 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1> 31 %8 = trunc <4 x i32> %7 to <4 x i8> 32 store <4 x i8> %8, ptr undef, align 4 33 ret void 34} 35 36define void @avg_v8i8(ptr %a, ptr %b) nounwind { 37; SSE2-LABEL: avg_v8i8: 38; SSE2: # %bb.0: 39; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 40; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 41; SSE2-NEXT: pavgb %xmm0, %xmm1 42; SSE2-NEXT: movq %xmm1, (%rax) 43; SSE2-NEXT: retq 44; 45; AVX-LABEL: avg_v8i8: 46; AVX: # %bb.0: 47; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 48; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 49; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0 50; AVX-NEXT: vmovq %xmm0, (%rax) 51; AVX-NEXT: retq 52 %1 = load <8 x i8>, ptr %a 53 %2 = load <8 x i8>, ptr %b 54 %3 = zext <8 x i8> %1 to <8 x i32> 55 %4 = zext <8 x i8> %2 to <8 x i32> 56 %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 57 %6 = add nuw nsw <8 x i32> %5, %4 58 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 59 %8 = trunc <8 x i32> %7 to <8 x i8> 60 store <8 x i8> %8, ptr undef, align 4 61 ret void 62} 63 64define void @avg_v16i8(ptr %a, ptr %b) nounwind { 65; SSE2-LABEL: avg_v16i8: 66; SSE2: # %bb.0: 67; SSE2-NEXT: movdqa (%rdi), %xmm0 68; SSE2-NEXT: pavgb (%rsi), %xmm0 69; SSE2-NEXT: movdqu %xmm0, (%rax) 70; SSE2-NEXT: retq 71; 72; AVX-LABEL: avg_v16i8: 73; AVX: # %bb.0: 74; AVX-NEXT: vmovdqa (%rdi), %xmm0 75; AVX-NEXT: vpavgb (%rsi), %xmm0, %xmm0 76; AVX-NEXT: vmovdqu %xmm0, (%rax) 77; AVX-NEXT: retq 78 %1 = load <16 x i8>, ptr %a 79 %2 = load <16 x i8>, ptr %b 80 %3 = zext <16 x i8> %1 to <16 x i32> 81 %4 = zext <16 x i8> %2 to <16 x i32> 82 %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 83 %6 = add nuw nsw <16 x i32> %5, %4 84 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 85 %8 = trunc <16 x i32> %7 to <16 x i8> 86 store <16 x i8> %8, ptr undef, align 4 87 ret void 88} 89 90define void @avg_v24i8(ptr %a, ptr %b) nounwind { 91; SSE2-LABEL: avg_v24i8: 92; SSE2: # %bb.0: 93; SSE2-NEXT: movdqa (%rdi), %xmm0 94; SSE2-NEXT: movdqa 16(%rdi), %xmm1 95; SSE2-NEXT: pavgb (%rsi), %xmm0 96; SSE2-NEXT: pavgb 16(%rsi), %xmm1 97; SSE2-NEXT: movq %xmm1, (%rax) 98; SSE2-NEXT: movdqu %xmm0, (%rax) 99; SSE2-NEXT: retq 100; 101; AVX1-LABEL: avg_v24i8: 102; AVX1: # %bb.0: 103; AVX1-NEXT: vmovdqa (%rdi), %xmm0 104; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 105; AVX1-NEXT: vpavgb (%rsi), %xmm0, %xmm0 106; AVX1-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 107; AVX1-NEXT: vmovq %xmm1, (%rax) 108; AVX1-NEXT: vmovdqu %xmm0, (%rax) 109; AVX1-NEXT: retq 110; 111; AVX2-LABEL: avg_v24i8: 112; AVX2: # %bb.0: 113; AVX2-NEXT: vmovdqa (%rdi), %ymm0 114; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0 115; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 116; AVX2-NEXT: vmovq %xmm1, (%rax) 117; AVX2-NEXT: vmovdqu %xmm0, (%rax) 118; AVX2-NEXT: vzeroupper 119; AVX2-NEXT: retq 120; 121; AVX512-LABEL: avg_v24i8: 122; AVX512: # %bb.0: 123; AVX512-NEXT: vmovdqa (%rdi), %ymm0 124; AVX512-NEXT: vpavgb (%rsi), %ymm0, %ymm0 125; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 126; AVX512-NEXT: vmovq %xmm1, (%rax) 127; AVX512-NEXT: vmovdqu %xmm0, (%rax) 128; AVX512-NEXT: vzeroupper 129; AVX512-NEXT: retq 130 %1 = load <24 x i8>, ptr %a 131 %2 = load <24 x i8>, ptr %b 132 %3 = zext <24 x i8> %1 to <24 x i32> 133 %4 = zext <24 x i8> %2 to <24 x i32> 134 %5 = add nuw nsw <24 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 135 %6 = add nuw nsw <24 x i32> %5, %4 136 %7 = lshr <24 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 137 %8 = trunc <24 x i32> %7 to <24 x i8> 138 store <24 x i8> %8, ptr undef, align 4 139 ret void 140} 141 142define void @avg_v32i8(ptr %a, ptr %b) nounwind { 143; SSE2-LABEL: avg_v32i8: 144; SSE2: # %bb.0: 145; SSE2-NEXT: movdqa (%rdi), %xmm0 146; SSE2-NEXT: movdqa 16(%rdi), %xmm1 147; SSE2-NEXT: pavgb (%rsi), %xmm0 148; SSE2-NEXT: pavgb 16(%rsi), %xmm1 149; SSE2-NEXT: movdqu %xmm1, (%rax) 150; SSE2-NEXT: movdqu %xmm0, (%rax) 151; SSE2-NEXT: retq 152; 153; AVX1-LABEL: avg_v32i8: 154; AVX1: # %bb.0: 155; AVX1-NEXT: vmovdqa (%rdi), %xmm0 156; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 157; AVX1-NEXT: vpavgb (%rsi), %xmm0, %xmm0 158; AVX1-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 159; AVX1-NEXT: vmovdqu %xmm1, (%rax) 160; AVX1-NEXT: vmovdqu %xmm0, (%rax) 161; AVX1-NEXT: retq 162; 163; AVX2-LABEL: avg_v32i8: 164; AVX2: # %bb.0: 165; AVX2-NEXT: vmovdqa (%rdi), %ymm0 166; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0 167; AVX2-NEXT: vmovdqu %ymm0, (%rax) 168; AVX2-NEXT: vzeroupper 169; AVX2-NEXT: retq 170; 171; AVX512-LABEL: avg_v32i8: 172; AVX512: # %bb.0: 173; AVX512-NEXT: vmovdqa (%rdi), %ymm0 174; AVX512-NEXT: vpavgb (%rsi), %ymm0, %ymm0 175; AVX512-NEXT: vmovdqu %ymm0, (%rax) 176; AVX512-NEXT: vzeroupper 177; AVX512-NEXT: retq 178 %1 = load <32 x i8>, ptr %a 179 %2 = load <32 x i8>, ptr %b 180 %3 = zext <32 x i8> %1 to <32 x i32> 181 %4 = zext <32 x i8> %2 to <32 x i32> 182 %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 183 %6 = add nuw nsw <32 x i32> %5, %4 184 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 185 %8 = trunc <32 x i32> %7 to <32 x i8> 186 store <32 x i8> %8, ptr undef, align 4 187 ret void 188} 189 190define void @avg_v48i8(ptr %a, ptr %b) nounwind { 191; SSE2-LABEL: avg_v48i8: 192; SSE2: # %bb.0: 193; SSE2-NEXT: movdqa (%rdi), %xmm0 194; SSE2-NEXT: movdqa 16(%rdi), %xmm1 195; SSE2-NEXT: movdqa 32(%rdi), %xmm2 196; SSE2-NEXT: pavgb (%rsi), %xmm0 197; SSE2-NEXT: pavgb 16(%rsi), %xmm1 198; SSE2-NEXT: pavgb 32(%rsi), %xmm2 199; SSE2-NEXT: movdqu %xmm2, (%rax) 200; SSE2-NEXT: movdqu %xmm1, (%rax) 201; SSE2-NEXT: movdqu %xmm0, (%rax) 202; SSE2-NEXT: retq 203; 204; AVX1-LABEL: avg_v48i8: 205; AVX1: # %bb.0: 206; AVX1-NEXT: vmovdqa (%rdi), %xmm0 207; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 208; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 209; AVX1-NEXT: vpavgb (%rsi), %xmm0, %xmm0 210; AVX1-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 211; AVX1-NEXT: vpavgb 32(%rsi), %xmm2, %xmm2 212; AVX1-NEXT: vmovdqu %xmm1, (%rax) 213; AVX1-NEXT: vmovdqu %xmm0, (%rax) 214; AVX1-NEXT: vmovdqu %xmm2, (%rax) 215; AVX1-NEXT: retq 216; 217; AVX2-LABEL: avg_v48i8: 218; AVX2: # %bb.0: 219; AVX2-NEXT: vmovdqa (%rdi), %ymm0 220; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0 221; AVX2-NEXT: vmovdqa 32(%rdi), %xmm1 222; AVX2-NEXT: vpavgb 32(%rsi), %xmm1, %xmm1 223; AVX2-NEXT: vmovdqu %xmm1, (%rax) 224; AVX2-NEXT: vmovdqu %ymm0, (%rax) 225; AVX2-NEXT: vzeroupper 226; AVX2-NEXT: retq 227; 228; AVX512F-LABEL: avg_v48i8: 229; AVX512F: # %bb.0: 230; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 231; AVX512F-NEXT: vpavgb (%rsi), %ymm0, %ymm0 232; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm1 233; AVX512F-NEXT: vpavgb 32(%rsi), %xmm1, %xmm1 234; AVX512F-NEXT: vmovdqu %xmm1, (%rax) 235; AVX512F-NEXT: vmovdqu %ymm0, (%rax) 236; AVX512F-NEXT: vzeroupper 237; AVX512F-NEXT: retq 238; 239; AVX512BW-LABEL: avg_v48i8: 240; AVX512BW: # %bb.0: 241; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 242; AVX512BW-NEXT: vpavgb (%rsi), %zmm0, %zmm0 243; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, (%rax) 244; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) 245; AVX512BW-NEXT: vzeroupper 246; AVX512BW-NEXT: retq 247 %1 = load <48 x i8>, ptr %a 248 %2 = load <48 x i8>, ptr %b 249 %3 = zext <48 x i8> %1 to <48 x i32> 250 %4 = zext <48 x i8> %2 to <48 x i32> 251 %5 = add nuw nsw <48 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 252 %6 = add nuw nsw <48 x i32> %5, %4 253 %7 = lshr <48 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 254 %8 = trunc <48 x i32> %7 to <48 x i8> 255 store <48 x i8> %8, ptr undef, align 4 256 ret void 257} 258 259define void @avg_v64i8(ptr %a, ptr %b) nounwind { 260; SSE2-LABEL: avg_v64i8: 261; SSE2: # %bb.0: 262; SSE2-NEXT: movdqa (%rdi), %xmm0 263; SSE2-NEXT: movdqa 16(%rdi), %xmm1 264; SSE2-NEXT: movdqa 32(%rdi), %xmm2 265; SSE2-NEXT: movdqa 48(%rdi), %xmm3 266; SSE2-NEXT: pavgb (%rsi), %xmm0 267; SSE2-NEXT: pavgb 16(%rsi), %xmm1 268; SSE2-NEXT: pavgb 32(%rsi), %xmm2 269; SSE2-NEXT: pavgb 48(%rsi), %xmm3 270; SSE2-NEXT: movdqu %xmm3, (%rax) 271; SSE2-NEXT: movdqu %xmm2, (%rax) 272; SSE2-NEXT: movdqu %xmm1, (%rax) 273; SSE2-NEXT: movdqu %xmm0, (%rax) 274; SSE2-NEXT: retq 275; 276; AVX1-LABEL: avg_v64i8: 277; AVX1: # %bb.0: 278; AVX1-NEXT: vmovdqa (%rdi), %xmm0 279; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 280; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 281; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 282; AVX1-NEXT: vpavgb (%rsi), %xmm0, %xmm0 283; AVX1-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 284; AVX1-NEXT: vpavgb 32(%rsi), %xmm2, %xmm2 285; AVX1-NEXT: vpavgb 48(%rsi), %xmm3, %xmm3 286; AVX1-NEXT: vmovdqu %xmm3, (%rax) 287; AVX1-NEXT: vmovdqu %xmm2, (%rax) 288; AVX1-NEXT: vmovdqu %xmm1, (%rax) 289; AVX1-NEXT: vmovdqu %xmm0, (%rax) 290; AVX1-NEXT: retq 291; 292; AVX2-LABEL: avg_v64i8: 293; AVX2: # %bb.0: 294; AVX2-NEXT: vmovdqa (%rdi), %ymm0 295; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 296; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0 297; AVX2-NEXT: vpavgb 32(%rsi), %ymm1, %ymm1 298; AVX2-NEXT: vmovdqu %ymm1, (%rax) 299; AVX2-NEXT: vmovdqu %ymm0, (%rax) 300; AVX2-NEXT: vzeroupper 301; AVX2-NEXT: retq 302; 303; AVX512F-LABEL: avg_v64i8: 304; AVX512F: # %bb.0: 305; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 306; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 307; AVX512F-NEXT: vpavgb (%rsi), %ymm0, %ymm0 308; AVX512F-NEXT: vpavgb 32(%rsi), %ymm1, %ymm1 309; AVX512F-NEXT: vmovdqu %ymm1, (%rax) 310; AVX512F-NEXT: vmovdqu %ymm0, (%rax) 311; AVX512F-NEXT: vzeroupper 312; AVX512F-NEXT: retq 313; 314; AVX512BW-LABEL: avg_v64i8: 315; AVX512BW: # %bb.0: 316; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 317; AVX512BW-NEXT: vpavgb (%rsi), %zmm0, %zmm0 318; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) 319; AVX512BW-NEXT: vzeroupper 320; AVX512BW-NEXT: retq 321 %1 = load <64 x i8>, ptr %a 322 %2 = load <64 x i8>, ptr %b 323 %3 = zext <64 x i8> %1 to <64 x i32> 324 %4 = zext <64 x i8> %2 to <64 x i32> 325 %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 326 %6 = add nuw nsw <64 x i32> %5, %4 327 %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 328 %8 = trunc <64 x i32> %7 to <64 x i8> 329 store <64 x i8> %8, ptr undef, align 4 330 ret void 331} 332 333define void @avg_v4i16(ptr %a, ptr %b) nounwind { 334; SSE2-LABEL: avg_v4i16: 335; SSE2: # %bb.0: 336; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 337; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 338; SSE2-NEXT: pavgw %xmm0, %xmm1 339; SSE2-NEXT: movq %xmm1, (%rax) 340; SSE2-NEXT: retq 341; 342; AVX-LABEL: avg_v4i16: 343; AVX: # %bb.0: 344; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 345; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 346; AVX-NEXT: vpavgw %xmm1, %xmm0, %xmm0 347; AVX-NEXT: vmovq %xmm0, (%rax) 348; AVX-NEXT: retq 349 %1 = load <4 x i16>, ptr %a 350 %2 = load <4 x i16>, ptr %b 351 %3 = zext <4 x i16> %1 to <4 x i32> 352 %4 = zext <4 x i16> %2 to <4 x i32> 353 %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1> 354 %6 = add nuw nsw <4 x i32> %5, %4 355 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1> 356 %8 = trunc <4 x i32> %7 to <4 x i16> 357 store <4 x i16> %8, ptr undef, align 4 358 ret void 359} 360 361define void @avg_v8i16(ptr %a, ptr %b) nounwind { 362; SSE2-LABEL: avg_v8i16: 363; SSE2: # %bb.0: 364; SSE2-NEXT: movdqa (%rdi), %xmm0 365; SSE2-NEXT: pavgw (%rsi), %xmm0 366; SSE2-NEXT: movdqu %xmm0, (%rax) 367; SSE2-NEXT: retq 368; 369; AVX-LABEL: avg_v8i16: 370; AVX: # %bb.0: 371; AVX-NEXT: vmovdqa (%rdi), %xmm0 372; AVX-NEXT: vpavgw (%rsi), %xmm0, %xmm0 373; AVX-NEXT: vmovdqu %xmm0, (%rax) 374; AVX-NEXT: retq 375 %1 = load <8 x i16>, ptr %a 376 %2 = load <8 x i16>, ptr %b 377 %3 = zext <8 x i16> %1 to <8 x i32> 378 %4 = zext <8 x i16> %2 to <8 x i32> 379 %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 380 %6 = add nuw nsw <8 x i32> %5, %4 381 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 382 %8 = trunc <8 x i32> %7 to <8 x i16> 383 store <8 x i16> %8, ptr undef, align 4 384 ret void 385} 386 387define void @avg_v16i16(ptr %a, ptr %b) nounwind { 388; SSE2-LABEL: avg_v16i16: 389; SSE2: # %bb.0: 390; SSE2-NEXT: movdqa (%rdi), %xmm0 391; SSE2-NEXT: movdqa 16(%rdi), %xmm1 392; SSE2-NEXT: pavgw (%rsi), %xmm0 393; SSE2-NEXT: pavgw 16(%rsi), %xmm1 394; SSE2-NEXT: movdqu %xmm1, (%rax) 395; SSE2-NEXT: movdqu %xmm0, (%rax) 396; SSE2-NEXT: retq 397; 398; AVX1-LABEL: avg_v16i16: 399; AVX1: # %bb.0: 400; AVX1-NEXT: vmovdqa (%rdi), %xmm0 401; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 402; AVX1-NEXT: vpavgw (%rsi), %xmm0, %xmm0 403; AVX1-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1 404; AVX1-NEXT: vmovdqu %xmm1, (%rax) 405; AVX1-NEXT: vmovdqu %xmm0, (%rax) 406; AVX1-NEXT: retq 407; 408; AVX2-LABEL: avg_v16i16: 409; AVX2: # %bb.0: 410; AVX2-NEXT: vmovdqa (%rdi), %ymm0 411; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 412; AVX2-NEXT: vmovdqu %ymm0, (%rax) 413; AVX2-NEXT: vzeroupper 414; AVX2-NEXT: retq 415; 416; AVX512-LABEL: avg_v16i16: 417; AVX512: # %bb.0: 418; AVX512-NEXT: vmovdqa (%rdi), %ymm0 419; AVX512-NEXT: vpavgw (%rsi), %ymm0, %ymm0 420; AVX512-NEXT: vmovdqu %ymm0, (%rax) 421; AVX512-NEXT: vzeroupper 422; AVX512-NEXT: retq 423 %1 = load <16 x i16>, ptr %a 424 %2 = load <16 x i16>, ptr %b 425 %3 = zext <16 x i16> %1 to <16 x i32> 426 %4 = zext <16 x i16> %2 to <16 x i32> 427 %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 428 %6 = add nuw nsw <16 x i32> %5, %4 429 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 430 %8 = trunc <16 x i32> %7 to <16 x i16> 431 store <16 x i16> %8, ptr undef, align 4 432 ret void 433} 434 435define void @avg_v32i16(ptr %a, ptr %b) nounwind { 436; SSE2-LABEL: avg_v32i16: 437; SSE2: # %bb.0: 438; SSE2-NEXT: movdqa (%rdi), %xmm0 439; SSE2-NEXT: movdqa 16(%rdi), %xmm1 440; SSE2-NEXT: movdqa 32(%rdi), %xmm2 441; SSE2-NEXT: movdqa 48(%rdi), %xmm3 442; SSE2-NEXT: pavgw (%rsi), %xmm0 443; SSE2-NEXT: pavgw 16(%rsi), %xmm1 444; SSE2-NEXT: pavgw 32(%rsi), %xmm2 445; SSE2-NEXT: pavgw 48(%rsi), %xmm3 446; SSE2-NEXT: movdqu %xmm3, (%rax) 447; SSE2-NEXT: movdqu %xmm2, (%rax) 448; SSE2-NEXT: movdqu %xmm1, (%rax) 449; SSE2-NEXT: movdqu %xmm0, (%rax) 450; SSE2-NEXT: retq 451; 452; AVX1-LABEL: avg_v32i16: 453; AVX1: # %bb.0: 454; AVX1-NEXT: vmovdqa (%rdi), %xmm0 455; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 456; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 457; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 458; AVX1-NEXT: vpavgw (%rsi), %xmm0, %xmm0 459; AVX1-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1 460; AVX1-NEXT: vpavgw 32(%rsi), %xmm2, %xmm2 461; AVX1-NEXT: vpavgw 48(%rsi), %xmm3, %xmm3 462; AVX1-NEXT: vmovdqu %xmm3, (%rax) 463; AVX1-NEXT: vmovdqu %xmm2, (%rax) 464; AVX1-NEXT: vmovdqu %xmm1, (%rax) 465; AVX1-NEXT: vmovdqu %xmm0, (%rax) 466; AVX1-NEXT: retq 467; 468; AVX2-LABEL: avg_v32i16: 469; AVX2: # %bb.0: 470; AVX2-NEXT: vmovdqa (%rdi), %ymm0 471; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 472; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 473; AVX2-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 474; AVX2-NEXT: vmovdqu %ymm1, (%rax) 475; AVX2-NEXT: vmovdqu %ymm0, (%rax) 476; AVX2-NEXT: vzeroupper 477; AVX2-NEXT: retq 478; 479; AVX512F-LABEL: avg_v32i16: 480; AVX512F: # %bb.0: 481; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 482; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 483; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0 484; AVX512F-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 485; AVX512F-NEXT: vmovdqu %ymm1, (%rax) 486; AVX512F-NEXT: vmovdqu %ymm0, (%rax) 487; AVX512F-NEXT: vzeroupper 488; AVX512F-NEXT: retq 489; 490; AVX512BW-LABEL: avg_v32i16: 491; AVX512BW: # %bb.0: 492; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 493; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0 494; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) 495; AVX512BW-NEXT: vzeroupper 496; AVX512BW-NEXT: retq 497 %1 = load <32 x i16>, ptr %a 498 %2 = load <32 x i16>, ptr %b 499 %3 = zext <32 x i16> %1 to <32 x i32> 500 %4 = zext <32 x i16> %2 to <32 x i32> 501 %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 502 %6 = add nuw nsw <32 x i32> %5, %4 503 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 504 %8 = trunc <32 x i32> %7 to <32 x i16> 505 store <32 x i16> %8, ptr undef, align 4 506 ret void 507} 508 509define void @avg_v40i16(ptr %a, ptr %b) nounwind { 510; SSE2-LABEL: avg_v40i16: 511; SSE2: # %bb.0: 512; SSE2-NEXT: movdqa 64(%rdi), %xmm0 513; SSE2-NEXT: movdqa (%rdi), %xmm1 514; SSE2-NEXT: movdqa 16(%rdi), %xmm2 515; SSE2-NEXT: movdqa 32(%rdi), %xmm3 516; SSE2-NEXT: movdqa 48(%rdi), %xmm4 517; SSE2-NEXT: pavgw (%rsi), %xmm1 518; SSE2-NEXT: pavgw 16(%rsi), %xmm2 519; SSE2-NEXT: pavgw 32(%rsi), %xmm3 520; SSE2-NEXT: pavgw 48(%rsi), %xmm4 521; SSE2-NEXT: pavgw 64(%rsi), %xmm0 522; SSE2-NEXT: movdqu %xmm0, (%rax) 523; SSE2-NEXT: movdqu %xmm4, (%rax) 524; SSE2-NEXT: movdqu %xmm3, (%rax) 525; SSE2-NEXT: movdqu %xmm2, (%rax) 526; SSE2-NEXT: movdqu %xmm1, (%rax) 527; SSE2-NEXT: retq 528; 529; AVX1-LABEL: avg_v40i16: 530; AVX1: # %bb.0: 531; AVX1-NEXT: vmovdqa (%rdi), %xmm0 532; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 533; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 534; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 535; AVX1-NEXT: vpavgw (%rsi), %xmm0, %xmm0 536; AVX1-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1 537; AVX1-NEXT: vpavgw 32(%rsi), %xmm2, %xmm2 538; AVX1-NEXT: vpavgw 48(%rsi), %xmm3, %xmm3 539; AVX1-NEXT: vmovdqa 64(%rdi), %xmm4 540; AVX1-NEXT: vpavgw 64(%rsi), %xmm4, %xmm4 541; AVX1-NEXT: vmovdqu %xmm3, (%rax) 542; AVX1-NEXT: vmovdqu %xmm2, (%rax) 543; AVX1-NEXT: vmovdqu %xmm1, (%rax) 544; AVX1-NEXT: vmovdqu %xmm0, (%rax) 545; AVX1-NEXT: vmovdqu %xmm4, (%rax) 546; AVX1-NEXT: retq 547; 548; AVX2-LABEL: avg_v40i16: 549; AVX2: # %bb.0: 550; AVX2-NEXT: vmovdqa (%rdi), %ymm0 551; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 552; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 553; AVX2-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 554; AVX2-NEXT: vmovdqa 64(%rdi), %xmm2 555; AVX2-NEXT: vpavgw 64(%rsi), %xmm2, %xmm2 556; AVX2-NEXT: vmovdqu %xmm2, (%rax) 557; AVX2-NEXT: vmovdqu %ymm1, (%rax) 558; AVX2-NEXT: vmovdqu %ymm0, (%rax) 559; AVX2-NEXT: vzeroupper 560; AVX2-NEXT: retq 561; 562; AVX512F-LABEL: avg_v40i16: 563; AVX512F: # %bb.0: 564; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 565; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 566; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0 567; AVX512F-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 568; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm2 569; AVX512F-NEXT: vpavgw 64(%rsi), %xmm2, %xmm2 570; AVX512F-NEXT: vmovdqu %ymm1, (%rax) 571; AVX512F-NEXT: vmovdqu %ymm0, (%rax) 572; AVX512F-NEXT: vmovdqu %xmm2, (%rax) 573; AVX512F-NEXT: vzeroupper 574; AVX512F-NEXT: retq 575; 576; AVX512BW-LABEL: avg_v40i16: 577; AVX512BW: # %bb.0: 578; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 579; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0 580; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm1 581; AVX512BW-NEXT: vpavgw 64(%rsi), %xmm1, %xmm1 582; AVX512BW-NEXT: vmovdqu %xmm1, (%rax) 583; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) 584; AVX512BW-NEXT: vzeroupper 585; AVX512BW-NEXT: retq 586 %1 = load <40 x i16>, ptr %a 587 %2 = load <40 x i16>, ptr %b 588 %3 = zext <40 x i16> %1 to <40 x i32> 589 %4 = zext <40 x i16> %2 to <40 x i32> 590 %5 = add nuw nsw <40 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 591 %6 = add nuw nsw <40 x i32> %5, %4 592 %7 = lshr <40 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 593 %8 = trunc <40 x i32> %7 to <40 x i16> 594 store <40 x i16> %8, ptr undef, align 4 595 ret void 596} 597 598define void @avg_v4i8_2(ptr %a, ptr %b) nounwind { 599; SSE2-LABEL: avg_v4i8_2: 600; SSE2: # %bb.0: 601; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 602; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 603; SSE2-NEXT: pavgb %xmm0, %xmm1 604; SSE2-NEXT: movd %xmm1, (%rax) 605; SSE2-NEXT: retq 606; 607; AVX-LABEL: avg_v4i8_2: 608; AVX: # %bb.0: 609; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 610; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 611; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0 612; AVX-NEXT: vmovd %xmm0, (%rax) 613; AVX-NEXT: retq 614 %1 = load <4 x i8>, ptr %a 615 %2 = load <4 x i8>, ptr %b 616 %3 = zext <4 x i8> %1 to <4 x i32> 617 %4 = zext <4 x i8> %2 to <4 x i32> 618 %5 = add nuw nsw <4 x i32> %3, %4 619 %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1> 620 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1> 621 %8 = trunc <4 x i32> %7 to <4 x i8> 622 store <4 x i8> %8, ptr undef, align 4 623 ret void 624} 625 626define void @avg_v8i8_2(ptr %a, ptr %b) nounwind { 627; SSE2-LABEL: avg_v8i8_2: 628; SSE2: # %bb.0: 629; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 630; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 631; SSE2-NEXT: pavgb %xmm0, %xmm1 632; SSE2-NEXT: movq %xmm1, (%rax) 633; SSE2-NEXT: retq 634; 635; AVX-LABEL: avg_v8i8_2: 636; AVX: # %bb.0: 637; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 638; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 639; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0 640; AVX-NEXT: vmovq %xmm0, (%rax) 641; AVX-NEXT: retq 642 %1 = load <8 x i8>, ptr %a 643 %2 = load <8 x i8>, ptr %b 644 %3 = zext <8 x i8> %1 to <8 x i32> 645 %4 = zext <8 x i8> %2 to <8 x i32> 646 %5 = add nuw nsw <8 x i32> %3, %4 647 %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 648 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 649 %8 = trunc <8 x i32> %7 to <8 x i8> 650 store <8 x i8> %8, ptr undef, align 4 651 ret void 652} 653 654define void @avg_v16i8_2(ptr %a, ptr %b) nounwind { 655; SSE2-LABEL: avg_v16i8_2: 656; SSE2: # %bb.0: 657; SSE2-NEXT: movdqa (%rdi), %xmm0 658; SSE2-NEXT: pavgb (%rsi), %xmm0 659; SSE2-NEXT: movdqu %xmm0, (%rax) 660; SSE2-NEXT: retq 661; 662; AVX-LABEL: avg_v16i8_2: 663; AVX: # %bb.0: 664; AVX-NEXT: vmovdqa (%rdi), %xmm0 665; AVX-NEXT: vpavgb (%rsi), %xmm0, %xmm0 666; AVX-NEXT: vmovdqu %xmm0, (%rax) 667; AVX-NEXT: retq 668 %1 = load <16 x i8>, ptr %a 669 %2 = load <16 x i8>, ptr %b 670 %3 = zext <16 x i8> %1 to <16 x i32> 671 %4 = zext <16 x i8> %2 to <16 x i32> 672 %5 = add nuw nsw <16 x i32> %3, %4 673 %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 674 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 675 %8 = trunc <16 x i32> %7 to <16 x i8> 676 store <16 x i8> %8, ptr undef, align 4 677 ret void 678} 679 680define void @avg_v32i8_2(ptr %a, ptr %b) nounwind { 681; SSE2-LABEL: avg_v32i8_2: 682; SSE2: # %bb.0: 683; SSE2-NEXT: movdqa (%rdi), %xmm0 684; SSE2-NEXT: movdqa 16(%rdi), %xmm1 685; SSE2-NEXT: pavgb (%rsi), %xmm0 686; SSE2-NEXT: pavgb 16(%rsi), %xmm1 687; SSE2-NEXT: movdqu %xmm1, (%rax) 688; SSE2-NEXT: movdqu %xmm0, (%rax) 689; SSE2-NEXT: retq 690; 691; AVX1-LABEL: avg_v32i8_2: 692; AVX1: # %bb.0: 693; AVX1-NEXT: vmovdqa (%rdi), %xmm0 694; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 695; AVX1-NEXT: vpavgb (%rsi), %xmm0, %xmm0 696; AVX1-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 697; AVX1-NEXT: vmovdqu %xmm1, (%rax) 698; AVX1-NEXT: vmovdqu %xmm0, (%rax) 699; AVX1-NEXT: retq 700; 701; AVX2-LABEL: avg_v32i8_2: 702; AVX2: # %bb.0: 703; AVX2-NEXT: vmovdqa (%rdi), %ymm0 704; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0 705; AVX2-NEXT: vmovdqu %ymm0, (%rax) 706; AVX2-NEXT: vzeroupper 707; AVX2-NEXT: retq 708; 709; AVX512-LABEL: avg_v32i8_2: 710; AVX512: # %bb.0: 711; AVX512-NEXT: vmovdqa (%rdi), %ymm0 712; AVX512-NEXT: vpavgb (%rsi), %ymm0, %ymm0 713; AVX512-NEXT: vmovdqu %ymm0, (%rax) 714; AVX512-NEXT: vzeroupper 715; AVX512-NEXT: retq 716 %1 = load <32 x i8>, ptr %a 717 %2 = load <32 x i8>, ptr %b 718 %3 = zext <32 x i8> %1 to <32 x i32> 719 %4 = zext <32 x i8> %2 to <32 x i32> 720 %5 = add nuw nsw <32 x i32> %3, %4 721 %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 722 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 723 %8 = trunc <32 x i32> %7 to <32 x i8> 724 store <32 x i8> %8, ptr undef, align 4 725 ret void 726} 727 728define void @avg_v64i8_2(ptr %a, ptr %b) nounwind { 729; SSE2-LABEL: avg_v64i8_2: 730; SSE2: # %bb.0: 731; SSE2-NEXT: movaps (%rsi), %xmm0 732; SSE2-NEXT: movaps 16(%rsi), %xmm1 733; SSE2-NEXT: movaps 32(%rsi), %xmm2 734; SSE2-NEXT: movaps 48(%rsi), %xmm3 735; SSE2-NEXT: movups %xmm3, (%rax) 736; SSE2-NEXT: movups %xmm2, (%rax) 737; SSE2-NEXT: movups %xmm1, (%rax) 738; SSE2-NEXT: movups %xmm0, (%rax) 739; SSE2-NEXT: retq 740; 741; AVX1-LABEL: avg_v64i8_2: 742; AVX1: # %bb.0: 743; AVX1-NEXT: vmovaps (%rsi), %ymm0 744; AVX1-NEXT: vmovaps 32(%rsi), %ymm1 745; AVX1-NEXT: vmovups %ymm1, (%rax) 746; AVX1-NEXT: vmovups %ymm0, (%rax) 747; AVX1-NEXT: vzeroupper 748; AVX1-NEXT: retq 749; 750; AVX2-LABEL: avg_v64i8_2: 751; AVX2: # %bb.0: 752; AVX2-NEXT: vmovaps (%rsi), %ymm0 753; AVX2-NEXT: vmovaps 32(%rsi), %ymm1 754; AVX2-NEXT: vmovups %ymm1, (%rax) 755; AVX2-NEXT: vmovups %ymm0, (%rax) 756; AVX2-NEXT: vzeroupper 757; AVX2-NEXT: retq 758; 759; AVX512-LABEL: avg_v64i8_2: 760; AVX512: # %bb.0: 761; AVX512-NEXT: vmovaps (%rsi), %zmm0 762; AVX512-NEXT: vmovups %zmm0, (%rax) 763; AVX512-NEXT: vzeroupper 764; AVX512-NEXT: retq 765 %1 = load <64 x i8>, ptr %a 766 %2 = load <64 x i8>, ptr %b 767 %3 = zext <64 x i8> %1 to <64 x i32> 768 %4 = zext <64 x i8> %2 to <64 x i32> 769 %5 = add nuw nsw <64 x i32> %4, %4 770 %6 = add nuw nsw <64 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 771 %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 772 %8 = trunc <64 x i32> %7 to <64 x i8> 773 store <64 x i8> %8, ptr undef, align 4 774 ret void 775} 776 777 778define void @avg_v4i16_2(ptr %a, ptr %b) nounwind { 779; SSE2-LABEL: avg_v4i16_2: 780; SSE2: # %bb.0: 781; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 782; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 783; SSE2-NEXT: pavgw %xmm0, %xmm1 784; SSE2-NEXT: movq %xmm1, (%rax) 785; SSE2-NEXT: retq 786; 787; AVX-LABEL: avg_v4i16_2: 788; AVX: # %bb.0: 789; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 790; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 791; AVX-NEXT: vpavgw %xmm1, %xmm0, %xmm0 792; AVX-NEXT: vmovq %xmm0, (%rax) 793; AVX-NEXT: retq 794 %1 = load <4 x i16>, ptr %a 795 %2 = load <4 x i16>, ptr %b 796 %3 = zext <4 x i16> %1 to <4 x i32> 797 %4 = zext <4 x i16> %2 to <4 x i32> 798 %5 = add nuw nsw <4 x i32> %3, %4 799 %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1> 800 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1> 801 %8 = trunc <4 x i32> %7 to <4 x i16> 802 store <4 x i16> %8, ptr undef, align 4 803 ret void 804} 805 806define void @avg_v8i16_2(ptr %a, ptr %b) nounwind { 807; SSE2-LABEL: avg_v8i16_2: 808; SSE2: # %bb.0: 809; SSE2-NEXT: movdqa (%rdi), %xmm0 810; SSE2-NEXT: pavgw (%rsi), %xmm0 811; SSE2-NEXT: movdqu %xmm0, (%rax) 812; SSE2-NEXT: retq 813; 814; AVX-LABEL: avg_v8i16_2: 815; AVX: # %bb.0: 816; AVX-NEXT: vmovdqa (%rdi), %xmm0 817; AVX-NEXT: vpavgw (%rsi), %xmm0, %xmm0 818; AVX-NEXT: vmovdqu %xmm0, (%rax) 819; AVX-NEXT: retq 820 %1 = load <8 x i16>, ptr %a 821 %2 = load <8 x i16>, ptr %b 822 %3 = zext <8 x i16> %1 to <8 x i32> 823 %4 = zext <8 x i16> %2 to <8 x i32> 824 %5 = add nuw nsw <8 x i32> %3, %4 825 %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 826 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 827 %8 = trunc <8 x i32> %7 to <8 x i16> 828 store <8 x i16> %8, ptr undef, align 4 829 ret void 830} 831 832define void @avg_v16i16_2(ptr %a, ptr %b) nounwind { 833; SSE2-LABEL: avg_v16i16_2: 834; SSE2: # %bb.0: 835; SSE2-NEXT: movdqa (%rdi), %xmm0 836; SSE2-NEXT: movdqa 16(%rdi), %xmm1 837; SSE2-NEXT: pavgw (%rsi), %xmm0 838; SSE2-NEXT: pavgw 16(%rsi), %xmm1 839; SSE2-NEXT: movdqu %xmm1, (%rax) 840; SSE2-NEXT: movdqu %xmm0, (%rax) 841; SSE2-NEXT: retq 842; 843; AVX1-LABEL: avg_v16i16_2: 844; AVX1: # %bb.0: 845; AVX1-NEXT: vmovdqa (%rdi), %xmm0 846; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 847; AVX1-NEXT: vpavgw (%rsi), %xmm0, %xmm0 848; AVX1-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1 849; AVX1-NEXT: vmovdqu %xmm1, (%rax) 850; AVX1-NEXT: vmovdqu %xmm0, (%rax) 851; AVX1-NEXT: retq 852; 853; AVX2-LABEL: avg_v16i16_2: 854; AVX2: # %bb.0: 855; AVX2-NEXT: vmovdqa (%rdi), %ymm0 856; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 857; AVX2-NEXT: vmovdqu %ymm0, (%rax) 858; AVX2-NEXT: vzeroupper 859; AVX2-NEXT: retq 860; 861; AVX512-LABEL: avg_v16i16_2: 862; AVX512: # %bb.0: 863; AVX512-NEXT: vmovdqa (%rdi), %ymm0 864; AVX512-NEXT: vpavgw (%rsi), %ymm0, %ymm0 865; AVX512-NEXT: vmovdqu %ymm0, (%rax) 866; AVX512-NEXT: vzeroupper 867; AVX512-NEXT: retq 868 %1 = load <16 x i16>, ptr %a 869 %2 = load <16 x i16>, ptr %b 870 %3 = zext <16 x i16> %1 to <16 x i32> 871 %4 = zext <16 x i16> %2 to <16 x i32> 872 %5 = add nuw nsw <16 x i32> %3, %4 873 %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 874 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 875 %8 = trunc <16 x i32> %7 to <16 x i16> 876 store <16 x i16> %8, ptr undef, align 4 877 ret void 878} 879 880define void @avg_v32i16_2(ptr %a, ptr %b) nounwind { 881; SSE2-LABEL: avg_v32i16_2: 882; SSE2: # %bb.0: 883; SSE2-NEXT: movdqa (%rdi), %xmm0 884; SSE2-NEXT: movdqa 16(%rdi), %xmm1 885; SSE2-NEXT: movdqa 32(%rdi), %xmm2 886; SSE2-NEXT: movdqa 48(%rdi), %xmm3 887; SSE2-NEXT: pavgw (%rsi), %xmm0 888; SSE2-NEXT: pavgw 16(%rsi), %xmm1 889; SSE2-NEXT: pavgw 32(%rsi), %xmm2 890; SSE2-NEXT: pavgw 48(%rsi), %xmm3 891; SSE2-NEXT: movdqu %xmm3, (%rax) 892; SSE2-NEXT: movdqu %xmm2, (%rax) 893; SSE2-NEXT: movdqu %xmm1, (%rax) 894; SSE2-NEXT: movdqu %xmm0, (%rax) 895; SSE2-NEXT: retq 896; 897; AVX1-LABEL: avg_v32i16_2: 898; AVX1: # %bb.0: 899; AVX1-NEXT: vmovdqa (%rdi), %xmm0 900; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 901; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 902; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 903; AVX1-NEXT: vpavgw (%rsi), %xmm0, %xmm0 904; AVX1-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1 905; AVX1-NEXT: vpavgw 32(%rsi), %xmm2, %xmm2 906; AVX1-NEXT: vpavgw 48(%rsi), %xmm3, %xmm3 907; AVX1-NEXT: vmovdqu %xmm3, (%rax) 908; AVX1-NEXT: vmovdqu %xmm2, (%rax) 909; AVX1-NEXT: vmovdqu %xmm1, (%rax) 910; AVX1-NEXT: vmovdqu %xmm0, (%rax) 911; AVX1-NEXT: retq 912; 913; AVX2-LABEL: avg_v32i16_2: 914; AVX2: # %bb.0: 915; AVX2-NEXT: vmovdqa (%rdi), %ymm0 916; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 917; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 918; AVX2-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 919; AVX2-NEXT: vmovdqu %ymm1, (%rax) 920; AVX2-NEXT: vmovdqu %ymm0, (%rax) 921; AVX2-NEXT: vzeroupper 922; AVX2-NEXT: retq 923; 924; AVX512F-LABEL: avg_v32i16_2: 925; AVX512F: # %bb.0: 926; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 927; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 928; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0 929; AVX512F-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 930; AVX512F-NEXT: vmovdqu %ymm1, (%rax) 931; AVX512F-NEXT: vmovdqu %ymm0, (%rax) 932; AVX512F-NEXT: vzeroupper 933; AVX512F-NEXT: retq 934; 935; AVX512BW-LABEL: avg_v32i16_2: 936; AVX512BW: # %bb.0: 937; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 938; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0 939; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) 940; AVX512BW-NEXT: vzeroupper 941; AVX512BW-NEXT: retq 942 %1 = load <32 x i16>, ptr %a 943 %2 = load <32 x i16>, ptr %b 944 %3 = zext <32 x i16> %1 to <32 x i32> 945 %4 = zext <32 x i16> %2 to <32 x i32> 946 %5 = add nuw nsw <32 x i32> %3, %4 947 %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 948 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 949 %8 = trunc <32 x i32> %7 to <32 x i16> 950 store <32 x i16> %8, ptr undef, align 4 951 ret void 952} 953 954define void @avg_v4i8_const(ptr %a) nounwind { 955; SSE2-LABEL: avg_v4i8_const: 956; SSE2: # %bb.0: 957; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 958; SSE2-NEXT: pavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 959; SSE2-NEXT: movd %xmm0, (%rax) 960; SSE2-NEXT: retq 961; 962; AVX-LABEL: avg_v4i8_const: 963; AVX: # %bb.0: 964; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 965; AVX-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 966; AVX-NEXT: vmovd %xmm0, (%rax) 967; AVX-NEXT: retq 968 %1 = load <4 x i8>, ptr %a 969 %2 = zext <4 x i8> %1 to <4 x i32> 970 %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4> 971 %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1> 972 %5 = trunc <4 x i32> %4 to <4 x i8> 973 store <4 x i8> %5, ptr undef, align 4 974 ret void 975} 976 977define void @avg_v8i8_const(ptr %a) nounwind { 978; SSE2-LABEL: avg_v8i8_const: 979; SSE2: # %bb.0: 980; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 981; SSE2-NEXT: pavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 982; SSE2-NEXT: movq %xmm0, (%rax) 983; SSE2-NEXT: retq 984; 985; AVX-LABEL: avg_v8i8_const: 986; AVX: # %bb.0: 987; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 988; AVX-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 989; AVX-NEXT: vmovq %xmm0, (%rax) 990; AVX-NEXT: retq 991 %1 = load <8 x i8>, ptr %a 992 %2 = zext <8 x i8> %1 to <8 x i32> 993 %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 994 %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 995 %5 = trunc <8 x i32> %4 to <8 x i8> 996 store <8 x i8> %5, ptr undef, align 4 997 ret void 998} 999 1000define void @avg_v16i8_const(ptr %a) nounwind { 1001; SSE2-LABEL: avg_v16i8_const: 1002; SSE2: # %bb.0: 1003; SSE2-NEXT: movdqa (%rdi), %xmm0 1004; SSE2-NEXT: pavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1005; SSE2-NEXT: movdqu %xmm0, (%rax) 1006; SSE2-NEXT: retq 1007; 1008; AVX-LABEL: avg_v16i8_const: 1009; AVX: # %bb.0: 1010; AVX-NEXT: vmovdqa (%rdi), %xmm0 1011; AVX-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1012; AVX-NEXT: vmovdqu %xmm0, (%rax) 1013; AVX-NEXT: retq 1014 %1 = load <16 x i8>, ptr %a 1015 %2 = zext <16 x i8> %1 to <16 x i32> 1016 %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 1017 %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1018 %5 = trunc <16 x i32> %4 to <16 x i8> 1019 store <16 x i8> %5, ptr undef, align 4 1020 ret void 1021} 1022 1023define void @avg_v32i8_const(ptr %a) nounwind { 1024; SSE2-LABEL: avg_v32i8_const: 1025; SSE2: # %bb.0: 1026; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] 1027; SSE2-NEXT: movdqa (%rdi), %xmm1 1028; SSE2-NEXT: pavgb %xmm0, %xmm1 1029; SSE2-NEXT: pavgb 16(%rdi), %xmm0 1030; SSE2-NEXT: movdqu %xmm0, (%rax) 1031; SSE2-NEXT: movdqu %xmm1, (%rax) 1032; SSE2-NEXT: retq 1033; 1034; AVX1-LABEL: avg_v32i8_const: 1035; AVX1: # %bb.0: 1036; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] 1037; AVX1-NEXT: # xmm0 = mem[0,0] 1038; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm1 1039; AVX1-NEXT: vpavgb 16(%rdi), %xmm0, %xmm0 1040; AVX1-NEXT: vmovdqu %xmm0, (%rax) 1041; AVX1-NEXT: vmovdqu %xmm1, (%rax) 1042; AVX1-NEXT: retq 1043; 1044; AVX2-LABEL: avg_v32i8_const: 1045; AVX2: # %bb.0: 1046; AVX2-NEXT: vmovdqa (%rdi), %ymm0 1047; AVX2-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1048; AVX2-NEXT: vmovdqu %ymm0, (%rax) 1049; AVX2-NEXT: vzeroupper 1050; AVX2-NEXT: retq 1051; 1052; AVX512-LABEL: avg_v32i8_const: 1053; AVX512: # %bb.0: 1054; AVX512-NEXT: vmovdqa (%rdi), %ymm0 1055; AVX512-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1056; AVX512-NEXT: vmovdqu %ymm0, (%rax) 1057; AVX512-NEXT: vzeroupper 1058; AVX512-NEXT: retq 1059 %1 = load <32 x i8>, ptr %a 1060 %2 = zext <32 x i8> %1 to <32 x i32> 1061 %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 1062 %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1063 %5 = trunc <32 x i32> %4 to <32 x i8> 1064 store <32 x i8> %5, ptr undef, align 4 1065 ret void 1066} 1067 1068define void @avg_v64i8_const(ptr %a) nounwind { 1069; SSE2-LABEL: avg_v64i8_const: 1070; SSE2: # %bb.0: 1071; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] 1072; SSE2-NEXT: movdqa (%rdi), %xmm1 1073; SSE2-NEXT: pavgb %xmm0, %xmm1 1074; SSE2-NEXT: movdqa 16(%rdi), %xmm2 1075; SSE2-NEXT: pavgb %xmm0, %xmm2 1076; SSE2-NEXT: movdqa 32(%rdi), %xmm3 1077; SSE2-NEXT: pavgb %xmm0, %xmm3 1078; SSE2-NEXT: pavgb 48(%rdi), %xmm0 1079; SSE2-NEXT: movdqu %xmm0, (%rax) 1080; SSE2-NEXT: movdqu %xmm3, (%rax) 1081; SSE2-NEXT: movdqu %xmm2, (%rax) 1082; SSE2-NEXT: movdqu %xmm1, (%rax) 1083; SSE2-NEXT: retq 1084; 1085; AVX1-LABEL: avg_v64i8_const: 1086; AVX1: # %bb.0: 1087; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] 1088; AVX1-NEXT: # xmm0 = mem[0,0] 1089; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm1 1090; AVX1-NEXT: vpavgb 16(%rdi), %xmm0, %xmm2 1091; AVX1-NEXT: vpavgb 32(%rdi), %xmm0, %xmm3 1092; AVX1-NEXT: vpavgb 48(%rdi), %xmm0, %xmm0 1093; AVX1-NEXT: vmovdqu %xmm0, (%rax) 1094; AVX1-NEXT: vmovdqu %xmm3, (%rax) 1095; AVX1-NEXT: vmovdqu %xmm2, (%rax) 1096; AVX1-NEXT: vmovdqu %xmm1, (%rax) 1097; AVX1-NEXT: retq 1098; 1099; AVX2-LABEL: avg_v64i8_const: 1100; AVX2: # %bb.0: 1101; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] 1102; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm1 1103; AVX2-NEXT: vpavgb 32(%rdi), %ymm0, %ymm0 1104; AVX2-NEXT: vmovdqu %ymm0, (%rax) 1105; AVX2-NEXT: vmovdqu %ymm1, (%rax) 1106; AVX2-NEXT: vzeroupper 1107; AVX2-NEXT: retq 1108; 1109; AVX512F-LABEL: avg_v64i8_const: 1110; AVX512F: # %bb.0: 1111; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] 1112; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm1 1113; AVX512F-NEXT: vpavgb 32(%rdi), %ymm0, %ymm0 1114; AVX512F-NEXT: vmovdqu %ymm0, (%rax) 1115; AVX512F-NEXT: vmovdqu %ymm1, (%rax) 1116; AVX512F-NEXT: vzeroupper 1117; AVX512F-NEXT: retq 1118; 1119; AVX512BW-LABEL: avg_v64i8_const: 1120; AVX512BW: # %bb.0: 1121; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1122; AVX512BW-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 1123; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) 1124; AVX512BW-NEXT: vzeroupper 1125; AVX512BW-NEXT: retq 1126 %1 = load <64 x i8>, ptr %a 1127 %2 = zext <64 x i8> %1 to <64 x i32> 1128 %3 = add nuw nsw <64 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 1129 %4 = lshr <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1130 %5 = trunc <64 x i32> %4 to <64 x i8> 1131 store <64 x i8> %5, ptr undef, align 4 1132 ret void 1133} 1134 1135define void @avg_v4i16_const(ptr %a) nounwind { 1136; SSE2-LABEL: avg_v4i16_const: 1137; SSE2: # %bb.0: 1138; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 1139; SSE2-NEXT: pavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1140; SSE2-NEXT: movq %xmm0, (%rax) 1141; SSE2-NEXT: retq 1142; 1143; AVX-LABEL: avg_v4i16_const: 1144; AVX: # %bb.0: 1145; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1146; AVX-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1147; AVX-NEXT: vmovq %xmm0, (%rax) 1148; AVX-NEXT: retq 1149 %1 = load <4 x i16>, ptr %a 1150 %2 = zext <4 x i16> %1 to <4 x i32> 1151 %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4> 1152 %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1> 1153 %5 = trunc <4 x i32> %4 to <4 x i16> 1154 store <4 x i16> %5, ptr undef, align 4 1155 ret void 1156} 1157 1158define void @avg_v8i16_const(ptr %a) nounwind { 1159; SSE2-LABEL: avg_v8i16_const: 1160; SSE2: # %bb.0: 1161; SSE2-NEXT: movdqa (%rdi), %xmm0 1162; SSE2-NEXT: pavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1163; SSE2-NEXT: movdqu %xmm0, (%rax) 1164; SSE2-NEXT: retq 1165; 1166; AVX-LABEL: avg_v8i16_const: 1167; AVX: # %bb.0: 1168; AVX-NEXT: vmovdqa (%rdi), %xmm0 1169; AVX-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1170; AVX-NEXT: vmovdqu %xmm0, (%rax) 1171; AVX-NEXT: retq 1172 %1 = load <8 x i16>, ptr %a 1173 %2 = zext <8 x i16> %1 to <8 x i32> 1174 %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 1175 %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1176 %5 = trunc <8 x i32> %4 to <8 x i16> 1177 store <8 x i16> %5, ptr undef, align 4 1178 ret void 1179} 1180 1181define void @avg_v16i16_const(ptr %a) nounwind { 1182; SSE2-LABEL: avg_v16i16_const: 1183; SSE2: # %bb.0: 1184; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] 1185; SSE2-NEXT: movdqa (%rdi), %xmm1 1186; SSE2-NEXT: pavgw %xmm0, %xmm1 1187; SSE2-NEXT: pavgw 16(%rdi), %xmm0 1188; SSE2-NEXT: movdqu %xmm0, (%rax) 1189; SSE2-NEXT: movdqu %xmm1, (%rax) 1190; SSE2-NEXT: retq 1191; 1192; AVX1-LABEL: avg_v16i16_const: 1193; AVX1: # %bb.0: 1194; AVX1-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] 1195; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm1 1196; AVX1-NEXT: vpavgw 16(%rdi), %xmm0, %xmm0 1197; AVX1-NEXT: vmovdqu %xmm0, (%rax) 1198; AVX1-NEXT: vmovdqu %xmm1, (%rax) 1199; AVX1-NEXT: retq 1200; 1201; AVX2-LABEL: avg_v16i16_const: 1202; AVX2: # %bb.0: 1203; AVX2-NEXT: vmovdqa (%rdi), %ymm0 1204; AVX2-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1205; AVX2-NEXT: vmovdqu %ymm0, (%rax) 1206; AVX2-NEXT: vzeroupper 1207; AVX2-NEXT: retq 1208; 1209; AVX512-LABEL: avg_v16i16_const: 1210; AVX512: # %bb.0: 1211; AVX512-NEXT: vmovdqa (%rdi), %ymm0 1212; AVX512-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1213; AVX512-NEXT: vmovdqu %ymm0, (%rax) 1214; AVX512-NEXT: vzeroupper 1215; AVX512-NEXT: retq 1216 %1 = load <16 x i16>, ptr %a 1217 %2 = zext <16 x i16> %1 to <16 x i32> 1218 %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 1219 %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1220 %5 = trunc <16 x i32> %4 to <16 x i16> 1221 store <16 x i16> %5, ptr undef, align 4 1222 ret void 1223} 1224 1225define void @avg_v32i16_const(ptr %a) nounwind { 1226; SSE2-LABEL: avg_v32i16_const: 1227; SSE2: # %bb.0: 1228; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] 1229; SSE2-NEXT: movdqa (%rdi), %xmm1 1230; SSE2-NEXT: pavgw %xmm0, %xmm1 1231; SSE2-NEXT: movdqa 16(%rdi), %xmm2 1232; SSE2-NEXT: pavgw %xmm0, %xmm2 1233; SSE2-NEXT: movdqa 32(%rdi), %xmm3 1234; SSE2-NEXT: pavgw %xmm0, %xmm3 1235; SSE2-NEXT: pavgw 48(%rdi), %xmm0 1236; SSE2-NEXT: movdqu %xmm0, (%rax) 1237; SSE2-NEXT: movdqu %xmm3, (%rax) 1238; SSE2-NEXT: movdqu %xmm2, (%rax) 1239; SSE2-NEXT: movdqu %xmm1, (%rax) 1240; SSE2-NEXT: retq 1241; 1242; AVX1-LABEL: avg_v32i16_const: 1243; AVX1: # %bb.0: 1244; AVX1-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] 1245; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm1 1246; AVX1-NEXT: vpavgw 16(%rdi), %xmm0, %xmm2 1247; AVX1-NEXT: vpavgw 32(%rdi), %xmm0, %xmm3 1248; AVX1-NEXT: vpavgw 48(%rdi), %xmm0, %xmm0 1249; AVX1-NEXT: vmovdqu %xmm0, (%rax) 1250; AVX1-NEXT: vmovdqu %xmm3, (%rax) 1251; AVX1-NEXT: vmovdqu %xmm2, (%rax) 1252; AVX1-NEXT: vmovdqu %xmm1, (%rax) 1253; AVX1-NEXT: retq 1254; 1255; AVX2-LABEL: avg_v32i16_const: 1256; AVX2: # %bb.0: 1257; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] 1258; AVX2-NEXT: # ymm0 = mem[0,1,0,1] 1259; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm1 1260; AVX2-NEXT: vpavgw 32(%rdi), %ymm0, %ymm0 1261; AVX2-NEXT: vmovdqu %ymm0, (%rax) 1262; AVX2-NEXT: vmovdqu %ymm1, (%rax) 1263; AVX2-NEXT: vzeroupper 1264; AVX2-NEXT: retq 1265; 1266; AVX512F-LABEL: avg_v32i16_const: 1267; AVX512F: # %bb.0: 1268; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] 1269; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] 1270; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm1 1271; AVX512F-NEXT: vpavgw 32(%rdi), %ymm0, %ymm0 1272; AVX512F-NEXT: vmovdqu %ymm0, (%rax) 1273; AVX512F-NEXT: vmovdqu %ymm1, (%rax) 1274; AVX512F-NEXT: vzeroupper 1275; AVX512F-NEXT: retq 1276; 1277; AVX512BW-LABEL: avg_v32i16_const: 1278; AVX512BW: # %bb.0: 1279; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1280; AVX512BW-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 1281; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) 1282; AVX512BW-NEXT: vzeroupper 1283; AVX512BW-NEXT: retq 1284 %1 = load <32 x i16>, ptr %a 1285 %2 = zext <32 x i16> %1 to <32 x i32> 1286 %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 1287 %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1288 %5 = trunc <32 x i32> %4 to <32 x i16> 1289 store <32 x i16> %5, ptr undef, align 4 1290 ret void 1291} 1292 1293define <16 x i8> @avg_v16i8_3(<16 x i8> %a, <16 x i8> %b) nounwind { 1294; SSE2-LABEL: avg_v16i8_3: 1295; SSE2: # %bb.0: 1296; SSE2-NEXT: pavgb %xmm1, %xmm0 1297; SSE2-NEXT: retq 1298; 1299; AVX-LABEL: avg_v16i8_3: 1300; AVX: # %bb.0: 1301; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0 1302; AVX-NEXT: retq 1303 %za = zext <16 x i8> %a to <16 x i16> 1304 %zb = zext <16 x i8> %b to <16 x i16> 1305 %add = add nuw nsw <16 x i16> %za, %zb 1306 %add1 = add nuw nsw <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1307 %lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1308 %res = trunc <16 x i16> %lshr to <16 x i8> 1309 ret <16 x i8> %res 1310} 1311 1312define <32 x i8> @avg_v32i8_3(<32 x i8> %a, <32 x i8> %b) nounwind { 1313; SSE2-LABEL: avg_v32i8_3: 1314; SSE2: # %bb.0: 1315; SSE2-NEXT: pavgb %xmm2, %xmm0 1316; SSE2-NEXT: pavgb %xmm3, %xmm1 1317; SSE2-NEXT: retq 1318; 1319; AVX1-LABEL: avg_v32i8_3: 1320; AVX1: # %bb.0: 1321; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1322; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1323; AVX1-NEXT: vpavgb %xmm2, %xmm3, %xmm2 1324; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 1325; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1326; AVX1-NEXT: retq 1327; 1328; AVX2-LABEL: avg_v32i8_3: 1329; AVX2: # %bb.0: 1330; AVX2-NEXT: vpavgb %ymm1, %ymm0, %ymm0 1331; AVX2-NEXT: retq 1332; 1333; AVX512-LABEL: avg_v32i8_3: 1334; AVX512: # %bb.0: 1335; AVX512-NEXT: vpavgb %ymm1, %ymm0, %ymm0 1336; AVX512-NEXT: retq 1337 %za = zext <32 x i8> %a to <32 x i16> 1338 %zb = zext <32 x i8> %b to <32 x i16> 1339 %add = add nuw nsw <32 x i16> %za, %zb 1340 %add1 = add nuw nsw <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1341 %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1342 %res = trunc <32 x i16> %lshr to <32 x i8> 1343 ret <32 x i8> %res 1344} 1345 1346define <64 x i8> @avg_v64i8_3(<64 x i8> %a, <64 x i8> %b) nounwind { 1347; SSE2-LABEL: avg_v64i8_3: 1348; SSE2: # %bb.0: 1349; SSE2-NEXT: pavgb %xmm4, %xmm0 1350; SSE2-NEXT: pavgb %xmm5, %xmm1 1351; SSE2-NEXT: pavgb %xmm6, %xmm2 1352; SSE2-NEXT: pavgb %xmm7, %xmm3 1353; SSE2-NEXT: retq 1354; 1355; AVX1-LABEL: avg_v64i8_3: 1356; AVX1: # %bb.0: 1357; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 1358; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 1359; AVX1-NEXT: vpavgb %xmm4, %xmm5, %xmm4 1360; AVX1-NEXT: vpavgb %xmm2, %xmm0, %xmm0 1361; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 1362; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 1363; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 1364; AVX1-NEXT: vpavgb %xmm2, %xmm4, %xmm2 1365; AVX1-NEXT: vpavgb %xmm3, %xmm1, %xmm1 1366; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1367; AVX1-NEXT: retq 1368; 1369; AVX2-LABEL: avg_v64i8_3: 1370; AVX2: # %bb.0: 1371; AVX2-NEXT: vpavgb %ymm2, %ymm0, %ymm0 1372; AVX2-NEXT: vpavgb %ymm3, %ymm1, %ymm1 1373; AVX2-NEXT: retq 1374; 1375; AVX512F-LABEL: avg_v64i8_3: 1376; AVX512F: # %bb.0: 1377; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 1378; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 1379; AVX512F-NEXT: vpavgb %ymm2, %ymm3, %ymm2 1380; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0 1381; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 1382; AVX512F-NEXT: retq 1383; 1384; AVX512BW-LABEL: avg_v64i8_3: 1385; AVX512BW: # %bb.0: 1386; AVX512BW-NEXT: vpavgb %zmm1, %zmm0, %zmm0 1387; AVX512BW-NEXT: retq 1388 %za = zext <64 x i8> %a to <64 x i16> 1389 %zb = zext <64 x i8> %b to <64 x i16> 1390 %add = add nuw nsw <64 x i16> %za, %zb 1391 %add1 = add nuw nsw <64 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1392 %lshr = lshr <64 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1393 %res = trunc <64 x i16> %lshr to <64 x i8> 1394 ret <64 x i8> %res 1395} 1396 1397define <512 x i8> @avg_v512i8_3(<512 x i8> %a, <512 x i8> %b) nounwind { 1398; SSE2-LABEL: avg_v512i8_3: 1399; SSE2: # %bb.0: 1400; SSE2-NEXT: movq %rdi, %rax 1401; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1402; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1403; SSE2-NEXT: movdqa %xmm8, 496(%rdi) 1404; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1405; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1406; SSE2-NEXT: movdqa %xmm8, 480(%rdi) 1407; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1408; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1409; SSE2-NEXT: movdqa %xmm8, 464(%rdi) 1410; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1411; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1412; SSE2-NEXT: movdqa %xmm8, 448(%rdi) 1413; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1414; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1415; SSE2-NEXT: movdqa %xmm8, 432(%rdi) 1416; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1417; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1418; SSE2-NEXT: movdqa %xmm8, 416(%rdi) 1419; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1420; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1421; SSE2-NEXT: movdqa %xmm8, 400(%rdi) 1422; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1423; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1424; SSE2-NEXT: movdqa %xmm8, 384(%rdi) 1425; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1426; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1427; SSE2-NEXT: movdqa %xmm8, 368(%rdi) 1428; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1429; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1430; SSE2-NEXT: movdqa %xmm8, 352(%rdi) 1431; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1432; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1433; SSE2-NEXT: movdqa %xmm8, 336(%rdi) 1434; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1435; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1436; SSE2-NEXT: movdqa %xmm8, 320(%rdi) 1437; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1438; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1439; SSE2-NEXT: movdqa %xmm8, 304(%rdi) 1440; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1441; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1442; SSE2-NEXT: movdqa %xmm8, 288(%rdi) 1443; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1444; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1445; SSE2-NEXT: movdqa %xmm8, 272(%rdi) 1446; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1447; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1448; SSE2-NEXT: movdqa %xmm8, 256(%rdi) 1449; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1450; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1451; SSE2-NEXT: movdqa %xmm8, 240(%rdi) 1452; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1453; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1454; SSE2-NEXT: movdqa %xmm8, 224(%rdi) 1455; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1456; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1457; SSE2-NEXT: movdqa %xmm8, 208(%rdi) 1458; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1459; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1460; SSE2-NEXT: movdqa %xmm8, 192(%rdi) 1461; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1462; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1463; SSE2-NEXT: movdqa %xmm8, 176(%rdi) 1464; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1465; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1466; SSE2-NEXT: movdqa %xmm8, 160(%rdi) 1467; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1468; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1469; SSE2-NEXT: movdqa %xmm8, 144(%rdi) 1470; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1471; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1472; SSE2-NEXT: movdqa %xmm8, 128(%rdi) 1473; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm7 1474; SSE2-NEXT: movdqa %xmm7, 112(%rdi) 1475; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm6 1476; SSE2-NEXT: movdqa %xmm6, 96(%rdi) 1477; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm5 1478; SSE2-NEXT: movdqa %xmm5, 80(%rdi) 1479; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm4 1480; SSE2-NEXT: movdqa %xmm4, 64(%rdi) 1481; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm3 1482; SSE2-NEXT: movdqa %xmm3, 48(%rdi) 1483; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm2 1484; SSE2-NEXT: movdqa %xmm2, 32(%rdi) 1485; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm1 1486; SSE2-NEXT: movdqa %xmm1, 16(%rdi) 1487; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm0 1488; SSE2-NEXT: movdqa %xmm0, (%rdi) 1489; SSE2-NEXT: retq 1490; 1491; AVX1-LABEL: avg_v512i8_3: 1492; AVX1: # %bb.0: 1493; AVX1-NEXT: pushq %rbp 1494; AVX1-NEXT: movq %rsp, %rbp 1495; AVX1-NEXT: andq $-32, %rsp 1496; AVX1-NEXT: subq $32, %rsp 1497; AVX1-NEXT: movq %rdi, %rax 1498; AVX1-NEXT: vmovdqa 256(%rbp), %xmm8 1499; AVX1-NEXT: vpavgb 768(%rbp), %xmm8, %xmm8 1500; AVX1-NEXT: vmovdqa %xmm8, 496(%rdi) 1501; AVX1-NEXT: vmovdqa 240(%rbp), %xmm8 1502; AVX1-NEXT: vpavgb 752(%rbp), %xmm8, %xmm8 1503; AVX1-NEXT: vmovdqa %xmm8, 480(%rdi) 1504; AVX1-NEXT: vmovdqa 224(%rbp), %xmm8 1505; AVX1-NEXT: vpavgb 736(%rbp), %xmm8, %xmm8 1506; AVX1-NEXT: vmovdqa %xmm8, 464(%rdi) 1507; AVX1-NEXT: vmovdqa 208(%rbp), %xmm8 1508; AVX1-NEXT: vpavgb 720(%rbp), %xmm8, %xmm8 1509; AVX1-NEXT: vmovdqa %xmm8, 448(%rdi) 1510; AVX1-NEXT: vmovdqa 192(%rbp), %xmm8 1511; AVX1-NEXT: vpavgb 704(%rbp), %xmm8, %xmm8 1512; AVX1-NEXT: vmovdqa %xmm8, 432(%rdi) 1513; AVX1-NEXT: vmovdqa 176(%rbp), %xmm8 1514; AVX1-NEXT: vpavgb 688(%rbp), %xmm8, %xmm8 1515; AVX1-NEXT: vmovdqa %xmm8, 416(%rdi) 1516; AVX1-NEXT: vmovdqa 160(%rbp), %xmm8 1517; AVX1-NEXT: vpavgb 672(%rbp), %xmm8, %xmm8 1518; AVX1-NEXT: vmovdqa %xmm8, 400(%rdi) 1519; AVX1-NEXT: vmovdqa 144(%rbp), %xmm8 1520; AVX1-NEXT: vpavgb 656(%rbp), %xmm8, %xmm8 1521; AVX1-NEXT: vmovdqa %xmm8, 384(%rdi) 1522; AVX1-NEXT: vmovdqa 128(%rbp), %xmm8 1523; AVX1-NEXT: vpavgb 640(%rbp), %xmm8, %xmm8 1524; AVX1-NEXT: vmovdqa %xmm8, 368(%rdi) 1525; AVX1-NEXT: vmovdqa 112(%rbp), %xmm8 1526; AVX1-NEXT: vpavgb 624(%rbp), %xmm8, %xmm8 1527; AVX1-NEXT: vmovdqa %xmm8, 352(%rdi) 1528; AVX1-NEXT: vmovdqa 96(%rbp), %xmm8 1529; AVX1-NEXT: vpavgb 608(%rbp), %xmm8, %xmm8 1530; AVX1-NEXT: vmovdqa %xmm8, 336(%rdi) 1531; AVX1-NEXT: vmovdqa 80(%rbp), %xmm8 1532; AVX1-NEXT: vpavgb 592(%rbp), %xmm8, %xmm8 1533; AVX1-NEXT: vmovdqa %xmm8, 320(%rdi) 1534; AVX1-NEXT: vmovdqa 64(%rbp), %xmm8 1535; AVX1-NEXT: vpavgb 576(%rbp), %xmm8, %xmm8 1536; AVX1-NEXT: vmovdqa %xmm8, 304(%rdi) 1537; AVX1-NEXT: vmovdqa 48(%rbp), %xmm8 1538; AVX1-NEXT: vpavgb 560(%rbp), %xmm8, %xmm8 1539; AVX1-NEXT: vmovdqa %xmm8, 288(%rdi) 1540; AVX1-NEXT: vmovdqa 32(%rbp), %xmm8 1541; AVX1-NEXT: vpavgb 544(%rbp), %xmm8, %xmm8 1542; AVX1-NEXT: vmovdqa %xmm8, 272(%rdi) 1543; AVX1-NEXT: vmovdqa 16(%rbp), %xmm8 1544; AVX1-NEXT: vpavgb 528(%rbp), %xmm8, %xmm8 1545; AVX1-NEXT: vmovdqa %xmm8, 256(%rdi) 1546; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8 1547; AVX1-NEXT: vpavgb 512(%rbp), %xmm8, %xmm8 1548; AVX1-NEXT: vmovdqa %xmm8, 240(%rdi) 1549; AVX1-NEXT: vpavgb 496(%rbp), %xmm7, %xmm7 1550; AVX1-NEXT: vmovdqa %xmm7, 224(%rdi) 1551; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7 1552; AVX1-NEXT: vpavgb 480(%rbp), %xmm7, %xmm7 1553; AVX1-NEXT: vmovdqa %xmm7, 208(%rdi) 1554; AVX1-NEXT: vpavgb 464(%rbp), %xmm6, %xmm6 1555; AVX1-NEXT: vmovdqa %xmm6, 192(%rdi) 1556; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6 1557; AVX1-NEXT: vpavgb 448(%rbp), %xmm6, %xmm6 1558; AVX1-NEXT: vmovdqa %xmm6, 176(%rdi) 1559; AVX1-NEXT: vpavgb 432(%rbp), %xmm5, %xmm5 1560; AVX1-NEXT: vmovdqa %xmm5, 160(%rdi) 1561; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 1562; AVX1-NEXT: vpavgb 416(%rbp), %xmm5, %xmm5 1563; AVX1-NEXT: vmovdqa %xmm5, 144(%rdi) 1564; AVX1-NEXT: vpavgb 400(%rbp), %xmm4, %xmm4 1565; AVX1-NEXT: vmovdqa %xmm4, 128(%rdi) 1566; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 1567; AVX1-NEXT: vpavgb 384(%rbp), %xmm4, %xmm4 1568; AVX1-NEXT: vmovdqa %xmm4, 112(%rdi) 1569; AVX1-NEXT: vpavgb 368(%rbp), %xmm3, %xmm3 1570; AVX1-NEXT: vmovdqa %xmm3, 96(%rdi) 1571; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 1572; AVX1-NEXT: vpavgb 352(%rbp), %xmm3, %xmm3 1573; AVX1-NEXT: vmovdqa %xmm3, 80(%rdi) 1574; AVX1-NEXT: vpavgb 336(%rbp), %xmm2, %xmm2 1575; AVX1-NEXT: vmovdqa %xmm2, 64(%rdi) 1576; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1577; AVX1-NEXT: vpavgb 320(%rbp), %xmm2, %xmm2 1578; AVX1-NEXT: vmovdqa %xmm2, 48(%rdi) 1579; AVX1-NEXT: vpavgb 304(%rbp), %xmm1, %xmm1 1580; AVX1-NEXT: vmovdqa %xmm1, 32(%rdi) 1581; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1582; AVX1-NEXT: vpavgb 288(%rbp), %xmm1, %xmm1 1583; AVX1-NEXT: vmovdqa %xmm1, 16(%rdi) 1584; AVX1-NEXT: vpavgb 272(%rbp), %xmm0, %xmm0 1585; AVX1-NEXT: vmovdqa %xmm0, (%rdi) 1586; AVX1-NEXT: movq %rbp, %rsp 1587; AVX1-NEXT: popq %rbp 1588; AVX1-NEXT: vzeroupper 1589; AVX1-NEXT: retq 1590; 1591; AVX2-LABEL: avg_v512i8_3: 1592; AVX2: # %bb.0: 1593; AVX2-NEXT: pushq %rbp 1594; AVX2-NEXT: movq %rsp, %rbp 1595; AVX2-NEXT: andq $-32, %rsp 1596; AVX2-NEXT: subq $32, %rsp 1597; AVX2-NEXT: movq %rdi, %rax 1598; AVX2-NEXT: vmovdqa 240(%rbp), %ymm8 1599; AVX2-NEXT: vmovdqa 208(%rbp), %ymm9 1600; AVX2-NEXT: vmovdqa 176(%rbp), %ymm10 1601; AVX2-NEXT: vmovdqa 144(%rbp), %ymm11 1602; AVX2-NEXT: vmovdqa 112(%rbp), %ymm12 1603; AVX2-NEXT: vmovdqa 80(%rbp), %ymm13 1604; AVX2-NEXT: vmovdqa 48(%rbp), %ymm14 1605; AVX2-NEXT: vmovdqa 16(%rbp), %ymm15 1606; AVX2-NEXT: vpavgb 272(%rbp), %ymm0, %ymm0 1607; AVX2-NEXT: vpavgb 304(%rbp), %ymm1, %ymm1 1608; AVX2-NEXT: vpavgb 336(%rbp), %ymm2, %ymm2 1609; AVX2-NEXT: vpavgb 368(%rbp), %ymm3, %ymm3 1610; AVX2-NEXT: vpavgb 400(%rbp), %ymm4, %ymm4 1611; AVX2-NEXT: vpavgb 432(%rbp), %ymm5, %ymm5 1612; AVX2-NEXT: vpavgb 464(%rbp), %ymm6, %ymm6 1613; AVX2-NEXT: vpavgb 496(%rbp), %ymm7, %ymm7 1614; AVX2-NEXT: vpavgb 528(%rbp), %ymm15, %ymm15 1615; AVX2-NEXT: vpavgb 560(%rbp), %ymm14, %ymm14 1616; AVX2-NEXT: vpavgb 592(%rbp), %ymm13, %ymm13 1617; AVX2-NEXT: vpavgb 624(%rbp), %ymm12, %ymm12 1618; AVX2-NEXT: vpavgb 656(%rbp), %ymm11, %ymm11 1619; AVX2-NEXT: vpavgb 688(%rbp), %ymm10, %ymm10 1620; AVX2-NEXT: vpavgb 720(%rbp), %ymm9, %ymm9 1621; AVX2-NEXT: vpavgb 752(%rbp), %ymm8, %ymm8 1622; AVX2-NEXT: vmovdqa %ymm8, 480(%rdi) 1623; AVX2-NEXT: vmovdqa %ymm9, 448(%rdi) 1624; AVX2-NEXT: vmovdqa %ymm10, 416(%rdi) 1625; AVX2-NEXT: vmovdqa %ymm11, 384(%rdi) 1626; AVX2-NEXT: vmovdqa %ymm12, 352(%rdi) 1627; AVX2-NEXT: vmovdqa %ymm13, 320(%rdi) 1628; AVX2-NEXT: vmovdqa %ymm14, 288(%rdi) 1629; AVX2-NEXT: vmovdqa %ymm15, 256(%rdi) 1630; AVX2-NEXT: vmovdqa %ymm7, 224(%rdi) 1631; AVX2-NEXT: vmovdqa %ymm6, 192(%rdi) 1632; AVX2-NEXT: vmovdqa %ymm5, 160(%rdi) 1633; AVX2-NEXT: vmovdqa %ymm4, 128(%rdi) 1634; AVX2-NEXT: vmovdqa %ymm3, 96(%rdi) 1635; AVX2-NEXT: vmovdqa %ymm2, 64(%rdi) 1636; AVX2-NEXT: vmovdqa %ymm1, 32(%rdi) 1637; AVX2-NEXT: vmovdqa %ymm0, (%rdi) 1638; AVX2-NEXT: movq %rbp, %rsp 1639; AVX2-NEXT: popq %rbp 1640; AVX2-NEXT: vzeroupper 1641; AVX2-NEXT: retq 1642; 1643; AVX512F-LABEL: avg_v512i8_3: 1644; AVX512F: # %bb.0: 1645; AVX512F-NEXT: pushq %rbp 1646; AVX512F-NEXT: movq %rsp, %rbp 1647; AVX512F-NEXT: andq $-64, %rsp 1648; AVX512F-NEXT: subq $64, %rsp 1649; AVX512F-NEXT: movq %rdi, %rax 1650; AVX512F-NEXT: vpavgb 16(%rbp), %ymm0, %ymm8 1651; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1652; AVX512F-NEXT: vpavgb 48(%rbp), %ymm0, %ymm0 1653; AVX512F-NEXT: vpavgb 80(%rbp), %ymm1, %ymm9 1654; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 1655; AVX512F-NEXT: vpavgb 112(%rbp), %ymm1, %ymm1 1656; AVX512F-NEXT: vpavgb 144(%rbp), %ymm2, %ymm10 1657; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2 1658; AVX512F-NEXT: vpavgb 176(%rbp), %ymm2, %ymm2 1659; AVX512F-NEXT: vpavgb 208(%rbp), %ymm3, %ymm11 1660; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm3 1661; AVX512F-NEXT: vpavgb 240(%rbp), %ymm3, %ymm3 1662; AVX512F-NEXT: vpavgb 272(%rbp), %ymm4, %ymm12 1663; AVX512F-NEXT: vextracti64x4 $1, %zmm4, %ymm4 1664; AVX512F-NEXT: vpavgb 304(%rbp), %ymm4, %ymm4 1665; AVX512F-NEXT: vpavgb 336(%rbp), %ymm5, %ymm13 1666; AVX512F-NEXT: vextracti64x4 $1, %zmm5, %ymm5 1667; AVX512F-NEXT: vpavgb 368(%rbp), %ymm5, %ymm5 1668; AVX512F-NEXT: vpavgb 400(%rbp), %ymm6, %ymm14 1669; AVX512F-NEXT: vextracti64x4 $1, %zmm6, %ymm6 1670; AVX512F-NEXT: vpavgb 432(%rbp), %ymm6, %ymm6 1671; AVX512F-NEXT: vpavgb 464(%rbp), %ymm7, %ymm15 1672; AVX512F-NEXT: vextracti64x4 $1, %zmm7, %ymm7 1673; AVX512F-NEXT: vpavgb 496(%rbp), %ymm7, %ymm7 1674; AVX512F-NEXT: vmovdqa %ymm7, 480(%rdi) 1675; AVX512F-NEXT: vmovdqa %ymm15, 448(%rdi) 1676; AVX512F-NEXT: vmovdqa %ymm6, 416(%rdi) 1677; AVX512F-NEXT: vmovdqa %ymm14, 384(%rdi) 1678; AVX512F-NEXT: vmovdqa %ymm5, 352(%rdi) 1679; AVX512F-NEXT: vmovdqa %ymm13, 320(%rdi) 1680; AVX512F-NEXT: vmovdqa %ymm4, 288(%rdi) 1681; AVX512F-NEXT: vmovdqa %ymm12, 256(%rdi) 1682; AVX512F-NEXT: vmovdqa %ymm3, 224(%rdi) 1683; AVX512F-NEXT: vmovdqa %ymm11, 192(%rdi) 1684; AVX512F-NEXT: vmovdqa %ymm2, 160(%rdi) 1685; AVX512F-NEXT: vmovdqa %ymm10, 128(%rdi) 1686; AVX512F-NEXT: vmovdqa %ymm1, 96(%rdi) 1687; AVX512F-NEXT: vmovdqa %ymm9, 64(%rdi) 1688; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdi) 1689; AVX512F-NEXT: vmovdqa %ymm8, (%rdi) 1690; AVX512F-NEXT: movq %rbp, %rsp 1691; AVX512F-NEXT: popq %rbp 1692; AVX512F-NEXT: vzeroupper 1693; AVX512F-NEXT: retq 1694; 1695; AVX512BW-LABEL: avg_v512i8_3: 1696; AVX512BW: # %bb.0: 1697; AVX512BW-NEXT: pushq %rbp 1698; AVX512BW-NEXT: movq %rsp, %rbp 1699; AVX512BW-NEXT: andq $-64, %rsp 1700; AVX512BW-NEXT: subq $64, %rsp 1701; AVX512BW-NEXT: movq %rdi, %rax 1702; AVX512BW-NEXT: vpavgb 16(%rbp), %zmm0, %zmm0 1703; AVX512BW-NEXT: vpavgb 80(%rbp), %zmm1, %zmm1 1704; AVX512BW-NEXT: vpavgb 144(%rbp), %zmm2, %zmm2 1705; AVX512BW-NEXT: vpavgb 208(%rbp), %zmm3, %zmm3 1706; AVX512BW-NEXT: vpavgb 272(%rbp), %zmm4, %zmm4 1707; AVX512BW-NEXT: vpavgb 336(%rbp), %zmm5, %zmm5 1708; AVX512BW-NEXT: vpavgb 400(%rbp), %zmm6, %zmm6 1709; AVX512BW-NEXT: vpavgb 464(%rbp), %zmm7, %zmm7 1710; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdi) 1711; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdi) 1712; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdi) 1713; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdi) 1714; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdi) 1715; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdi) 1716; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdi) 1717; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdi) 1718; AVX512BW-NEXT: movq %rbp, %rsp 1719; AVX512BW-NEXT: popq %rbp 1720; AVX512BW-NEXT: vzeroupper 1721; AVX512BW-NEXT: retq 1722 %za = zext <512 x i8> %a to <512 x i16> 1723 %zb = zext <512 x i8> %b to <512 x i16> 1724 %add = add nuw nsw <512 x i16> %za, %zb 1725 %add1 = add nuw nsw <512 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1726 %lshr = lshr <512 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1727 %res = trunc <512 x i16> %lshr to <512 x i8> 1728 ret <512 x i8> %res 1729} 1730 1731; This is not an avgceilu, but its structurally similar and previously caused a crash 1732; because the constants can't be read with APInt::getZExtValue. 1733define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { 1734; SSE2-LABEL: not_avg_v16i8_wide_constants: 1735; SSE2: # %bb.0: 1736; SSE2-NEXT: movaps (%rdi), %xmm1 1737; SSE2-NEXT: movdqa (%rsi), %xmm0 1738; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) 1739; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1740; SSE2-NEXT: decl %eax 1741; SSE2-NEXT: movd %eax, %xmm2 1742; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1743; SSE2-NEXT: decl %eax 1744; SSE2-NEXT: movd %eax, %xmm1 1745; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1746; SSE2-NEXT: decl %eax 1747; SSE2-NEXT: movd %eax, %xmm3 1748; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1749; SSE2-NEXT: decl %eax 1750; SSE2-NEXT: movd %eax, %xmm4 1751; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1752; SSE2-NEXT: decl %eax 1753; SSE2-NEXT: movd %eax, %xmm5 1754; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1755; SSE2-NEXT: decl %eax 1756; SSE2-NEXT: movd %eax, %xmm6 1757; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1758; SSE2-NEXT: decl %eax 1759; SSE2-NEXT: movd %eax, %xmm7 1760; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1761; SSE2-NEXT: decl %eax 1762; SSE2-NEXT: movd %eax, %xmm8 1763; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1764; SSE2-NEXT: decl %eax 1765; SSE2-NEXT: movd %eax, %xmm10 1766; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1767; SSE2-NEXT: decl %eax 1768; SSE2-NEXT: movd %eax, %xmm9 1769; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1770; SSE2-NEXT: decl %eax 1771; SSE2-NEXT: movd %eax, %xmm11 1772; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1773; SSE2-NEXT: decl %eax 1774; SSE2-NEXT: movd %eax, %xmm12 1775; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1776; SSE2-NEXT: decl %eax 1777; SSE2-NEXT: movd %eax, %xmm13 1778; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1779; SSE2-NEXT: decl %eax 1780; SSE2-NEXT: movd %eax, %xmm14 1781; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1782; SSE2-NEXT: decl %eax 1783; SSE2-NEXT: movd %eax, %xmm15 1784; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1785; SSE2-NEXT: decl %eax 1786; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 1787; SSE2-NEXT: movd %eax, %xmm2 1788; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 1789; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0] 1790; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 1791; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] 1792; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,0,0] 1793; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] 1794; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,0,0] 1795; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] 1796; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] 1797; SSE2-NEXT: pxor %xmm3, %xmm3 1798; SSE2-NEXT: movdqa %xmm0, %xmm1 1799; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 1800; SSE2-NEXT: movapd %xmm4, %xmm5 1801; SSE2-NEXT: andpd %xmm1, %xmm5 1802; SSE2-NEXT: xorpd %xmm4, %xmm1 1803; SSE2-NEXT: psrlw $1, %xmm1 1804; SSE2-NEXT: paddw %xmm5, %xmm1 1805; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] 1806; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] 1807; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,0,0,0] 1808; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] 1809; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] 1810; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,0,0,0] 1811; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] 1812; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 1813; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] 1814; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1] 1815; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] 1816; SSE2-NEXT: movapd %xmm2, %xmm3 1817; SSE2-NEXT: andpd %xmm0, %xmm3 1818; SSE2-NEXT: xorpd %xmm2, %xmm0 1819; SSE2-NEXT: psrlw $1, %xmm0 1820; SSE2-NEXT: paddw %xmm3, %xmm0 1821; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 1822; SSE2-NEXT: pand %xmm2, %xmm0 1823; SSE2-NEXT: pand %xmm2, %xmm1 1824; SSE2-NEXT: packuswb %xmm0, %xmm1 1825; SSE2-NEXT: movdqu %xmm1, (%rax) 1826; SSE2-NEXT: retq 1827; 1828; AVX1-LABEL: not_avg_v16i8_wide_constants: 1829; AVX1: # %bb.0: 1830; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1831; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1832; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1833; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1834; AVX1-NEXT: vpextrw $7, %xmm3, %edx 1835; AVX1-NEXT: vpextrw $6, %xmm3, %ecx 1836; AVX1-NEXT: vpextrw $5, %xmm3, %eax 1837; AVX1-NEXT: decl %edx 1838; AVX1-NEXT: vmovd %edx, %xmm4 1839; AVX1-NEXT: vpextrw $4, %xmm3, %edx 1840; AVX1-NEXT: decl %ecx 1841; AVX1-NEXT: vmovd %ecx, %xmm5 1842; AVX1-NEXT: vpextrw $1, %xmm3, %ecx 1843; AVX1-NEXT: decl %eax 1844; AVX1-NEXT: vmovd %eax, %xmm6 1845; AVX1-NEXT: vpextrw $0, %xmm3, %eax 1846; AVX1-NEXT: decl %edx 1847; AVX1-NEXT: vmovd %edx, %xmm7 1848; AVX1-NEXT: vpextrw $3, %xmm3, %edx 1849; AVX1-NEXT: decq %rcx 1850; AVX1-NEXT: vmovq %rcx, %xmm8 1851; AVX1-NEXT: vpextrw $2, %xmm3, %ecx 1852; AVX1-NEXT: decq %rax 1853; AVX1-NEXT: vmovq %rax, %xmm3 1854; AVX1-NEXT: vpextrw $7, %xmm2, %eax 1855; AVX1-NEXT: decl %edx 1856; AVX1-NEXT: vmovd %edx, %xmm9 1857; AVX1-NEXT: vpextrw $6, %xmm2, %edx 1858; AVX1-NEXT: decl %ecx 1859; AVX1-NEXT: vmovd %ecx, %xmm10 1860; AVX1-NEXT: vpextrw $5, %xmm2, %ecx 1861; AVX1-NEXT: decl %eax 1862; AVX1-NEXT: vmovd %eax, %xmm11 1863; AVX1-NEXT: vpextrw $4, %xmm2, %eax 1864; AVX1-NEXT: decl %edx 1865; AVX1-NEXT: vmovd %edx, %xmm12 1866; AVX1-NEXT: vpextrw $1, %xmm2, %edx 1867; AVX1-NEXT: decl %ecx 1868; AVX1-NEXT: vmovd %ecx, %xmm13 1869; AVX1-NEXT: vpextrw $0, %xmm2, %ecx 1870; AVX1-NEXT: decl %eax 1871; AVX1-NEXT: vmovd %eax, %xmm14 1872; AVX1-NEXT: vpextrw $3, %xmm2, %eax 1873; AVX1-NEXT: decq %rdx 1874; AVX1-NEXT: vmovq %rdx, %xmm15 1875; AVX1-NEXT: vpextrw $2, %xmm2, %edx 1876; AVX1-NEXT: decq %rcx 1877; AVX1-NEXT: vmovq %rcx, %xmm2 1878; AVX1-NEXT: decl %eax 1879; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 1880; AVX1-NEXT: vmovd %eax, %xmm5 1881; AVX1-NEXT: decl %edx 1882; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 1883; AVX1-NEXT: vmovd %edx, %xmm7 1884; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] 1885; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] 1886; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5],xmm4[6,7] 1887; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] 1888; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] 1889; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,1,1] 1890; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5,6,7] 1891; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] 1892; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] 1893; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] 1894; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] 1895; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] 1896; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5],xmm4[6,7] 1897; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] 1898; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] 1899; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] 1900; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5,6,7] 1901; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] 1902; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 1903; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1904; AVX1-NEXT: vandps %ymm0, %ymm2, %ymm1 1905; AVX1-NEXT: vxorps %ymm0, %ymm2, %ymm0 1906; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2 1907; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm2 1908; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1909; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1910; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 1911; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0 1912; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 1913; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1914; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 1915; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 1916; AVX1-NEXT: vmovdqu %xmm0, (%rax) 1917; AVX1-NEXT: vzeroupper 1918; AVX1-NEXT: retq 1919; 1920; AVX2-LABEL: not_avg_v16i8_wide_constants: 1921; AVX2: # %bb.0: 1922; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 1923; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 1924; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 1925; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1926; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 1927; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0 1928; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1929; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1930; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1931; AVX2-NEXT: vmovdqu %xmm0, (%rax) 1932; AVX2-NEXT: vzeroupper 1933; AVX2-NEXT: retq 1934; 1935; AVX512F-LABEL: not_avg_v16i8_wide_constants: 1936; AVX512F: # %bb.0: 1937; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 1938; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 1939; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0 1940; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1941; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0 1942; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 1943; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1944; AVX512F-NEXT: vpmovdb %zmm0, (%rax) 1945; AVX512F-NEXT: vzeroupper 1946; AVX512F-NEXT: retq 1947; 1948; AVX512BW-LABEL: not_avg_v16i8_wide_constants: 1949; AVX512BW: # %bb.0: 1950; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 1951; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 1952; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm0 1953; AVX512BW-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1954; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm0 1955; AVX512BW-NEXT: vpsrlw $1, %ymm0, %ymm0 1956; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1957; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) 1958; AVX512BW-NEXT: vzeroupper 1959; AVX512BW-NEXT: retq 1960 %1 = load <16 x i8>, ptr %a 1961 %2 = load <16 x i8>, ptr %b 1962 %3 = zext <16 x i8> %1 to <16 x i128> 1963 %4 = zext <16 x i8> %2 to <16 x i128> 1964 %5 = add <16 x i128> %3, <i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1> 1965 %6 = add <16 x i128> %5, %4 1966 %7 = lshr <16 x i128> %6, <i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1> 1967 %8 = trunc <16 x i128> %7 to <16 x i8> 1968 store <16 x i8> %8, ptr undef, align 4 1969 ret void 1970} 1971 1972; Make sure we don't fail on single element vectors. 1973define <1 x i8> @avg_v1i8(<1 x i8> %x, <1 x i8> %y) { 1974; CHECK-LABEL: avg_v1i8: 1975; CHECK: # %bb.0: 1976; CHECK-NEXT: movzbl %sil, %eax 1977; CHECK-NEXT: movzbl %dil, %ecx 1978; CHECK-NEXT: leal 1(%rcx,%rax), %eax 1979; CHECK-NEXT: shrl %eax 1980; CHECK-NEXT: # kill: def $al killed $al killed $eax 1981; CHECK-NEXT: retq 1982 %a = zext <1 x i8> %x to <1 x i16> 1983 %b = zext <1 x i8> %y to <1 x i16> 1984 %c = add <1 x i16> %a, %b 1985 %d = add <1 x i16> %c, <i16 1> 1986 %e = lshr <1 x i16> %d, <i16 1> 1987 %f = trunc <1 x i16> %e to <1 x i8> 1988 ret <1 x i8> %f 1989} 1990 1991; _mm_avg_epu16( _mm_slli_epi16(a, 2), _mm_slli_epi16(b, 2)) 1992define <2 x i64> @PR41316(<2 x i64>, <2 x i64>) { 1993; SSE2-LABEL: PR41316: 1994; SSE2: # %bb.0: 1995; SSE2-NEXT: psllw $2, %xmm0 1996; SSE2-NEXT: psllw $2, %xmm1 1997; SSE2-NEXT: pavgw %xmm1, %xmm0 1998; SSE2-NEXT: retq 1999; 2000; AVX-LABEL: PR41316: 2001; AVX: # %bb.0: 2002; AVX-NEXT: vpsllw $2, %xmm0, %xmm0 2003; AVX-NEXT: vpsllw $2, %xmm1, %xmm1 2004; AVX-NEXT: vpavgw %xmm0, %xmm1, %xmm0 2005; AVX-NEXT: retq 2006 %3 = bitcast <2 x i64> %0 to <8 x i16> 2007 %4 = shl <8 x i16> %3, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2> 2008 %5 = bitcast <2 x i64> %1 to <8 x i16> 2009 %6 = shl <8 x i16> %5, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2> 2010 %7 = zext <8 x i16> %6 to <8 x i32> 2011 %8 = or <8 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 2012 %9 = zext <8 x i16> %8 to <8 x i32> 2013 %10 = add nuw nsw <8 x i32> %9, %7 2014 %11 = lshr <8 x i32> %10, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 2015 %12 = trunc <8 x i32> %11 to <8 x i16> 2016 %13 = bitcast <8 x i16> %12 to <2 x i64> 2017 ret <2 x i64> %13 2018} 2019 2020; shuffle(avg(shuffle(),shuffle())) -> avg(shuffle(),shuffle()) 2021define <16 x i8> @fold_avgb_shuffles(<16 x i8> %x, <16 x i8> %y) { 2022; SSE2-LABEL: fold_avgb_shuffles: 2023; SSE2: # %bb.0: # %entry 2024; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] 2025; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 2026; SSE2-NEXT: pavgb %xmm1, %xmm0 2027; SSE2-NEXT: retq 2028; 2029; AVX-LABEL: fold_avgb_shuffles: 2030; AVX: # %bb.0: # %entry 2031; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] 2032; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 2033; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0 2034; AVX-NEXT: retq 2035entry: 2036 %0 = shufflevector <16 x i8> %x, <16 x i8> poison, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> 2037 %1 = shufflevector <16 x i8> %y, <16 x i8> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 2038 %2 = tail call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %0, <16 x i8> %1) 2039 %3 = shufflevector <16 x i8> %2, <16 x i8> poison, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 2040 ret <16 x i8> %3 2041} 2042declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) 2043 2044define <8 x i16> @fold_avgw_shuffles(<8 x i16> %x, <8 x i16> %y) { 2045; SSE2-LABEL: fold_avgw_shuffles: 2046; SSE2: # %bb.0: # %entry 2047; SSE2-NEXT: pavgw %xmm1, %xmm0 2048; SSE2-NEXT: retq 2049; 2050; AVX-LABEL: fold_avgw_shuffles: 2051; AVX: # %bb.0: # %entry 2052; AVX-NEXT: vpavgw %xmm1, %xmm0, %xmm0 2053; AVX-NEXT: retq 2054entry: 2055 %0 = shufflevector <8 x i16> %x, <8 x i16> poison, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1> 2056 %1 = shufflevector <8 x i16> %y, <8 x i16> poison, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1> 2057 %2 = tail call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %0, <8 x i16> %1) 2058 %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1> 2059 ret <8 x i16> %3 2060} 2061declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) 2062 2063define <8 x i16> @PR52131_pavg_chain(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) { 2064; SSE2-LABEL: PR52131_pavg_chain: 2065; SSE2: # %bb.0: 2066; SSE2-NEXT: pavgw %xmm1, %xmm0 2067; SSE2-NEXT: pavgw %xmm2, %xmm0 2068; SSE2-NEXT: retq 2069; 2070; AVX-LABEL: PR52131_pavg_chain: 2071; AVX: # %bb.0: 2072; AVX-NEXT: vpavgw %xmm1, %xmm0, %xmm0 2073; AVX-NEXT: vpavgw %xmm0, %xmm2, %xmm0 2074; AVX-NEXT: retq 2075 %i = zext <8 x i16> %a to <8 x i32> 2076 %i1 = zext <8 x i16> %b to <8 x i32> 2077 %i2 = add nuw nsw <8 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 2078 %i3 = add nuw nsw <8 x i32> %i2, %i1 2079 %i4 = lshr <8 x i32> %i3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 2080 %i5 = and <8 x i32> %i4, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535> 2081 %i6 = zext <8 x i16> %c to <8 x i32> 2082 %i7 = add nuw nsw <8 x i32> %i6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 2083 %i8 = add nuw nsw <8 x i32> %i7, %i5 2084 %i9 = lshr <8 x i32> %i8, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 2085 %i10 = trunc <8 x i32> %i9 to <8 x i16> 2086 ret <8 x i16> %i10 2087} 2088 2089define <8 x i16> @PR52131_pavg_chainlike_but_not_zext(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) { 2090; SSE2-LABEL: PR52131_pavg_chainlike_but_not_zext: 2091; SSE2: # %bb.0: 2092; SSE2-NEXT: pavgw %xmm1, %xmm0 2093; SSE2-NEXT: pavgw %xmm2, %xmm0 2094; SSE2-NEXT: retq 2095; 2096; AVX-LABEL: PR52131_pavg_chainlike_but_not_zext: 2097; AVX: # %bb.0: 2098; AVX-NEXT: vpavgw %xmm1, %xmm0, %xmm0 2099; AVX-NEXT: vpavgw %xmm0, %xmm2, %xmm0 2100; AVX-NEXT: retq 2101 %i = zext <8 x i16> %a to <8 x i32> 2102 %i1 = zext <8 x i16> %b to <8 x i32> 2103 %i2 = add nuw nsw <8 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 2104 %i3 = add nuw nsw <8 x i32> %i2, %i1 2105 %i4 = lshr <8 x i32> %i3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 2106 %i5 = and <8 x i32> %i4, <i32 131071, i32 131071, i32 131071, i32 131071, i32 131071, i32 131071, i32 131071, i32 131071> 2107 %i6 = zext <8 x i16> %c to <8 x i32> 2108 %i7 = add nuw nsw <8 x i32> %i6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 2109 %i8 = add nuw nsw <8 x i32> %i7, %i5 2110 %i9 = lshr <8 x i32> %i8, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 2111 %i10 = trunc <8 x i32> %i9 to <8 x i16> 2112 ret <8 x i16> %i10 2113} 2114 2115define <8 x i16> @PR52131_pavg_with_mask(<8 x i32> %a, <8 x i16> %b) { 2116; SSE2-LABEL: PR52131_pavg_with_mask: 2117; SSE2: # %bb.0: 2118; SSE2-NEXT: pslld $16, %xmm1 2119; SSE2-NEXT: psrad $16, %xmm1 2120; SSE2-NEXT: pslld $16, %xmm0 2121; SSE2-NEXT: psrad $16, %xmm0 2122; SSE2-NEXT: packssdw %xmm1, %xmm0 2123; SSE2-NEXT: pavgw %xmm2, %xmm0 2124; SSE2-NEXT: retq 2125; 2126; AVX1-LABEL: PR52131_pavg_with_mask: 2127; AVX1: # %bb.0: 2128; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2129; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2130; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 2131; AVX1-NEXT: vpavgw %xmm0, %xmm1, %xmm0 2132; AVX1-NEXT: vzeroupper 2133; AVX1-NEXT: retq 2134; 2135; AVX2-LABEL: PR52131_pavg_with_mask: 2136; AVX2: # %bb.0: 2137; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 2138; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2139; AVX2-NEXT: vpavgw %xmm0, %xmm1, %xmm0 2140; AVX2-NEXT: vzeroupper 2141; AVX2-NEXT: retq 2142; 2143; AVX512-LABEL: PR52131_pavg_with_mask: 2144; AVX512: # %bb.0: 2145; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2146; AVX512-NEXT: vpmovdw %zmm0, %ymm0 2147; AVX512-NEXT: vpavgw %xmm0, %xmm1, %xmm0 2148; AVX512-NEXT: vzeroupper 2149; AVX512-NEXT: retq 2150 %i = and <8 x i32> %a, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535> 2151 %i3 = zext <8 x i16> %b to <8 x i32> 2152 %i4 = add nuw nsw <8 x i32> %i3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 2153 %i5 = add nuw nsw <8 x i32> %i4, %i 2154 %i6 = lshr <8 x i32> %i5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 2155 %i7 = trunc <8 x i32> %i6 to <8 x i16> 2156 ret <8 x i16> %i7 2157} 2158 2159define <8 x i16> @PR52131_not_zext_with_constant(<8 x i32> %a) { 2160; SSE2-LABEL: PR52131_not_zext_with_constant: 2161; SSE2: # %bb.0: 2162; SSE2-NEXT: pslld $16, %xmm1 2163; SSE2-NEXT: psrad $16, %xmm1 2164; SSE2-NEXT: pslld $16, %xmm0 2165; SSE2-NEXT: psrad $16, %xmm0 2166; SSE2-NEXT: packssdw %xmm1, %xmm0 2167; SSE2-NEXT: pavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2168; SSE2-NEXT: retq 2169; 2170; AVX1-LABEL: PR52131_not_zext_with_constant: 2171; AVX1: # %bb.0: 2172; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2173; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2174; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2175; AVX1-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2176; AVX1-NEXT: vzeroupper 2177; AVX1-NEXT: retq 2178; 2179; AVX2-LABEL: PR52131_not_zext_with_constant: 2180; AVX2: # %bb.0: 2181; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 2182; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2183; AVX2-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2184; AVX2-NEXT: vzeroupper 2185; AVX2-NEXT: retq 2186; 2187; AVX512-LABEL: PR52131_not_zext_with_constant: 2188; AVX512: # %bb.0: 2189; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2190; AVX512-NEXT: vpmovdw %zmm0, %ymm0 2191; AVX512-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2192; AVX512-NEXT: vzeroupper 2193; AVX512-NEXT: retq 2194 %i = and <8 x i32> %a, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535> 2195 %i1 = add nuw nsw <8 x i32> %i, <i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43> 2196 %i2 = lshr <8 x i32> %i1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 2197 %i3 = trunc <8 x i32> %i2 to <8 x i16> 2198 ret <8 x i16> %i3 2199} 2200 2201define i64 @PR95284(i32 %a0) { 2202; CHECK-LABEL: PR95284: 2203; CHECK: # %bb.0: 2204; CHECK-NEXT: movl %edi, %eax 2205; CHECK-NEXT: decq %rax 2206; CHECK-NEXT: shrq %rax 2207; CHECK-NEXT: incq %rax 2208; CHECK-NEXT: andq $-2, %rax 2209; CHECK-NEXT: retq 2210 %ext = zext nneg i32 %a0 to i64 2211 %dec = add i64 %ext, -1 2212 %srl = lshr i64 %dec, 1 2213 %inc = add nuw nsw i64 %srl, 1 2214 %res = and i64 %inc, 9223372036854775806 2215 ret i64 %res 2216} 2217