1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 2; ## Support i16x2 instructions 3; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80 \ 4; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ 5; RUN: | FileCheck -allow-deprecated-dag-overlap %s 6; RUN: %if ptxas %{ \ 7; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 \ 8; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ 9; RUN: | %ptxas-verify -arch=sm_90 \ 10; RUN: %} 11 12target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" 13 14define <4 x i8> @test_ret_const() #0 { 15; CHECK-LABEL: test_ret_const( 16; CHECK: { 17; CHECK-NEXT: .reg .b32 %r<2>; 18; CHECK-EMPTY: 19; CHECK-NEXT: // %bb.0: 20; CHECK-NEXT: mov.b32 %r1, -66911489; 21; CHECK-NEXT: st.param.b32 [func_retval0], %r1; 22; CHECK-NEXT: ret; 23 ret <4 x i8> <i8 -1, i8 2, i8 3, i8 -4> 24} 25 26define i8 @test_extract_0(<4 x i8> %a) #0 { 27; CHECK-LABEL: test_extract_0( 28; CHECK: { 29; CHECK-NEXT: .reg .b32 %r<3>; 30; CHECK-EMPTY: 31; CHECK-NEXT: // %bb.0: 32; CHECK-NEXT: ld.param.u32 %r1, [test_extract_0_param_0]; 33; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; 34; CHECK-NEXT: st.param.b32 [func_retval0], %r2; 35; CHECK-NEXT: ret; 36 %e = extractelement <4 x i8> %a, i32 0 37 ret i8 %e 38} 39 40define i8 @test_extract_1(<4 x i8> %a) #0 { 41; CHECK-LABEL: test_extract_1( 42; CHECK: { 43; CHECK-NEXT: .reg .b32 %r<3>; 44; CHECK-EMPTY: 45; CHECK-NEXT: // %bb.0: 46; CHECK-NEXT: ld.param.u32 %r1, [test_extract_1_param_0]; 47; CHECK-NEXT: bfe.u32 %r2, %r1, 8, 8; 48; CHECK-NEXT: st.param.b32 [func_retval0], %r2; 49; CHECK-NEXT: ret; 50 %e = extractelement <4 x i8> %a, i32 1 51 ret i8 %e 52} 53 54define i8 @test_extract_2(<4 x i8> %a) #0 { 55; CHECK-LABEL: test_extract_2( 56; CHECK: { 57; CHECK-NEXT: .reg .b32 %r<3>; 58; CHECK-EMPTY: 59; CHECK-NEXT: // %bb.0: 60; CHECK-NEXT: ld.param.u32 %r1, [test_extract_2_param_0]; 61; CHECK-NEXT: bfe.u32 %r2, %r1, 16, 8; 62; CHECK-NEXT: st.param.b32 [func_retval0], %r2; 63; CHECK-NEXT: ret; 64 %e = extractelement <4 x i8> %a, i32 2 65 ret i8 %e 66} 67 68define i8 @test_extract_3(<4 x i8> %a) #0 { 69; CHECK-LABEL: test_extract_3( 70; CHECK: { 71; CHECK-NEXT: .reg .b32 %r<3>; 72; CHECK-EMPTY: 73; CHECK-NEXT: // %bb.0: 74; CHECK-NEXT: ld.param.u32 %r1, [test_extract_3_param_0]; 75; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; 76; CHECK-NEXT: st.param.b32 [func_retval0], %r2; 77; CHECK-NEXT: ret; 78 %e = extractelement <4 x i8> %a, i32 3 79 ret i8 %e 80} 81 82define i8 @test_extract_i(<4 x i8> %a, i64 %idx) #0 { 83; CHECK-LABEL: test_extract_i( 84; CHECK: { 85; CHECK-NEXT: .reg .b32 %r<5>; 86; CHECK-NEXT: .reg .b64 %rd<2>; 87; CHECK-EMPTY: 88; CHECK-NEXT: // %bb.0: 89; CHECK-NEXT: ld.param.u64 %rd1, [test_extract_i_param_1]; 90; CHECK-NEXT: ld.param.u32 %r1, [test_extract_i_param_0]; 91; CHECK-NEXT: cvt.u32.u64 %r2, %rd1; 92; CHECK-NEXT: shl.b32 %r3, %r2, 3; 93; CHECK-NEXT: bfe.u32 %r4, %r1, %r3, 8; 94; CHECK-NEXT: st.param.b32 [func_retval0], %r4; 95; CHECK-NEXT: ret; 96 %e = extractelement <4 x i8> %a, i64 %idx 97 ret i8 %e 98} 99 100define <4 x i8> @test_add(<4 x i8> %a, <4 x i8> %b) #0 { 101; CHECK-LABEL: test_add( 102; CHECK: { 103; CHECK-NEXT: .reg .b16 %rs<13>; 104; CHECK-NEXT: .reg .b32 %r<18>; 105; CHECK-EMPTY: 106; CHECK-NEXT: // %bb.0: 107; CHECK-NEXT: ld.param.u32 %r2, [test_add_param_1]; 108; CHECK-NEXT: ld.param.u32 %r1, [test_add_param_0]; 109; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; 110; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; 111; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; 112; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; 113; CHECK-NEXT: add.s16 %rs3, %rs2, %rs1; 114; CHECK-NEXT: cvt.u32.u16 %r5, %rs3; 115; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8; 116; CHECK-NEXT: cvt.u16.u32 %rs4, %r6; 117; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; 118; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; 119; CHECK-NEXT: add.s16 %rs6, %rs5, %rs4; 120; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; 121; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U; 122; CHECK-NEXT: bfe.u32 %r10, %r2, 8, 8; 123; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; 124; CHECK-NEXT: bfe.u32 %r11, %r1, 8, 8; 125; CHECK-NEXT: cvt.u16.u32 %rs8, %r11; 126; CHECK-NEXT: add.s16 %rs9, %rs8, %rs7; 127; CHECK-NEXT: cvt.u32.u16 %r12, %rs9; 128; CHECK-NEXT: bfe.u32 %r13, %r2, 0, 8; 129; CHECK-NEXT: cvt.u16.u32 %rs10, %r13; 130; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8; 131; CHECK-NEXT: cvt.u16.u32 %rs11, %r14; 132; CHECK-NEXT: add.s16 %rs12, %rs11, %rs10; 133; CHECK-NEXT: cvt.u32.u16 %r15, %rs12; 134; CHECK-NEXT: prmt.b32 %r16, %r15, %r12, 0x3340U; 135; CHECK-NEXT: prmt.b32 %r17, %r16, %r9, 0x5410U; 136; CHECK-NEXT: st.param.b32 [func_retval0], %r17; 137; CHECK-NEXT: ret; 138 %r = add <4 x i8> %a, %b 139 ret <4 x i8> %r 140} 141 142define <4 x i8> @test_add_imm_0(<4 x i8> %a) #0 { 143; CHECK-LABEL: test_add_imm_0( 144; CHECK: { 145; CHECK-NEXT: .reg .b16 %rs<9>; 146; CHECK-NEXT: .reg .b32 %r<13>; 147; CHECK-EMPTY: 148; CHECK-NEXT: // %bb.0: 149; CHECK-NEXT: ld.param.u32 %r1, [test_add_imm_0_param_0]; 150; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; 151; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; 152; CHECK-NEXT: add.s16 %rs2, %rs1, 4; 153; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; 154; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; 155; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; 156; CHECK-NEXT: add.s16 %rs4, %rs3, 3; 157; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; 158; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; 159; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; 160; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; 161; CHECK-NEXT: add.s16 %rs6, %rs5, 2; 162; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; 163; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; 164; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; 165; CHECK-NEXT: add.s16 %rs8, %rs7, 1; 166; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; 167; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; 168; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; 169; CHECK-NEXT: st.param.b32 [func_retval0], %r12; 170; CHECK-NEXT: ret; 171 %r = add <4 x i8> <i8 1, i8 2, i8 3, i8 4>, %a 172 ret <4 x i8> %r 173} 174 175define <4 x i8> @test_add_imm_1(<4 x i8> %a) #0 { 176; CHECK-LABEL: test_add_imm_1( 177; CHECK: { 178; CHECK-NEXT: .reg .b16 %rs<9>; 179; CHECK-NEXT: .reg .b32 %r<13>; 180; CHECK-EMPTY: 181; CHECK-NEXT: // %bb.0: 182; CHECK-NEXT: ld.param.u32 %r1, [test_add_imm_1_param_0]; 183; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; 184; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; 185; CHECK-NEXT: add.s16 %rs2, %rs1, 4; 186; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; 187; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; 188; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; 189; CHECK-NEXT: add.s16 %rs4, %rs3, 3; 190; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; 191; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; 192; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; 193; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; 194; CHECK-NEXT: add.s16 %rs6, %rs5, 2; 195; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; 196; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; 197; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; 198; CHECK-NEXT: add.s16 %rs8, %rs7, 1; 199; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; 200; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; 201; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; 202; CHECK-NEXT: st.param.b32 [func_retval0], %r12; 203; CHECK-NEXT: ret; 204 %r = add <4 x i8> %a, <i8 1, i8 2, i8 3, i8 4> 205 ret <4 x i8> %r 206} 207 208define <4 x i8> @test_sub(<4 x i8> %a, <4 x i8> %b) #0 { 209; CHECK-LABEL: test_sub( 210; CHECK: { 211; CHECK-NEXT: .reg .b16 %rs<13>; 212; CHECK-NEXT: .reg .b32 %r<18>; 213; CHECK-EMPTY: 214; CHECK-NEXT: // %bb.0: 215; CHECK-NEXT: ld.param.u32 %r2, [test_sub_param_1]; 216; CHECK-NEXT: ld.param.u32 %r1, [test_sub_param_0]; 217; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; 218; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; 219; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; 220; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; 221; CHECK-NEXT: sub.s16 %rs3, %rs2, %rs1; 222; CHECK-NEXT: cvt.u32.u16 %r5, %rs3; 223; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8; 224; CHECK-NEXT: cvt.u16.u32 %rs4, %r6; 225; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; 226; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; 227; CHECK-NEXT: sub.s16 %rs6, %rs5, %rs4; 228; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; 229; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U; 230; CHECK-NEXT: bfe.u32 %r10, %r2, 8, 8; 231; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; 232; CHECK-NEXT: bfe.u32 %r11, %r1, 8, 8; 233; CHECK-NEXT: cvt.u16.u32 %rs8, %r11; 234; CHECK-NEXT: sub.s16 %rs9, %rs8, %rs7; 235; CHECK-NEXT: cvt.u32.u16 %r12, %rs9; 236; CHECK-NEXT: bfe.u32 %r13, %r2, 0, 8; 237; CHECK-NEXT: cvt.u16.u32 %rs10, %r13; 238; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8; 239; CHECK-NEXT: cvt.u16.u32 %rs11, %r14; 240; CHECK-NEXT: sub.s16 %rs12, %rs11, %rs10; 241; CHECK-NEXT: cvt.u32.u16 %r15, %rs12; 242; CHECK-NEXT: prmt.b32 %r16, %r15, %r12, 0x3340U; 243; CHECK-NEXT: prmt.b32 %r17, %r16, %r9, 0x5410U; 244; CHECK-NEXT: st.param.b32 [func_retval0], %r17; 245; CHECK-NEXT: ret; 246 %r = sub <4 x i8> %a, %b 247 ret <4 x i8> %r 248} 249 250define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 { 251; CHECK-LABEL: test_smax( 252; CHECK: { 253; CHECK-NEXT: .reg .pred %p<5>; 254; CHECK-NEXT: .reg .b32 %r<26>; 255; CHECK-EMPTY: 256; CHECK-NEXT: // %bb.0: 257; CHECK-NEXT: ld.param.u32 %r2, [test_smax_param_1]; 258; CHECK-NEXT: ld.param.u32 %r1, [test_smax_param_0]; 259; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8; 260; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8; 261; CHECK-NEXT: setp.gt.s32 %p1, %r4, %r3; 262; CHECK-NEXT: bfe.s32 %r5, %r2, 8, 8; 263; CHECK-NEXT: bfe.s32 %r6, %r1, 8, 8; 264; CHECK-NEXT: setp.gt.s32 %p2, %r6, %r5; 265; CHECK-NEXT: bfe.s32 %r7, %r2, 16, 8; 266; CHECK-NEXT: bfe.s32 %r8, %r1, 16, 8; 267; CHECK-NEXT: setp.gt.s32 %p3, %r8, %r7; 268; CHECK-NEXT: bfe.s32 %r9, %r2, 24, 8; 269; CHECK-NEXT: bfe.s32 %r10, %r1, 24, 8; 270; CHECK-NEXT: setp.gt.s32 %p4, %r10, %r9; 271; CHECK-NEXT: bfe.u32 %r11, %r1, 0, 8; 272; CHECK-NEXT: bfe.u32 %r12, %r1, 8, 8; 273; CHECK-NEXT: bfe.u32 %r13, %r1, 16, 8; 274; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; 275; CHECK-NEXT: bfe.u32 %r15, %r2, 24, 8; 276; CHECK-NEXT: selp.b32 %r16, %r14, %r15, %p4; 277; CHECK-NEXT: bfe.u32 %r17, %r2, 16, 8; 278; CHECK-NEXT: selp.b32 %r18, %r13, %r17, %p3; 279; CHECK-NEXT: prmt.b32 %r19, %r18, %r16, 0x3340U; 280; CHECK-NEXT: bfe.u32 %r20, %r2, 8, 8; 281; CHECK-NEXT: selp.b32 %r21, %r12, %r20, %p2; 282; CHECK-NEXT: bfe.u32 %r22, %r2, 0, 8; 283; CHECK-NEXT: selp.b32 %r23, %r11, %r22, %p1; 284; CHECK-NEXT: prmt.b32 %r24, %r23, %r21, 0x3340U; 285; CHECK-NEXT: prmt.b32 %r25, %r24, %r19, 0x5410U; 286; CHECK-NEXT: st.param.b32 [func_retval0], %r25; 287; CHECK-NEXT: ret; 288 %cmp = icmp sgt <4 x i8> %a, %b 289 %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b 290 ret <4 x i8> %r 291} 292 293define <4 x i8> @test_umax(<4 x i8> %a, <4 x i8> %b) #0 { 294; CHECK-LABEL: test_umax( 295; CHECK: { 296; CHECK-NEXT: .reg .pred %p<5>; 297; CHECK-NEXT: .reg .b32 %r<18>; 298; CHECK-EMPTY: 299; CHECK-NEXT: // %bb.0: 300; CHECK-NEXT: ld.param.u32 %r2, [test_umax_param_1]; 301; CHECK-NEXT: ld.param.u32 %r1, [test_umax_param_0]; 302; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; 303; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; 304; CHECK-NEXT: setp.hi.u32 %p1, %r4, %r3; 305; CHECK-NEXT: bfe.u32 %r5, %r2, 8, 8; 306; CHECK-NEXT: bfe.u32 %r6, %r1, 8, 8; 307; CHECK-NEXT: setp.hi.u32 %p2, %r6, %r5; 308; CHECK-NEXT: bfe.u32 %r7, %r2, 16, 8; 309; CHECK-NEXT: bfe.u32 %r8, %r1, 16, 8; 310; CHECK-NEXT: setp.hi.u32 %p3, %r8, %r7; 311; CHECK-NEXT: bfe.u32 %r9, %r2, 24, 8; 312; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; 313; CHECK-NEXT: setp.hi.u32 %p4, %r10, %r9; 314; CHECK-NEXT: selp.b32 %r11, %r10, %r9, %p4; 315; CHECK-NEXT: selp.b32 %r12, %r8, %r7, %p3; 316; CHECK-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U; 317; CHECK-NEXT: selp.b32 %r14, %r6, %r5, %p2; 318; CHECK-NEXT: selp.b32 %r15, %r4, %r3, %p1; 319; CHECK-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U; 320; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U; 321; CHECK-NEXT: st.param.b32 [func_retval0], %r17; 322; CHECK-NEXT: ret; 323 %cmp = icmp ugt <4 x i8> %a, %b 324 %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b 325 ret <4 x i8> %r 326} 327 328define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 { 329; CHECK-LABEL: test_smin( 330; CHECK: { 331; CHECK-NEXT: .reg .pred %p<5>; 332; CHECK-NEXT: .reg .b32 %r<26>; 333; CHECK-EMPTY: 334; CHECK-NEXT: // %bb.0: 335; CHECK-NEXT: ld.param.u32 %r2, [test_smin_param_1]; 336; CHECK-NEXT: ld.param.u32 %r1, [test_smin_param_0]; 337; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8; 338; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8; 339; CHECK-NEXT: setp.le.s32 %p1, %r4, %r3; 340; CHECK-NEXT: bfe.s32 %r5, %r2, 8, 8; 341; CHECK-NEXT: bfe.s32 %r6, %r1, 8, 8; 342; CHECK-NEXT: setp.le.s32 %p2, %r6, %r5; 343; CHECK-NEXT: bfe.s32 %r7, %r2, 16, 8; 344; CHECK-NEXT: bfe.s32 %r8, %r1, 16, 8; 345; CHECK-NEXT: setp.le.s32 %p3, %r8, %r7; 346; CHECK-NEXT: bfe.s32 %r9, %r2, 24, 8; 347; CHECK-NEXT: bfe.s32 %r10, %r1, 24, 8; 348; CHECK-NEXT: setp.le.s32 %p4, %r10, %r9; 349; CHECK-NEXT: bfe.u32 %r11, %r1, 0, 8; 350; CHECK-NEXT: bfe.u32 %r12, %r1, 8, 8; 351; CHECK-NEXT: bfe.u32 %r13, %r1, 16, 8; 352; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; 353; CHECK-NEXT: bfe.u32 %r15, %r2, 24, 8; 354; CHECK-NEXT: selp.b32 %r16, %r14, %r15, %p4; 355; CHECK-NEXT: bfe.u32 %r17, %r2, 16, 8; 356; CHECK-NEXT: selp.b32 %r18, %r13, %r17, %p3; 357; CHECK-NEXT: prmt.b32 %r19, %r18, %r16, 0x3340U; 358; CHECK-NEXT: bfe.u32 %r20, %r2, 8, 8; 359; CHECK-NEXT: selp.b32 %r21, %r12, %r20, %p2; 360; CHECK-NEXT: bfe.u32 %r22, %r2, 0, 8; 361; CHECK-NEXT: selp.b32 %r23, %r11, %r22, %p1; 362; CHECK-NEXT: prmt.b32 %r24, %r23, %r21, 0x3340U; 363; CHECK-NEXT: prmt.b32 %r25, %r24, %r19, 0x5410U; 364; CHECK-NEXT: st.param.b32 [func_retval0], %r25; 365; CHECK-NEXT: ret; 366 %cmp = icmp sle <4 x i8> %a, %b 367 %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b 368 ret <4 x i8> %r 369} 370 371define <4 x i8> @test_umin(<4 x i8> %a, <4 x i8> %b) #0 { 372; CHECK-LABEL: test_umin( 373; CHECK: { 374; CHECK-NEXT: .reg .pred %p<5>; 375; CHECK-NEXT: .reg .b32 %r<18>; 376; CHECK-EMPTY: 377; CHECK-NEXT: // %bb.0: 378; CHECK-NEXT: ld.param.u32 %r2, [test_umin_param_1]; 379; CHECK-NEXT: ld.param.u32 %r1, [test_umin_param_0]; 380; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; 381; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; 382; CHECK-NEXT: setp.ls.u32 %p1, %r4, %r3; 383; CHECK-NEXT: bfe.u32 %r5, %r2, 8, 8; 384; CHECK-NEXT: bfe.u32 %r6, %r1, 8, 8; 385; CHECK-NEXT: setp.ls.u32 %p2, %r6, %r5; 386; CHECK-NEXT: bfe.u32 %r7, %r2, 16, 8; 387; CHECK-NEXT: bfe.u32 %r8, %r1, 16, 8; 388; CHECK-NEXT: setp.ls.u32 %p3, %r8, %r7; 389; CHECK-NEXT: bfe.u32 %r9, %r2, 24, 8; 390; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; 391; CHECK-NEXT: setp.ls.u32 %p4, %r10, %r9; 392; CHECK-NEXT: selp.b32 %r11, %r10, %r9, %p4; 393; CHECK-NEXT: selp.b32 %r12, %r8, %r7, %p3; 394; CHECK-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U; 395; CHECK-NEXT: selp.b32 %r14, %r6, %r5, %p2; 396; CHECK-NEXT: selp.b32 %r15, %r4, %r3, %p1; 397; CHECK-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U; 398; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U; 399; CHECK-NEXT: st.param.b32 [func_retval0], %r17; 400; CHECK-NEXT: ret; 401 %cmp = icmp ule <4 x i8> %a, %b 402 %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b 403 ret <4 x i8> %r 404} 405 406define <4 x i8> @test_eq(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 { 407; CHECK-LABEL: test_eq( 408; CHECK: { 409; CHECK-NEXT: .reg .pred %p<5>; 410; CHECK-NEXT: .reg .b32 %r<23>; 411; CHECK-EMPTY: 412; CHECK-NEXT: // %bb.0: 413; CHECK-NEXT: ld.param.u32 %r3, [test_eq_param_2]; 414; CHECK-NEXT: ld.param.u32 %r2, [test_eq_param_1]; 415; CHECK-NEXT: ld.param.u32 %r1, [test_eq_param_0]; 416; CHECK-NEXT: bfe.u32 %r4, %r2, 0, 8; 417; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8; 418; CHECK-NEXT: setp.eq.u32 %p1, %r5, %r4; 419; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8; 420; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; 421; CHECK-NEXT: setp.eq.u32 %p2, %r7, %r6; 422; CHECK-NEXT: bfe.u32 %r8, %r2, 16, 8; 423; CHECK-NEXT: bfe.u32 %r9, %r1, 16, 8; 424; CHECK-NEXT: setp.eq.u32 %p3, %r9, %r8; 425; CHECK-NEXT: bfe.u32 %r10, %r2, 24, 8; 426; CHECK-NEXT: bfe.u32 %r11, %r1, 24, 8; 427; CHECK-NEXT: setp.eq.u32 %p4, %r11, %r10; 428; CHECK-NEXT: bfe.u32 %r12, %r3, 24, 8; 429; CHECK-NEXT: selp.b32 %r13, %r11, %r12, %p4; 430; CHECK-NEXT: bfe.u32 %r14, %r3, 16, 8; 431; CHECK-NEXT: selp.b32 %r15, %r9, %r14, %p3; 432; CHECK-NEXT: prmt.b32 %r16, %r15, %r13, 0x3340U; 433; CHECK-NEXT: bfe.u32 %r17, %r3, 8, 8; 434; CHECK-NEXT: selp.b32 %r18, %r7, %r17, %p2; 435; CHECK-NEXT: bfe.u32 %r19, %r3, 0, 8; 436; CHECK-NEXT: selp.b32 %r20, %r5, %r19, %p1; 437; CHECK-NEXT: prmt.b32 %r21, %r20, %r18, 0x3340U; 438; CHECK-NEXT: prmt.b32 %r22, %r21, %r16, 0x5410U; 439; CHECK-NEXT: st.param.b32 [func_retval0], %r22; 440; CHECK-NEXT: ret; 441 %cmp = icmp eq <4 x i8> %a, %b 442 %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %c 443 ret <4 x i8> %r 444} 445 446define <4 x i8> @test_ne(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 { 447; CHECK-LABEL: test_ne( 448; CHECK: { 449; CHECK-NEXT: .reg .pred %p<5>; 450; CHECK-NEXT: .reg .b32 %r<23>; 451; CHECK-EMPTY: 452; CHECK-NEXT: // %bb.0: 453; CHECK-NEXT: ld.param.u32 %r3, [test_ne_param_2]; 454; CHECK-NEXT: ld.param.u32 %r2, [test_ne_param_1]; 455; CHECK-NEXT: ld.param.u32 %r1, [test_ne_param_0]; 456; CHECK-NEXT: bfe.u32 %r4, %r2, 0, 8; 457; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8; 458; CHECK-NEXT: setp.ne.u32 %p1, %r5, %r4; 459; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8; 460; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; 461; CHECK-NEXT: setp.ne.u32 %p2, %r7, %r6; 462; CHECK-NEXT: bfe.u32 %r8, %r2, 16, 8; 463; CHECK-NEXT: bfe.u32 %r9, %r1, 16, 8; 464; CHECK-NEXT: setp.ne.u32 %p3, %r9, %r8; 465; CHECK-NEXT: bfe.u32 %r10, %r2, 24, 8; 466; CHECK-NEXT: bfe.u32 %r11, %r1, 24, 8; 467; CHECK-NEXT: setp.ne.u32 %p4, %r11, %r10; 468; CHECK-NEXT: bfe.u32 %r12, %r3, 24, 8; 469; CHECK-NEXT: selp.b32 %r13, %r11, %r12, %p4; 470; CHECK-NEXT: bfe.u32 %r14, %r3, 16, 8; 471; CHECK-NEXT: selp.b32 %r15, %r9, %r14, %p3; 472; CHECK-NEXT: prmt.b32 %r16, %r15, %r13, 0x3340U; 473; CHECK-NEXT: bfe.u32 %r17, %r3, 8, 8; 474; CHECK-NEXT: selp.b32 %r18, %r7, %r17, %p2; 475; CHECK-NEXT: bfe.u32 %r19, %r3, 0, 8; 476; CHECK-NEXT: selp.b32 %r20, %r5, %r19, %p1; 477; CHECK-NEXT: prmt.b32 %r21, %r20, %r18, 0x3340U; 478; CHECK-NEXT: prmt.b32 %r22, %r21, %r16, 0x5410U; 479; CHECK-NEXT: st.param.b32 [func_retval0], %r22; 480; CHECK-NEXT: ret; 481 %cmp = icmp ne <4 x i8> %a, %b 482 %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %c 483 ret <4 x i8> %r 484} 485 486define <4 x i8> @test_mul(<4 x i8> %a, <4 x i8> %b) #0 { 487; CHECK-LABEL: test_mul( 488; CHECK: { 489; CHECK-NEXT: .reg .b16 %rs<13>; 490; CHECK-NEXT: .reg .b32 %r<18>; 491; CHECK-EMPTY: 492; CHECK-NEXT: // %bb.0: 493; CHECK-NEXT: ld.param.u32 %r2, [test_mul_param_1]; 494; CHECK-NEXT: ld.param.u32 %r1, [test_mul_param_0]; 495; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; 496; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; 497; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; 498; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; 499; CHECK-NEXT: mul.lo.s16 %rs3, %rs2, %rs1; 500; CHECK-NEXT: cvt.u32.u16 %r5, %rs3; 501; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8; 502; CHECK-NEXT: cvt.u16.u32 %rs4, %r6; 503; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; 504; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; 505; CHECK-NEXT: mul.lo.s16 %rs6, %rs5, %rs4; 506; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; 507; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U; 508; CHECK-NEXT: bfe.u32 %r10, %r2, 8, 8; 509; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; 510; CHECK-NEXT: bfe.u32 %r11, %r1, 8, 8; 511; CHECK-NEXT: cvt.u16.u32 %rs8, %r11; 512; CHECK-NEXT: mul.lo.s16 %rs9, %rs8, %rs7; 513; CHECK-NEXT: cvt.u32.u16 %r12, %rs9; 514; CHECK-NEXT: bfe.u32 %r13, %r2, 0, 8; 515; CHECK-NEXT: cvt.u16.u32 %rs10, %r13; 516; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8; 517; CHECK-NEXT: cvt.u16.u32 %rs11, %r14; 518; CHECK-NEXT: mul.lo.s16 %rs12, %rs11, %rs10; 519; CHECK-NEXT: cvt.u32.u16 %r15, %rs12; 520; CHECK-NEXT: prmt.b32 %r16, %r15, %r12, 0x3340U; 521; CHECK-NEXT: prmt.b32 %r17, %r16, %r9, 0x5410U; 522; CHECK-NEXT: st.param.b32 [func_retval0], %r17; 523; CHECK-NEXT: ret; 524 %r = mul <4 x i8> %a, %b 525 ret <4 x i8> %r 526} 527 528define <4 x i8> @test_or(<4 x i8> %a, <4 x i8> %b) #0 { 529; CHECK-LABEL: test_or( 530; CHECK: { 531; CHECK-NEXT: .reg .b32 %r<4>; 532; CHECK-EMPTY: 533; CHECK-NEXT: // %bb.0: 534; CHECK-NEXT: ld.param.u32 %r2, [test_or_param_1]; 535; CHECK-NEXT: ld.param.u32 %r1, [test_or_param_0]; 536; CHECK-NEXT: or.b32 %r3, %r1, %r2; 537; CHECK-NEXT: st.param.b32 [func_retval0], %r3; 538; CHECK-NEXT: ret; 539 %r = or <4 x i8> %a, %b 540 ret <4 x i8> %r 541} 542 543define <4 x i8> @test_or_computed(i8 %a) { 544; CHECK-LABEL: test_or_computed( 545; CHECK: { 546; CHECK-NEXT: .reg .b16 %rs<2>; 547; CHECK-NEXT: .reg .b32 %r<8>; 548; CHECK-EMPTY: 549; CHECK-NEXT: // %bb.0: 550; CHECK-NEXT: ld.param.u8 %rs1, [test_or_computed_param_0]; 551; CHECK-NEXT: mov.b32 %r1, 0; 552; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x3340U; 553; CHECK-NEXT: cvt.u32.u16 %r3, %rs1; 554; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 0x3340U; 555; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 0x5410U; 556; CHECK-NEXT: bfi.b32 %r6, 5, %r5, 8, 8; 557; CHECK-NEXT: or.b32 %r7, %r6, %r5; 558; CHECK-NEXT: st.param.b32 [func_retval0], %r7; 559; CHECK-NEXT: ret; 560 %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0 561 %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1 562 %r = or <4 x i8> %ins.1, %ins.0 563 ret <4 x i8> %r 564} 565 566define <4 x i8> @test_or_imm_0(<4 x i8> %a) #0 { 567; CHECK-LABEL: test_or_imm_0( 568; CHECK: { 569; CHECK-NEXT: .reg .b32 %r<3>; 570; CHECK-EMPTY: 571; CHECK-NEXT: // %bb.0: 572; CHECK-NEXT: ld.param.u32 %r1, [test_or_imm_0_param_0]; 573; CHECK-NEXT: or.b32 %r2, %r1, 67305985; 574; CHECK-NEXT: st.param.b32 [func_retval0], %r2; 575; CHECK-NEXT: ret; 576 %r = or <4 x i8> <i8 1, i8 2, i8 3, i8 4>, %a 577 ret <4 x i8> %r 578} 579 580define <4 x i8> @test_or_imm_1(<4 x i8> %a) #0 { 581; CHECK-LABEL: test_or_imm_1( 582; CHECK: { 583; CHECK-NEXT: .reg .b32 %r<3>; 584; CHECK-EMPTY: 585; CHECK-NEXT: // %bb.0: 586; CHECK-NEXT: ld.param.u32 %r1, [test_or_imm_1_param_0]; 587; CHECK-NEXT: or.b32 %r2, %r1, 67305985; 588; CHECK-NEXT: st.param.b32 [func_retval0], %r2; 589; CHECK-NEXT: ret; 590 %r = or <4 x i8> %a, <i8 1, i8 2, i8 3, i8 4> 591 ret <4 x i8> %r 592} 593 594define <4 x i8> @test_xor(<4 x i8> %a, <4 x i8> %b) #0 { 595; CHECK-LABEL: test_xor( 596; CHECK: { 597; CHECK-NEXT: .reg .b32 %r<4>; 598; CHECK-EMPTY: 599; CHECK-NEXT: // %bb.0: 600; CHECK-NEXT: ld.param.u32 %r2, [test_xor_param_1]; 601; CHECK-NEXT: ld.param.u32 %r1, [test_xor_param_0]; 602; CHECK-NEXT: xor.b32 %r3, %r1, %r2; 603; CHECK-NEXT: st.param.b32 [func_retval0], %r3; 604; CHECK-NEXT: ret; 605 %r = xor <4 x i8> %a, %b 606 ret <4 x i8> %r 607} 608 609define <4 x i8> @test_xor_computed(i8 %a) { 610; CHECK-LABEL: test_xor_computed( 611; CHECK: { 612; CHECK-NEXT: .reg .b16 %rs<2>; 613; CHECK-NEXT: .reg .b32 %r<8>; 614; CHECK-EMPTY: 615; CHECK-NEXT: // %bb.0: 616; CHECK-NEXT: ld.param.u8 %rs1, [test_xor_computed_param_0]; 617; CHECK-NEXT: mov.b32 %r1, 0; 618; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x3340U; 619; CHECK-NEXT: cvt.u32.u16 %r3, %rs1; 620; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 0x3340U; 621; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 0x5410U; 622; CHECK-NEXT: bfi.b32 %r6, 5, %r5, 8, 8; 623; CHECK-NEXT: xor.b32 %r7, %r6, %r5; 624; CHECK-NEXT: st.param.b32 [func_retval0], %r7; 625; CHECK-NEXT: ret; 626 %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0 627 %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1 628 %r = xor <4 x i8> %ins.1, %ins.0 629 ret <4 x i8> %r 630} 631 632define <4 x i8> @test_xor_imm_0(<4 x i8> %a) #0 { 633; CHECK-LABEL: test_xor_imm_0( 634; CHECK: { 635; CHECK-NEXT: .reg .b32 %r<3>; 636; CHECK-EMPTY: 637; CHECK-NEXT: // %bb.0: 638; CHECK-NEXT: ld.param.u32 %r1, [test_xor_imm_0_param_0]; 639; CHECK-NEXT: xor.b32 %r2, %r1, 67305985; 640; CHECK-NEXT: st.param.b32 [func_retval0], %r2; 641; CHECK-NEXT: ret; 642 %r = xor <4 x i8> <i8 1, i8 2, i8 3, i8 4>, %a 643 ret <4 x i8> %r 644} 645 646define <4 x i8> @test_xor_imm_1(<4 x i8> %a) #0 { 647; CHECK-LABEL: test_xor_imm_1( 648; CHECK: { 649; CHECK-NEXT: .reg .b32 %r<3>; 650; CHECK-EMPTY: 651; CHECK-NEXT: // %bb.0: 652; CHECK-NEXT: ld.param.u32 %r1, [test_xor_imm_1_param_0]; 653; CHECK-NEXT: xor.b32 %r2, %r1, 67305985; 654; CHECK-NEXT: st.param.b32 [func_retval0], %r2; 655; CHECK-NEXT: ret; 656 %r = xor <4 x i8> %a, <i8 1, i8 2, i8 3, i8 4> 657 ret <4 x i8> %r 658} 659 660define <4 x i8> @test_and(<4 x i8> %a, <4 x i8> %b) #0 { 661; CHECK-LABEL: test_and( 662; CHECK: { 663; CHECK-NEXT: .reg .b32 %r<4>; 664; CHECK-EMPTY: 665; CHECK-NEXT: // %bb.0: 666; CHECK-NEXT: ld.param.u32 %r2, [test_and_param_1]; 667; CHECK-NEXT: ld.param.u32 %r1, [test_and_param_0]; 668; CHECK-NEXT: and.b32 %r3, %r1, %r2; 669; CHECK-NEXT: st.param.b32 [func_retval0], %r3; 670; CHECK-NEXT: ret; 671 %r = and <4 x i8> %a, %b 672 ret <4 x i8> %r 673} 674 675define <4 x i8> @test_and_computed(i8 %a) { 676; CHECK-LABEL: test_and_computed( 677; CHECK: { 678; CHECK-NEXT: .reg .b16 %rs<2>; 679; CHECK-NEXT: .reg .b32 %r<8>; 680; CHECK-EMPTY: 681; CHECK-NEXT: // %bb.0: 682; CHECK-NEXT: ld.param.u8 %rs1, [test_and_computed_param_0]; 683; CHECK-NEXT: mov.b32 %r1, 0; 684; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x3340U; 685; CHECK-NEXT: cvt.u32.u16 %r3, %rs1; 686; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 0x3340U; 687; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 0x5410U; 688; CHECK-NEXT: bfi.b32 %r6, 5, %r5, 8, 8; 689; CHECK-NEXT: and.b32 %r7, %r6, %r5; 690; CHECK-NEXT: st.param.b32 [func_retval0], %r7; 691; CHECK-NEXT: ret; 692 %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0 693 %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1 694 %r = and <4 x i8> %ins.1, %ins.0 695 ret <4 x i8> %r 696} 697 698define <4 x i8> @test_and_imm_0(<4 x i8> %a) #0 { 699; CHECK-LABEL: test_and_imm_0( 700; CHECK: { 701; CHECK-NEXT: .reg .b32 %r<3>; 702; CHECK-EMPTY: 703; CHECK-NEXT: // %bb.0: 704; CHECK-NEXT: ld.param.u32 %r1, [test_and_imm_0_param_0]; 705; CHECK-NEXT: and.b32 %r2, %r1, 67305985; 706; CHECK-NEXT: st.param.b32 [func_retval0], %r2; 707; CHECK-NEXT: ret; 708 %r = and <4 x i8> <i8 1, i8 2, i8 3, i8 4>, %a 709 ret <4 x i8> %r 710} 711 712define <4 x i8> @test_and_imm_1(<4 x i8> %a) #0 { 713; CHECK-LABEL: test_and_imm_1( 714; CHECK: { 715; CHECK-NEXT: .reg .b32 %r<3>; 716; CHECK-EMPTY: 717; CHECK-NEXT: // %bb.0: 718; CHECK-NEXT: ld.param.u32 %r1, [test_and_imm_1_param_0]; 719; CHECK-NEXT: and.b32 %r2, %r1, 67305985; 720; CHECK-NEXT: st.param.b32 [func_retval0], %r2; 721; CHECK-NEXT: ret; 722 %r = and <4 x i8> %a, <i8 1, i8 2, i8 3, i8 4> 723 ret <4 x i8> %r 724} 725 726define void @test_ldst_v2i8(ptr %a, ptr %b) { 727; CHECK-LABEL: test_ldst_v2i8( 728; CHECK: { 729; CHECK-NEXT: .reg .b32 %r<2>; 730; CHECK-NEXT: .reg .b64 %rd<3>; 731; CHECK-EMPTY: 732; CHECK-NEXT: // %bb.0: 733; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v2i8_param_1]; 734; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v2i8_param_0]; 735; CHECK-NEXT: ld.u32 %r1, [%rd1]; 736; CHECK-NEXT: st.u32 [%rd2], %r1; 737; CHECK-NEXT: ret; 738 %t1 = load <4 x i8>, ptr %a 739 store <4 x i8> %t1, ptr %b, align 16 740 ret void 741} 742 743define void @test_ldst_v3i8(ptr %a, ptr %b) { 744; CHECK-LABEL: test_ldst_v3i8( 745; CHECK: { 746; CHECK-NEXT: .reg .b32 %r<3>; 747; CHECK-NEXT: .reg .b64 %rd<3>; 748; CHECK-EMPTY: 749; CHECK-NEXT: // %bb.0: 750; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v3i8_param_1]; 751; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v3i8_param_0]; 752; CHECK-NEXT: ld.u32 %r1, [%rd1]; 753; CHECK-NEXT: st.u16 [%rd2], %r1; 754; CHECK-NEXT: bfe.u32 %r2, %r1, 16, 8; 755; CHECK-NEXT: st.u8 [%rd2+2], %r2; 756; CHECK-NEXT: ret; 757 %t1 = load <3 x i8>, ptr %a 758 store <3 x i8> %t1, ptr %b, align 16 759 ret void 760} 761 762define void @test_ldst_v4i8(ptr %a, ptr %b) { 763; CHECK-LABEL: test_ldst_v4i8( 764; CHECK: { 765; CHECK-NEXT: .reg .b32 %r<2>; 766; CHECK-NEXT: .reg .b64 %rd<3>; 767; CHECK-EMPTY: 768; CHECK-NEXT: // %bb.0: 769; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v4i8_param_1]; 770; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v4i8_param_0]; 771; CHECK-NEXT: ld.u32 %r1, [%rd1]; 772; CHECK-NEXT: st.u32 [%rd2], %r1; 773; CHECK-NEXT: ret; 774 %t1 = load <4 x i8>, ptr %a 775 store <4 x i8> %t1, ptr %b, align 16 776 ret void 777} 778 779define void @test_ldst_v4i8_unaligned(ptr %a, ptr %b) { 780; CHECK-LABEL: test_ldst_v4i8_unaligned( 781; CHECK: { 782; CHECK-NEXT: .reg .b32 %r<5>; 783; CHECK-NEXT: .reg .b64 %rd<3>; 784; CHECK-EMPTY: 785; CHECK-NEXT: // %bb.0: 786; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v4i8_unaligned_param_1]; 787; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v4i8_unaligned_param_0]; 788; CHECK-NEXT: ld.u8 %r1, [%rd1]; 789; CHECK-NEXT: ld.u8 %r2, [%rd1+1]; 790; CHECK-NEXT: ld.u8 %r3, [%rd1+2]; 791; CHECK-NEXT: ld.u8 %r4, [%rd1+3]; 792; CHECK-NEXT: st.u8 [%rd2+3], %r4; 793; CHECK-NEXT: st.u8 [%rd2+2], %r3; 794; CHECK-NEXT: st.u8 [%rd2+1], %r2; 795; CHECK-NEXT: st.u8 [%rd2], %r1; 796; CHECK-NEXT: ret; 797 %t1 = load <4 x i8>, ptr %a, align 1 798 store <4 x i8> %t1, ptr %b, align 1 799 ret void 800} 801 802 803define void @test_ldst_v8i8(ptr %a, ptr %b) { 804; CHECK-LABEL: test_ldst_v8i8( 805; CHECK: { 806; CHECK-NEXT: .reg .b32 %r<3>; 807; CHECK-NEXT: .reg .b64 %rd<3>; 808; CHECK-EMPTY: 809; CHECK-NEXT: // %bb.0: 810; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v8i8_param_1]; 811; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v8i8_param_0]; 812; CHECK-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1]; 813; CHECK-NEXT: st.v2.b32 [%rd2], {%r1, %r2}; 814; CHECK-NEXT: ret; 815 %t1 = load <8 x i8>, ptr %a 816 store <8 x i8> %t1, ptr %b, align 16 817 ret void 818} 819 820declare <4 x i8> @test_callee(<4 x i8> %a, <4 x i8> %b) #0 821 822define <4 x i8> @test_call(<4 x i8> %a, <4 x i8> %b) #0 { 823; CHECK-LABEL: test_call( 824; CHECK: { 825; CHECK-NEXT: .reg .b32 %r<5>; 826; CHECK-EMPTY: 827; CHECK-NEXT: // %bb.0: 828; CHECK-NEXT: ld.param.u32 %r2, [test_call_param_1]; 829; CHECK-NEXT: ld.param.u32 %r1, [test_call_param_0]; 830; CHECK-NEXT: { // callseq 0, 0 831; CHECK-NEXT: .param .align 4 .b8 param0[4]; 832; CHECK-NEXT: st.param.b32 [param0], %r1; 833; CHECK-NEXT: .param .align 4 .b8 param1[4]; 834; CHECK-NEXT: st.param.b32 [param1], %r2; 835; CHECK-NEXT: .param .align 4 .b8 retval0[4]; 836; CHECK-NEXT: call.uni (retval0), 837; CHECK-NEXT: test_callee, 838; CHECK-NEXT: ( 839; CHECK-NEXT: param0, 840; CHECK-NEXT: param1 841; CHECK-NEXT: ); 842; CHECK-NEXT: ld.param.b32 %r3, [retval0]; 843; CHECK-NEXT: } // callseq 0 844; CHECK-NEXT: st.param.b32 [func_retval0], %r3; 845; CHECK-NEXT: ret; 846 %r = call <4 x i8> @test_callee(<4 x i8> %a, <4 x i8> %b) 847 ret <4 x i8> %r 848} 849 850define <4 x i8> @test_call_flipped(<4 x i8> %a, <4 x i8> %b) #0 { 851; CHECK-LABEL: test_call_flipped( 852; CHECK: { 853; CHECK-NEXT: .reg .b32 %r<5>; 854; CHECK-EMPTY: 855; CHECK-NEXT: // %bb.0: 856; CHECK-NEXT: ld.param.u32 %r2, [test_call_flipped_param_1]; 857; CHECK-NEXT: ld.param.u32 %r1, [test_call_flipped_param_0]; 858; CHECK-NEXT: { // callseq 1, 0 859; CHECK-NEXT: .param .align 4 .b8 param0[4]; 860; CHECK-NEXT: st.param.b32 [param0], %r2; 861; CHECK-NEXT: .param .align 4 .b8 param1[4]; 862; CHECK-NEXT: st.param.b32 [param1], %r1; 863; CHECK-NEXT: .param .align 4 .b8 retval0[4]; 864; CHECK-NEXT: call.uni (retval0), 865; CHECK-NEXT: test_callee, 866; CHECK-NEXT: ( 867; CHECK-NEXT: param0, 868; CHECK-NEXT: param1 869; CHECK-NEXT: ); 870; CHECK-NEXT: ld.param.b32 %r3, [retval0]; 871; CHECK-NEXT: } // callseq 1 872; CHECK-NEXT: st.param.b32 [func_retval0], %r3; 873; CHECK-NEXT: ret; 874 %r = call <4 x i8> @test_callee(<4 x i8> %b, <4 x i8> %a) 875 ret <4 x i8> %r 876} 877 878define <4 x i8> @test_tailcall_flipped(<4 x i8> %a, <4 x i8> %b) #0 { 879; CHECK-LABEL: test_tailcall_flipped( 880; CHECK: { 881; CHECK-NEXT: .reg .b32 %r<5>; 882; CHECK-EMPTY: 883; CHECK-NEXT: // %bb.0: 884; CHECK-NEXT: ld.param.u32 %r2, [test_tailcall_flipped_param_1]; 885; CHECK-NEXT: ld.param.u32 %r1, [test_tailcall_flipped_param_0]; 886; CHECK-NEXT: { // callseq 2, 0 887; CHECK-NEXT: .param .align 4 .b8 param0[4]; 888; CHECK-NEXT: st.param.b32 [param0], %r2; 889; CHECK-NEXT: .param .align 4 .b8 param1[4]; 890; CHECK-NEXT: st.param.b32 [param1], %r1; 891; CHECK-NEXT: .param .align 4 .b8 retval0[4]; 892; CHECK-NEXT: call.uni (retval0), 893; CHECK-NEXT: test_callee, 894; CHECK-NEXT: ( 895; CHECK-NEXT: param0, 896; CHECK-NEXT: param1 897; CHECK-NEXT: ); 898; CHECK-NEXT: ld.param.b32 %r3, [retval0]; 899; CHECK-NEXT: } // callseq 2 900; CHECK-NEXT: st.param.b32 [func_retval0], %r3; 901; CHECK-NEXT: ret; 902 %r = tail call <4 x i8> @test_callee(<4 x i8> %b, <4 x i8> %a) 903 ret <4 x i8> %r 904} 905 906define <4 x i8> @test_select(<4 x i8> %a, <4 x i8> %b, i1 zeroext %c) #0 { 907; CHECK-LABEL: test_select( 908; CHECK: { 909; CHECK-NEXT: .reg .pred %p<2>; 910; CHECK-NEXT: .reg .b16 %rs<3>; 911; CHECK-NEXT: .reg .b32 %r<4>; 912; CHECK-EMPTY: 913; CHECK-NEXT: // %bb.0: 914; CHECK-NEXT: ld.param.u8 %rs1, [test_select_param_2]; 915; CHECK-NEXT: and.b16 %rs2, %rs1, 1; 916; CHECK-NEXT: setp.eq.b16 %p1, %rs2, 1; 917; CHECK-NEXT: ld.param.u32 %r2, [test_select_param_1]; 918; CHECK-NEXT: ld.param.u32 %r1, [test_select_param_0]; 919; CHECK-NEXT: selp.b32 %r3, %r1, %r2, %p1; 920; CHECK-NEXT: st.param.b32 [func_retval0], %r3; 921; CHECK-NEXT: ret; 922 %r = select i1 %c, <4 x i8> %a, <4 x i8> %b 923 ret <4 x i8> %r 924} 925 926define <4 x i8> @test_select_cc(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) #0 { 927; CHECK-LABEL: test_select_cc( 928; CHECK: { 929; CHECK-NEXT: .reg .pred %p<5>; 930; CHECK-NEXT: .reg .b32 %r<28>; 931; CHECK-EMPTY: 932; CHECK-NEXT: // %bb.0: 933; CHECK-NEXT: ld.param.u32 %r4, [test_select_cc_param_3]; 934; CHECK-NEXT: ld.param.u32 %r3, [test_select_cc_param_2]; 935; CHECK-NEXT: ld.param.u32 %r2, [test_select_cc_param_1]; 936; CHECK-NEXT: ld.param.u32 %r1, [test_select_cc_param_0]; 937; CHECK-NEXT: bfe.u32 %r5, %r4, 0, 8; 938; CHECK-NEXT: bfe.u32 %r6, %r3, 0, 8; 939; CHECK-NEXT: setp.ne.u32 %p1, %r6, %r5; 940; CHECK-NEXT: bfe.u32 %r7, %r4, 8, 8; 941; CHECK-NEXT: bfe.u32 %r8, %r3, 8, 8; 942; CHECK-NEXT: setp.ne.u32 %p2, %r8, %r7; 943; CHECK-NEXT: bfe.u32 %r9, %r4, 16, 8; 944; CHECK-NEXT: bfe.u32 %r10, %r3, 16, 8; 945; CHECK-NEXT: setp.ne.u32 %p3, %r10, %r9; 946; CHECK-NEXT: bfe.u32 %r11, %r4, 24, 8; 947; CHECK-NEXT: bfe.u32 %r12, %r3, 24, 8; 948; CHECK-NEXT: setp.ne.u32 %p4, %r12, %r11; 949; CHECK-NEXT: bfe.u32 %r13, %r2, 24, 8; 950; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; 951; CHECK-NEXT: selp.b32 %r15, %r14, %r13, %p4; 952; CHECK-NEXT: bfe.u32 %r16, %r2, 16, 8; 953; CHECK-NEXT: bfe.u32 %r17, %r1, 16, 8; 954; CHECK-NEXT: selp.b32 %r18, %r17, %r16, %p3; 955; CHECK-NEXT: prmt.b32 %r19, %r18, %r15, 0x3340U; 956; CHECK-NEXT: bfe.u32 %r20, %r2, 8, 8; 957; CHECK-NEXT: bfe.u32 %r21, %r1, 8, 8; 958; CHECK-NEXT: selp.b32 %r22, %r21, %r20, %p2; 959; CHECK-NEXT: bfe.u32 %r23, %r2, 0, 8; 960; CHECK-NEXT: bfe.u32 %r24, %r1, 0, 8; 961; CHECK-NEXT: selp.b32 %r25, %r24, %r23, %p1; 962; CHECK-NEXT: prmt.b32 %r26, %r25, %r22, 0x3340U; 963; CHECK-NEXT: prmt.b32 %r27, %r26, %r19, 0x5410U; 964; CHECK-NEXT: st.param.b32 [func_retval0], %r27; 965; CHECK-NEXT: ret; 966 %cc = icmp ne <4 x i8> %c, %d 967 %r = select <4 x i1> %cc, <4 x i8> %a, <4 x i8> %b 968 ret <4 x i8> %r 969} 970 971define <4 x i32> @test_select_cc_i32_i8(<4 x i32> %a, <4 x i32> %b, 972; CHECK-LABEL: test_select_cc_i32_i8( 973; CHECK: { 974; CHECK-NEXT: .reg .pred %p<5>; 975; CHECK-NEXT: .reg .b32 %r<23>; 976; CHECK-EMPTY: 977; CHECK-NEXT: // %bb.0: 978; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [test_select_cc_i32_i8_param_1]; 979; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [test_select_cc_i32_i8_param_0]; 980; CHECK-NEXT: ld.param.u32 %r10, [test_select_cc_i32_i8_param_3]; 981; CHECK-NEXT: ld.param.u32 %r9, [test_select_cc_i32_i8_param_2]; 982; CHECK-NEXT: bfe.u32 %r11, %r10, 0, 8; 983; CHECK-NEXT: bfe.u32 %r12, %r9, 0, 8; 984; CHECK-NEXT: setp.ne.u32 %p1, %r12, %r11; 985; CHECK-NEXT: bfe.u32 %r13, %r10, 8, 8; 986; CHECK-NEXT: bfe.u32 %r14, %r9, 8, 8; 987; CHECK-NEXT: setp.ne.u32 %p2, %r14, %r13; 988; CHECK-NEXT: bfe.u32 %r15, %r10, 16, 8; 989; CHECK-NEXT: bfe.u32 %r16, %r9, 16, 8; 990; CHECK-NEXT: setp.ne.u32 %p3, %r16, %r15; 991; CHECK-NEXT: bfe.u32 %r17, %r10, 24, 8; 992; CHECK-NEXT: bfe.u32 %r18, %r9, 24, 8; 993; CHECK-NEXT: setp.ne.u32 %p4, %r18, %r17; 994; CHECK-NEXT: selp.b32 %r19, %r4, %r8, %p4; 995; CHECK-NEXT: selp.b32 %r20, %r3, %r7, %p3; 996; CHECK-NEXT: selp.b32 %r21, %r2, %r6, %p2; 997; CHECK-NEXT: selp.b32 %r22, %r1, %r5, %p1; 998; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r22, %r21, %r20, %r19}; 999; CHECK-NEXT: ret; 1000 <4 x i8> %c, <4 x i8> %d) #0 { 1001 %cc = icmp ne <4 x i8> %c, %d 1002 %r = select <4 x i1> %cc, <4 x i32> %a, <4 x i32> %b 1003 ret <4 x i32> %r 1004} 1005 1006define <4 x i8> @test_select_cc_i8_i32(<4 x i8> %a, <4 x i8> %b, 1007; CHECK-LABEL: test_select_cc_i8_i32( 1008; CHECK: { 1009; CHECK-NEXT: .reg .pred %p<5>; 1010; CHECK-NEXT: .reg .b32 %r<26>; 1011; CHECK-EMPTY: 1012; CHECK-NEXT: // %bb.0: 1013; CHECK-NEXT: ld.param.v4.u32 {%r7, %r8, %r9, %r10}, [test_select_cc_i8_i32_param_3]; 1014; CHECK-NEXT: ld.param.v4.u32 {%r3, %r4, %r5, %r6}, [test_select_cc_i8_i32_param_2]; 1015; CHECK-NEXT: ld.param.u32 %r2, [test_select_cc_i8_i32_param_1]; 1016; CHECK-NEXT: ld.param.u32 %r1, [test_select_cc_i8_i32_param_0]; 1017; CHECK-NEXT: setp.ne.s32 %p1, %r3, %r7; 1018; CHECK-NEXT: setp.ne.s32 %p2, %r4, %r8; 1019; CHECK-NEXT: setp.ne.s32 %p3, %r5, %r9; 1020; CHECK-NEXT: setp.ne.s32 %p4, %r6, %r10; 1021; CHECK-NEXT: bfe.u32 %r11, %r2, 24, 8; 1022; CHECK-NEXT: bfe.u32 %r12, %r1, 24, 8; 1023; CHECK-NEXT: selp.b32 %r13, %r12, %r11, %p4; 1024; CHECK-NEXT: bfe.u32 %r14, %r2, 16, 8; 1025; CHECK-NEXT: bfe.u32 %r15, %r1, 16, 8; 1026; CHECK-NEXT: selp.b32 %r16, %r15, %r14, %p3; 1027; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 0x3340U; 1028; CHECK-NEXT: bfe.u32 %r18, %r2, 8, 8; 1029; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8; 1030; CHECK-NEXT: selp.b32 %r20, %r19, %r18, %p2; 1031; CHECK-NEXT: bfe.u32 %r21, %r2, 0, 8; 1032; CHECK-NEXT: bfe.u32 %r22, %r1, 0, 8; 1033; CHECK-NEXT: selp.b32 %r23, %r22, %r21, %p1; 1034; CHECK-NEXT: prmt.b32 %r24, %r23, %r20, 0x3340U; 1035; CHECK-NEXT: prmt.b32 %r25, %r24, %r17, 0x5410U; 1036; CHECK-NEXT: st.param.b32 [func_retval0], %r25; 1037; CHECK-NEXT: ret; 1038 <4 x i32> %c, <4 x i32> %d) #0 { 1039 %cc = icmp ne <4 x i32> %c, %d 1040 %r = select <4 x i1> %cc, <4 x i8> %a, <4 x i8> %b 1041 ret <4 x i8> %r 1042} 1043 1044 1045define <4 x i8> @test_trunc_2xi32(<4 x i32> %a) #0 { 1046; CHECK-LABEL: test_trunc_2xi32( 1047; CHECK: { 1048; CHECK-NEXT: .reg .b32 %r<8>; 1049; CHECK-EMPTY: 1050; CHECK-NEXT: // %bb.0: 1051; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [test_trunc_2xi32_param_0]; 1052; CHECK-NEXT: prmt.b32 %r5, %r3, %r4, 0x3340U; 1053; CHECK-NEXT: prmt.b32 %r6, %r1, %r2, 0x3340U; 1054; CHECK-NEXT: prmt.b32 %r7, %r6, %r5, 0x5410U; 1055; CHECK-NEXT: st.param.b32 [func_retval0], %r7; 1056; CHECK-NEXT: ret; 1057 %r = trunc <4 x i32> %a to <4 x i8> 1058 ret <4 x i8> %r 1059} 1060 1061define <4 x i8> @test_trunc_2xi64(<4 x i64> %a) #0 { 1062; CHECK-LABEL: test_trunc_2xi64( 1063; CHECK: { 1064; CHECK-NEXT: .reg .b32 %r<8>; 1065; CHECK-NEXT: .reg .b64 %rd<5>; 1066; CHECK-EMPTY: 1067; CHECK-NEXT: // %bb.0: 1068; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_trunc_2xi64_param_0+16]; 1069; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_trunc_2xi64_param_0]; 1070; CHECK-NEXT: cvt.u32.u64 %r1, %rd4; 1071; CHECK-NEXT: cvt.u32.u64 %r2, %rd3; 1072; CHECK-NEXT: prmt.b32 %r3, %r2, %r1, 0x3340U; 1073; CHECK-NEXT: cvt.u32.u64 %r4, %rd2; 1074; CHECK-NEXT: cvt.u32.u64 %r5, %rd1; 1075; CHECK-NEXT: prmt.b32 %r6, %r5, %r4, 0x3340U; 1076; CHECK-NEXT: prmt.b32 %r7, %r6, %r3, 0x5410U; 1077; CHECK-NEXT: st.param.b32 [func_retval0], %r7; 1078; CHECK-NEXT: ret; 1079 %r = trunc <4 x i64> %a to <4 x i8> 1080 ret <4 x i8> %r 1081} 1082 1083define <4 x i32> @test_zext_2xi32(<4 x i8> %a) #0 { 1084; CHECK-LABEL: test_zext_2xi32( 1085; CHECK: { 1086; CHECK-NEXT: .reg .b32 %r<6>; 1087; CHECK-EMPTY: 1088; CHECK-NEXT: // %bb.0: 1089; CHECK-NEXT: ld.param.u32 %r1, [test_zext_2xi32_param_0]; 1090; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; 1091; CHECK-NEXT: bfe.u32 %r3, %r1, 16, 8; 1092; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; 1093; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8; 1094; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r5, %r4, %r3, %r2}; 1095; CHECK-NEXT: ret; 1096 %r = zext <4 x i8> %a to <4 x i32> 1097 ret <4 x i32> %r 1098} 1099 1100define <4 x i64> @test_zext_2xi64(<4 x i8> %a) #0 { 1101; CHECK-LABEL: test_zext_2xi64( 1102; CHECK: { 1103; CHECK-NEXT: .reg .b32 %r<6>; 1104; CHECK-NEXT: .reg .b64 %rd<9>; 1105; CHECK-EMPTY: 1106; CHECK-NEXT: // %bb.0: 1107; CHECK-NEXT: ld.param.u32 %r1, [test_zext_2xi64_param_0]; 1108; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; 1109; CHECK-NEXT: cvt.u64.u32 %rd1, %r2; 1110; CHECK-NEXT: and.b64 %rd2, %rd1, 255; 1111; CHECK-NEXT: bfe.u32 %r3, %r1, 16, 8; 1112; CHECK-NEXT: cvt.u64.u32 %rd3, %r3; 1113; CHECK-NEXT: and.b64 %rd4, %rd3, 255; 1114; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; 1115; CHECK-NEXT: cvt.u64.u32 %rd5, %r4; 1116; CHECK-NEXT: and.b64 %rd6, %rd5, 255; 1117; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8; 1118; CHECK-NEXT: cvt.u64.u32 %rd7, %r5; 1119; CHECK-NEXT: and.b64 %rd8, %rd7, 255; 1120; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6}; 1121; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd4, %rd2}; 1122; CHECK-NEXT: ret; 1123 %r = zext <4 x i8> %a to <4 x i64> 1124 ret <4 x i64> %r 1125} 1126 1127define <4 x i8> @test_bitcast_i32_to_4xi8(i32 %a) #0 { 1128; CHECK-LABEL: test_bitcast_i32_to_4xi8( 1129; CHECK: { 1130; CHECK-NEXT: .reg .b32 %r<2>; 1131; CHECK-EMPTY: 1132; CHECK-NEXT: // %bb.0: 1133; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_i32_to_4xi8_param_0]; 1134; CHECK-NEXT: st.param.b32 [func_retval0], %r1; 1135; CHECK-NEXT: ret; 1136 %r = bitcast i32 %a to <4 x i8> 1137 ret <4 x i8> %r 1138} 1139 1140define <4 x i8> @test_bitcast_float_to_4xi8(float %a) #0 { 1141; CHECK-LABEL: test_bitcast_float_to_4xi8( 1142; CHECK: { 1143; CHECK-NEXT: .reg .b32 %r<2>; 1144; CHECK-NEXT: .reg .f32 %f<2>; 1145; CHECK-EMPTY: 1146; CHECK-NEXT: // %bb.0: 1147; CHECK-NEXT: ld.param.f32 %f1, [test_bitcast_float_to_4xi8_param_0]; 1148; CHECK-NEXT: mov.b32 %r1, %f1; 1149; CHECK-NEXT: st.param.b32 [func_retval0], %r1; 1150; CHECK-NEXT: ret; 1151 %r = bitcast float %a to <4 x i8> 1152 ret <4 x i8> %r 1153} 1154 1155define i32 @test_bitcast_4xi8_to_i32(<4 x i8> %a) #0 { 1156; CHECK-LABEL: test_bitcast_4xi8_to_i32( 1157; CHECK: { 1158; CHECK-NEXT: .reg .b32 %r<2>; 1159; CHECK-EMPTY: 1160; CHECK-NEXT: // %bb.0: 1161; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_4xi8_to_i32_param_0]; 1162; CHECK-NEXT: st.param.b32 [func_retval0], %r1; 1163; CHECK-NEXT: ret; 1164 %r = bitcast <4 x i8> %a to i32 1165 ret i32 %r 1166} 1167 1168define float @test_bitcast_4xi8_to_float(<4 x i8> %a) #0 { 1169; CHECK-LABEL: test_bitcast_4xi8_to_float( 1170; CHECK: { 1171; CHECK-NEXT: .reg .b32 %r<2>; 1172; CHECK-NEXT: .reg .f32 %f<2>; 1173; CHECK-EMPTY: 1174; CHECK-NEXT: // %bb.0: 1175; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_4xi8_to_float_param_0]; 1176; CHECK-NEXT: mov.b32 %f1, %r1; 1177; CHECK-NEXT: st.param.f32 [func_retval0], %f1; 1178; CHECK-NEXT: ret; 1179 %r = bitcast <4 x i8> %a to float 1180 ret float %r 1181} 1182 1183 1184define <2 x half> @test_bitcast_4xi8_to_2xhalf(i8 %a) #0 { 1185; CHECK-LABEL: test_bitcast_4xi8_to_2xhalf( 1186; CHECK: { 1187; CHECK-NEXT: .reg .b16 %rs<2>; 1188; CHECK-NEXT: .reg .b32 %r<6>; 1189; CHECK-EMPTY: 1190; CHECK-NEXT: // %bb.0: 1191; CHECK-NEXT: ld.param.u8 %rs1, [test_bitcast_4xi8_to_2xhalf_param_0]; 1192; CHECK-NEXT: mov.b32 %r1, 6; 1193; CHECK-NEXT: prmt.b32 %r2, %r1, 7, 0x3340U; 1194; CHECK-NEXT: cvt.u32.u16 %r3, %rs1; 1195; CHECK-NEXT: prmt.b32 %r4, %r3, 5, 0x3340U; 1196; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 0x5410U; 1197; CHECK-NEXT: st.param.b32 [func_retval0], %r5; 1198; CHECK-NEXT: ret; 1199 %ins.0 = insertelement <4 x i8> undef, i8 %a, i32 0 1200 %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1 1201 %ins.2 = insertelement <4 x i8> %ins.1, i8 6, i32 2 1202 %ins.3 = insertelement <4 x i8> %ins.2, i8 7, i32 3 1203 %r = bitcast <4 x i8> %ins.3 to <2 x half> 1204 ret <2 x half> %r 1205} 1206 1207 1208define <4 x i8> @test_shufflevector(<4 x i8> %a) #0 { 1209; CHECK-LABEL: test_shufflevector( 1210; CHECK: { 1211; CHECK-NEXT: .reg .b32 %r<4>; 1212; CHECK-EMPTY: 1213; CHECK-NEXT: // %bb.0: 1214; CHECK-NEXT: ld.param.u32 %r1, [test_shufflevector_param_0]; 1215; CHECK-NEXT: // implicit-def: %r3 1216; CHECK-NEXT: prmt.b32 %r2, %r1, %r3, 0x123U; 1217; CHECK-NEXT: st.param.b32 [func_retval0], %r2; 1218; CHECK-NEXT: ret; 1219 %s = shufflevector <4 x i8> %a, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 1220 ret <4 x i8> %s 1221} 1222 1223define <4 x i8> @test_shufflevector_2(<4 x i8> %a, <4 x i8> %b) #0 { 1224; CHECK-LABEL: test_shufflevector_2( 1225; CHECK: { 1226; CHECK-NEXT: .reg .b32 %r<4>; 1227; CHECK-EMPTY: 1228; CHECK-NEXT: // %bb.0: 1229; CHECK-NEXT: ld.param.u32 %r2, [test_shufflevector_2_param_1]; 1230; CHECK-NEXT: ld.param.u32 %r1, [test_shufflevector_2_param_0]; 1231; CHECK-NEXT: prmt.b32 %r3, %r1, %r2, 0x2537U; 1232; CHECK-NEXT: st.param.b32 [func_retval0], %r3; 1233; CHECK-NEXT: ret; 1234 %s = shufflevector <4 x i8> %a, <4 x i8> %b, <4 x i32> <i32 7, i32 3, i32 5, i32 2> 1235 ret <4 x i8> %s 1236} 1237 1238 1239define <4 x i8> @test_insertelement(<4 x i8> %a, i8 %x) #0 { 1240; CHECK-LABEL: test_insertelement( 1241; CHECK: { 1242; CHECK-NEXT: .reg .b16 %rs<2>; 1243; CHECK-NEXT: .reg .b32 %r<4>; 1244; CHECK-EMPTY: 1245; CHECK-NEXT: // %bb.0: 1246; CHECK-NEXT: ld.param.u8 %rs1, [test_insertelement_param_1]; 1247; CHECK-NEXT: ld.param.u32 %r1, [test_insertelement_param_0]; 1248; CHECK-NEXT: cvt.u32.u16 %r2, %rs1; 1249; CHECK-NEXT: bfi.b32 %r3, %r2, %r1, 8, 8; 1250; CHECK-NEXT: st.param.b32 [func_retval0], %r3; 1251; CHECK-NEXT: ret; 1252 %i = insertelement <4 x i8> %a, i8 %x, i64 1 1253 ret <4 x i8> %i 1254} 1255 1256define <4 x i8> @test_fptosi_4xhalf_to_4xi8(<4 x half> %a) #0 { 1257; CHECK-LABEL: test_fptosi_4xhalf_to_4xi8( 1258; CHECK: { 1259; CHECK-NEXT: .reg .b16 %rs<13>; 1260; CHECK-NEXT: .reg .b32 %r<12>; 1261; CHECK-EMPTY: 1262; CHECK-NEXT: // %bb.0: 1263; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_fptosi_4xhalf_to_4xi8_param_0]; 1264; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; 1265; CHECK-NEXT: cvt.rzi.s16.f16 %rs3, %rs2; 1266; CHECK-NEXT: cvt.rzi.s16.f16 %rs4, %rs1; 1267; CHECK-NEXT: mov.b32 %r3, {%rs4, %rs3}; 1268; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; 1269; CHECK-NEXT: cvt.u32.u16 %r4, %rs6; 1270; CHECK-NEXT: cvt.u32.u16 %r5, %rs5; 1271; CHECK-NEXT: prmt.b32 %r6, %r5, %r4, 0x3340U; 1272; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r1; 1273; CHECK-NEXT: cvt.rzi.s16.f16 %rs9, %rs8; 1274; CHECK-NEXT: cvt.rzi.s16.f16 %rs10, %rs7; 1275; CHECK-NEXT: mov.b32 %r7, {%rs10, %rs9}; 1276; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r7; 1277; CHECK-NEXT: cvt.u32.u16 %r8, %rs12; 1278; CHECK-NEXT: cvt.u32.u16 %r9, %rs11; 1279; CHECK-NEXT: prmt.b32 %r10, %r9, %r8, 0x3340U; 1280; CHECK-NEXT: prmt.b32 %r11, %r10, %r6, 0x5410U; 1281; CHECK-NEXT: st.param.b32 [func_retval0], %r11; 1282; CHECK-NEXT: ret; 1283 %r = fptosi <4 x half> %a to <4 x i8> 1284 ret <4 x i8> %r 1285} 1286 1287define <4 x i8> @test_fptoui_4xhalf_to_4xi8(<4 x half> %a) #0 { 1288; CHECK-LABEL: test_fptoui_4xhalf_to_4xi8( 1289; CHECK: { 1290; CHECK-NEXT: .reg .b16 %rs<13>; 1291; CHECK-NEXT: .reg .b32 %r<12>; 1292; CHECK-EMPTY: 1293; CHECK-NEXT: // %bb.0: 1294; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_fptoui_4xhalf_to_4xi8_param_0]; 1295; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; 1296; CHECK-NEXT: cvt.rzi.u16.f16 %rs3, %rs2; 1297; CHECK-NEXT: cvt.rzi.u16.f16 %rs4, %rs1; 1298; CHECK-NEXT: mov.b32 %r3, {%rs4, %rs3}; 1299; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; 1300; CHECK-NEXT: cvt.u32.u16 %r4, %rs6; 1301; CHECK-NEXT: cvt.u32.u16 %r5, %rs5; 1302; CHECK-NEXT: prmt.b32 %r6, %r5, %r4, 0x3340U; 1303; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r1; 1304; CHECK-NEXT: cvt.rzi.u16.f16 %rs9, %rs8; 1305; CHECK-NEXT: cvt.rzi.u16.f16 %rs10, %rs7; 1306; CHECK-NEXT: mov.b32 %r7, {%rs10, %rs9}; 1307; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r7; 1308; CHECK-NEXT: cvt.u32.u16 %r8, %rs12; 1309; CHECK-NEXT: cvt.u32.u16 %r9, %rs11; 1310; CHECK-NEXT: prmt.b32 %r10, %r9, %r8, 0x3340U; 1311; CHECK-NEXT: prmt.b32 %r11, %r10, %r6, 0x5410U; 1312; CHECK-NEXT: st.param.b32 [func_retval0], %r11; 1313; CHECK-NEXT: ret; 1314 %r = fptoui <4 x half> %a to <4 x i8> 1315 ret <4 x i8> %r 1316} 1317 1318define void @test_srem_v4i8(ptr %a, ptr %b, ptr %c) { 1319; CHECK-LABEL: test_srem_v4i8( 1320; CHECK: { 1321; CHECK-NEXT: .reg .b16 %rs<13>; 1322; CHECK-NEXT: .reg .b32 %r<18>; 1323; CHECK-NEXT: .reg .b64 %rd<4>; 1324; CHECK-EMPTY: 1325; CHECK-NEXT: // %bb.0: // %entry 1326; CHECK-NEXT: ld.param.u64 %rd3, [test_srem_v4i8_param_2]; 1327; CHECK-NEXT: ld.param.u64 %rd2, [test_srem_v4i8_param_1]; 1328; CHECK-NEXT: ld.param.u64 %rd1, [test_srem_v4i8_param_0]; 1329; CHECK-NEXT: ld.u32 %r1, [%rd1]; 1330; CHECK-NEXT: ld.u32 %r2, [%rd2]; 1331; CHECK-NEXT: bfe.s32 %r3, %r2, 24, 8; 1332; CHECK-NEXT: cvt.s8.s32 %rs1, %r3; 1333; CHECK-NEXT: bfe.s32 %r4, %r1, 24, 8; 1334; CHECK-NEXT: cvt.s8.s32 %rs2, %r4; 1335; CHECK-NEXT: rem.s16 %rs3, %rs2, %rs1; 1336; CHECK-NEXT: cvt.u32.u16 %r5, %rs3; 1337; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8; 1338; CHECK-NEXT: cvt.s8.s32 %rs4, %r6; 1339; CHECK-NEXT: bfe.s32 %r7, %r1, 16, 8; 1340; CHECK-NEXT: cvt.s8.s32 %rs5, %r7; 1341; CHECK-NEXT: rem.s16 %rs6, %rs5, %rs4; 1342; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; 1343; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U; 1344; CHECK-NEXT: bfe.s32 %r10, %r2, 8, 8; 1345; CHECK-NEXT: cvt.s8.s32 %rs7, %r10; 1346; CHECK-NEXT: bfe.s32 %r11, %r1, 8, 8; 1347; CHECK-NEXT: cvt.s8.s32 %rs8, %r11; 1348; CHECK-NEXT: rem.s16 %rs9, %rs8, %rs7; 1349; CHECK-NEXT: cvt.u32.u16 %r12, %rs9; 1350; CHECK-NEXT: bfe.s32 %r13, %r2, 0, 8; 1351; CHECK-NEXT: cvt.s8.s32 %rs10, %r13; 1352; CHECK-NEXT: bfe.s32 %r14, %r1, 0, 8; 1353; CHECK-NEXT: cvt.s8.s32 %rs11, %r14; 1354; CHECK-NEXT: rem.s16 %rs12, %rs11, %rs10; 1355; CHECK-NEXT: cvt.u32.u16 %r15, %rs12; 1356; CHECK-NEXT: prmt.b32 %r16, %r15, %r12, 0x3340U; 1357; CHECK-NEXT: prmt.b32 %r17, %r16, %r9, 0x5410U; 1358; CHECK-NEXT: st.u32 [%rd3], %r17; 1359; CHECK-NEXT: ret; 1360entry: 1361 %t57 = load <4 x i8>, ptr %a, align 4 1362 %t59 = load <4 x i8>, ptr %b, align 4 1363 %x = srem <4 x i8> %t57, %t59 1364 store <4 x i8> %x, ptr %c, align 4 1365 ret void 1366} 1367 1368;; v3i8 lowering, especially for unaligned loads is terrible. We end up doing 1369;; tons of pointless scalar_to_vector/bitcast/extract_elt on v2i16/v4i8, which 1370;; is further complicated by LLVM trying to use i16 as an intermediate type, 1371;; because we don't have i8 registers. It's a mess. 1372;; Ideally we want to split it into element-wise ops, but legalizer can't handle 1373;; odd-sized vectors. TL;DR; don't use odd-sized vectors of v8. 1374define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) { 1375; CHECK-LABEL: test_srem_v3i8( 1376; CHECK: { 1377; CHECK-NEXT: .reg .b16 %rs<20>; 1378; CHECK-NEXT: .reg .b32 %r<14>; 1379; CHECK-NEXT: .reg .b64 %rd<4>; 1380; CHECK-EMPTY: 1381; CHECK-NEXT: // %bb.0: // %entry 1382; CHECK-NEXT: ld.param.u64 %rd3, [test_srem_v3i8_param_2]; 1383; CHECK-NEXT: ld.param.u64 %rd2, [test_srem_v3i8_param_1]; 1384; CHECK-NEXT: ld.param.u64 %rd1, [test_srem_v3i8_param_0]; 1385; CHECK-NEXT: ld.u8 %rs1, [%rd1]; 1386; CHECK-NEXT: ld.u8 %rs2, [%rd1+1]; 1387; CHECK-NEXT: shl.b16 %rs3, %rs2, 8; 1388; CHECK-NEXT: or.b16 %rs4, %rs3, %rs1; 1389; CHECK-NEXT: cvt.u32.u16 %r1, %rs4; 1390; CHECK-NEXT: ld.s8 %rs5, [%rd1+2]; 1391; CHECK-NEXT: ld.u8 %rs6, [%rd2]; 1392; CHECK-NEXT: ld.u8 %rs7, [%rd2+1]; 1393; CHECK-NEXT: shl.b16 %rs8, %rs7, 8; 1394; CHECK-NEXT: or.b16 %rs9, %rs8, %rs6; 1395; CHECK-NEXT: cvt.u32.u16 %r2, %rs9; 1396; CHECK-NEXT: ld.s8 %rs10, [%rd2+2]; 1397; CHECK-NEXT: bfe.s32 %r3, %r2, 8, 8; 1398; CHECK-NEXT: cvt.s8.s32 %rs11, %r3; 1399; CHECK-NEXT: bfe.s32 %r4, %r1, 8, 8; 1400; CHECK-NEXT: cvt.s8.s32 %rs12, %r4; 1401; CHECK-NEXT: rem.s16 %rs13, %rs12, %rs11; 1402; CHECK-NEXT: cvt.u32.u16 %r5, %rs13; 1403; CHECK-NEXT: bfe.s32 %r6, %r2, 0, 8; 1404; CHECK-NEXT: cvt.s8.s32 %rs14, %r6; 1405; CHECK-NEXT: bfe.s32 %r7, %r1, 0, 8; 1406; CHECK-NEXT: cvt.s8.s32 %rs15, %r7; 1407; CHECK-NEXT: rem.s16 %rs16, %rs15, %rs14; 1408; CHECK-NEXT: cvt.u32.u16 %r8, %rs16; 1409; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U; 1410; CHECK-NEXT: // implicit-def: %r11 1411; CHECK-NEXT: // implicit-def: %r12 1412; CHECK-NEXT: prmt.b32 %r10, %r11, %r12, 0x3340U; 1413; CHECK-NEXT: prmt.b32 %r13, %r9, %r10, 0x5410U; 1414; CHECK-NEXT: rem.s16 %rs17, %rs5, %rs10; 1415; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs18, tmp}, %r13; } 1416; CHECK-NEXT: st.u8 [%rd3], %rs18; 1417; CHECK-NEXT: shr.u16 %rs19, %rs18, 8; 1418; CHECK-NEXT: st.u8 [%rd3+1], %rs19; 1419; CHECK-NEXT: st.u8 [%rd3+2], %rs17; 1420; CHECK-NEXT: ret; 1421entry: 1422 %t57 = load <3 x i8>, ptr %a, align 1 1423 %t59 = load <3 x i8>, ptr %b, align 1 1424 %x = srem <3 x i8> %t57, %t59 1425 store <3 x i8> %x, ptr %c, align 1 1426 ret void 1427} 1428 1429define void @test_sext_v4i1_to_v4i8(ptr %a, ptr %b, ptr %c) { 1430; CHECK-LABEL: test_sext_v4i1_to_v4i8( 1431; CHECK: { 1432; CHECK-NEXT: .reg .pred %p<5>; 1433; CHECK-NEXT: .reg .b32 %r<18>; 1434; CHECK-NEXT: .reg .b64 %rd<4>; 1435; CHECK-EMPTY: 1436; CHECK-NEXT: // %bb.0: // %entry 1437; CHECK-NEXT: ld.param.u64 %rd3, [test_sext_v4i1_to_v4i8_param_2]; 1438; CHECK-NEXT: ld.param.u64 %rd2, [test_sext_v4i1_to_v4i8_param_1]; 1439; CHECK-NEXT: ld.param.u64 %rd1, [test_sext_v4i1_to_v4i8_param_0]; 1440; CHECK-NEXT: ld.u32 %r1, [%rd1]; 1441; CHECK-NEXT: ld.u32 %r2, [%rd2]; 1442; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; 1443; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; 1444; CHECK-NEXT: setp.hi.u32 %p1, %r4, %r3; 1445; CHECK-NEXT: bfe.u32 %r5, %r2, 8, 8; 1446; CHECK-NEXT: bfe.u32 %r6, %r1, 8, 8; 1447; CHECK-NEXT: setp.hi.u32 %p2, %r6, %r5; 1448; CHECK-NEXT: bfe.u32 %r7, %r2, 16, 8; 1449; CHECK-NEXT: bfe.u32 %r8, %r1, 16, 8; 1450; CHECK-NEXT: setp.hi.u32 %p3, %r8, %r7; 1451; CHECK-NEXT: bfe.u32 %r9, %r2, 24, 8; 1452; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; 1453; CHECK-NEXT: setp.hi.u32 %p4, %r10, %r9; 1454; CHECK-NEXT: selp.s32 %r11, -1, 0, %p4; 1455; CHECK-NEXT: selp.s32 %r12, -1, 0, %p3; 1456; CHECK-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U; 1457; CHECK-NEXT: selp.s32 %r14, -1, 0, %p2; 1458; CHECK-NEXT: selp.s32 %r15, -1, 0, %p1; 1459; CHECK-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U; 1460; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U; 1461; CHECK-NEXT: st.u32 [%rd3], %r17; 1462; CHECK-NEXT: ret; 1463entry: 1464 %t1 = load <4 x i8>, ptr %a, align 4 1465 %t2 = load <4 x i8>, ptr %b, align 4 1466 %t5 = icmp ugt <4 x i8> %t1, %t2 1467 %t6 = sext <4 x i1> %t5 to <4 x i8> 1468 store <4 x i8> %t6, ptr %c, align 4 1469 ret void 1470} 1471 1472attributes #0 = { nounwind } 1473