1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2; ## Support i16x2 instructions 3; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80 \ 4; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ 5; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes COMMON,I16x2 %s 6; RUN: %if ptxas %{ \ 7; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 \ 8; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ 9; RUN: | %ptxas-verify -arch=sm_90 \ 10; RUN: %} 11; ## No support for i16x2 instructions 12; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \ 13; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ 14; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes COMMON,NO-I16x2 %s 15; RUN: %if ptxas %{ \ 16; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \ 17; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ 18; RUN: | %ptxas-verify -arch=sm_53 \ 19; RUN: %} 20 21target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" 22 23define <2 x i16> @test_ret_const() #0 { 24; COMMON-LABEL: test_ret_const( 25; COMMON: { 26; COMMON-NEXT: .reg .b32 %r<2>; 27; COMMON-EMPTY: 28; COMMON-NEXT: // %bb.0: 29; COMMON-NEXT: mov.b32 %r1, 131073; 30; COMMON-NEXT: st.param.b32 [func_retval0], %r1; 31; COMMON-NEXT: ret; 32 ret <2 x i16> <i16 1, i16 2> 33} 34 35define i16 @test_extract_0(<2 x i16> %a) #0 { 36; COMMON-LABEL: test_extract_0( 37; COMMON: { 38; COMMON-NEXT: .reg .b16 %rs<2>; 39; COMMON-NEXT: .reg .b32 %r<3>; 40; COMMON-EMPTY: 41; COMMON-NEXT: // %bb.0: 42; COMMON-NEXT: ld.param.u32 %r1, [test_extract_0_param_0]; 43; COMMON-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; } 44; COMMON-NEXT: cvt.u32.u16 %r2, %rs1; 45; COMMON-NEXT: st.param.b32 [func_retval0], %r2; 46; COMMON-NEXT: ret; 47 %e = extractelement <2 x i16> %a, i32 0 48 ret i16 %e 49} 50 51define i16 @test_extract_1(<2 x i16> %a) #0 { 52; COMMON-LABEL: test_extract_1( 53; COMMON: { 54; COMMON-NEXT: .reg .b16 %rs<2>; 55; COMMON-NEXT: .reg .b32 %r<3>; 56; COMMON-EMPTY: 57; COMMON-NEXT: // %bb.0: 58; COMMON-NEXT: ld.param.u32 %r1, [test_extract_1_param_0]; 59; COMMON-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r1; } 60; COMMON-NEXT: cvt.u32.u16 %r2, %rs1; 61; COMMON-NEXT: st.param.b32 [func_retval0], %r2; 62; COMMON-NEXT: ret; 63 %e = extractelement <2 x i16> %a, i32 1 64 ret i16 %e 65} 66 67define i16 @test_extract_i(<2 x i16> %a, i64 %idx) #0 { 68; COMMON-LABEL: test_extract_i( 69; COMMON: { 70; COMMON-NEXT: .reg .pred %p<2>; 71; COMMON-NEXT: .reg .b16 %rs<4>; 72; COMMON-NEXT: .reg .b32 %r<3>; 73; COMMON-NEXT: .reg .b64 %rd<2>; 74; COMMON-EMPTY: 75; COMMON-NEXT: // %bb.0: 76; COMMON-NEXT: ld.param.u64 %rd1, [test_extract_i_param_1]; 77; COMMON-NEXT: ld.param.u32 %r1, [test_extract_i_param_0]; 78; COMMON-NEXT: setp.eq.s64 %p1, %rd1, 0; 79; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; 80; COMMON-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1; 81; COMMON-NEXT: cvt.u32.u16 %r2, %rs3; 82; COMMON-NEXT: st.param.b32 [func_retval0], %r2; 83; COMMON-NEXT: ret; 84 %e = extractelement <2 x i16> %a, i64 %idx 85 ret i16 %e 86} 87 88define <2 x i16> @test_add(<2 x i16> %a, <2 x i16> %b) #0 { 89; I16x2-LABEL: test_add( 90; I16x2: { 91; I16x2-NEXT: .reg .b32 %r<4>; 92; I16x2-EMPTY: 93; I16x2-NEXT: // %bb.0: 94; I16x2-NEXT: ld.param.u32 %r2, [test_add_param_1]; 95; I16x2-NEXT: ld.param.u32 %r1, [test_add_param_0]; 96; I16x2-NEXT: add.s16x2 %r3, %r1, %r2; 97; I16x2-NEXT: st.param.b32 [func_retval0], %r3; 98; I16x2-NEXT: ret; 99; 100; NO-I16x2-LABEL: test_add( 101; NO-I16x2: { 102; NO-I16x2-NEXT: .reg .b16 %rs<7>; 103; NO-I16x2-NEXT: .reg .b32 %r<4>; 104; NO-I16x2-EMPTY: 105; NO-I16x2-NEXT: // %bb.0: 106; NO-I16x2-NEXT: ld.param.u32 %r2, [test_add_param_1]; 107; NO-I16x2-NEXT: ld.param.u32 %r1, [test_add_param_0]; 108; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2; 109; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1; 110; NO-I16x2-NEXT: add.s16 %rs5, %rs4, %rs2; 111; NO-I16x2-NEXT: add.s16 %rs6, %rs3, %rs1; 112; NO-I16x2-NEXT: mov.b32 %r3, {%rs6, %rs5}; 113; NO-I16x2-NEXT: st.param.b32 [func_retval0], %r3; 114; NO-I16x2-NEXT: ret; 115 %r = add <2 x i16> %a, %b 116 ret <2 x i16> %r 117} 118 119; Check that we can lower add with immediate arguments. 120define <2 x i16> @test_add_imm_0(<2 x i16> %a) #0 { 121; I16x2-LABEL: test_add_imm_0( 122; I16x2: { 123; I16x2-NEXT: .reg .b32 %r<4>; 124; I16x2-EMPTY: 125; I16x2-NEXT: // %bb.0: 126; I16x2-NEXT: ld.param.u32 %r1, [test_add_imm_0_param_0]; 127; I16x2-NEXT: mov.b32 %r2, 131073; 128; I16x2-NEXT: add.s16x2 %r3, %r1, %r2; 129; I16x2-NEXT: st.param.b32 [func_retval0], %r3; 130; I16x2-NEXT: ret; 131; 132; NO-I16x2-LABEL: test_add_imm_0( 133; NO-I16x2: { 134; NO-I16x2-NEXT: .reg .b16 %rs<5>; 135; NO-I16x2-NEXT: .reg .b32 %r<3>; 136; NO-I16x2-EMPTY: 137; NO-I16x2-NEXT: // %bb.0: 138; NO-I16x2-NEXT: ld.param.u32 %r1, [test_add_imm_0_param_0]; 139; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r1; 140; NO-I16x2-NEXT: add.s16 %rs3, %rs2, 2; 141; NO-I16x2-NEXT: add.s16 %rs4, %rs1, 1; 142; NO-I16x2-NEXT: mov.b32 %r2, {%rs4, %rs3}; 143; NO-I16x2-NEXT: st.param.b32 [func_retval0], %r2; 144; NO-I16x2-NEXT: ret; 145 %r = add <2 x i16> <i16 1, i16 2>, %a 146 ret <2 x i16> %r 147} 148 149define <2 x i16> @test_add_imm_1(<2 x i16> %a) #0 { 150; I16x2-LABEL: test_add_imm_1( 151; I16x2: { 152; I16x2-NEXT: .reg .b32 %r<4>; 153; I16x2-EMPTY: 154; I16x2-NEXT: // %bb.0: 155; I16x2-NEXT: ld.param.u32 %r1, [test_add_imm_1_param_0]; 156; I16x2-NEXT: mov.b32 %r2, 131073; 157; I16x2-NEXT: add.s16x2 %r3, %r1, %r2; 158; I16x2-NEXT: st.param.b32 [func_retval0], %r3; 159; I16x2-NEXT: ret; 160; 161; NO-I16x2-LABEL: test_add_imm_1( 162; NO-I16x2: { 163; NO-I16x2-NEXT: .reg .b16 %rs<5>; 164; NO-I16x2-NEXT: .reg .b32 %r<3>; 165; NO-I16x2-EMPTY: 166; NO-I16x2-NEXT: // %bb.0: 167; NO-I16x2-NEXT: ld.param.u32 %r1, [test_add_imm_1_param_0]; 168; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r1; 169; NO-I16x2-NEXT: add.s16 %rs3, %rs2, 2; 170; NO-I16x2-NEXT: add.s16 %rs4, %rs1, 1; 171; NO-I16x2-NEXT: mov.b32 %r2, {%rs4, %rs3}; 172; NO-I16x2-NEXT: st.param.b32 [func_retval0], %r2; 173; NO-I16x2-NEXT: ret; 174 %r = add <2 x i16> %a, <i16 1, i16 2> 175 ret <2 x i16> %r 176} 177 178define <2 x i16> @test_sub(<2 x i16> %a, <2 x i16> %b) #0 { 179; COMMON-LABEL: test_sub( 180; COMMON: { 181; COMMON-NEXT: .reg .b16 %rs<7>; 182; COMMON-NEXT: .reg .b32 %r<4>; 183; COMMON-EMPTY: 184; COMMON-NEXT: // %bb.0: 185; COMMON-NEXT: ld.param.u32 %r2, [test_sub_param_1]; 186; COMMON-NEXT: ld.param.u32 %r1, [test_sub_param_0]; 187; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r2; 188; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r1; 189; COMMON-NEXT: sub.s16 %rs5, %rs4, %rs2; 190; COMMON-NEXT: sub.s16 %rs6, %rs3, %rs1; 191; COMMON-NEXT: mov.b32 %r3, {%rs6, %rs5}; 192; COMMON-NEXT: st.param.b32 [func_retval0], %r3; 193; COMMON-NEXT: ret; 194 %r = sub <2 x i16> %a, %b 195 ret <2 x i16> %r 196} 197 198define <2 x i16> @test_smax(<2 x i16> %a, <2 x i16> %b) #0 { 199; I16x2-LABEL: test_smax( 200; I16x2: { 201; I16x2-NEXT: .reg .b32 %r<4>; 202; I16x2-EMPTY: 203; I16x2-NEXT: // %bb.0: 204; I16x2-NEXT: ld.param.u32 %r2, [test_smax_param_1]; 205; I16x2-NEXT: ld.param.u32 %r1, [test_smax_param_0]; 206; I16x2-NEXT: max.s16x2 %r3, %r1, %r2; 207; I16x2-NEXT: st.param.b32 [func_retval0], %r3; 208; I16x2-NEXT: ret; 209; 210; NO-I16x2-LABEL: test_smax( 211; NO-I16x2: { 212; NO-I16x2-NEXT: .reg .b16 %rs<7>; 213; NO-I16x2-NEXT: .reg .b32 %r<4>; 214; NO-I16x2-EMPTY: 215; NO-I16x2-NEXT: // %bb.0: 216; NO-I16x2-NEXT: ld.param.u32 %r2, [test_smax_param_1]; 217; NO-I16x2-NEXT: ld.param.u32 %r1, [test_smax_param_0]; 218; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2; 219; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1; 220; NO-I16x2-NEXT: max.s16 %rs5, %rs4, %rs2; 221; NO-I16x2-NEXT: max.s16 %rs6, %rs3, %rs1; 222; NO-I16x2-NEXT: mov.b32 %r3, {%rs6, %rs5}; 223; NO-I16x2-NEXT: st.param.b32 [func_retval0], %r3; 224; NO-I16x2-NEXT: ret; 225 %cmp = icmp sgt <2 x i16> %a, %b 226 %r = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b 227 ret <2 x i16> %r 228} 229 230define <2 x i16> @test_umax(<2 x i16> %a, <2 x i16> %b) #0 { 231; I16x2-LABEL: test_umax( 232; I16x2: { 233; I16x2-NEXT: .reg .b32 %r<4>; 234; I16x2-EMPTY: 235; I16x2-NEXT: // %bb.0: 236; I16x2-NEXT: ld.param.u32 %r2, [test_umax_param_1]; 237; I16x2-NEXT: ld.param.u32 %r1, [test_umax_param_0]; 238; I16x2-NEXT: max.u16x2 %r3, %r1, %r2; 239; I16x2-NEXT: st.param.b32 [func_retval0], %r3; 240; I16x2-NEXT: ret; 241; 242; NO-I16x2-LABEL: test_umax( 243; NO-I16x2: { 244; NO-I16x2-NEXT: .reg .b16 %rs<7>; 245; NO-I16x2-NEXT: .reg .b32 %r<4>; 246; NO-I16x2-EMPTY: 247; NO-I16x2-NEXT: // %bb.0: 248; NO-I16x2-NEXT: ld.param.u32 %r2, [test_umax_param_1]; 249; NO-I16x2-NEXT: ld.param.u32 %r1, [test_umax_param_0]; 250; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2; 251; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1; 252; NO-I16x2-NEXT: max.u16 %rs5, %rs4, %rs2; 253; NO-I16x2-NEXT: max.u16 %rs6, %rs3, %rs1; 254; NO-I16x2-NEXT: mov.b32 %r3, {%rs6, %rs5}; 255; NO-I16x2-NEXT: st.param.b32 [func_retval0], %r3; 256; NO-I16x2-NEXT: ret; 257 %cmp = icmp ugt <2 x i16> %a, %b 258 %r = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b 259 ret <2 x i16> %r 260} 261 262define <2 x i16> @test_smin(<2 x i16> %a, <2 x i16> %b) #0 { 263; I16x2-LABEL: test_smin( 264; I16x2: { 265; I16x2-NEXT: .reg .b32 %r<4>; 266; I16x2-EMPTY: 267; I16x2-NEXT: // %bb.0: 268; I16x2-NEXT: ld.param.u32 %r2, [test_smin_param_1]; 269; I16x2-NEXT: ld.param.u32 %r1, [test_smin_param_0]; 270; I16x2-NEXT: min.s16x2 %r3, %r1, %r2; 271; I16x2-NEXT: st.param.b32 [func_retval0], %r3; 272; I16x2-NEXT: ret; 273; 274; NO-I16x2-LABEL: test_smin( 275; NO-I16x2: { 276; NO-I16x2-NEXT: .reg .b16 %rs<7>; 277; NO-I16x2-NEXT: .reg .b32 %r<4>; 278; NO-I16x2-EMPTY: 279; NO-I16x2-NEXT: // %bb.0: 280; NO-I16x2-NEXT: ld.param.u32 %r2, [test_smin_param_1]; 281; NO-I16x2-NEXT: ld.param.u32 %r1, [test_smin_param_0]; 282; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2; 283; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1; 284; NO-I16x2-NEXT: min.s16 %rs5, %rs4, %rs2; 285; NO-I16x2-NEXT: min.s16 %rs6, %rs3, %rs1; 286; NO-I16x2-NEXT: mov.b32 %r3, {%rs6, %rs5}; 287; NO-I16x2-NEXT: st.param.b32 [func_retval0], %r3; 288; NO-I16x2-NEXT: ret; 289 %cmp = icmp sle <2 x i16> %a, %b 290 %r = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b 291 ret <2 x i16> %r 292} 293 294define <2 x i16> @test_umin(<2 x i16> %a, <2 x i16> %b) #0 { 295; I16x2-LABEL: test_umin( 296; I16x2: { 297; I16x2-NEXT: .reg .b32 %r<4>; 298; I16x2-EMPTY: 299; I16x2-NEXT: // %bb.0: 300; I16x2-NEXT: ld.param.u32 %r2, [test_umin_param_1]; 301; I16x2-NEXT: ld.param.u32 %r1, [test_umin_param_0]; 302; I16x2-NEXT: min.u16x2 %r3, %r1, %r2; 303; I16x2-NEXT: st.param.b32 [func_retval0], %r3; 304; I16x2-NEXT: ret; 305; 306; NO-I16x2-LABEL: test_umin( 307; NO-I16x2: { 308; NO-I16x2-NEXT: .reg .b16 %rs<7>; 309; NO-I16x2-NEXT: .reg .b32 %r<4>; 310; NO-I16x2-EMPTY: 311; NO-I16x2-NEXT: // %bb.0: 312; NO-I16x2-NEXT: ld.param.u32 %r2, [test_umin_param_1]; 313; NO-I16x2-NEXT: ld.param.u32 %r1, [test_umin_param_0]; 314; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2; 315; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1; 316; NO-I16x2-NEXT: min.u16 %rs5, %rs4, %rs2; 317; NO-I16x2-NEXT: min.u16 %rs6, %rs3, %rs1; 318; NO-I16x2-NEXT: mov.b32 %r3, {%rs6, %rs5}; 319; NO-I16x2-NEXT: st.param.b32 [func_retval0], %r3; 320; NO-I16x2-NEXT: ret; 321 %cmp = icmp ule <2 x i16> %a, %b 322 %r = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b 323 ret <2 x i16> %r 324} 325 326define <2 x i16> @test_mul(<2 x i16> %a, <2 x i16> %b) #0 { 327; COMMON-LABEL: test_mul( 328; COMMON: { 329; COMMON-NEXT: .reg .b16 %rs<7>; 330; COMMON-NEXT: .reg .b32 %r<4>; 331; COMMON-EMPTY: 332; COMMON-NEXT: // %bb.0: 333; COMMON-NEXT: ld.param.u32 %r2, [test_mul_param_1]; 334; COMMON-NEXT: ld.param.u32 %r1, [test_mul_param_0]; 335; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r2; 336; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r1; 337; COMMON-NEXT: mul.lo.s16 %rs5, %rs4, %rs2; 338; COMMON-NEXT: mul.lo.s16 %rs6, %rs3, %rs1; 339; COMMON-NEXT: mov.b32 %r3, {%rs6, %rs5}; 340; COMMON-NEXT: st.param.b32 [func_retval0], %r3; 341; COMMON-NEXT: ret; 342 %r = mul <2 x i16> %a, %b 343 ret <2 x i16> %r 344} 345 346;; Logical ops are available on all GPUs as regular 32-bit logical ops 347define <2 x i16> @test_or(<2 x i16> %a, <2 x i16> %b) #0 { 348; COMMON-LABEL: test_or( 349; COMMON: { 350; COMMON-NEXT: .reg .b32 %r<4>; 351; COMMON-EMPTY: 352; COMMON-NEXT: // %bb.0: 353; COMMON-NEXT: ld.param.u32 %r2, [test_or_param_1]; 354; COMMON-NEXT: ld.param.u32 %r1, [test_or_param_0]; 355; COMMON-NEXT: or.b32 %r3, %r1, %r2; 356; COMMON-NEXT: st.param.b32 [func_retval0], %r3; 357; COMMON-NEXT: ret; 358 %r = or <2 x i16> %a, %b 359 ret <2 x i16> %r 360} 361 362; Ops that operate on computed arguments go though a different lowering path. 363; compared to the ones that operate on loaded data. So we test them separately. 364define <2 x i16> @test_or_computed(i16 %a) { 365; COMMON-LABEL: test_or_computed( 366; COMMON: { 367; COMMON-NEXT: .reg .b16 %rs<4>; 368; COMMON-NEXT: .reg .b32 %r<4>; 369; COMMON-EMPTY: 370; COMMON-NEXT: // %bb.0: 371; COMMON-NEXT: ld.param.u16 %rs1, [test_or_computed_param_0]; 372; COMMON-NEXT: mov.b16 %rs2, 0; 373; COMMON-NEXT: mov.b32 %r1, {%rs1, %rs2}; 374; COMMON-NEXT: mov.b16 %rs3, 5; 375; COMMON-NEXT: mov.b32 %r2, {%rs1, %rs3}; 376; COMMON-NEXT: or.b32 %r3, %r2, %r1; 377; COMMON-NEXT: st.param.b32 [func_retval0], %r3; 378; COMMON-NEXT: ret; 379 %ins.0 = insertelement <2 x i16> zeroinitializer, i16 %a, i32 0 380 %ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1 381 %r = or <2 x i16> %ins.1, %ins.0 382 ret <2 x i16> %r 383} 384 385; Check that we can lower or with immediate arguments. 386define <2 x i16> @test_or_imm_0(<2 x i16> %a) #0 { 387; COMMON-LABEL: test_or_imm_0( 388; COMMON: { 389; COMMON-NEXT: .reg .b32 %r<3>; 390; COMMON-EMPTY: 391; COMMON-NEXT: // %bb.0: 392; COMMON-NEXT: ld.param.u32 %r1, [test_or_imm_0_param_0]; 393; COMMON-NEXT: or.b32 %r2, %r1, 131073; 394; COMMON-NEXT: st.param.b32 [func_retval0], %r2; 395; COMMON-NEXT: ret; 396 %r = or <2 x i16> <i16 1, i16 2>, %a 397 ret <2 x i16> %r 398} 399 400define <2 x i16> @test_or_imm_1(<2 x i16> %a) #0 { 401; COMMON-LABEL: test_or_imm_1( 402; COMMON: { 403; COMMON-NEXT: .reg .b32 %r<3>; 404; COMMON-EMPTY: 405; COMMON-NEXT: // %bb.0: 406; COMMON-NEXT: ld.param.u32 %r1, [test_or_imm_1_param_0]; 407; COMMON-NEXT: or.b32 %r2, %r1, 131073; 408; COMMON-NEXT: st.param.b32 [func_retval0], %r2; 409; COMMON-NEXT: ret; 410 %r = or <2 x i16> %a, <i16 1, i16 2> 411 ret <2 x i16> %r 412} 413 414define <2 x i16> @test_xor(<2 x i16> %a, <2 x i16> %b) #0 { 415; COMMON-LABEL: test_xor( 416; COMMON: { 417; COMMON-NEXT: .reg .b32 %r<4>; 418; COMMON-EMPTY: 419; COMMON-NEXT: // %bb.0: 420; COMMON-NEXT: ld.param.u32 %r2, [test_xor_param_1]; 421; COMMON-NEXT: ld.param.u32 %r1, [test_xor_param_0]; 422; COMMON-NEXT: xor.b32 %r3, %r1, %r2; 423; COMMON-NEXT: st.param.b32 [func_retval0], %r3; 424; COMMON-NEXT: ret; 425 %r = xor <2 x i16> %a, %b 426 ret <2 x i16> %r 427} 428 429define <2 x i16> @test_xor_computed(i16 %a) { 430; COMMON-LABEL: test_xor_computed( 431; COMMON: { 432; COMMON-NEXT: .reg .b16 %rs<4>; 433; COMMON-NEXT: .reg .b32 %r<4>; 434; COMMON-EMPTY: 435; COMMON-NEXT: // %bb.0: 436; COMMON-NEXT: ld.param.u16 %rs1, [test_xor_computed_param_0]; 437; COMMON-NEXT: mov.b16 %rs2, 0; 438; COMMON-NEXT: mov.b32 %r1, {%rs1, %rs2}; 439; COMMON-NEXT: mov.b16 %rs3, 5; 440; COMMON-NEXT: mov.b32 %r2, {%rs1, %rs3}; 441; COMMON-NEXT: xor.b32 %r3, %r2, %r1; 442; COMMON-NEXT: st.param.b32 [func_retval0], %r3; 443; COMMON-NEXT: ret; 444 %ins.0 = insertelement <2 x i16> zeroinitializer, i16 %a, i32 0 445 %ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1 446 %r = xor <2 x i16> %ins.1, %ins.0 447 ret <2 x i16> %r 448} 449 450; Check that we can lower xor with immediate arguments. 451define <2 x i16> @test_xor_imm_0(<2 x i16> %a) #0 { 452; COMMON-LABEL: test_xor_imm_0( 453; COMMON: { 454; COMMON-NEXT: .reg .b32 %r<3>; 455; COMMON-EMPTY: 456; COMMON-NEXT: // %bb.0: 457; COMMON-NEXT: ld.param.u32 %r1, [test_xor_imm_0_param_0]; 458; COMMON-NEXT: xor.b32 %r2, %r1, 131073; 459; COMMON-NEXT: st.param.b32 [func_retval0], %r2; 460; COMMON-NEXT: ret; 461 %r = xor <2 x i16> <i16 1, i16 2>, %a 462 ret <2 x i16> %r 463} 464 465define <2 x i16> @test_xor_imm_1(<2 x i16> %a) #0 { 466; COMMON-LABEL: test_xor_imm_1( 467; COMMON: { 468; COMMON-NEXT: .reg .b32 %r<3>; 469; COMMON-EMPTY: 470; COMMON-NEXT: // %bb.0: 471; COMMON-NEXT: ld.param.u32 %r1, [test_xor_imm_1_param_0]; 472; COMMON-NEXT: xor.b32 %r2, %r1, 131073; 473; COMMON-NEXT: st.param.b32 [func_retval0], %r2; 474; COMMON-NEXT: ret; 475 %r = xor <2 x i16> %a, <i16 1, i16 2> 476 ret <2 x i16> %r 477} 478 479define <2 x i16> @test_and(<2 x i16> %a, <2 x i16> %b) #0 { 480; COMMON-LABEL: test_and( 481; COMMON: { 482; COMMON-NEXT: .reg .b32 %r<4>; 483; COMMON-EMPTY: 484; COMMON-NEXT: // %bb.0: 485; COMMON-NEXT: ld.param.u32 %r2, [test_and_param_1]; 486; COMMON-NEXT: ld.param.u32 %r1, [test_and_param_0]; 487; COMMON-NEXT: and.b32 %r3, %r1, %r2; 488; COMMON-NEXT: st.param.b32 [func_retval0], %r3; 489; COMMON-NEXT: ret; 490 %r = and <2 x i16> %a, %b 491 ret <2 x i16> %r 492} 493 494; Ops that operate on computed arguments go though a different lowering path. 495; compared to the ones that operate on loaded data. So we test them separately. 496define <2 x i16> @test_and_computed(i16 %a) { 497; COMMON-LABEL: test_and_computed( 498; COMMON: { 499; COMMON-NEXT: .reg .b16 %rs<4>; 500; COMMON-NEXT: .reg .b32 %r<4>; 501; COMMON-EMPTY: 502; COMMON-NEXT: // %bb.0: 503; COMMON-NEXT: ld.param.u16 %rs1, [test_and_computed_param_0]; 504; COMMON-NEXT: mov.b16 %rs2, 0; 505; COMMON-NEXT: mov.b32 %r1, {%rs1, %rs2}; 506; COMMON-NEXT: mov.b16 %rs3, 5; 507; COMMON-NEXT: mov.b32 %r2, {%rs1, %rs3}; 508; COMMON-NEXT: and.b32 %r3, %r2, %r1; 509; COMMON-NEXT: st.param.b32 [func_retval0], %r3; 510; COMMON-NEXT: ret; 511 %ins.0 = insertelement <2 x i16> zeroinitializer, i16 %a, i32 0 512 %ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1 513 %r = and <2 x i16> %ins.1, %ins.0 514 ret <2 x i16> %r 515} 516 517; Check that we can lower and with immediate arguments. 518define <2 x i16> @test_and_imm_0(<2 x i16> %a) #0 { 519; COMMON-LABEL: test_and_imm_0( 520; COMMON: { 521; COMMON-NEXT: .reg .b32 %r<3>; 522; COMMON-EMPTY: 523; COMMON-NEXT: // %bb.0: 524; COMMON-NEXT: ld.param.u32 %r1, [test_and_imm_0_param_0]; 525; COMMON-NEXT: and.b32 %r2, %r1, 131073; 526; COMMON-NEXT: st.param.b32 [func_retval0], %r2; 527; COMMON-NEXT: ret; 528 %r = and <2 x i16> <i16 1, i16 2>, %a 529 ret <2 x i16> %r 530} 531 532define <2 x i16> @test_and_imm_1(<2 x i16> %a) #0 { 533; COMMON-LABEL: test_and_imm_1( 534; COMMON: { 535; COMMON-NEXT: .reg .b32 %r<3>; 536; COMMON-EMPTY: 537; COMMON-NEXT: // %bb.0: 538; COMMON-NEXT: ld.param.u32 %r1, [test_and_imm_1_param_0]; 539; COMMON-NEXT: and.b32 %r2, %r1, 131073; 540; COMMON-NEXT: st.param.b32 [func_retval0], %r2; 541; COMMON-NEXT: ret; 542 %r = and <2 x i16> %a, <i16 1, i16 2> 543 ret <2 x i16> %r 544} 545 546define void @test_ldst_v2i16(ptr %a, ptr %b) { 547; COMMON-LABEL: test_ldst_v2i16( 548; COMMON: { 549; COMMON-NEXT: .reg .b32 %r<2>; 550; COMMON-NEXT: .reg .b64 %rd<3>; 551; COMMON-EMPTY: 552; COMMON-NEXT: // %bb.0: 553; COMMON-NEXT: ld.param.u64 %rd2, [test_ldst_v2i16_param_1]; 554; COMMON-NEXT: ld.param.u64 %rd1, [test_ldst_v2i16_param_0]; 555; COMMON-NEXT: ld.u32 %r1, [%rd1]; 556; COMMON-NEXT: st.u32 [%rd2], %r1; 557; COMMON-NEXT: ret; 558 %t1 = load <2 x i16>, ptr %a 559 store <2 x i16> %t1, ptr %b, align 16 560 ret void 561} 562 563; -- v3 is inconvenient to capture as it's lowered as ld.b64 + fair 564; number of bitshifting instructions that may change at llvm's whim. 565; So we only verify that we only issue correct number of writes using 566; correct offset, but not the values we write. 567define void @test_ldst_v3i16(ptr %a, ptr %b) { 568; COMMON-LABEL: test_ldst_v3i16( 569; COMMON: { 570; COMMON-NEXT: .reg .b64 %rd<5>; 571; COMMON-EMPTY: 572; COMMON-NEXT: // %bb.0: 573; COMMON-NEXT: ld.param.u64 %rd2, [test_ldst_v3i16_param_1]; 574; COMMON-NEXT: ld.param.u64 %rd1, [test_ldst_v3i16_param_0]; 575; COMMON-NEXT: ld.u64 %rd3, [%rd1]; 576; COMMON-NEXT: shr.u64 %rd4, %rd3, 32; 577; COMMON-NEXT: st.u32 [%rd2], %rd3; 578; COMMON-NEXT: st.u16 [%rd2+4], %rd4; 579; COMMON-NEXT: ret; 580 %t1 = load <3 x i16>, ptr %a 581 store <3 x i16> %t1, ptr %b, align 16 582 ret void 583} 584 585define void @test_ldst_v4i16(ptr %a, ptr %b) { 586; COMMON-LABEL: test_ldst_v4i16( 587; COMMON: { 588; COMMON-NEXT: .reg .b16 %rs<5>; 589; COMMON-NEXT: .reg .b64 %rd<3>; 590; COMMON-EMPTY: 591; COMMON-NEXT: // %bb.0: 592; COMMON-NEXT: ld.param.u64 %rd2, [test_ldst_v4i16_param_1]; 593; COMMON-NEXT: ld.param.u64 %rd1, [test_ldst_v4i16_param_0]; 594; COMMON-NEXT: ld.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; 595; COMMON-NEXT: st.v4.u16 [%rd2], {%rs1, %rs2, %rs3, %rs4}; 596; COMMON-NEXT: ret; 597 %t1 = load <4 x i16>, ptr %a 598 store <4 x i16> %t1, ptr %b, align 16 599 ret void 600} 601 602define void @test_ldst_v8i16(ptr %a, ptr %b) { 603; COMMON-LABEL: test_ldst_v8i16( 604; COMMON: { 605; COMMON-NEXT: .reg .b32 %r<5>; 606; COMMON-NEXT: .reg .b64 %rd<3>; 607; COMMON-EMPTY: 608; COMMON-NEXT: // %bb.0: 609; COMMON-NEXT: ld.param.u64 %rd2, [test_ldst_v8i16_param_1]; 610; COMMON-NEXT: ld.param.u64 %rd1, [test_ldst_v8i16_param_0]; 611; COMMON-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; 612; COMMON-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; 613; COMMON-NEXT: ret; 614 %t1 = load <8 x i16>, ptr %a 615 store <8 x i16> %t1, ptr %b, align 16 616 ret void 617} 618 619declare <2 x i16> @test_callee(<2 x i16> %a, <2 x i16> %b) #0 620 621define <2 x i16> @test_call(<2 x i16> %a, <2 x i16> %b) #0 { 622; COMMON-LABEL: test_call( 623; COMMON: { 624; COMMON-NEXT: .reg .b32 %r<5>; 625; COMMON-EMPTY: 626; COMMON-NEXT: // %bb.0: 627; COMMON-NEXT: ld.param.u32 %r2, [test_call_param_1]; 628; COMMON-NEXT: ld.param.u32 %r1, [test_call_param_0]; 629; COMMON-NEXT: { // callseq 0, 0 630; COMMON-NEXT: .param .align 4 .b8 param0[4]; 631; COMMON-NEXT: st.param.b32 [param0], %r1; 632; COMMON-NEXT: .param .align 4 .b8 param1[4]; 633; COMMON-NEXT: st.param.b32 [param1], %r2; 634; COMMON-NEXT: .param .align 4 .b8 retval0[4]; 635; COMMON-NEXT: call.uni (retval0), 636; COMMON-NEXT: test_callee, 637; COMMON-NEXT: ( 638; COMMON-NEXT: param0, 639; COMMON-NEXT: param1 640; COMMON-NEXT: ); 641; COMMON-NEXT: ld.param.b32 %r3, [retval0]; 642; COMMON-NEXT: } // callseq 0 643; COMMON-NEXT: st.param.b32 [func_retval0], %r3; 644; COMMON-NEXT: ret; 645 %r = call <2 x i16> @test_callee(<2 x i16> %a, <2 x i16> %b) 646 ret <2 x i16> %r 647} 648 649define <2 x i16> @test_call_flipped(<2 x i16> %a, <2 x i16> %b) #0 { 650; COMMON-LABEL: test_call_flipped( 651; COMMON: { 652; COMMON-NEXT: .reg .b32 %r<5>; 653; COMMON-EMPTY: 654; COMMON-NEXT: // %bb.0: 655; COMMON-NEXT: ld.param.u32 %r2, [test_call_flipped_param_1]; 656; COMMON-NEXT: ld.param.u32 %r1, [test_call_flipped_param_0]; 657; COMMON-NEXT: { // callseq 1, 0 658; COMMON-NEXT: .param .align 4 .b8 param0[4]; 659; COMMON-NEXT: st.param.b32 [param0], %r2; 660; COMMON-NEXT: .param .align 4 .b8 param1[4]; 661; COMMON-NEXT: st.param.b32 [param1], %r1; 662; COMMON-NEXT: .param .align 4 .b8 retval0[4]; 663; COMMON-NEXT: call.uni (retval0), 664; COMMON-NEXT: test_callee, 665; COMMON-NEXT: ( 666; COMMON-NEXT: param0, 667; COMMON-NEXT: param1 668; COMMON-NEXT: ); 669; COMMON-NEXT: ld.param.b32 %r3, [retval0]; 670; COMMON-NEXT: } // callseq 1 671; COMMON-NEXT: st.param.b32 [func_retval0], %r3; 672; COMMON-NEXT: ret; 673 %r = call <2 x i16> @test_callee(<2 x i16> %b, <2 x i16> %a) 674 ret <2 x i16> %r 675} 676 677define <2 x i16> @test_tailcall_flipped(<2 x i16> %a, <2 x i16> %b) #0 { 678; COMMON-LABEL: test_tailcall_flipped( 679; COMMON: { 680; COMMON-NEXT: .reg .b32 %r<5>; 681; COMMON-EMPTY: 682; COMMON-NEXT: // %bb.0: 683; COMMON-NEXT: ld.param.u32 %r2, [test_tailcall_flipped_param_1]; 684; COMMON-NEXT: ld.param.u32 %r1, [test_tailcall_flipped_param_0]; 685; COMMON-NEXT: { // callseq 2, 0 686; COMMON-NEXT: .param .align 4 .b8 param0[4]; 687; COMMON-NEXT: st.param.b32 [param0], %r2; 688; COMMON-NEXT: .param .align 4 .b8 param1[4]; 689; COMMON-NEXT: st.param.b32 [param1], %r1; 690; COMMON-NEXT: .param .align 4 .b8 retval0[4]; 691; COMMON-NEXT: call.uni (retval0), 692; COMMON-NEXT: test_callee, 693; COMMON-NEXT: ( 694; COMMON-NEXT: param0, 695; COMMON-NEXT: param1 696; COMMON-NEXT: ); 697; COMMON-NEXT: ld.param.b32 %r3, [retval0]; 698; COMMON-NEXT: } // callseq 2 699; COMMON-NEXT: st.param.b32 [func_retval0], %r3; 700; COMMON-NEXT: ret; 701 %r = tail call <2 x i16> @test_callee(<2 x i16> %b, <2 x i16> %a) 702 ret <2 x i16> %r 703} 704 705define <2 x i16> @test_select(<2 x i16> %a, <2 x i16> %b, i1 zeroext %c) #0 { 706; COMMON-LABEL: test_select( 707; COMMON: { 708; COMMON-NEXT: .reg .pred %p<2>; 709; COMMON-NEXT: .reg .b16 %rs<3>; 710; COMMON-NEXT: .reg .b32 %r<4>; 711; COMMON-EMPTY: 712; COMMON-NEXT: // %bb.0: 713; COMMON-NEXT: ld.param.u8 %rs1, [test_select_param_2]; 714; COMMON-NEXT: and.b16 %rs2, %rs1, 1; 715; COMMON-NEXT: setp.eq.b16 %p1, %rs2, 1; 716; COMMON-NEXT: ld.param.u32 %r2, [test_select_param_1]; 717; COMMON-NEXT: ld.param.u32 %r1, [test_select_param_0]; 718; COMMON-NEXT: selp.b32 %r3, %r1, %r2, %p1; 719; COMMON-NEXT: st.param.b32 [func_retval0], %r3; 720; COMMON-NEXT: ret; 721 %r = select i1 %c, <2 x i16> %a, <2 x i16> %b 722 ret <2 x i16> %r 723} 724 725define <2 x i16> @test_select_cc(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) #0 { 726; COMMON-LABEL: test_select_cc( 727; COMMON: { 728; COMMON-NEXT: .reg .pred %p<3>; 729; COMMON-NEXT: .reg .b16 %rs<11>; 730; COMMON-NEXT: .reg .b32 %r<6>; 731; COMMON-EMPTY: 732; COMMON-NEXT: // %bb.0: 733; COMMON-NEXT: ld.param.u32 %r4, [test_select_cc_param_3]; 734; COMMON-NEXT: ld.param.u32 %r3, [test_select_cc_param_2]; 735; COMMON-NEXT: ld.param.u32 %r2, [test_select_cc_param_1]; 736; COMMON-NEXT: ld.param.u32 %r1, [test_select_cc_param_0]; 737; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r4; 738; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r3; 739; COMMON-NEXT: setp.ne.s16 %p1, %rs3, %rs1; 740; COMMON-NEXT: setp.ne.s16 %p2, %rs4, %rs2; 741; COMMON-NEXT: mov.b32 {%rs5, %rs6}, %r2; 742; COMMON-NEXT: mov.b32 {%rs7, %rs8}, %r1; 743; COMMON-NEXT: selp.b16 %rs9, %rs8, %rs6, %p2; 744; COMMON-NEXT: selp.b16 %rs10, %rs7, %rs5, %p1; 745; COMMON-NEXT: mov.b32 %r5, {%rs10, %rs9}; 746; COMMON-NEXT: st.param.b32 [func_retval0], %r5; 747; COMMON-NEXT: ret; 748 %cc = icmp ne <2 x i16> %c, %d 749 %r = select <2 x i1> %cc, <2 x i16> %a, <2 x i16> %b 750 ret <2 x i16> %r 751} 752 753define <2 x i32> @test_select_cc_i32_i16(<2 x i32> %a, <2 x i32> %b, 754; COMMON-LABEL: test_select_cc_i32_i16( 755; COMMON: { 756; COMMON-NEXT: .reg .pred %p<3>; 757; COMMON-NEXT: .reg .b16 %rs<5>; 758; COMMON-NEXT: .reg .b32 %r<9>; 759; COMMON-EMPTY: 760; COMMON-NEXT: // %bb.0: 761; COMMON-NEXT: ld.param.v2.u32 {%r3, %r4}, [test_select_cc_i32_i16_param_1]; 762; COMMON-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_select_cc_i32_i16_param_0]; 763; COMMON-NEXT: ld.param.u32 %r6, [test_select_cc_i32_i16_param_3]; 764; COMMON-NEXT: ld.param.u32 %r5, [test_select_cc_i32_i16_param_2]; 765; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r6; 766; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r5; 767; COMMON-NEXT: setp.ne.s16 %p1, %rs3, %rs1; 768; COMMON-NEXT: setp.ne.s16 %p2, %rs4, %rs2; 769; COMMON-NEXT: selp.b32 %r7, %r2, %r4, %p2; 770; COMMON-NEXT: selp.b32 %r8, %r1, %r3, %p1; 771; COMMON-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; 772; COMMON-NEXT: ret; 773 <2 x i16> %c, <2 x i16> %d) #0 { 774 %cc = icmp ne <2 x i16> %c, %d 775 %r = select <2 x i1> %cc, <2 x i32> %a, <2 x i32> %b 776 ret <2 x i32> %r 777} 778 779define <2 x i16> @test_select_cc_i16_i32(<2 x i16> %a, <2 x i16> %b, 780; COMMON-LABEL: test_select_cc_i16_i32( 781; COMMON: { 782; COMMON-NEXT: .reg .pred %p<3>; 783; COMMON-NEXT: .reg .b16 %rs<7>; 784; COMMON-NEXT: .reg .b32 %r<8>; 785; COMMON-EMPTY: 786; COMMON-NEXT: // %bb.0: 787; COMMON-NEXT: ld.param.v2.u32 {%r5, %r6}, [test_select_cc_i16_i32_param_3]; 788; COMMON-NEXT: ld.param.v2.u32 {%r3, %r4}, [test_select_cc_i16_i32_param_2]; 789; COMMON-NEXT: ld.param.u32 %r2, [test_select_cc_i16_i32_param_1]; 790; COMMON-NEXT: ld.param.u32 %r1, [test_select_cc_i16_i32_param_0]; 791; COMMON-NEXT: setp.ne.s32 %p1, %r3, %r5; 792; COMMON-NEXT: setp.ne.s32 %p2, %r4, %r6; 793; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r2; 794; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r1; 795; COMMON-NEXT: selp.b16 %rs5, %rs4, %rs2, %p2; 796; COMMON-NEXT: selp.b16 %rs6, %rs3, %rs1, %p1; 797; COMMON-NEXT: mov.b32 %r7, {%rs6, %rs5}; 798; COMMON-NEXT: st.param.b32 [func_retval0], %r7; 799; COMMON-NEXT: ret; 800 <2 x i32> %c, <2 x i32> %d) #0 { 801 %cc = icmp ne <2 x i32> %c, %d 802 %r = select <2 x i1> %cc, <2 x i16> %a, <2 x i16> %b 803 ret <2 x i16> %r 804} 805 806 807define <2 x i16> @test_trunc_2xi32(<2 x i32> %a) #0 { 808; COMMON-LABEL: test_trunc_2xi32( 809; COMMON: { 810; COMMON-NEXT: .reg .b32 %r<4>; 811; COMMON-EMPTY: 812; COMMON-NEXT: // %bb.0: 813; COMMON-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_trunc_2xi32_param_0]; 814; COMMON-NEXT: prmt.b32 %r3, %r1, %r2, 0x5410U; 815; COMMON-NEXT: st.param.b32 [func_retval0], %r3; 816; COMMON-NEXT: ret; 817 %r = trunc <2 x i32> %a to <2 x i16> 818 ret <2 x i16> %r 819} 820 821define <2 x i16> @test_trunc_2xi32_muliple_use0(<2 x i32> %a, ptr %p) #0 { 822; I16x2-LABEL: test_trunc_2xi32_muliple_use0( 823; I16x2: { 824; I16x2-NEXT: .reg .b32 %r<6>; 825; I16x2-NEXT: .reg .b64 %rd<2>; 826; I16x2-EMPTY: 827; I16x2-NEXT: // %bb.0: 828; I16x2-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_trunc_2xi32_muliple_use0_param_0]; 829; I16x2-NEXT: ld.param.u64 %rd1, [test_trunc_2xi32_muliple_use0_param_1]; 830; I16x2-NEXT: prmt.b32 %r3, %r1, %r2, 0x5410U; 831; I16x2-NEXT: mov.b32 %r4, 65537; 832; I16x2-NEXT: add.s16x2 %r5, %r3, %r4; 833; I16x2-NEXT: st.u32 [%rd1], %r5; 834; I16x2-NEXT: st.param.b32 [func_retval0], %r3; 835; I16x2-NEXT: ret; 836; 837; NO-I16x2-LABEL: test_trunc_2xi32_muliple_use0( 838; NO-I16x2: { 839; NO-I16x2-NEXT: .reg .b16 %rs<5>; 840; NO-I16x2-NEXT: .reg .b32 %r<5>; 841; NO-I16x2-NEXT: .reg .b64 %rd<2>; 842; NO-I16x2-EMPTY: 843; NO-I16x2-NEXT: // %bb.0: 844; NO-I16x2-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_trunc_2xi32_muliple_use0_param_0]; 845; NO-I16x2-NEXT: ld.param.u64 %rd1, [test_trunc_2xi32_muliple_use0_param_1]; 846; NO-I16x2-NEXT: cvt.u16.u32 %rs1, %r2; 847; NO-I16x2-NEXT: cvt.u16.u32 %rs2, %r1; 848; NO-I16x2-NEXT: mov.b32 %r3, {%rs2, %rs1}; 849; NO-I16x2-NEXT: add.s16 %rs3, %rs1, 1; 850; NO-I16x2-NEXT: add.s16 %rs4, %rs2, 1; 851; NO-I16x2-NEXT: mov.b32 %r4, {%rs4, %rs3}; 852; NO-I16x2-NEXT: st.u32 [%rd1], %r4; 853; NO-I16x2-NEXT: st.param.b32 [func_retval0], %r3; 854; NO-I16x2-NEXT: ret; 855 %r = trunc <2 x i32> %a to <2 x i16> 856 ; Reuse the truncate - optimizing to PRMT when we don't have i16x2 vectors 857 ; would increase register pressure 858 %s = add <2 x i16> %r, splat (i16 1) 859 store <2 x i16> %s, ptr %p 860 ret <2 x i16> %r 861} 862 863define <2 x i16> @test_trunc_2xi32_muliple_use1(<2 x i32> %a, ptr %p) #0 { 864; COMMON-LABEL: test_trunc_2xi32_muliple_use1( 865; COMMON: { 866; COMMON-NEXT: .reg .b32 %r<6>; 867; COMMON-NEXT: .reg .b64 %rd<2>; 868; COMMON-EMPTY: 869; COMMON-NEXT: // %bb.0: 870; COMMON-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_trunc_2xi32_muliple_use1_param_0]; 871; COMMON-NEXT: ld.param.u64 %rd1, [test_trunc_2xi32_muliple_use1_param_1]; 872; COMMON-NEXT: prmt.b32 %r3, %r1, %r2, 0x5410U; 873; COMMON-NEXT: add.s32 %r4, %r2, 1; 874; COMMON-NEXT: add.s32 %r5, %r1, 1; 875; COMMON-NEXT: st.v2.u32 [%rd1], {%r5, %r4}; 876; COMMON-NEXT: st.param.b32 [func_retval0], %r3; 877; COMMON-NEXT: ret; 878 %r = trunc <2 x i32> %a to <2 x i16> 879 ; Reuse the original value - optimizing to PRMT does not increase register 880 ; pressure 881 %s = add <2 x i32> %a, splat (i32 1) 882 store <2 x i32> %s, ptr %p 883 ret <2 x i16> %r 884} 885 886define <2 x i16> @test_trunc_2xi64(<2 x i64> %a) #0 { 887; COMMON-LABEL: test_trunc_2xi64( 888; COMMON: { 889; COMMON-NEXT: .reg .b16 %rs<3>; 890; COMMON-NEXT: .reg .b32 %r<2>; 891; COMMON-NEXT: .reg .b64 %rd<3>; 892; COMMON-EMPTY: 893; COMMON-NEXT: // %bb.0: 894; COMMON-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_trunc_2xi64_param_0]; 895; COMMON-NEXT: cvt.u16.u64 %rs1, %rd2; 896; COMMON-NEXT: cvt.u16.u64 %rs2, %rd1; 897; COMMON-NEXT: mov.b32 %r1, {%rs2, %rs1}; 898; COMMON-NEXT: st.param.b32 [func_retval0], %r1; 899; COMMON-NEXT: ret; 900 %r = trunc <2 x i64> %a to <2 x i16> 901 ret <2 x i16> %r 902} 903 904define <2 x i32> @test_zext_2xi32(<2 x i16> %a) #0 { 905; COMMON-LABEL: test_zext_2xi32( 906; COMMON: { 907; COMMON-NEXT: .reg .b16 %rs<3>; 908; COMMON-NEXT: .reg .b32 %r<4>; 909; COMMON-EMPTY: 910; COMMON-NEXT: // %bb.0: 911; COMMON-NEXT: ld.param.u32 %r1, [test_zext_2xi32_param_0]; 912; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; 913; COMMON-NEXT: cvt.u32.u16 %r2, %rs1; 914; COMMON-NEXT: cvt.u32.u16 %r3, %rs2; 915; COMMON-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r3}; 916; COMMON-NEXT: ret; 917 %r = zext <2 x i16> %a to <2 x i32> 918 ret <2 x i32> %r 919} 920 921define <2 x i64> @test_zext_2xi64(<2 x i16> %a) #0 { 922; COMMON-LABEL: test_zext_2xi64( 923; COMMON: { 924; COMMON-NEXT: .reg .b16 %rs<3>; 925; COMMON-NEXT: .reg .b32 %r<2>; 926; COMMON-NEXT: .reg .b64 %rd<3>; 927; COMMON-EMPTY: 928; COMMON-NEXT: // %bb.0: 929; COMMON-NEXT: ld.param.u32 %r1, [test_zext_2xi64_param_0]; 930; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; 931; COMMON-NEXT: cvt.u64.u16 %rd1, %rs2; 932; COMMON-NEXT: cvt.u64.u16 %rd2, %rs1; 933; COMMON-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; 934; COMMON-NEXT: ret; 935 %r = zext <2 x i16> %a to <2 x i64> 936 ret <2 x i64> %r 937} 938 939define <2 x i16> @test_bitcast_i32_to_2xi16(i32 %a) #0 { 940; COMMON-LABEL: test_bitcast_i32_to_2xi16( 941; COMMON: { 942; COMMON-NEXT: .reg .b32 %r<2>; 943; COMMON-EMPTY: 944; COMMON-NEXT: // %bb.0: 945; COMMON-NEXT: ld.param.u32 %r1, [test_bitcast_i32_to_2xi16_param_0]; 946; COMMON-NEXT: st.param.b32 [func_retval0], %r1; 947; COMMON-NEXT: ret; 948 %r = bitcast i32 %a to <2 x i16> 949 ret <2 x i16> %r 950} 951 952define i32 @test_bitcast_2xi16_to_i32(<2 x i16> %a) #0 { 953; COMMON-LABEL: test_bitcast_2xi16_to_i32( 954; COMMON: { 955; COMMON-NEXT: .reg .b32 %r<2>; 956; COMMON-EMPTY: 957; COMMON-NEXT: // %bb.0: 958; COMMON-NEXT: ld.param.u32 %r1, [test_bitcast_2xi16_to_i32_param_0]; 959; COMMON-NEXT: st.param.b32 [func_retval0], %r1; 960; COMMON-NEXT: ret; 961 %r = bitcast <2 x i16> %a to i32 962 ret i32 %r 963} 964 965define <2 x half> @test_bitcast_2xi16_to_2xhalf(i16 %a) #0 { 966; COMMON-LABEL: test_bitcast_2xi16_to_2xhalf( 967; COMMON: { 968; COMMON-NEXT: .reg .b16 %rs<3>; 969; COMMON-NEXT: .reg .b32 %r<2>; 970; COMMON-EMPTY: 971; COMMON-NEXT: // %bb.0: 972; COMMON-NEXT: ld.param.u16 %rs1, [test_bitcast_2xi16_to_2xhalf_param_0]; 973; COMMON-NEXT: mov.b16 %rs2, 5; 974; COMMON-NEXT: mov.b32 %r1, {%rs1, %rs2}; 975; COMMON-NEXT: st.param.b32 [func_retval0], %r1; 976; COMMON-NEXT: ret; 977 %ins.0 = insertelement <2 x i16> undef, i16 %a, i32 0 978 %ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1 979 %r = bitcast <2 x i16> %ins.1 to <2 x half> 980 ret <2 x half> %r 981} 982 983 984define <2 x i16> @test_shufflevector(<2 x i16> %a) #0 { 985; COMMON-LABEL: test_shufflevector( 986; COMMON: { 987; COMMON-NEXT: .reg .b16 %rs<3>; 988; COMMON-NEXT: .reg .b32 %r<3>; 989; COMMON-EMPTY: 990; COMMON-NEXT: // %bb.0: 991; COMMON-NEXT: ld.param.u32 %r1, [test_shufflevector_param_0]; 992; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; 993; COMMON-NEXT: mov.b32 %r2, {%rs2, %rs1}; 994; COMMON-NEXT: st.param.b32 [func_retval0], %r2; 995; COMMON-NEXT: ret; 996 %s = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> <i32 1, i32 0> 997 ret <2 x i16> %s 998} 999 1000define <2 x i16> @test_insertelement(<2 x i16> %a, i16 %x) #0 { 1001; COMMON-LABEL: test_insertelement( 1002; COMMON: { 1003; COMMON-NEXT: .reg .b16 %rs<3>; 1004; COMMON-NEXT: .reg .b32 %r<3>; 1005; COMMON-EMPTY: 1006; COMMON-NEXT: // %bb.0: 1007; COMMON-NEXT: ld.param.u16 %rs1, [test_insertelement_param_1]; 1008; COMMON-NEXT: ld.param.u32 %r1, [test_insertelement_param_0]; 1009; COMMON-NEXT: { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r1; } 1010; COMMON-NEXT: mov.b32 %r2, {%rs2, %rs1}; 1011; COMMON-NEXT: st.param.b32 [func_retval0], %r2; 1012; COMMON-NEXT: ret; 1013 %i = insertelement <2 x i16> %a, i16 %x, i64 1 1014 ret <2 x i16> %i 1015} 1016 1017define <2 x i16> @test_fptosi_2xhalf_to_2xi16(<2 x half> %a) #0 { 1018; COMMON-LABEL: test_fptosi_2xhalf_to_2xi16( 1019; COMMON: { 1020; COMMON-NEXT: .reg .b16 %rs<5>; 1021; COMMON-NEXT: .reg .b32 %r<3>; 1022; COMMON-EMPTY: 1023; COMMON-NEXT: // %bb.0: 1024; COMMON-NEXT: ld.param.b32 %r1, [test_fptosi_2xhalf_to_2xi16_param_0]; 1025; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; 1026; COMMON-NEXT: cvt.rzi.s16.f16 %rs3, %rs2; 1027; COMMON-NEXT: cvt.rzi.s16.f16 %rs4, %rs1; 1028; COMMON-NEXT: mov.b32 %r2, {%rs4, %rs3}; 1029; COMMON-NEXT: st.param.b32 [func_retval0], %r2; 1030; COMMON-NEXT: ret; 1031 %r = fptosi <2 x half> %a to <2 x i16> 1032 ret <2 x i16> %r 1033} 1034 1035define <2 x i16> @test_fptoui_2xhalf_to_2xi16(<2 x half> %a) #0 { 1036; COMMON-LABEL: test_fptoui_2xhalf_to_2xi16( 1037; COMMON: { 1038; COMMON-NEXT: .reg .b16 %rs<5>; 1039; COMMON-NEXT: .reg .b32 %r<3>; 1040; COMMON-EMPTY: 1041; COMMON-NEXT: // %bb.0: 1042; COMMON-NEXT: ld.param.b32 %r1, [test_fptoui_2xhalf_to_2xi16_param_0]; 1043; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; 1044; COMMON-NEXT: cvt.rzi.u16.f16 %rs3, %rs2; 1045; COMMON-NEXT: cvt.rzi.u16.f16 %rs4, %rs1; 1046; COMMON-NEXT: mov.b32 %r2, {%rs4, %rs3}; 1047; COMMON-NEXT: st.param.b32 [func_retval0], %r2; 1048; COMMON-NEXT: ret; 1049 %r = fptoui <2 x half> %a to <2 x i16> 1050 ret <2 x i16> %r 1051} 1052 1053attributes #0 = { nounwind } 1054