1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX906 %s 3; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX908 %s 4; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s 5; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s 6 7define i32 @v_udot2(<2 x i16> %a, <2 x i16> %b, i32 %c) { 8; GFX906-LABEL: v_udot2: 9; GFX906: ; %bb.0: 10; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 12; GFX906-NEXT: s_setpc_b64 s[30:31] 13; 14; GFX908-LABEL: v_udot2: 15; GFX908: ; %bb.0: 16; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 18; GFX908-NEXT: s_setpc_b64 s[30:31] 19; 20; GFX10-LABEL: v_udot2: 21; GFX10: ; %bb.0: 22; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 23; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 24; GFX10-NEXT: s_setpc_b64 s[30:31] 25 %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 false) 26 ret i32 %r 27} 28 29define i32 @v_udot2_clamp(<2 x i16> %a, <2 x i16> %b, i32 %c) { 30; GFX906-LABEL: v_udot2_clamp: 31; GFX906: ; %bb.0: 32; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 clamp 34; GFX906-NEXT: s_setpc_b64 s[30:31] 35; 36; GFX908-LABEL: v_udot2_clamp: 37; GFX908: ; %bb.0: 38; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 39; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 clamp 40; GFX908-NEXT: s_setpc_b64 s[30:31] 41; 42; GFX10-LABEL: v_udot2_clamp: 43; GFX10: ; %bb.0: 44; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 45; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 clamp 46; GFX10-NEXT: s_setpc_b64 s[30:31] 47 %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 true) 48 ret i32 %r 49} 50 51define amdgpu_ps float @v_udot2_sgpr_sgpr_sgpr(<2 x i16> inreg %a, <2 x i16> inreg %b, i32 inreg %c) { 52; GFX906-LABEL: v_udot2_sgpr_sgpr_sgpr: 53; GFX906: ; %bb.0: 54; GFX906-NEXT: v_mov_b32_e32 v0, s1 55; GFX906-NEXT: v_mov_b32_e32 v1, s2 56; GFX906-NEXT: v_dot2_u32_u16 v0, s0, v0, v1 57; GFX906-NEXT: ; return to shader part epilog 58; 59; GFX908-LABEL: v_udot2_sgpr_sgpr_sgpr: 60; GFX908: ; %bb.0: 61; GFX908-NEXT: v_mov_b32_e32 v0, s1 62; GFX908-NEXT: v_mov_b32_e32 v1, s2 63; GFX908-NEXT: v_dot2_u32_u16 v0, s0, v0, v1 64; GFX908-NEXT: ; return to shader part epilog 65; 66; GFX10-LABEL: v_udot2_sgpr_sgpr_sgpr: 67; GFX10: ; %bb.0: 68; GFX10-NEXT: v_mov_b32_e32 v0, s2 69; GFX10-NEXT: v_dot2_u32_u16 v0, s0, s1, v0 70; GFX10-NEXT: ; return to shader part epilog 71 %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 false) 72 %cast = bitcast i32 %r to float 73 ret float %cast 74} 75 76define i32 @v_udot2_inline_literal_a(<2 x i16> %b, i32 %c) { 77; GFX906-LABEL: v_udot2_inline_literal_a: 78; GFX906: ; %bb.0: 79; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 80; GFX906-NEXT: v_dot2_u32_u16 v0, 4, v0, v1 op_sel_hi:[0,1,1] 81; GFX906-NEXT: s_setpc_b64 s[30:31] 82; 83; GFX908-LABEL: v_udot2_inline_literal_a: 84; GFX908: ; %bb.0: 85; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 86; GFX908-NEXT: v_dot2_u32_u16 v0, 4, v0, v1 op_sel_hi:[0,1,1] 87; GFX908-NEXT: s_setpc_b64 s[30:31] 88; 89; GFX10-LABEL: v_udot2_inline_literal_a: 90; GFX10: ; %bb.0: 91; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 92; GFX10-NEXT: v_dot2_u32_u16 v0, 4, v0, v1 op_sel_hi:[0,1,1] 93; GFX10-NEXT: s_setpc_b64 s[30:31] 94 %r = call i32 @llvm.amdgcn.udot2(<2 x i16> <i16 4, i16 4>, <2 x i16> %b, i32 %c, i1 false) 95 ret i32 %r 96} 97 98define i32 @v_udot2_inline_literal_b(<2 x i16> %a, i32 %c) { 99; GFX906-LABEL: v_udot2_inline_literal_b: 100; GFX906: ; %bb.0: 101; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 102; GFX906-NEXT: v_dot2_u32_u16 v0, v0, 4, v1 op_sel_hi:[1,0,1] 103; GFX906-NEXT: s_setpc_b64 s[30:31] 104; 105; GFX908-LABEL: v_udot2_inline_literal_b: 106; GFX908: ; %bb.0: 107; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 108; GFX908-NEXT: v_dot2_u32_u16 v0, v0, 4, v1 op_sel_hi:[1,0,1] 109; GFX908-NEXT: s_setpc_b64 s[30:31] 110; 111; GFX10-LABEL: v_udot2_inline_literal_b: 112; GFX10: ; %bb.0: 113; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 114; GFX10-NEXT: v_dot2_u32_u16 v0, v0, 4, v1 op_sel_hi:[1,0,1] 115; GFX10-NEXT: s_setpc_b64 s[30:31] 116 %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> <i16 4, i16 4>, i32 %c, i1 false) 117 ret i32 %r 118} 119 120define i32 @v_udot2_inline_literal_a_b(<2 x i16> %a, i32 %c) { 121; GFX906-LABEL: v_udot2_inline_literal_a_b: 122; GFX906: ; %bb.0: 123; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 124; GFX906-NEXT: v_dot2_u32_u16 v0, 8, 4, v1 op_sel_hi:[0,0,1] 125; GFX906-NEXT: s_setpc_b64 s[30:31] 126; 127; GFX908-LABEL: v_udot2_inline_literal_a_b: 128; GFX908: ; %bb.0: 129; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 130; GFX908-NEXT: v_dot2_u32_u16 v0, 8, 4, v1 op_sel_hi:[0,0,1] 131; GFX908-NEXT: s_setpc_b64 s[30:31] 132; 133; GFX10-LABEL: v_udot2_inline_literal_a_b: 134; GFX10: ; %bb.0: 135; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 136; GFX10-NEXT: v_dot2_u32_u16 v0, 8, 4, v1 op_sel_hi:[0,0,1] 137; GFX10-NEXT: s_setpc_b64 s[30:31] 138 %r = call i32 @llvm.amdgcn.udot2(<2 x i16> <i16 8, i16 8>, <2 x i16> <i16 4, i16 4>, i32 %c, i1 false) 139 ret i32 %r 140} 141 142define i32 @v_udot2_inline_literal_a_b_c() { 143; GFX906-LABEL: v_udot2_inline_literal_a_b_c: 144; GFX906: ; %bb.0: 145; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 146; GFX906-NEXT: v_dot2_u32_u16 v0, 8, 4, 8 op_sel_hi:[0,0,1] 147; GFX906-NEXT: s_setpc_b64 s[30:31] 148; 149; GFX908-LABEL: v_udot2_inline_literal_a_b_c: 150; GFX908: ; %bb.0: 151; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 152; GFX908-NEXT: v_dot2_u32_u16 v0, 8, 4, 8 op_sel_hi:[0,0,1] 153; GFX908-NEXT: s_setpc_b64 s[30:31] 154; 155; GFX10-LABEL: v_udot2_inline_literal_a_b_c: 156; GFX10: ; %bb.0: 157; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 158; GFX10-NEXT: v_dot2_u32_u16 v0, 8, 4, 8 op_sel_hi:[0,0,1] 159; GFX10-NEXT: s_setpc_b64 s[30:31] 160 %r = call i32 @llvm.amdgcn.udot2(<2 x i16> <i16 8, i16 8>, <2 x i16> <i16 4, i16 4>, i32 8, i1 false) 161 ret i32 %r 162} 163 164define i32 @v_udot2_inline_literal_c(<2 x i16> %a, <2 x i16> %b) { 165; GFX906-LABEL: v_udot2_inline_literal_c: 166; GFX906: ; %bb.0: 167; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 168; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, 7 169; GFX906-NEXT: s_setpc_b64 s[30:31] 170; 171; GFX908-LABEL: v_udot2_inline_literal_c: 172; GFX908: ; %bb.0: 173; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 174; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, 7 175; GFX908-NEXT: s_setpc_b64 s[30:31] 176; 177; GFX10-LABEL: v_udot2_inline_literal_c: 178; GFX10: ; %bb.0: 179; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 180; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, 7 181; GFX10-NEXT: s_setpc_b64 s[30:31] 182 %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 7, i1 false) 183 ret i32 %r 184} 185 186define i32 @v_udot2_fneg_a(<2 x half> %a, <2 x i16> %b, i32 %c) { 187; GFX906-LABEL: v_udot2_fneg_a: 188; GFX906: ; %bb.0: 189; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 190; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] 191; GFX906-NEXT: s_setpc_b64 s[30:31] 192; 193; GFX908-LABEL: v_udot2_fneg_a: 194; GFX908: ; %bb.0: 195; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 196; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] 197; GFX908-NEXT: s_setpc_b64 s[30:31] 198; 199; GFX10-LABEL: v_udot2_fneg_a: 200; GFX10: ; %bb.0: 201; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 202; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] 203; GFX10-NEXT: s_setpc_b64 s[30:31] 204 %neg.a = fneg <2 x half> %a 205 %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16> 206 %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %cast.neg.a, <2 x i16> %b, i32 %c, i1 false) 207 ret i32 %r 208} 209 210define i32 @v_udot2_fneg_b(<2 x i16> %a, <2 x half> %b, i32 %c) { 211; GFX906-LABEL: v_udot2_fneg_b: 212; GFX906: ; %bb.0: 213; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 214; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] 215; GFX906-NEXT: s_setpc_b64 s[30:31] 216; 217; GFX908-LABEL: v_udot2_fneg_b: 218; GFX908: ; %bb.0: 219; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 220; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] 221; GFX908-NEXT: s_setpc_b64 s[30:31] 222; 223; GFX10-LABEL: v_udot2_fneg_b: 224; GFX10: ; %bb.0: 225; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 226; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] 227; GFX10-NEXT: s_setpc_b64 s[30:31] 228 %neg.b = fneg <2 x half> %b 229 %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16> 230 %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %cast.neg.b, i32 %c, i1 false) 231 ret i32 %r 232} 233 234define i32 @v_udot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) { 235; GFX906-LABEL: v_udot2_fnegf32_c: 236; GFX906: ; %bb.0: 237; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 238; GFX906-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 239; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 240; GFX906-NEXT: s_setpc_b64 s[30:31] 241; 242; GFX908-LABEL: v_udot2_fnegf32_c: 243; GFX908: ; %bb.0: 244; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 245; GFX908-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 246; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 247; GFX908-NEXT: s_setpc_b64 s[30:31] 248; 249; GFX10-LABEL: v_udot2_fnegf32_c: 250; GFX10: ; %bb.0: 251; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 252; GFX10-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 253; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 254; GFX10-NEXT: s_setpc_b64 s[30:31] 255 %neg.c = fneg float %c 256 %cast.neg.c = bitcast float %neg.c to i32 257 %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %cast.neg.c, i1 false) 258 ret i32 %r 259} 260 261define i32 @v_udot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) { 262; GFX906-LABEL: v_udot2_fnegv2f16_c: 263; GFX906: ; %bb.0: 264; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 265; GFX906-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 266; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 267; GFX906-NEXT: s_setpc_b64 s[30:31] 268; 269; GFX908-LABEL: v_udot2_fnegv2f16_c: 270; GFX908: ; %bb.0: 271; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 272; GFX908-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 273; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 274; GFX908-NEXT: s_setpc_b64 s[30:31] 275; 276; GFX10-LABEL: v_udot2_fnegv2f16_c: 277; GFX10: ; %bb.0: 278; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 279; GFX10-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 280; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 281; GFX10-NEXT: s_setpc_b64 s[30:31] 282 %neg.c = fneg <2 x half> %c 283 %cast.neg.c = bitcast <2 x half> %neg.c to i32 284 %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %cast.neg.c, i1 false) 285 ret i32 %r 286} 287 288define i32 @v_udot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) { 289; GFX906-LABEL: v_udot2_shuffle10_a: 290; GFX906: ; %bb.0: 291; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 292; GFX906-NEXT: v_alignbit_b32 v0, v0, v0, 16 293; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 294; GFX906-NEXT: s_setpc_b64 s[30:31] 295; 296; GFX908-LABEL: v_udot2_shuffle10_a: 297; GFX908: ; %bb.0: 298; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 299; GFX908-NEXT: v_alignbit_b32 v0, v0, v0, 16 300; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 301; GFX908-NEXT: s_setpc_b64 s[30:31] 302; 303; GFX10-LABEL: v_udot2_shuffle10_a: 304; GFX10: ; %bb.0: 305; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 306; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16 307; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 308; GFX10-NEXT: s_setpc_b64 s[30:31] 309 %shuf.a = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> <i32 1, i32 0> 310 %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false) 311 ret i32 %r 312} 313 314define i32 @v_udot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) { 315; GFX906-LABEL: v_udot2_shuffle10_b: 316; GFX906: ; %bb.0: 317; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 318; GFX906-NEXT: v_alignbit_b32 v1, v1, v1, 16 319; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 320; GFX906-NEXT: s_setpc_b64 s[30:31] 321; 322; GFX908-LABEL: v_udot2_shuffle10_b: 323; GFX908: ; %bb.0: 324; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 325; GFX908-NEXT: v_alignbit_b32 v1, v1, v1, 16 326; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 327; GFX908-NEXT: s_setpc_b64 s[30:31] 328; 329; GFX10-LABEL: v_udot2_shuffle10_b: 330; GFX10: ; %bb.0: 331; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 332; GFX10-NEXT: v_alignbit_b32 v1, v1, v1, 16 333; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 334; GFX10-NEXT: s_setpc_b64 s[30:31] 335 %shuf.b = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> <i32 1, i32 0> 336 %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false) 337 ret i32 %r 338} 339 340declare i32 @llvm.amdgcn.udot2(<2 x i16>, <2 x i16>, i32, i1 immarg) #0 341 342attributes #0 = { nounwind readnone speculatable } 343