1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64 3 4declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half>, <16 x half>, <4 x float>) 5declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16>, <16 x i16>, <4 x float>) 6declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.b8f16.v16f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg) 7declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16.v16f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg) 8declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg) 9declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16.v16i16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg) 10declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg) 11declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg) 12 13; @llvm.amdgcn.wmma.f32.16x16x16.f16 14 15define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { 16; W64-LABEL: test_wmma_f32_16x16x16_f16: 17; W64: ; %bb.0: ; %bb 18; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] 19; W64-NEXT: global_store_b128 v[20:21], v[16:19], off 20; W64-NEXT: s_endpgm 21bb: 22 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> %A, <16 x half> %B, <4 x float> %C) 23 store <4 x float> %res, ptr addrspace(1) %out, align 16 24 ret void 25} 26 27; @llvm.amdgcn.wmma.f32.16x16x16.bf16 28 29define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) { 30; W64-LABEL: test_wmma_f32_16x16x16_bf16: 31; W64: ; %bb.0: ; %bb 32; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] 33; W64-NEXT: global_store_b128 v[20:21], v[16:19], off 34; W64-NEXT: s_endpgm 35bb: 36 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C) 37 store <4 x float> %res, ptr addrspace(1) %out, align 16 38 ret void 39} 40 41; @llvm.amdgcn.wmma.f16.16x16x16.f16 42 43define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { 44; W64-LABEL: test_wmma_f16_16x16x16_f16_lo: 45; W64: ; %bb.0: ; %bb 46; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] 47; W64-NEXT: global_store_b128 v[20:21], v[16:19], off 48; W64-NEXT: s_endpgm 49bb: 50 %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 0) 51 store <8 x half> %res, ptr addrspace(1) %out, align 16 52 ret void 53} 54 55define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { 56; W64-LABEL: test_wmma_f16_16x16x16_f16_hi: 57; W64: ; %bb.0: ; %bb 58; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] 59; W64-NEXT: global_store_b128 v[20:21], v[16:19], off 60; W64-NEXT: s_endpgm 61bb: 62 %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 1) 63 store <8 x half> %res, ptr addrspace(1) %out, align 16 64 ret void 65} 66 67define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) { 68; W64-LABEL: test_wmma_f16_16x16x16_f16_untied: 69; W64: ; %bb.0: ; %bb 70; W64-NEXT: v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[32:35] 71; W64-NEXT: v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35] 72; W64-NEXT: global_store_b128 v[36:37], v[40:43], off 73; W64-NEXT: global_store_b128 v[38:39], v[32:35], off 74; W64-NEXT: s_endpgm 75bb: 76 %res.0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0) 77 %res.1 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, i1 0) 78 store <8 x half> %res.0, ptr addrspace(1) %out.0, align 32 79 store <8 x half> %res.1, ptr addrspace(1) %out.1, align 32 80 ret void 81} 82 83define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) { 84; W64-LABEL: test_wmma_f16_16x16x16_f16_tied: 85; W64: ; %bb.0: ; %bb 86; W64-NEXT: v_mov_b32_e32 v43, v35 87; W64-NEXT: v_mov_b32_e32 v42, v34 88; W64-NEXT: v_mov_b32_e32 v41, v33 89; W64-NEXT: v_mov_b32_e32 v40, v32 90; W64-NEXT: v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35] 91; W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 92; W64-NEXT: v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[40:43] 93; W64-NEXT: global_store_b128 v[36:37], v[40:43], off 94; W64-NEXT: global_store_b128 v[38:39], v[32:35], off 95; W64-NEXT: s_endpgm 96bb: 97 %res.0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16.v16f16(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0) 98 %res.1 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16.v16f16(<16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, i1 0) 99 store <8 x half> %res.0, ptr addrspace(1) %out.0, align 32 100 store <8 x half> %res.1, ptr addrspace(1) %out.1, align 32 101 ret void 102} 103 104; @llvm.amdgcn.wmma.bf16.16x16x16.bf16 105 106define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) { 107; W64-LABEL: test_wmma_bf16_16x16x16_bf16_lo: 108; W64: ; %bb.0: ; %bb 109; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] 110; W64-NEXT: global_store_b128 v[20:21], v[16:19], off 111; W64-NEXT: s_endpgm 112bb: 113 %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 0) 114 store <8 x i16> %res, ptr addrspace(1) %out, align 16 115 ret void 116} 117 118define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) { 119; W64-LABEL: test_wmma_bf16_16x16x16_bf16_hi: 120; W64: ; %bb.0: ; %bb 121; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] 122; W64-NEXT: global_store_b128 v[20:21], v[16:19], off 123; W64-NEXT: s_endpgm 124bb: 125 %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 1) 126 store <8 x i16> %res, ptr addrspace(1) %out, align 16 127 ret void 128} 129 130define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) { 131; W64-LABEL: test_wmma_bf16_16x16x16_bf16_untied: 132; W64: ; %bb.0: ; %bb 133; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[32:35] 134; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35] 135; W64-NEXT: global_store_b128 v[36:37], v[40:43], off 136; W64-NEXT: global_store_b128 v[38:39], v[32:35], off 137; W64-NEXT: s_endpgm 138bb: 139 %res.0 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0) 140 %res.1 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, i1 0) 141 store <8 x i16> %res.0, ptr addrspace(1) %out.0, align 32 142 store <8 x i16> %res.1, ptr addrspace(1) %out.1, align 32 143 ret void 144} 145 146define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) { 147; W64-LABEL: test_wmma_bf16_16x16x16_bf16_tied: 148; W64: ; %bb.0: ; %bb 149; W64-NEXT: v_mov_b32_e32 v43, v35 150; W64-NEXT: v_mov_b32_e32 v42, v34 151; W64-NEXT: v_mov_b32_e32 v41, v33 152; W64-NEXT: v_mov_b32_e32 v40, v32 153; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35] 154; W64-NEXT: s_delay_alu instid0(VALU_DEP_2) 155; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[40:43] 156; W64-NEXT: global_store_b128 v[36:37], v[40:43], off 157; W64-NEXT: global_store_b128 v[38:39], v[32:35], off 158; W64-NEXT: s_endpgm 159bb: 160 %res.0 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16.v16i16(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0) 161 %res.1 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16.v16i16(<16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, i1 0) 162 store <8 x i16> %res.0, ptr addrspace(1) %out.0, align 32 163 store <8 x i16> %res.1, ptr addrspace(1) %out.1, align 32 164 ret void 165} 166 167; @llvm.amdgcn.wmma.i32.16x16x16.iu8 168 169define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) { 170; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned: 171; W64: ; %bb.0: ; %bb 172; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] 173; W64-NEXT: global_store_b128 v[12:13], v[8:11], off 174; W64-NEXT: s_endpgm 175bb: 176 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0) 177 store <4 x i32> %res, ptr addrspace(1) %out, align 16 178 ret void 179} 180 181 182define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) { 183; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed: 184; W64: ; %bb.0: ; %bb 185; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] 186; W64-NEXT: global_store_b128 v[12:13], v[8:11], off 187; W64-NEXT: s_endpgm 188bb: 189 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0) 190 store <4 x i32> %res, ptr addrspace(1) %out, align 16 191 ret void 192} 193 194define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) { 195; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned: 196; W64: ; %bb.0: ; %bb 197; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] 198; W64-NEXT: global_store_b128 v[12:13], v[8:11], off 199; W64-NEXT: s_endpgm 200bb: 201 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0) 202 store <4 x i32> %res, ptr addrspace(1) %out, align 16 203 ret void 204} 205 206define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) { 207; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed: 208; W64: ; %bb.0: ; %bb 209; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] 210; W64-NEXT: global_store_b128 v[12:13], v[8:11], off 211; W64-NEXT: s_endpgm 212bb: 213 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0) 214 store <4 x i32> %res, ptr addrspace(1) %out, align 16 215 ret void 216} 217 218define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) { 219; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp: 220; W64: ; %bb.0: ; %bb 221; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] clamp 222; W64-NEXT: global_store_b128 v[12:13], v[8:11], off 223; W64-NEXT: s_endpgm 224bb: 225 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1) 226 store <4 x i32> %res, ptr addrspace(1) %out, align 16 227 ret void 228} 229 230define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) { 231; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp: 232; W64: ; %bb.0: ; %bb 233; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] clamp 234; W64-NEXT: global_store_b128 v[12:13], v[8:11], off 235; W64-NEXT: s_endpgm 236bb: 237 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1) 238 store <4 x i32> %res, ptr addrspace(1) %out, align 16 239 ret void 240} 241 242define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) { 243; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp: 244; W64: ; %bb.0: ; %bb 245; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] clamp 246; W64-NEXT: global_store_b128 v[12:13], v[8:11], off 247; W64-NEXT: s_endpgm 248bb: 249 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1) 250 store <4 x i32> %res, ptr addrspace(1) %out, align 16 251 ret void 252} 253 254define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) { 255; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp: 256; W64: ; %bb.0: ; %bb 257; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] clamp 258; W64-NEXT: global_store_b128 v[12:13], v[8:11], off 259; W64-NEXT: s_endpgm 260bb: 261 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1) 262 store <4 x i32> %res, ptr addrspace(1) %out, align 16 263 ret void 264} 265 266; @llvm.amdgcn.wmma.i32.16x16x16.iu4 267 268define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) { 269; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned: 270; W64: ; %bb.0: ; %bb 271; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] 272; W64-NEXT: global_store_b128 v[8:9], v[4:7], off 273; W64-NEXT: s_endpgm 274bb: 275 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0) 276 store <4 x i32> %res, ptr addrspace(1) %out, align 16 277 ret void 278} 279 280define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) { 281; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed: 282; W64: ; %bb.0: ; %bb 283; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] 284; W64-NEXT: global_store_b128 v[8:9], v[4:7], off 285; W64-NEXT: s_endpgm 286bb: 287 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0) 288 store <4 x i32> %res, ptr addrspace(1) %out, align 16 289 ret void 290} 291 292define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) { 293; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned: 294; W64: ; %bb.0: ; %bb 295; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] 296; W64-NEXT: global_store_b128 v[8:9], v[4:7], off 297; W64-NEXT: s_endpgm 298bb: 299 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0) 300 store <4 x i32> %res, ptr addrspace(1) %out, align 16 301 ret void 302} 303 304define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) { 305; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed: 306; W64: ; %bb.0: ; %bb 307; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] 308; W64-NEXT: global_store_b128 v[8:9], v[4:7], off 309; W64-NEXT: s_endpgm 310bb: 311 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0) 312 store <4 x i32> %res, ptr addrspace(1) %out, align 16 313 ret void 314} 315 316define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) { 317; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp: 318; W64: ; %bb.0: ; %bb 319; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] clamp 320; W64-NEXT: global_store_b128 v[8:9], v[4:7], off 321; W64-NEXT: s_endpgm 322bb: 323 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1) 324 store <4 x i32> %res, ptr addrspace(1) %out, align 16 325 ret void 326} 327 328define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) { 329; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp: 330; W64: ; %bb.0: ; %bb 331; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] clamp 332; W64-NEXT: global_store_b128 v[8:9], v[4:7], off 333; W64-NEXT: s_endpgm 334bb: 335 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1) 336 store <4 x i32> %res, ptr addrspace(1) %out, align 16 337 ret void 338} 339 340define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) { 341; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp: 342; W64: ; %bb.0: ; %bb 343; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] clamp 344; W64-NEXT: global_store_b128 v[8:9], v[4:7], off 345; W64-NEXT: s_endpgm 346bb: 347 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1) 348 store <4 x i32> %res, ptr addrspace(1) %out, align 16 349 ret void 350} 351 352define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) { 353; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp: 354; W64: ; %bb.0: ; %bb 355; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] clamp 356; W64-NEXT: global_store_b128 v[8:9], v[4:7], off 357; W64-NEXT: s_endpgm 358bb: 359 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1) 360 store <4 x i32> %res, ptr addrspace(1) %out, align 16 361 ret void 362} 363 364