1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64 3 4declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half>, <16 x half>, <4 x float>) 5declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16>, <16 x i16>, <4 x float>) 6declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg) 7declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg) 8declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg) 9declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg) 10 11; The tests demonstrate that the following WMMA register constraints are satisfied. 12; 13; v_wmma D, A, B, C 14; A and B cannot overlap with D. C cannot partially overlap with D, but it is OK for them to be the same (which is a typical case). 15; 16; In each test, 17; - first wmma instruction: the dest register D is different than all the sources 18; - second wmma instruction: the dest register D and src2 (C) are the same 19 20 21; @llvm.amdgcn.wmma.f32.16x16x16.f16 22 23define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <4 x float> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) { 24; W64-LABEL: test_wmma_f32_16x16x16_f16: 25; W64: ; %bb.0: ; %bb 26; W64-NEXT: v_wmma_f32_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19] 27; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] 28; W64-NEXT: global_store_b128 v[20:21], v[24:27], off 29; W64-NEXT: global_store_b128 v[22:23], v[16:19], off 30; W64-NEXT: s_endpgm 31bb: 32 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> %A, <16 x half> %B, <4 x float> %C) 33 %res2 = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> %B, <16 x half> %B, <4 x float> %C) 34 store <4 x float> %res, ptr addrspace(1) %out, align 16 35 store <4 x float> %res2, ptr addrspace(1) %out2, align 16 36 ret void 37} 38 39; @llvm.amdgcn.wmma.f32.16x16x16.bf16 40 41define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) { 42; W64-LABEL: test_wmma_f32_16x16x16_bf16: 43; W64: ; %bb.0: ; %bb 44; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19] 45; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19] 46; W64-NEXT: global_store_b128 v[20:21], v[24:27], off 47; W64-NEXT: global_store_b128 v[22:23], v[16:19], off 48; W64-NEXT: s_endpgm 49bb: 50 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C) 51 %res2 = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> %B, <16 x i16> %B, <4 x float> %C) 52 store <4 x float> %res, ptr addrspace(1) %out, align 16 53 store <4 x float> %res2, ptr addrspace(1) %out2, align 16 54 ret void 55} 56 57; @llvm.amdgcn.wmma.f16.16x16x16.f16 58 59define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) { 60; W64-LABEL: test_wmma_f16_16x16x16_f16_lo: 61; W64: ; %bb.0: ; %bb 62; W64-NEXT: v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19] 63; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] 64; W64-NEXT: global_store_b128 v[20:21], v[24:27], off 65; W64-NEXT: global_store_b128 v[22:23], v[16:19], off 66; W64-NEXT: s_endpgm 67bb: 68 %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 0) 69 %res2 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %B, <16 x half> %B, <8 x half> %C, i1 0) 70 store <8 x half> %res, ptr addrspace(1) %out, align 16 71 store <8 x half> %res2, ptr addrspace(1) %out2, align 16 72 ret void 73} 74 75define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) { 76; W64-LABEL: test_wmma_f16_16x16x16_f16_hi: 77; W64: ; %bb.0: ; %bb 78; W64-NEXT: v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] 79; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1] 80; W64-NEXT: global_store_b128 v[20:21], v[24:27], off 81; W64-NEXT: global_store_b128 v[22:23], v[16:19], off 82; W64-NEXT: s_endpgm 83bb: 84 %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 1) 85 %res2 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %B, <16 x half> %B, <8 x half> %C, i1 1) 86 store <8 x half> %res, ptr addrspace(1) %out, align 16 87 store <8 x half> %res2, ptr addrspace(1) %out2, align 16 88 ret void 89} 90 91; @llvm.amdgcn.wmma.bf16.16x16x16.bf16 92 93define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) { 94; W64-LABEL: test_wmma_bf16_16x16x16_bf16_lo: 95; W64: ; %bb.0: ; %bb 96; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19] 97; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19] 98; W64-NEXT: global_store_b128 v[20:21], v[24:27], off 99; W64-NEXT: global_store_b128 v[22:23], v[16:19], off 100; W64-NEXT: s_endpgm 101bb: 102 %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 0) 103 %res2 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %B, <16 x i16> %B, <8 x i16> %C, i1 0) 104 store <8 x i16> %res, ptr addrspace(1) %out, align 16 105 store <8 x i16> %res2, ptr addrspace(1) %out2, align 16 106 ret void 107} 108 109define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) { 110; W64-LABEL: test_wmma_bf16_16x16x16_bf16_hi: 111; W64: ; %bb.0: ; %bb 112; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] 113; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1] 114; W64-NEXT: global_store_b128 v[20:21], v[24:27], off 115; W64-NEXT: global_store_b128 v[22:23], v[16:19], off 116; W64-NEXT: s_endpgm 117bb: 118 %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 1) 119 %res2 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %B, <16 x i16> %B, <8 x i16> %C, i1 1) 120 store <8 x i16> %res, ptr addrspace(1) %out, align 16 121 store <8 x i16> %res2, ptr addrspace(1) %out2, align 16 122 ret void 123} 124 125; @llvm.amdgcn.wmma.i32.16x16x16.iu8 126 127define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) { 128; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned: 129; W64: ; %bb.0: ; %bb 130; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] 131; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] 132; W64-NEXT: global_store_b128 v[12:13], v[16:19], off 133; W64-NEXT: global_store_b128 v[14:15], v[8:11], off 134; W64-NEXT: s_endpgm 135bb: 136 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0) 137 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0) 138 store <4 x i32> %res, ptr addrspace(1) %out, align 16 139 store <4 x i32> %res2, ptr addrspace(1) %out2, align 16 140 ret void 141} 142 143 144define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) { 145; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed: 146; W64: ; %bb.0: ; %bb 147; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] 148; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[0,1,0] 149; W64-NEXT: global_store_b128 v[12:13], v[16:19], off 150; W64-NEXT: global_store_b128 v[14:15], v[8:11], off 151; W64-NEXT: s_endpgm 152bb: 153 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0) 154 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0) 155 store <4 x i32> %res, ptr addrspace(1) %out, align 16 156 store <4 x i32> %res2, ptr addrspace(1) %out2, align 16 157 ret void 158} 159 160define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) { 161; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned: 162; W64: ; %bb.0: ; %bb 163; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] 164; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,0,0] 165; W64-NEXT: global_store_b128 v[12:13], v[16:19], off 166; W64-NEXT: global_store_b128 v[14:15], v[8:11], off 167; W64-NEXT: s_endpgm 168bb: 169 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0) 170 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0) 171 store <4 x i32> %res, ptr addrspace(1) %out, align 16 172 store <4 x i32> %res2, ptr addrspace(1) %out2, align 16 173 ret void 174} 175 176define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) { 177; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed: 178; W64: ; %bb.0: ; %bb 179; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] 180; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,1,0] 181; W64-NEXT: global_store_b128 v[12:13], v[16:19], off 182; W64-NEXT: global_store_b128 v[14:15], v[8:11], off 183; W64-NEXT: s_endpgm 184bb: 185 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0) 186 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0) 187 store <4 x i32> %res, ptr addrspace(1) %out, align 16 188 store <4 x i32> %res2, ptr addrspace(1) %out2, align 16 189 ret void 190} 191 192define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) { 193; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp: 194; W64: ; %bb.0: ; %bb 195; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] clamp 196; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] clamp 197; W64-NEXT: global_store_b128 v[12:13], v[16:19], off 198; W64-NEXT: global_store_b128 v[14:15], v[8:11], off 199; W64-NEXT: s_endpgm 200bb: 201 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1) 202 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1) 203 store <4 x i32> %res, ptr addrspace(1) %out, align 16 204 store <4 x i32> %res2, ptr addrspace(1) %out2, align 16 205 ret void 206} 207 208define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) { 209; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp: 210; W64: ; %bb.0: ; %bb 211; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] clamp 212; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[0,1,0] clamp 213; W64-NEXT: global_store_b128 v[12:13], v[16:19], off 214; W64-NEXT: global_store_b128 v[14:15], v[8:11], off 215; W64-NEXT: s_endpgm 216bb: 217 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1) 218 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1) 219 store <4 x i32> %res, ptr addrspace(1) %out, align 16 220 store <4 x i32> %res2, ptr addrspace(1) %out2, align 16 221 ret void 222} 223 224define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) { 225; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp: 226; W64: ; %bb.0: ; %bb 227; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] clamp 228; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,0,0] clamp 229; W64-NEXT: global_store_b128 v[12:13], v[16:19], off 230; W64-NEXT: global_store_b128 v[14:15], v[8:11], off 231; W64-NEXT: s_endpgm 232bb: 233 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1) 234 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1) 235 store <4 x i32> %res, ptr addrspace(1) %out, align 16 236 store <4 x i32> %res2, ptr addrspace(1) %out2, align 16 237 ret void 238} 239 240define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) { 241; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp: 242; W64: ; %bb.0: ; %bb 243; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] clamp 244; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,1,0] clamp 245; W64-NEXT: global_store_b128 v[12:13], v[16:19], off 246; W64-NEXT: global_store_b128 v[14:15], v[8:11], off 247; W64-NEXT: s_endpgm 248bb: 249 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1) 250 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1) 251 store <4 x i32> %res, ptr addrspace(1) %out, align 16 252 store <4 x i32> %res2, ptr addrspace(1) %out2, align 16 253 ret void 254} 255 256; @llvm.amdgcn.wmma.i32.16x16x16.iu4 257 258define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) { 259; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned: 260; W64: ; %bb.0: ; %bb 261; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] 262; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] 263; W64-NEXT: global_store_b128 v[8:9], v[12:15], off 264; W64-NEXT: global_store_b128 v[10:11], v[4:7], off 265; W64-NEXT: s_endpgm 266bb: 267 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0) 268 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0) 269 store <4 x i32> %res, ptr addrspace(1) %out, align 16 270 store <4 x i32> %res2, ptr addrspace(1) %out2, align 16 271 ret void 272} 273 274define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) { 275; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed: 276; W64: ; %bb.0: ; %bb 277; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] 278; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[0,1,0] 279; W64-NEXT: global_store_b128 v[8:9], v[12:15], off 280; W64-NEXT: global_store_b128 v[10:11], v[4:7], off 281; W64-NEXT: s_endpgm 282bb: 283 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0) 284 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0) 285 store <4 x i32> %res, ptr addrspace(1) %out, align 16 286 store <4 x i32> %res2, ptr addrspace(1) %out2, align 16 287 ret void 288} 289 290define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) { 291; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned: 292; W64: ; %bb.0: ; %bb 293; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] 294; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,0,0] 295; W64-NEXT: global_store_b128 v[8:9], v[12:15], off 296; W64-NEXT: global_store_b128 v[10:11], v[4:7], off 297; W64-NEXT: s_endpgm 298bb: 299 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0) 300 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0) 301 store <4 x i32> %res, ptr addrspace(1) %out, align 16 302 store <4 x i32> %res2, ptr addrspace(1) %out2, align 16 303 ret void 304} 305 306define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) { 307; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed: 308; W64: ; %bb.0: ; %bb 309; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] 310; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,1,0] 311; W64-NEXT: global_store_b128 v[8:9], v[12:15], off 312; W64-NEXT: global_store_b128 v[10:11], v[4:7], off 313; W64-NEXT: s_endpgm 314bb: 315 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0) 316 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0) 317 store <4 x i32> %res, ptr addrspace(1) %out, align 16 318 store <4 x i32> %res2, ptr addrspace(1) %out2, align 16 319 ret void 320} 321 322define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) { 323; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp: 324; W64: ; %bb.0: ; %bb 325; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] clamp 326; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] clamp 327; W64-NEXT: global_store_b128 v[8:9], v[12:15], off 328; W64-NEXT: global_store_b128 v[10:11], v[4:7], off 329; W64-NEXT: s_endpgm 330bb: 331 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1) 332 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1) 333 store <4 x i32> %res, ptr addrspace(1) %out, align 16 334 store <4 x i32> %res2, ptr addrspace(1) %out2, align 16 335 ret void 336} 337 338define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) { 339; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp: 340; W64: ; %bb.0: ; %bb 341; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] clamp 342; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[0,1,0] clamp 343; W64-NEXT: global_store_b128 v[8:9], v[12:15], off 344; W64-NEXT: global_store_b128 v[10:11], v[4:7], off 345; W64-NEXT: s_endpgm 346bb: 347 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1) 348 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1) 349 store <4 x i32> %res, ptr addrspace(1) %out, align 16 350 store <4 x i32> %res2, ptr addrspace(1) %out2, align 16 351 ret void 352} 353 354define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) { 355; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp: 356; W64: ; %bb.0: ; %bb 357; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] clamp 358; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,0,0] clamp 359; W64-NEXT: global_store_b128 v[8:9], v[12:15], off 360; W64-NEXT: global_store_b128 v[10:11], v[4:7], off 361; W64-NEXT: s_endpgm 362bb: 363 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1) 364 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1) 365 store <4 x i32> %res, ptr addrspace(1) %out, align 16 366 store <4 x i32> %res2, ptr addrspace(1) %out2, align 16 367 ret void 368} 369 370define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) { 371; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp: 372; W64: ; %bb.0: ; %bb 373; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] clamp 374; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,1,0] clamp 375; W64-NEXT: global_store_b128 v[8:9], v[12:15], off 376; W64-NEXT: global_store_b128 v[10:11], v[4:7], off 377; W64-NEXT: s_endpgm 378bb: 379 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1) 380 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1) 381 store <4 x i32> %res, ptr addrspace(1) %out, align 16 382 store <4 x i32> %res2, ptr addrspace(1) %out2, align 16 383 ret void 384} 385 386