1; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s 2; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s 3; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s 4 5; FUNC-LABEL: {{^}}udiv24_i8: 6; SI: v_cvt_f32_ubyte 7; SI-DAG: v_cvt_f32_ubyte 8; SI-DAG: v_rcp_iflag_f32 9; SI: v_cvt_u32_f32 10 11; EG: UINT_TO_FLT 12; EG-DAG: UINT_TO_FLT 13; EG-DAG: RECIP_IEEE 14; EG: FLT_TO_UINT 15define amdgpu_kernel void @udiv24_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) { 16 %den_ptr = getelementptr i8, ptr addrspace(1) %in, i8 1 17 %num = load i8, ptr addrspace(1) %in 18 %den = load i8, ptr addrspace(1) %den_ptr 19 %result = udiv i8 %num, %den 20 store i8 %result, ptr addrspace(1) %out 21 ret void 22} 23 24; FUNC-LABEL: {{^}}udiv24_i8_denorm_flush_in_out: 25; SI: v_cvt_f32_ubyte 26; SI-DAG: v_cvt_f32_ubyte 27; SI-DAG: v_rcp_iflag_f32 28; SI: v_cvt_u32_f32 29 30; EG: UINT_TO_FLT 31; EG-DAG: UINT_TO_FLT 32; EG-DAG: RECIP_IEEE 33; EG: FLT_TO_UINT 34define amdgpu_kernel void @udiv24_i8_denorm_flush_in_out(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 35 %den_ptr = getelementptr i8, ptr addrspace(1) %in, i8 1 36 %num = load i8, ptr addrspace(1) %in 37 %den = load i8, ptr addrspace(1) %den_ptr 38 %result = udiv i8 %num, %den 39 store i8 %result, ptr addrspace(1) %out 40 ret void 41} 42 43; FUNC-LABEL: {{^}}udiv24_i8_denorm_flush_in: 44; SI: v_cvt_f32_ubyte 45; SI-DAG: v_cvt_f32_ubyte 46; SI-DAG: v_rcp_iflag_f32 47; SI: v_cvt_u32_f32 48 49; EG: UINT_TO_FLT 50; EG-DAG: UINT_TO_FLT 51; EG-DAG: RECIP_IEEE 52; EG: FLT_TO_UINT 53define amdgpu_kernel void @udiv24_i8_denorm_flush_in(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { 54 %den_ptr = getelementptr i8, ptr addrspace(1) %in, i8 1 55 %num = load i8, ptr addrspace(1) %in 56 %den = load i8, ptr addrspace(1) %den_ptr 57 %result = udiv i8 %num, %den 58 store i8 %result, ptr addrspace(1) %out 59 ret void 60} 61 62; FUNC-LABEL: {{^}}udiv24_i8_denorm_flush_out: 63; SI: v_cvt_f32_ubyte 64; SI-DAG: v_cvt_f32_ubyte 65; SI-DAG: v_rcp_iflag_f32 66; SI: v_cvt_u32_f32 67 68; EG: UINT_TO_FLT 69; EG-DAG: UINT_TO_FLT 70; EG-DAG: RECIP_IEEE 71; EG: FLT_TO_UINT 72define amdgpu_kernel void @udiv24_i8_denorm_flush_out(ptr addrspace(1) %out, ptr addrspace(1) %in) #2 { 73 %den_ptr = getelementptr i8, ptr addrspace(1) %in, i8 1 74 %num = load i8, ptr addrspace(1) %in 75 %den = load i8, ptr addrspace(1) %den_ptr 76 %result = udiv i8 %num, %den 77 store i8 %result, ptr addrspace(1) %out 78 ret void 79} 80 81; FUNC-LABEL: {{^}}udiv24_i16: 82; SI: v_cvt_f32_u32 83; SI: v_cvt_f32_u32 84; SI: v_rcp_iflag_f32 85; SI: v_cvt_u32_f32 86 87; EG: UINT_TO_FLT 88; EG-DAG: UINT_TO_FLT 89; EG-DAG: RECIP_IEEE 90; EG: FLT_TO_UINT 91define amdgpu_kernel void @udiv24_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { 92 %den_ptr = getelementptr i16, ptr addrspace(1) %in, i16 1 93 %num = load i16, ptr addrspace(1) %in, align 2 94 %den = load i16, ptr addrspace(1) %den_ptr, align 2 95 %result = udiv i16 %num, %den 96 store i16 %result, ptr addrspace(1) %out, align 2 97 ret void 98} 99 100; FUNC-LABEL: {{^}}udiv23_i32: 101; SI: v_cvt_f32_u32 102; SI-DAG: v_cvt_f32_u32 103; SI-DAG: v_rcp_iflag_f32 104; SI: v_cvt_u32_f32 105 106; EG: UINT_TO_FLT 107; EG-DAG: UINT_TO_FLT 108; EG-DAG: RECIP_IEEE 109; EG: FLT_TO_UINT 110define amdgpu_kernel void @udiv23_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 111 %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 112 %num = load i32, ptr addrspace(1) %in, align 4 113 %den = load i32, ptr addrspace(1) %den_ptr, align 4 114 %num.i23.0 = shl i32 %num, 9 115 %den.i23.0 = shl i32 %den, 9 116 %num.i23 = lshr i32 %num.i23.0, 9 117 %den.i23 = lshr i32 %den.i23.0, 9 118 %result = udiv i32 %num.i23, %den.i23 119 store i32 %result, ptr addrspace(1) %out, align 4 120 ret void 121} 122 123; FUNC-LABEL: {{^}}udiv24_i32: 124; SI: v_rcp_iflag 125; SI-NOT: v_rcp_f32 126; EG-NOT: RECIP_IEEE 127define amdgpu_kernel void @udiv24_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 128 %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 129 %num = load i32, ptr addrspace(1) %in, align 4 130 %den = load i32, ptr addrspace(1) %den_ptr, align 4 131 %num.i24.0 = shl i32 %num, 8 132 %den.i24.0 = shl i32 %den, 8 133 %num.i24 = lshr i32 %num.i24.0, 8 134 %den.i24 = lshr i32 %den.i24.0, 8 135 %result = udiv i32 %num.i24, %den.i24 136 store i32 %result, ptr addrspace(1) %out, align 4 137 ret void 138} 139 140; FUNC-LABEL: {{^}}no_udiv24_u23_u24_i32: 141; SI: v_rcp_iflag 142; SI-NOT: v_rcp_f32 143; EG-NOT: RECIP_IEEE 144define amdgpu_kernel void @no_udiv24_u23_u24_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 145 %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 146 %num = load i32, ptr addrspace(1) %in, align 4 147 %den = load i32, ptr addrspace(1) %den_ptr, align 4 148 %num.i23.0 = shl i32 %num, 9 149 %den.i24.0 = shl i32 %den, 8 150 %num.i23 = lshr i32 %num.i23.0, 9 151 %den.i24 = lshr i32 %den.i24.0, 8 152 %result = udiv i32 %num.i23, %den.i24 153 store i32 %result, ptr addrspace(1) %out, align 4 154 ret void 155} 156 157; FUNC-LABEL: {{^}}no_udiv24_u24_u23_i32: 158; SI: v_rcp_iflag 159; SI-NOT: v_rcp_f32 160; EG-NOT: RECIP_IEEE 161define amdgpu_kernel void @no_udiv24_u24_u23_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 162 %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 163 %num = load i32, ptr addrspace(1) %in, align 4 164 %den = load i32, ptr addrspace(1) %den_ptr, align 4 165 %num.i24.0 = shl i32 %num, 8 166 %den.i23.0 = shl i32 %den, 9 167 %num.i24 = lshr i32 %num.i24.0, 8 168 %den.i23 = lshr i32 %den.i23.0, 9 169 %result = udiv i32 %num.i24, %den.i23 170 store i32 %result, ptr addrspace(1) %out, align 4 171 ret void 172} 173 174; FUNC-LABEL: {{^}}udiv25_i32: 175; RCP_IFLAG is for URECIP in the full 32b alg 176; SI: v_rcp_iflag 177; SI-NOT: v_rcp_f32 178 179; EG-NOT: UINT_TO_FLT 180; EG-NOT: RECIP_IEEE 181define amdgpu_kernel void @udiv25_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 182 %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 183 %num = load i32, ptr addrspace(1) %in, align 4 184 %den = load i32, ptr addrspace(1) %den_ptr, align 4 185 %num.i25.0 = shl i32 %num, 7 186 %den.i25.0 = shl i32 %den, 7 187 %num.i25 = lshr i32 %num.i25.0, 7 188 %den.i25 = lshr i32 %den.i25.0, 7 189 %result = udiv i32 %num.i25, %den.i25 190 store i32 %result, ptr addrspace(1) %out, align 4 191 ret void 192} 193 194; FUNC-LABEL: {{^}}test_no_udiv24_i32_1: 195; RCP_IFLAG is for URECIP in the full 32b alg 196; SI: v_rcp_iflag 197; SI-NOT: v_rcp_f32 198 199; EG-NOT: UINT_TO_FLT 200; EG-NOT: RECIP_IEEE 201define amdgpu_kernel void @test_no_udiv24_i32_1(ptr addrspace(1) %out, ptr addrspace(1) %in) { 202 %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 203 %num = load i32, ptr addrspace(1) %in, align 4 204 %den = load i32, ptr addrspace(1) %den_ptr, align 4 205 %num.i24.0 = shl i32 %num, 8 206 %den.i24.0 = shl i32 %den, 7 207 %num.i24 = lshr i32 %num.i24.0, 8 208 %den.i24 = lshr i32 %den.i24.0, 7 209 %result = udiv i32 %num.i24, %den.i24 210 store i32 %result, ptr addrspace(1) %out, align 4 211 ret void 212} 213 214; FUNC-LABEL: {{^}}test_no_udiv24_i32_2: 215; RCP_IFLAG is for URECIP in the full 32b alg 216; SI: v_rcp_iflag 217; SI-NOT: v_rcp_f32 218 219; EG-NOT: UINT_TO_FLT 220; EG-NOT: RECIP_IEEE 221define amdgpu_kernel void @test_no_udiv24_i32_2(ptr addrspace(1) %out, ptr addrspace(1) %in) { 222 %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 223 %num = load i32, ptr addrspace(1) %in, align 4 224 %den = load i32, ptr addrspace(1) %den_ptr, align 4 225 %num.i24.0 = shl i32 %num, 7 226 %den.i24.0 = shl i32 %den, 8 227 %num.i24 = lshr i32 %num.i24.0, 7 228 %den.i24 = lshr i32 %den.i24.0, 8 229 %result = udiv i32 %num.i24, %den.i24 230 store i32 %result, ptr addrspace(1) %out, align 4 231 ret void 232} 233 234; FUNC-LABEL: {{^}}urem24_i8: 235; SI: v_cvt_f32_ubyte 236; SI-DAG: v_cvt_f32_ubyte 237; SI-DAG: v_rcp_iflag_f32 238; SI: v_cvt_u32_f32 239 240; EG: UINT_TO_FLT 241; EG-DAG: UINT_TO_FLT 242; EG-DAG: RECIP_IEEE 243; EG: FLT_TO_UINT 244define amdgpu_kernel void @urem24_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) { 245 %den_ptr = getelementptr i8, ptr addrspace(1) %in, i8 1 246 %num = load i8, ptr addrspace(1) %in 247 %den = load i8, ptr addrspace(1) %den_ptr 248 %result = urem i8 %num, %den 249 store i8 %result, ptr addrspace(1) %out 250 ret void 251} 252 253; FUNC-LABEL: {{^}}urem24_i16: 254; SI: v_cvt_f32_u32 255; SI: v_cvt_f32_u32 256; SI: v_rcp_iflag_f32 257; SI: v_cvt_u32_f32 258 259; EG: UINT_TO_FLT 260; EG-DAG: UINT_TO_FLT 261; EG-DAG: RECIP_IEEE 262; EG: FLT_TO_UINT 263define amdgpu_kernel void @urem24_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { 264 %den_ptr = getelementptr i16, ptr addrspace(1) %in, i16 1 265 %num = load i16, ptr addrspace(1) %in, align 2 266 %den = load i16, ptr addrspace(1) %den_ptr, align 2 267 %result = urem i16 %num, %den 268 store i16 %result, ptr addrspace(1) %out, align 2 269 ret void 270} 271 272; FUNC-LABEL: {{^}}urem24_i32: 273; SI-NOT: v_rcp_f32 274; EG-NOT: RECIP_IEEE 275define amdgpu_kernel void @urem24_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 276 %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 277 %num = load i32, ptr addrspace(1) %in, align 4 278 %den = load i32, ptr addrspace(1) %den_ptr, align 4 279 %num.i24.0 = shl i32 %num, 8 280 %den.i24.0 = shl i32 %den, 8 281 %num.i24 = lshr i32 %num.i24.0, 8 282 %den.i24 = lshr i32 %den.i24.0, 8 283 %result = urem i32 %num.i24, %den.i24 284 store i32 %result, ptr addrspace(1) %out, align 4 285 ret void 286} 287 288; FUNC-LABEL: {{^}}urem25_i32: 289; RCP_IFLAG is for URECIP in the full 32b alg 290; SI: v_rcp_iflag 291; SI-NOT: v_rcp_f32 292 293; EG-NOT: UINT_TO_FLT 294; EG-NOT: RECIP_IEEE 295define amdgpu_kernel void @urem25_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 296 %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 297 %num = load i32, ptr addrspace(1) %in, align 4 298 %den = load i32, ptr addrspace(1) %den_ptr, align 4 299 %num.i24.0 = shl i32 %num, 7 300 %den.i24.0 = shl i32 %den, 7 301 %num.i24 = lshr i32 %num.i24.0, 7 302 %den.i24 = lshr i32 %den.i24.0, 7 303 %result = urem i32 %num.i24, %den.i24 304 store i32 %result, ptr addrspace(1) %out, align 4 305 ret void 306} 307 308; FUNC-LABEL: {{^}}test_no_urem24_i32_1: 309; RCP_IFLAG is for URECIP in the full 32b alg 310; SI: v_rcp_iflag 311; SI-NOT: v_rcp_f32 312 313; EG-NOT: UINT_TO_FLT 314; EG-NOT: RECIP_IEEE 315define amdgpu_kernel void @test_no_urem24_i32_1(ptr addrspace(1) %out, ptr addrspace(1) %in) { 316 %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 317 %num = load i32, ptr addrspace(1) %in, align 4 318 %den = load i32, ptr addrspace(1) %den_ptr, align 4 319 %num.i24.0 = shl i32 %num, 8 320 %den.i24.0 = shl i32 %den, 7 321 %num.i24 = lshr i32 %num.i24.0, 8 322 %den.i24 = lshr i32 %den.i24.0, 7 323 %result = urem i32 %num.i24, %den.i24 324 store i32 %result, ptr addrspace(1) %out, align 4 325 ret void 326} 327 328; FUNC-LABEL: {{^}}test_no_urem24_i32_2: 329; RCP_IFLAG is for URECIP in the full 32b alg 330; SI: v_rcp_iflag 331; SI-NOT: v_rcp_f32 332 333; EG-NOT: UINT_TO_FLT 334; EG-NOT: RECIP_IEEE 335define amdgpu_kernel void @test_no_urem24_i32_2(ptr addrspace(1) %out, ptr addrspace(1) %in) { 336 %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 337 %num = load i32, ptr addrspace(1) %in, align 4 338 %den = load i32, ptr addrspace(1) %den_ptr, align 4 339 %num.i24.0 = shl i32 %num, 7 340 %den.i24.0 = shl i32 %den, 8 341 %num.i24 = lshr i32 %num.i24.0, 7 342 %den.i24 = lshr i32 %den.i24.0, 8 343 %result = urem i32 %num.i24, %den.i24 344 store i32 %result, ptr addrspace(1) %out, align 4 345 ret void 346} 347 348; FUNC-LABEL: {{^}}test_udiv24_u16_u23_i32: 349; SI: v_rcp_iflag_f32 350; SI: v_and_b32_e32 v{{[0-9]+}}, 0x7fffff, 351 352; EG: RECIP_IEEE 353define amdgpu_kernel void @test_udiv24_u16_u23_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 354 %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 355 %num = load i32, ptr addrspace(1) %in, align 4 356 %den = load i32, ptr addrspace(1) %den_ptr, align 4 357 %num.i16.0 = shl i32 %num, 16 358 %den.i23.0 = shl i32 %den, 9 359 %num.i16 = lshr i32 %num.i16.0, 16 360 %den.i23 = lshr i32 %den.i23.0, 9 361 %result = udiv i32 %num.i16, %den.i23 362 store i32 %result, ptr addrspace(1) %out, align 4 363 ret void 364} 365 366; FUNC-LABEL: {{^}}test_udiv24_u23_u16_i32: 367; SI: v_rcp_iflag_f32 368; SI: v_and_b32_e32 v{{[0-9]+}}, 0x7fffff, 369 370; EG: RECIP_IEEE 371define amdgpu_kernel void @test_udiv24_u23_u16_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 372 %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 373 %num = load i32, ptr addrspace(1) %in, align 4 374 %den = load i32, ptr addrspace(1) %den_ptr, align 4 375 %num.i23.0 = shl i32 %num, 9 376 %den.i16.0 = shl i32 %den, 16 377 %num.i23 = lshr i32 %num.i23.0, 9 378 %den.i16 = lshr i32 %den.i16.0, 16 379 %result = udiv i32 %num.i23, %den.i16 380 store i32 %result, ptr addrspace(1) %out, align 4 381 ret void 382} 383 384attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 385attributes #1 = { "denormal-fp-math-f32"="ieee,preserve-sign" } 386attributes #2 = { "denormal-fp-math-f32"="preserve-sign,ieee" } 387