1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s 3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s 4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s 5; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s 6; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s 7 8define i7 @v_saddsat_i7(i7 %lhs, i7 %rhs) { 9; GFX6-LABEL: v_saddsat_i7: 10; GFX6: ; %bb.0: 11; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12; GFX6-NEXT: v_lshlrev_b32_e32 v0, 25, v0 13; GFX6-NEXT: v_min_i32_e32 v3, 0, v0 14; GFX6-NEXT: v_lshlrev_b32_e32 v1, 25, v1 15; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 16; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 17; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 18; GFX6-NEXT: v_max_i32_e32 v1, v3, v1 19; GFX6-NEXT: v_min_i32_e32 v1, v1, v2 20; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 21; GFX6-NEXT: v_ashrrev_i32_e32 v0, 25, v0 22; GFX6-NEXT: s_setpc_b64 s[30:31] 23; 24; GFX8-LABEL: v_saddsat_i7: 25; GFX8: ; %bb.0: 26; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27; GFX8-NEXT: v_lshlrev_b16_e32 v0, 9, v0 28; GFX8-NEXT: v_min_i16_e32 v3, 0, v0 29; GFX8-NEXT: v_lshlrev_b16_e32 v1, 9, v1 30; GFX8-NEXT: v_max_i16_e32 v2, 0, v0 31; GFX8-NEXT: v_sub_u16_e32 v3, 0x8000, v3 32; GFX8-NEXT: v_sub_u16_e32 v2, 0x7fff, v2 33; GFX8-NEXT: v_max_i16_e32 v1, v3, v1 34; GFX8-NEXT: v_min_i16_e32 v1, v1, v2 35; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 36; GFX8-NEXT: v_ashrrev_i16_e32 v0, 9, v0 37; GFX8-NEXT: s_setpc_b64 s[30:31] 38; 39; GFX9-LABEL: v_saddsat_i7: 40; GFX9: ; %bb.0: 41; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 42; GFX9-NEXT: v_lshlrev_b16_e32 v0, 9, v0 43; GFX9-NEXT: v_lshlrev_b16_e32 v1, 9, v1 44; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp 45; GFX9-NEXT: v_ashrrev_i16_e32 v0, 9, v0 46; GFX9-NEXT: s_setpc_b64 s[30:31] 47; 48; GFX10PLUS-LABEL: v_saddsat_i7: 49; GFX10PLUS: ; %bb.0: 50; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 51; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 9, v0 52; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 9, v1 53; GFX10PLUS-NEXT: v_add_nc_i16 v0, v0, v1 clamp 54; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 9, v0 55; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 56 %result = call i7 @llvm.sadd.sat.i7(i7 %lhs, i7 %rhs) 57 ret i7 %result 58} 59 60define amdgpu_ps i7 @s_saddsat_i7(i7 inreg %lhs, i7 inreg %rhs) { 61; GFX6-LABEL: s_saddsat_i7: 62; GFX6: ; %bb.0: 63; GFX6-NEXT: s_lshl_b32 s0, s0, 25 64; GFX6-NEXT: s_min_i32 s3, s0, 0 65; GFX6-NEXT: s_lshl_b32 s1, s1, 25 66; GFX6-NEXT: s_max_i32 s2, s0, 0 67; GFX6-NEXT: s_sub_i32 s3, 0x80000000, s3 68; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2 69; GFX6-NEXT: s_max_i32 s1, s3, s1 70; GFX6-NEXT: s_min_i32 s1, s1, s2 71; GFX6-NEXT: s_add_i32 s0, s0, s1 72; GFX6-NEXT: s_ashr_i32 s0, s0, 25 73; GFX6-NEXT: ; return to shader part epilog 74; 75; GFX8-LABEL: s_saddsat_i7: 76; GFX8: ; %bb.0: 77; GFX8-NEXT: s_lshl_b32 s0, s0, 9 78; GFX8-NEXT: s_sext_i32_i16 s2, s0 79; GFX8-NEXT: s_sext_i32_i16 s3, 0 80; GFX8-NEXT: s_max_i32 s4, s2, s3 81; GFX8-NEXT: s_min_i32 s2, s2, s3 82; GFX8-NEXT: s_lshl_b32 s1, s1, 9 83; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 84; GFX8-NEXT: s_sext_i32_i16 s2, s2 85; GFX8-NEXT: s_sext_i32_i16 s1, s1 86; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 87; GFX8-NEXT: s_max_i32 s1, s2, s1 88; GFX8-NEXT: s_sext_i32_i16 s1, s1 89; GFX8-NEXT: s_sext_i32_i16 s2, s4 90; GFX8-NEXT: s_min_i32 s1, s1, s2 91; GFX8-NEXT: s_add_i32 s0, s0, s1 92; GFX8-NEXT: s_sext_i32_i16 s0, s0 93; GFX8-NEXT: s_ashr_i32 s0, s0, 9 94; GFX8-NEXT: ; return to shader part epilog 95; 96; GFX9-LABEL: s_saddsat_i7: 97; GFX9: ; %bb.0: 98; GFX9-NEXT: s_lshl_b32 s1, s1, 9 99; GFX9-NEXT: s_lshl_b32 s0, s0, 9 100; GFX9-NEXT: v_mov_b32_e32 v0, s1 101; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp 102; GFX9-NEXT: v_ashrrev_i16_e32 v0, 9, v0 103; GFX9-NEXT: v_readfirstlane_b32 s0, v0 104; GFX9-NEXT: ; return to shader part epilog 105; 106; GFX10PLUS-LABEL: s_saddsat_i7: 107; GFX10PLUS: ; %bb.0: 108; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9 109; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 9 110; GFX10PLUS-NEXT: v_add_nc_i16 v0, s0, s1 clamp 111; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 9, v0 112; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 113; GFX10PLUS-NEXT: ; return to shader part epilog 114 %result = call i7 @llvm.sadd.sat.i7(i7 %lhs, i7 %rhs) 115 ret i7 %result 116} 117 118define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) { 119; GFX6-LABEL: v_saddsat_i8: 120; GFX6: ; %bb.0: 121; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 122; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 123; GFX6-NEXT: v_min_i32_e32 v3, 0, v0 124; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 125; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 126; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 127; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 128; GFX6-NEXT: v_max_i32_e32 v1, v3, v1 129; GFX6-NEXT: v_min_i32_e32 v1, v1, v2 130; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 131; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 132; GFX6-NEXT: s_setpc_b64 s[30:31] 133; 134; GFX8-LABEL: v_saddsat_i8: 135; GFX8: ; %bb.0: 136; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 137; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 138; GFX8-NEXT: v_min_i16_e32 v3, 0, v0 139; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 140; GFX8-NEXT: v_max_i16_e32 v2, 0, v0 141; GFX8-NEXT: v_sub_u16_e32 v3, 0x8000, v3 142; GFX8-NEXT: v_sub_u16_e32 v2, 0x7fff, v2 143; GFX8-NEXT: v_max_i16_e32 v1, v3, v1 144; GFX8-NEXT: v_min_i16_e32 v1, v1, v2 145; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 146; GFX8-NEXT: v_ashrrev_i16_e32 v0, 8, v0 147; GFX8-NEXT: s_setpc_b64 s[30:31] 148; 149; GFX9-LABEL: v_saddsat_i8: 150; GFX9: ; %bb.0: 151; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 152; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 153; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 154; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp 155; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 156; GFX9-NEXT: s_setpc_b64 s[30:31] 157; 158; GFX10PLUS-LABEL: v_saddsat_i8: 159; GFX10PLUS: ; %bb.0: 160; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 161; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 8, v0 162; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 8, v1 163; GFX10PLUS-NEXT: v_add_nc_i16 v0, v0, v1 clamp 164; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 8, v0 165; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 166 %result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs) 167 ret i8 %result 168} 169 170define amdgpu_ps i8 @s_saddsat_i8(i8 inreg %lhs, i8 inreg %rhs) { 171; GFX6-LABEL: s_saddsat_i8: 172; GFX6: ; %bb.0: 173; GFX6-NEXT: s_lshl_b32 s0, s0, 24 174; GFX6-NEXT: s_min_i32 s3, s0, 0 175; GFX6-NEXT: s_lshl_b32 s1, s1, 24 176; GFX6-NEXT: s_max_i32 s2, s0, 0 177; GFX6-NEXT: s_sub_i32 s3, 0x80000000, s3 178; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2 179; GFX6-NEXT: s_max_i32 s1, s3, s1 180; GFX6-NEXT: s_min_i32 s1, s1, s2 181; GFX6-NEXT: s_add_i32 s0, s0, s1 182; GFX6-NEXT: s_ashr_i32 s0, s0, 24 183; GFX6-NEXT: ; return to shader part epilog 184; 185; GFX8-LABEL: s_saddsat_i8: 186; GFX8: ; %bb.0: 187; GFX8-NEXT: s_lshl_b32 s0, s0, 8 188; GFX8-NEXT: s_sext_i32_i16 s2, s0 189; GFX8-NEXT: s_sext_i32_i16 s3, 0 190; GFX8-NEXT: s_max_i32 s4, s2, s3 191; GFX8-NEXT: s_min_i32 s2, s2, s3 192; GFX8-NEXT: s_lshl_b32 s1, s1, 8 193; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 194; GFX8-NEXT: s_sext_i32_i16 s2, s2 195; GFX8-NEXT: s_sext_i32_i16 s1, s1 196; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 197; GFX8-NEXT: s_max_i32 s1, s2, s1 198; GFX8-NEXT: s_sext_i32_i16 s1, s1 199; GFX8-NEXT: s_sext_i32_i16 s2, s4 200; GFX8-NEXT: s_min_i32 s1, s1, s2 201; GFX8-NEXT: s_add_i32 s0, s0, s1 202; GFX8-NEXT: s_sext_i32_i16 s0, s0 203; GFX8-NEXT: s_ashr_i32 s0, s0, 8 204; GFX8-NEXT: ; return to shader part epilog 205; 206; GFX9-LABEL: s_saddsat_i8: 207; GFX9: ; %bb.0: 208; GFX9-NEXT: s_lshl_b32 s1, s1, 8 209; GFX9-NEXT: s_lshl_b32 s0, s0, 8 210; GFX9-NEXT: v_mov_b32_e32 v0, s1 211; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp 212; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 213; GFX9-NEXT: v_readfirstlane_b32 s0, v0 214; GFX9-NEXT: ; return to shader part epilog 215; 216; GFX10PLUS-LABEL: s_saddsat_i8: 217; GFX10PLUS: ; %bb.0: 218; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8 219; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8 220; GFX10PLUS-NEXT: v_add_nc_i16 v0, s0, s1 clamp 221; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 8, v0 222; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 223; GFX10PLUS-NEXT: ; return to shader part epilog 224 %result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs) 225 ret i8 %result 226} 227 228define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { 229; GFX6-LABEL: v_saddsat_v2i8: 230; GFX6: ; %bb.0: 231; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 232; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0 233; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 234; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 235; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1 236; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 237; GFX6-NEXT: v_max_i32_e32 v4, 0, v0 238; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x80000000, v5 239; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x7fffffff, v4 240; GFX6-NEXT: v_max_i32_e32 v1, v5, v1 241; GFX6-NEXT: v_min_i32_e32 v1, v1, v4 242; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 243; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 244; GFX6-NEXT: v_min_i32_e32 v4, 0, v1 245; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 246; GFX6-NEXT: v_max_i32_e32 v3, 0, v1 247; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x80000000, v4 248; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x7fffffff, v3 249; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 250; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 251; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 252; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1 253; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 254; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 255; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 256; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 257; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 258; GFX6-NEXT: s_setpc_b64 s[30:31] 259; 260; GFX8-LABEL: v_saddsat_v2i8: 261; GFX8: ; %bb.0: 262; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 263; GFX8-NEXT: v_mov_b32_e32 v2, 8 264; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 265; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 266; GFX8-NEXT: v_min_i16_e32 v5, 0, v0 267; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 268; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 269; GFX8-NEXT: v_max_i16_e32 v4, 0, v0 270; GFX8-NEXT: v_sub_u16_e32 v5, 0x8000, v5 271; GFX8-NEXT: v_sub_u16_e32 v4, 0x7fff, v4 272; GFX8-NEXT: v_max_i16_e32 v1, v5, v1 273; GFX8-NEXT: v_min_i16_e32 v1, v1, v4 274; GFX8-NEXT: v_min_i16_e32 v4, 0, v3 275; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 276; GFX8-NEXT: v_max_i16_e32 v1, 0, v3 277; GFX8-NEXT: v_sub_u16_e32 v4, 0x8000, v4 278; GFX8-NEXT: v_sub_u16_e32 v1, 0x7fff, v1 279; GFX8-NEXT: v_max_i16_e32 v2, v4, v2 280; GFX8-NEXT: v_min_i16_e32 v1, v2, v1 281; GFX8-NEXT: v_add_u16_e32 v1, v3, v1 282; GFX8-NEXT: v_mov_b32_e32 v2, 0xff 283; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 284; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 285; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 286; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 287; GFX8-NEXT: s_setpc_b64 s[30:31] 288; 289; GFX9-LABEL: v_saddsat_v2i8: 290; GFX9: ; %bb.0: 291; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 292; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 293; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 294; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 295; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 296; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 297; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 298; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 299; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 300; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp 301; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 302; GFX9-NEXT: v_mov_b32_e32 v1, 0xff 303; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 304; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 305; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 306; GFX9-NEXT: s_setpc_b64 s[30:31] 307; 308; GFX10-LABEL: v_saddsat_v2i8: 309; GFX10: ; %bb.0: 310; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 311; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 312; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 313; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 314; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 315; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0 316; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1 317; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 318; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 319; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp 320; GFX10-NEXT: v_mov_b32_e32 v1, 0xff 321; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 322; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 323; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 324; GFX10-NEXT: s_setpc_b64 s[30:31] 325; 326; GFX11-LABEL: v_saddsat_v2i8: 327; GFX11: ; %bb.0: 328; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 329; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 330; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 331; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 332; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 333; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 334; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 335; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 336; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 337; GFX11-NEXT: v_pk_add_i16 v0, v0, v1 clamp 338; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 339; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 340; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 341; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 342; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 343; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 344; GFX11-NEXT: s_setpc_b64 s[30:31] 345 %lhs = bitcast i16 %lhs.arg to <2 x i8> 346 %rhs = bitcast i16 %rhs.arg to <2 x i8> 347 %result = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs) 348 %cast.result = bitcast <2 x i8> %result to i16 349 ret i16 %cast.result 350} 351 352define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { 353; GFX6-LABEL: s_saddsat_v2i8: 354; GFX6: ; %bb.0: 355; GFX6-NEXT: s_lshr_b32 s2, s0, 8 356; GFX6-NEXT: s_lshl_b32 s0, s0, 24 357; GFX6-NEXT: s_min_i32 s5, s0, 0 358; GFX6-NEXT: s_lshr_b32 s3, s1, 8 359; GFX6-NEXT: s_lshl_b32 s1, s1, 24 360; GFX6-NEXT: s_max_i32 s4, s0, 0 361; GFX6-NEXT: s_sub_i32 s5, 0x80000000, s5 362; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4 363; GFX6-NEXT: s_max_i32 s1, s5, s1 364; GFX6-NEXT: s_min_i32 s1, s1, s4 365; GFX6-NEXT: s_add_i32 s0, s0, s1 366; GFX6-NEXT: s_lshl_b32 s1, s2, 24 367; GFX6-NEXT: s_min_i32 s4, s1, 0 368; GFX6-NEXT: s_lshl_b32 s2, s3, 24 369; GFX6-NEXT: s_max_i32 s3, s1, 0 370; GFX6-NEXT: s_sub_i32 s4, 0x80000000, s4 371; GFX6-NEXT: s_sub_i32 s3, 0x7fffffff, s3 372; GFX6-NEXT: s_max_i32 s2, s4, s2 373; GFX6-NEXT: s_min_i32 s2, s2, s3 374; GFX6-NEXT: s_add_i32 s1, s1, s2 375; GFX6-NEXT: s_ashr_i32 s1, s1, 24 376; GFX6-NEXT: s_ashr_i32 s0, s0, 24 377; GFX6-NEXT: s_and_b32 s1, s1, 0xff 378; GFX6-NEXT: s_and_b32 s0, s0, 0xff 379; GFX6-NEXT: s_lshl_b32 s1, s1, 8 380; GFX6-NEXT: s_or_b32 s0, s0, s1 381; GFX6-NEXT: ; return to shader part epilog 382; 383; GFX8-LABEL: s_saddsat_v2i8: 384; GFX8: ; %bb.0: 385; GFX8-NEXT: s_lshr_b32 s2, s0, 8 386; GFX8-NEXT: s_lshl_b32 s0, s0, 8 387; GFX8-NEXT: s_sext_i32_i16 s4, s0 388; GFX8-NEXT: s_sext_i32_i16 s5, 0 389; GFX8-NEXT: s_max_i32 s6, s4, s5 390; GFX8-NEXT: s_min_i32 s4, s4, s5 391; GFX8-NEXT: s_lshr_b32 s3, s1, 8 392; GFX8-NEXT: s_lshl_b32 s1, s1, 8 393; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 394; GFX8-NEXT: s_sext_i32_i16 s4, s4 395; GFX8-NEXT: s_sext_i32_i16 s1, s1 396; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6 397; GFX8-NEXT: s_max_i32 s1, s4, s1 398; GFX8-NEXT: s_sext_i32_i16 s1, s1 399; GFX8-NEXT: s_sext_i32_i16 s4, s6 400; GFX8-NEXT: s_min_i32 s1, s1, s4 401; GFX8-NEXT: s_add_i32 s0, s0, s1 402; GFX8-NEXT: s_lshl_b32 s1, s2, 8 403; GFX8-NEXT: s_lshl_b32 s2, s3, 8 404; GFX8-NEXT: s_sext_i32_i16 s3, s1 405; GFX8-NEXT: s_max_i32 s4, s3, s5 406; GFX8-NEXT: s_min_i32 s3, s3, s5 407; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 408; GFX8-NEXT: s_sext_i32_i16 s3, s3 409; GFX8-NEXT: s_sext_i32_i16 s2, s2 410; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 411; GFX8-NEXT: s_max_i32 s2, s3, s2 412; GFX8-NEXT: s_sext_i32_i16 s2, s2 413; GFX8-NEXT: s_sext_i32_i16 s3, s4 414; GFX8-NEXT: s_min_i32 s2, s2, s3 415; GFX8-NEXT: s_add_i32 s1, s1, s2 416; GFX8-NEXT: s_sext_i32_i16 s1, s1 417; GFX8-NEXT: s_sext_i32_i16 s0, s0 418; GFX8-NEXT: s_ashr_i32 s1, s1, 8 419; GFX8-NEXT: s_ashr_i32 s0, s0, 8 420; GFX8-NEXT: s_and_b32 s1, s1, 0xff 421; GFX8-NEXT: s_and_b32 s0, s0, 0xff 422; GFX8-NEXT: s_lshl_b32 s1, s1, 8 423; GFX8-NEXT: s_or_b32 s0, s0, s1 424; GFX8-NEXT: ; return to shader part epilog 425; 426; GFX9-LABEL: s_saddsat_v2i8: 427; GFX9: ; %bb.0: 428; GFX9-NEXT: s_lshr_b32 s2, s0, 8 429; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 430; GFX9-NEXT: s_lshr_b32 s3, s1, 8 431; GFX9-NEXT: s_lshr_b32 s2, s0, 16 432; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 433; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 434; GFX9-NEXT: s_lshl_b32 s2, s2, 8 435; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 436; GFX9-NEXT: s_lshr_b32 s2, s1, 16 437; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 438; GFX9-NEXT: s_lshl_b32 s2, s2, 8 439; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 440; GFX9-NEXT: v_mov_b32_e32 v0, s1 441; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp 442; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 443; GFX9-NEXT: v_mov_b32_e32 v1, 0xff 444; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 445; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 446; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 447; GFX9-NEXT: v_readfirstlane_b32 s0, v0 448; GFX9-NEXT: ; return to shader part epilog 449; 450; GFX10-LABEL: s_saddsat_v2i8: 451; GFX10: ; %bb.0: 452; GFX10-NEXT: s_lshr_b32 s2, s0, 8 453; GFX10-NEXT: s_lshr_b32 s3, s1, 8 454; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 455; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 456; GFX10-NEXT: s_lshr_b32 s2, s0, 16 457; GFX10-NEXT: s_lshr_b32 s3, s1, 16 458; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 459; GFX10-NEXT: s_lshl_b32 s2, s2, 8 460; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 461; GFX10-NEXT: s_lshl_b32 s3, s3, 8 462; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 463; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 464; GFX10-NEXT: v_mov_b32_e32 v1, 0xff 465; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp 466; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 467; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 468; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 469; GFX10-NEXT: v_readfirstlane_b32 s0, v0 470; GFX10-NEXT: ; return to shader part epilog 471; 472; GFX11-LABEL: s_saddsat_v2i8: 473; GFX11: ; %bb.0: 474; GFX11-NEXT: s_lshr_b32 s2, s0, 8 475; GFX11-NEXT: s_lshr_b32 s3, s1, 8 476; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2 477; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3 478; GFX11-NEXT: s_lshr_b32 s2, s0, 16 479; GFX11-NEXT: s_lshr_b32 s3, s1, 16 480; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008 481; GFX11-NEXT: s_lshl_b32 s2, s2, 8 482; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008 483; GFX11-NEXT: s_lshl_b32 s3, s3, 8 484; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2 485; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3 486; GFX11-NEXT: v_pk_add_i16 v0, s0, s1 clamp 487; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 488; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 489; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 490; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 491; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 492; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 493; GFX11-NEXT: v_readfirstlane_b32 s0, v0 494; GFX11-NEXT: ; return to shader part epilog 495 %lhs = bitcast i16 %lhs.arg to <2 x i8> 496 %rhs = bitcast i16 %rhs.arg to <2 x i8> 497 %result = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs) 498 %cast.result = bitcast <2 x i8> %result to i16 499 ret i16 %cast.result 500} 501 502define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { 503; GFX6-LABEL: v_saddsat_v4i8: 504; GFX6: ; %bb.0: 505; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 506; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0 507; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 508; GFX6-NEXT: v_lshrrev_b32_e32 v4, 24, v0 509; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 510; GFX6-NEXT: v_min_i32_e32 v10, 0, v0 511; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 512; GFX6-NEXT: v_lshrrev_b32_e32 v5, 8, v1 513; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v1 514; GFX6-NEXT: v_lshrrev_b32_e32 v7, 24, v1 515; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 516; GFX6-NEXT: v_max_i32_e32 v8, 0, v0 517; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v11, v10 518; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0x7fffffff, v8 519; GFX6-NEXT: v_max_i32_e32 v1, v10, v1 520; GFX6-NEXT: v_min_i32_e32 v1, v1, v8 521; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 522; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 523; GFX6-NEXT: v_min_i32_e32 v8, 0, v1 524; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 525; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v5 526; GFX6-NEXT: v_max_i32_e32 v5, 0, v1 527; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v11, v8 528; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 529; GFX6-NEXT: v_max_i32_e32 v2, v8, v2 530; GFX6-NEXT: v_min_i32_e32 v2, v2, v5 531; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 532; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 533; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v6 534; GFX6-NEXT: v_min_i32_e32 v6, 0, v2 535; GFX6-NEXT: v_max_i32_e32 v5, 0, v2 536; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v11, v6 537; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 538; GFX6-NEXT: v_max_i32_e32 v3, v6, v3 539; GFX6-NEXT: v_min_i32_e32 v3, v3, v5 540; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 541; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v4 542; GFX6-NEXT: v_min_i32_e32 v6, 0, v3 543; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1 544; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7 545; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 546; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v11, v6 547; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 548; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 549; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 550; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 551; GFX6-NEXT: v_ashrrev_i32_e32 v2, 24, v2 552; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 553; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 554; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 555; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 556; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 557; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v2 558; GFX6-NEXT: v_ashrrev_i32_e32 v3, 24, v3 559; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 560; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 561; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v3 562; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 563; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 564; GFX6-NEXT: s_setpc_b64 s[30:31] 565; 566; GFX8-LABEL: v_saddsat_v4i8: 567; GFX8: ; %bb.0: 568; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 569; GFX8-NEXT: v_mov_b32_e32 v2, 8 570; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 571; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 572; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 573; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 574; GFX8-NEXT: v_min_i16_e32 v9, 0, v0 575; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 576; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1 577; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 578; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 579; GFX8-NEXT: v_max_i16_e32 v8, 0, v0 580; GFX8-NEXT: v_sub_u16_e32 v9, 0x8000, v9 581; GFX8-NEXT: v_sub_u16_e32 v8, 0x7fff, v8 582; GFX8-NEXT: v_max_i16_e32 v1, v9, v1 583; GFX8-NEXT: v_min_i16_e32 v1, v1, v8 584; GFX8-NEXT: v_min_i16_e32 v8, 0, v3 585; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 586; GFX8-NEXT: v_max_i16_e32 v1, 0, v3 587; GFX8-NEXT: v_sub_u16_e32 v8, 0x8000, v8 588; GFX8-NEXT: v_sub_u16_e32 v1, 0x7fff, v1 589; GFX8-NEXT: v_max_i16_e32 v2, v8, v2 590; GFX8-NEXT: v_min_i16_e32 v1, v2, v1 591; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v4 592; GFX8-NEXT: v_add_u16_e32 v1, v3, v1 593; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v6 594; GFX8-NEXT: v_min_i16_e32 v6, 0, v2 595; GFX8-NEXT: v_max_i16_e32 v4, 0, v2 596; GFX8-NEXT: v_sub_u16_e32 v6, 0x8000, v6 597; GFX8-NEXT: v_sub_u16_e32 v4, 0x7fff, v4 598; GFX8-NEXT: v_max_i16_e32 v3, v6, v3 599; GFX8-NEXT: v_min_i16_e32 v3, v3, v4 600; GFX8-NEXT: v_add_u16_e32 v2, v2, v3 601; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v5 602; GFX8-NEXT: v_min_i16_e32 v6, 0, v3 603; GFX8-NEXT: v_lshlrev_b16_e32 v4, 8, v7 604; GFX8-NEXT: v_max_i16_e32 v5, 0, v3 605; GFX8-NEXT: v_sub_u16_e32 v6, 0x8000, v6 606; GFX8-NEXT: v_sub_u16_e32 v5, 0x7fff, v5 607; GFX8-NEXT: v_max_i16_e32 v4, v6, v4 608; GFX8-NEXT: v_min_i16_e32 v4, v4, v5 609; GFX8-NEXT: v_add_u16_e32 v3, v3, v4 610; GFX8-NEXT: v_mov_b32_e32 v4, 0xff 611; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 612; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 613; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 614; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 615; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 616; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 617; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 618; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 619; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 620; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 621; GFX8-NEXT: s_setpc_b64 s[30:31] 622; 623; GFX9-LABEL: v_saddsat_v4i8: 624; GFX9: ; %bb.0: 625; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 626; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 627; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 628; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 629; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v0 630; GFX9-NEXT: v_alignbit_b32 v0, v3, v0, 16 631; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v1 632; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 633; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v6 634; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 635; GFX9-NEXT: v_alignbit_b32 v1, v5, v1, 16 636; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] 637; GFX9-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] 638; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 639; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 640; GFX9-NEXT: v_pk_add_i16 v2, v2, v3 clamp 641; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp 642; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v2 op_sel_hi:[0,1] 643; GFX9-NEXT: v_mov_b32_e32 v3, 8 644; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 645; GFX9-NEXT: v_mov_b32_e32 v2, 0xff 646; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 647; GFX9-NEXT: v_and_or_b32 v1, v1, v2, v3 648; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v0 649; GFX9-NEXT: v_mov_b32_e32 v3, 24 650; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 651; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 652; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 653; GFX9-NEXT: s_setpc_b64 s[30:31] 654; 655; GFX10-LABEL: v_saddsat_v4i8: 656; GFX10: ; %bb.0: 657; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 658; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 659; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 660; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v0 661; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 662; GFX10-NEXT: v_and_b32_e32 v6, 0xffff, v1 663; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 664; GFX10-NEXT: v_alignbit_b32 v0, v3, v0, 16 665; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v4 666; GFX10-NEXT: v_mov_b32_e32 v4, 24 667; GFX10-NEXT: v_lshl_or_b32 v3, v5, 16, v6 668; GFX10-NEXT: v_alignbit_b32 v1, v7, v1, 16 669; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 670; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] 671; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] 672; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 673; GFX10-NEXT: v_pk_add_i16 v2, v2, v3 clamp 674; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp 675; GFX10-NEXT: v_mov_b32_e32 v1, 8 676; GFX10-NEXT: v_pk_ashrrev_i16 v2, 8, v2 op_sel_hi:[0,1] 677; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 678; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 679; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v0 680; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 681; GFX10-NEXT: v_and_or_b32 v1, 0xff, v2, v1 682; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 683; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0 684; GFX10-NEXT: s_setpc_b64 s[30:31] 685; 686; GFX11-LABEL: v_saddsat_v4i8: 687; GFX11: ; %bb.0: 688; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 689; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 690; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 691; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0 692; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v1 693; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v0 694; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1 695; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v4 696; GFX11-NEXT: v_lshl_or_b32 v3, v3, 16, v5 697; GFX11-NEXT: v_alignbit_b32 v0, v6, v0, 16 698; GFX11-NEXT: v_alignbit_b32 v1, v7, v1, 16 699; GFX11-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] 700; GFX11-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] 701; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 702; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 703; GFX11-NEXT: v_pk_add_i16 v2, v2, v3 clamp 704; GFX11-NEXT: v_pk_add_i16 v0, v0, v1 clamp 705; GFX11-NEXT: v_pk_ashrrev_i16 v1, 8, v2 op_sel_hi:[0,1] 706; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 707; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 8 708; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v0 709; GFX11-NEXT: v_bfe_u32 v0, v0, 16, 8 710; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 711; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 712; GFX11-NEXT: v_lshlrev_b32_e32 v0, 24, v0 713; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v2 714; GFX11-NEXT: v_or3_b32 v0, v1, v3, v0 715; GFX11-NEXT: s_setpc_b64 s[30:31] 716 %lhs = bitcast i32 %lhs.arg to <4 x i8> 717 %rhs = bitcast i32 %rhs.arg to <4 x i8> 718 %result = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs) 719 %cast.result = bitcast <4 x i8> %result to i32 720 ret i32 %cast.result 721} 722 723define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { 724; GFX6-LABEL: s_saddsat_v4i8: 725; GFX6: ; %bb.0: 726; GFX6-NEXT: s_lshr_b32 s2, s0, 8 727; GFX6-NEXT: s_lshr_b32 s3, s0, 16 728; GFX6-NEXT: s_lshr_b32 s4, s0, 24 729; GFX6-NEXT: s_lshl_b32 s0, s0, 24 730; GFX6-NEXT: s_min_i32 s9, s0, 0 731; GFX6-NEXT: s_lshr_b32 s5, s1, 8 732; GFX6-NEXT: s_lshr_b32 s6, s1, 16 733; GFX6-NEXT: s_lshr_b32 s7, s1, 24 734; GFX6-NEXT: s_lshl_b32 s1, s1, 24 735; GFX6-NEXT: s_max_i32 s8, s0, 0 736; GFX6-NEXT: s_sub_i32 s9, 0x80000000, s9 737; GFX6-NEXT: s_sub_i32 s8, 0x7fffffff, s8 738; GFX6-NEXT: s_max_i32 s1, s9, s1 739; GFX6-NEXT: s_min_i32 s1, s1, s8 740; GFX6-NEXT: s_add_i32 s0, s0, s1 741; GFX6-NEXT: s_lshl_b32 s1, s2, 24 742; GFX6-NEXT: s_min_i32 s8, s1, 0 743; GFX6-NEXT: s_lshl_b32 s2, s5, 24 744; GFX6-NEXT: s_max_i32 s5, s1, 0 745; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 746; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 747; GFX6-NEXT: s_max_i32 s2, s8, s2 748; GFX6-NEXT: s_min_i32 s2, s2, s5 749; GFX6-NEXT: s_add_i32 s1, s1, s2 750; GFX6-NEXT: s_lshl_b32 s2, s3, 24 751; GFX6-NEXT: s_lshl_b32 s3, s6, 24 752; GFX6-NEXT: s_min_i32 s6, s2, 0 753; GFX6-NEXT: s_max_i32 s5, s2, 0 754; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 755; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 756; GFX6-NEXT: s_max_i32 s3, s6, s3 757; GFX6-NEXT: s_min_i32 s3, s3, s5 758; GFX6-NEXT: s_add_i32 s2, s2, s3 759; GFX6-NEXT: s_lshl_b32 s3, s4, 24 760; GFX6-NEXT: s_min_i32 s6, s3, 0 761; GFX6-NEXT: s_ashr_i32 s1, s1, 24 762; GFX6-NEXT: s_lshl_b32 s4, s7, 24 763; GFX6-NEXT: s_max_i32 s5, s3, 0 764; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 765; GFX6-NEXT: s_ashr_i32 s0, s0, 24 766; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 767; GFX6-NEXT: s_max_i32 s4, s6, s4 768; GFX6-NEXT: s_and_b32 s1, s1, 0xff 769; GFX6-NEXT: s_ashr_i32 s2, s2, 24 770; GFX6-NEXT: s_min_i32 s4, s4, s5 771; GFX6-NEXT: s_and_b32 s0, s0, 0xff 772; GFX6-NEXT: s_lshl_b32 s1, s1, 8 773; GFX6-NEXT: s_add_i32 s3, s3, s4 774; GFX6-NEXT: s_or_b32 s0, s0, s1 775; GFX6-NEXT: s_and_b32 s1, s2, 0xff 776; GFX6-NEXT: s_ashr_i32 s3, s3, 24 777; GFX6-NEXT: s_lshl_b32 s1, s1, 16 778; GFX6-NEXT: s_or_b32 s0, s0, s1 779; GFX6-NEXT: s_and_b32 s1, s3, 0xff 780; GFX6-NEXT: s_lshl_b32 s1, s1, 24 781; GFX6-NEXT: s_or_b32 s0, s0, s1 782; GFX6-NEXT: ; return to shader part epilog 783; 784; GFX8-LABEL: s_saddsat_v4i8: 785; GFX8: ; %bb.0: 786; GFX8-NEXT: s_lshr_b32 s2, s0, 8 787; GFX8-NEXT: s_lshr_b32 s3, s0, 16 788; GFX8-NEXT: s_lshr_b32 s4, s0, 24 789; GFX8-NEXT: s_lshl_b32 s0, s0, 8 790; GFX8-NEXT: s_sext_i32_i16 s8, s0 791; GFX8-NEXT: s_sext_i32_i16 s9, 0 792; GFX8-NEXT: s_max_i32 s10, s8, s9 793; GFX8-NEXT: s_min_i32 s8, s8, s9 794; GFX8-NEXT: s_lshr_b32 s5, s1, 8 795; GFX8-NEXT: s_lshr_b32 s6, s1, 16 796; GFX8-NEXT: s_lshr_b32 s7, s1, 24 797; GFX8-NEXT: s_lshl_b32 s1, s1, 8 798; GFX8-NEXT: s_sub_i32 s8, 0xffff8000, s8 799; GFX8-NEXT: s_sext_i32_i16 s8, s8 800; GFX8-NEXT: s_sext_i32_i16 s1, s1 801; GFX8-NEXT: s_sub_i32 s10, 0x7fff, s10 802; GFX8-NEXT: s_max_i32 s1, s8, s1 803; GFX8-NEXT: s_sext_i32_i16 s1, s1 804; GFX8-NEXT: s_sext_i32_i16 s8, s10 805; GFX8-NEXT: s_min_i32 s1, s1, s8 806; GFX8-NEXT: s_add_i32 s0, s0, s1 807; GFX8-NEXT: s_lshl_b32 s1, s2, 8 808; GFX8-NEXT: s_lshl_b32 s2, s5, 8 809; GFX8-NEXT: s_sext_i32_i16 s5, s1 810; GFX8-NEXT: s_max_i32 s8, s5, s9 811; GFX8-NEXT: s_min_i32 s5, s5, s9 812; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5 813; GFX8-NEXT: s_sext_i32_i16 s5, s5 814; GFX8-NEXT: s_sext_i32_i16 s2, s2 815; GFX8-NEXT: s_sub_i32 s8, 0x7fff, s8 816; GFX8-NEXT: s_max_i32 s2, s5, s2 817; GFX8-NEXT: s_sext_i32_i16 s2, s2 818; GFX8-NEXT: s_sext_i32_i16 s5, s8 819; GFX8-NEXT: s_min_i32 s2, s2, s5 820; GFX8-NEXT: s_add_i32 s1, s1, s2 821; GFX8-NEXT: s_lshl_b32 s2, s3, 8 822; GFX8-NEXT: s_sext_i32_i16 s5, s2 823; GFX8-NEXT: s_lshl_b32 s3, s6, 8 824; GFX8-NEXT: s_max_i32 s6, s5, s9 825; GFX8-NEXT: s_min_i32 s5, s5, s9 826; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5 827; GFX8-NEXT: s_sext_i32_i16 s5, s5 828; GFX8-NEXT: s_sext_i32_i16 s3, s3 829; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6 830; GFX8-NEXT: s_max_i32 s3, s5, s3 831; GFX8-NEXT: s_sext_i32_i16 s3, s3 832; GFX8-NEXT: s_sext_i32_i16 s5, s6 833; GFX8-NEXT: s_min_i32 s3, s3, s5 834; GFX8-NEXT: s_add_i32 s2, s2, s3 835; GFX8-NEXT: s_lshl_b32 s3, s4, 8 836; GFX8-NEXT: s_sext_i32_i16 s5, s3 837; GFX8-NEXT: s_max_i32 s6, s5, s9 838; GFX8-NEXT: s_min_i32 s5, s5, s9 839; GFX8-NEXT: s_lshl_b32 s4, s7, 8 840; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5 841; GFX8-NEXT: s_sext_i32_i16 s5, s5 842; GFX8-NEXT: s_sext_i32_i16 s4, s4 843; GFX8-NEXT: s_sext_i32_i16 s1, s1 844; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6 845; GFX8-NEXT: s_max_i32 s4, s5, s4 846; GFX8-NEXT: s_sext_i32_i16 s0, s0 847; GFX8-NEXT: s_ashr_i32 s1, s1, 8 848; GFX8-NEXT: s_sext_i32_i16 s4, s4 849; GFX8-NEXT: s_sext_i32_i16 s5, s6 850; GFX8-NEXT: s_ashr_i32 s0, s0, 8 851; GFX8-NEXT: s_sext_i32_i16 s2, s2 852; GFX8-NEXT: s_min_i32 s4, s4, s5 853; GFX8-NEXT: s_and_b32 s1, s1, 0xff 854; GFX8-NEXT: s_ashr_i32 s2, s2, 8 855; GFX8-NEXT: s_add_i32 s3, s3, s4 856; GFX8-NEXT: s_and_b32 s0, s0, 0xff 857; GFX8-NEXT: s_lshl_b32 s1, s1, 8 858; GFX8-NEXT: s_sext_i32_i16 s3, s3 859; GFX8-NEXT: s_or_b32 s0, s0, s1 860; GFX8-NEXT: s_and_b32 s1, s2, 0xff 861; GFX8-NEXT: s_ashr_i32 s3, s3, 8 862; GFX8-NEXT: s_lshl_b32 s1, s1, 16 863; GFX8-NEXT: s_or_b32 s0, s0, s1 864; GFX8-NEXT: s_and_b32 s1, s3, 0xff 865; GFX8-NEXT: s_lshl_b32 s1, s1, 24 866; GFX8-NEXT: s_or_b32 s0, s0, s1 867; GFX8-NEXT: ; return to shader part epilog 868; 869; GFX9-LABEL: s_saddsat_v4i8: 870; GFX9: ; %bb.0: 871; GFX9-NEXT: s_lshr_b32 s2, s0, 8 872; GFX9-NEXT: s_lshr_b32 s3, s0, 16 873; GFX9-NEXT: s_lshr_b32 s4, s0, 24 874; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 875; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s4 876; GFX9-NEXT: s_lshr_b32 s4, s0, 16 877; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 878; GFX9-NEXT: s_lshl_b32 s4, s4, 8 879; GFX9-NEXT: s_lshr_b32 s5, s1, 8 880; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 881; GFX9-NEXT: s_lshr_b32 s4, s2, 16 882; GFX9-NEXT: s_lshr_b32 s6, s1, 16 883; GFX9-NEXT: s_lshr_b32 s7, s1, 24 884; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 885; GFX9-NEXT: s_lshl_b32 s2, s2, 0x80008 886; GFX9-NEXT: s_lshl_b32 s4, s4, 8 887; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 888; GFX9-NEXT: s_lshr_b32 s4, s1, 16 889; GFX9-NEXT: s_pack_ll_b32_b16 s3, s6, s7 890; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 891; GFX9-NEXT: s_lshl_b32 s4, s4, 8 892; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 893; GFX9-NEXT: s_lshr_b32 s4, s3, 16 894; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008 895; GFX9-NEXT: s_lshl_b32 s4, s4, 8 896; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 897; GFX9-NEXT: v_mov_b32_e32 v0, s1 898; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp 899; GFX9-NEXT: v_mov_b32_e32 v1, s3 900; GFX9-NEXT: v_pk_add_i16 v1, s2, v1 clamp 901; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 902; GFX9-NEXT: v_mov_b32_e32 v3, 8 903; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] 904; GFX9-NEXT: v_mov_b32_e32 v2, 0xff 905; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 906; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3 907; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1 908; GFX9-NEXT: v_mov_b32_e32 v3, 24 909; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 910; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 911; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 912; GFX9-NEXT: v_readfirstlane_b32 s0, v0 913; GFX9-NEXT: ; return to shader part epilog 914; 915; GFX10-LABEL: s_saddsat_v4i8: 916; GFX10: ; %bb.0: 917; GFX10-NEXT: s_lshr_b32 s2, s0, 8 918; GFX10-NEXT: s_lshr_b32 s3, s0, 16 919; GFX10-NEXT: s_lshr_b32 s4, s0, 24 920; GFX10-NEXT: s_lshr_b32 s5, s1, 8 921; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 922; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 923; GFX10-NEXT: s_lshr_b32 s6, s1, 16 924; GFX10-NEXT: s_lshr_b32 s7, s1, 24 925; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 926; GFX10-NEXT: s_lshr_b32 s4, s0, 16 927; GFX10-NEXT: s_lshr_b32 s5, s2, 16 928; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7 929; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 930; GFX10-NEXT: s_lshl_b32 s4, s4, 8 931; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008 932; GFX10-NEXT: s_lshl_b32 s5, s5, 8 933; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 934; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s5 935; GFX10-NEXT: s_lshr_b32 s4, s1, 16 936; GFX10-NEXT: s_lshr_b32 s5, s3, 16 937; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 938; GFX10-NEXT: s_lshl_b32 s4, s4, 8 939; GFX10-NEXT: s_lshl_b32 s3, s3, 0x80008 940; GFX10-NEXT: s_lshl_b32 s5, s5, 8 941; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 942; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 943; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp 944; GFX10-NEXT: v_pk_add_i16 v1, s2, s3 clamp 945; GFX10-NEXT: v_mov_b32_e32 v2, 8 946; GFX10-NEXT: v_mov_b32_e32 v4, 24 947; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 948; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] 949; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 950; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 951; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 952; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v2 953; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 954; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 955; GFX10-NEXT: v_readfirstlane_b32 s0, v0 956; GFX10-NEXT: ; return to shader part epilog 957; 958; GFX11-LABEL: s_saddsat_v4i8: 959; GFX11: ; %bb.0: 960; GFX11-NEXT: s_lshr_b32 s2, s0, 8 961; GFX11-NEXT: s_lshr_b32 s3, s0, 24 962; GFX11-NEXT: s_lshr_b32 s4, s1, 8 963; GFX11-NEXT: s_lshr_b32 s5, s1, 24 964; GFX11-NEXT: s_pack_ll_b32_b16 s2, s0, s2 965; GFX11-NEXT: s_pack_hl_b32_b16 s0, s0, s3 966; GFX11-NEXT: s_pack_ll_b32_b16 s3, s1, s4 967; GFX11-NEXT: s_lshr_b32 s4, s2, 16 968; GFX11-NEXT: s_pack_hl_b32_b16 s1, s1, s5 969; GFX11-NEXT: s_lshr_b32 s5, s3, 16 970; GFX11-NEXT: s_lshl_b32 s2, s2, 0x80008 971; GFX11-NEXT: s_lshl_b32 s4, s4, 8 972; GFX11-NEXT: s_lshl_b32 s3, s3, 0x80008 973; GFX11-NEXT: s_lshl_b32 s5, s5, 8 974; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4 975; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s5 976; GFX11-NEXT: s_lshr_b32 s4, s0, 16 977; GFX11-NEXT: s_lshr_b32 s5, s1, 16 978; GFX11-NEXT: v_pk_add_i16 v0, s2, s3 clamp 979; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008 980; GFX11-NEXT: s_lshl_b32 s4, s4, 8 981; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008 982; GFX11-NEXT: s_lshl_b32 s2, s5, 8 983; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s4 984; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s2 985; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 986; GFX11-NEXT: v_pk_add_i16 v1, s0, s1 clamp 987; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 988; GFX11-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] 989; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 990; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v1 991; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8 992; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v2 993; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 994; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1 995; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 996; GFX11-NEXT: v_readfirstlane_b32 s0, v0 997; GFX11-NEXT: ; return to shader part epilog 998 %lhs = bitcast i32 %lhs.arg to <4 x i8> 999 %rhs = bitcast i32 %rhs.arg to <4 x i8> 1000 %result = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs) 1001 %cast.result = bitcast <4 x i8> %result to i32 1002 ret i32 %cast.result 1003} 1004 1005define i24 @v_saddsat_i24(i24 %lhs, i24 %rhs) { 1006; GFX6-LABEL: v_saddsat_i24: 1007; GFX6: ; %bb.0: 1008; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1009; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1010; GFX6-NEXT: v_min_i32_e32 v3, 0, v0 1011; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1012; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 1013; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 1014; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 1015; GFX6-NEXT: v_max_i32_e32 v1, v3, v1 1016; GFX6-NEXT: v_min_i32_e32 v1, v1, v2 1017; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 1018; GFX6-NEXT: v_ashrrev_i32_e32 v0, 8, v0 1019; GFX6-NEXT: s_setpc_b64 s[30:31] 1020; 1021; GFX8-LABEL: v_saddsat_i24: 1022; GFX8: ; %bb.0: 1023; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1024; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 1025; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 24 1026; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 24 1027; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v0 1028; GFX8-NEXT: v_bfe_i32 v0, v1, 0, 24 1029; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v0 1030; GFX8-NEXT: v_ashrrev_i32_e32 v0, 23, v3 1031; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xff800000, v0 1032; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] 1033; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 1034; GFX8-NEXT: s_setpc_b64 s[30:31] 1035; 1036; GFX9-LABEL: v_saddsat_i24: 1037; GFX9: ; %bb.0: 1038; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1039; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1040; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1041; GFX9-NEXT: v_add_i32 v0, v0, v1 clamp 1042; GFX9-NEXT: v_ashrrev_i32_e32 v0, 8, v0 1043; GFX9-NEXT: s_setpc_b64 s[30:31] 1044; 1045; GFX10PLUS-LABEL: v_saddsat_i24: 1046; GFX10PLUS: ; %bb.0: 1047; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1048; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1049; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1050; GFX10PLUS-NEXT: v_add_nc_i32 v0, v0, v1 clamp 1051; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v0, 8, v0 1052; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 1053 %result = call i24 @llvm.sadd.sat.i24(i24 %lhs, i24 %rhs) 1054 ret i24 %result 1055} 1056 1057define amdgpu_ps i24 @s_saddsat_i24(i24 inreg %lhs, i24 inreg %rhs) { 1058; GFX6-LABEL: s_saddsat_i24: 1059; GFX6: ; %bb.0: 1060; GFX6-NEXT: s_lshl_b32 s0, s0, 8 1061; GFX6-NEXT: s_min_i32 s3, s0, 0 1062; GFX6-NEXT: s_lshl_b32 s1, s1, 8 1063; GFX6-NEXT: s_max_i32 s2, s0, 0 1064; GFX6-NEXT: s_sub_i32 s3, 0x80000000, s3 1065; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2 1066; GFX6-NEXT: s_max_i32 s1, s3, s1 1067; GFX6-NEXT: s_min_i32 s1, s1, s2 1068; GFX6-NEXT: s_add_i32 s0, s0, s1 1069; GFX6-NEXT: s_ashr_i32 s0, s0, 8 1070; GFX6-NEXT: ; return to shader part epilog 1071; 1072; GFX8-LABEL: s_saddsat_i24: 1073; GFX8: ; %bb.0: 1074; GFX8-NEXT: s_add_i32 s2, s0, s1 1075; GFX8-NEXT: s_bfe_i32 s3, s2, 0x180000 1076; GFX8-NEXT: s_bfe_i32 s0, s0, 0x180000 1077; GFX8-NEXT: s_cmp_lt_i32 s3, s0 1078; GFX8-NEXT: s_cselect_b32 s0, 1, 0 1079; GFX8-NEXT: s_bfe_i32 s1, s1, 0x180000 1080; GFX8-NEXT: s_cmp_lt_i32 s1, 0 1081; GFX8-NEXT: s_cselect_b32 s1, 1, 0 1082; GFX8-NEXT: s_xor_b32 s0, s1, s0 1083; GFX8-NEXT: s_ashr_i32 s1, s3, 23 1084; GFX8-NEXT: s_add_i32 s1, s1, 0xff800000 1085; GFX8-NEXT: s_and_b32 s0, s0, 1 1086; GFX8-NEXT: s_cmp_lg_u32 s0, 0 1087; GFX8-NEXT: s_cselect_b32 s0, s1, s2 1088; GFX8-NEXT: ; return to shader part epilog 1089; 1090; GFX9-LABEL: s_saddsat_i24: 1091; GFX9: ; %bb.0: 1092; GFX9-NEXT: s_lshl_b32 s1, s1, 8 1093; GFX9-NEXT: s_lshl_b32 s0, s0, 8 1094; GFX9-NEXT: v_mov_b32_e32 v0, s1 1095; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp 1096; GFX9-NEXT: v_ashrrev_i32_e32 v0, 8, v0 1097; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1098; GFX9-NEXT: ; return to shader part epilog 1099; 1100; GFX10PLUS-LABEL: s_saddsat_i24: 1101; GFX10PLUS: ; %bb.0: 1102; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8 1103; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8 1104; GFX10PLUS-NEXT: v_add_nc_i32 v0, s0, s1 clamp 1105; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v0, 8, v0 1106; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 1107; GFX10PLUS-NEXT: ; return to shader part epilog 1108 %result = call i24 @llvm.sadd.sat.i24(i24 %lhs, i24 %rhs) 1109 ret i24 %result 1110} 1111 1112define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) { 1113; GFX6-LABEL: v_saddsat_i32: 1114; GFX6: ; %bb.0: 1115; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1116; GFX6-NEXT: v_min_i32_e32 v3, 0, v0 1117; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 1118; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 1119; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 1120; GFX6-NEXT: v_max_i32_e32 v1, v3, v1 1121; GFX6-NEXT: v_min_i32_e32 v1, v1, v2 1122; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 1123; GFX6-NEXT: s_setpc_b64 s[30:31] 1124; 1125; GFX8-LABEL: v_saddsat_i32: 1126; GFX8: ; %bb.0: 1127; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1128; GFX8-NEXT: v_min_i32_e32 v3, 0, v0 1129; GFX8-NEXT: v_max_i32_e32 v2, 0, v0 1130; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 0x80000000, v3 1131; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 0x7fffffff, v2 1132; GFX8-NEXT: v_max_i32_e32 v1, v3, v1 1133; GFX8-NEXT: v_min_i32_e32 v1, v1, v2 1134; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 1135; GFX8-NEXT: s_setpc_b64 s[30:31] 1136; 1137; GFX9-LABEL: v_saddsat_i32: 1138; GFX9: ; %bb.0: 1139; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1140; GFX9-NEXT: v_add_i32 v0, v0, v1 clamp 1141; GFX9-NEXT: s_setpc_b64 s[30:31] 1142; 1143; GFX10PLUS-LABEL: v_saddsat_i32: 1144; GFX10PLUS: ; %bb.0: 1145; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1146; GFX10PLUS-NEXT: v_add_nc_i32 v0, v0, v1 clamp 1147; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 1148 %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs) 1149 ret i32 %result 1150} 1151 1152define amdgpu_ps i32 @s_saddsat_i32(i32 inreg %lhs, i32 inreg %rhs) { 1153; GFX6-LABEL: s_saddsat_i32: 1154; GFX6: ; %bb.0: 1155; GFX6-NEXT: s_min_i32 s3, s0, 0 1156; GFX6-NEXT: s_max_i32 s2, s0, 0 1157; GFX6-NEXT: s_sub_i32 s3, 0x80000000, s3 1158; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2 1159; GFX6-NEXT: s_max_i32 s1, s3, s1 1160; GFX6-NEXT: s_min_i32 s1, s1, s2 1161; GFX6-NEXT: s_add_i32 s0, s0, s1 1162; GFX6-NEXT: ; return to shader part epilog 1163; 1164; GFX8-LABEL: s_saddsat_i32: 1165; GFX8: ; %bb.0: 1166; GFX8-NEXT: s_min_i32 s3, s0, 0 1167; GFX8-NEXT: s_max_i32 s2, s0, 0 1168; GFX8-NEXT: s_sub_i32 s3, 0x80000000, s3 1169; GFX8-NEXT: s_sub_i32 s2, 0x7fffffff, s2 1170; GFX8-NEXT: s_max_i32 s1, s3, s1 1171; GFX8-NEXT: s_min_i32 s1, s1, s2 1172; GFX8-NEXT: s_add_i32 s0, s0, s1 1173; GFX8-NEXT: ; return to shader part epilog 1174; 1175; GFX9-LABEL: s_saddsat_i32: 1176; GFX9: ; %bb.0: 1177; GFX9-NEXT: v_mov_b32_e32 v0, s1 1178; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp 1179; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1180; GFX9-NEXT: ; return to shader part epilog 1181; 1182; GFX10PLUS-LABEL: s_saddsat_i32: 1183; GFX10PLUS: ; %bb.0: 1184; GFX10PLUS-NEXT: v_add_nc_i32 v0, s0, s1 clamp 1185; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 1186; GFX10PLUS-NEXT: ; return to shader part epilog 1187 %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs) 1188 ret i32 %result 1189} 1190 1191define amdgpu_ps float @saddsat_i32_sv(i32 inreg %lhs, i32 %rhs) { 1192; GFX6-LABEL: saddsat_i32_sv: 1193; GFX6: ; %bb.0: 1194; GFX6-NEXT: s_min_i32 s2, s0, 0 1195; GFX6-NEXT: s_max_i32 s1, s0, 0 1196; GFX6-NEXT: s_sub_i32 s2, 0x80000000, s2 1197; GFX6-NEXT: s_sub_i32 s1, 0x7fffffff, s1 1198; GFX6-NEXT: v_max_i32_e32 v0, s2, v0 1199; GFX6-NEXT: v_min_i32_e32 v0, s1, v0 1200; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 1201; GFX6-NEXT: ; return to shader part epilog 1202; 1203; GFX8-LABEL: saddsat_i32_sv: 1204; GFX8: ; %bb.0: 1205; GFX8-NEXT: s_min_i32 s2, s0, 0 1206; GFX8-NEXT: s_max_i32 s1, s0, 0 1207; GFX8-NEXT: s_sub_i32 s2, 0x80000000, s2 1208; GFX8-NEXT: s_sub_i32 s1, 0x7fffffff, s1 1209; GFX8-NEXT: v_max_i32_e32 v0, s2, v0 1210; GFX8-NEXT: v_min_i32_e32 v0, s1, v0 1211; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1212; GFX8-NEXT: ; return to shader part epilog 1213; 1214; GFX9-LABEL: saddsat_i32_sv: 1215; GFX9: ; %bb.0: 1216; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp 1217; GFX9-NEXT: ; return to shader part epilog 1218; 1219; GFX10PLUS-LABEL: saddsat_i32_sv: 1220; GFX10PLUS: ; %bb.0: 1221; GFX10PLUS-NEXT: v_add_nc_i32 v0, s0, v0 clamp 1222; GFX10PLUS-NEXT: ; return to shader part epilog 1223 %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs) 1224 %cast = bitcast i32 %result to float 1225 ret float %cast 1226} 1227 1228define amdgpu_ps float @saddsat_i32_vs(i32 %lhs, i32 inreg %rhs) { 1229; GFX6-LABEL: saddsat_i32_vs: 1230; GFX6: ; %bb.0: 1231; GFX6-NEXT: v_min_i32_e32 v2, 0, v0 1232; GFX6-NEXT: v_max_i32_e32 v1, 0, v0 1233; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x80000000, v2 1234; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0x7fffffff, v1 1235; GFX6-NEXT: v_max_i32_e32 v2, s0, v2 1236; GFX6-NEXT: v_min_i32_e32 v1, v2, v1 1237; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 1238; GFX6-NEXT: ; return to shader part epilog 1239; 1240; GFX8-LABEL: saddsat_i32_vs: 1241; GFX8: ; %bb.0: 1242; GFX8-NEXT: v_min_i32_e32 v2, 0, v0 1243; GFX8-NEXT: v_max_i32_e32 v1, 0, v0 1244; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 0x80000000, v2 1245; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 0x7fffffff, v1 1246; GFX8-NEXT: v_max_i32_e32 v2, s0, v2 1247; GFX8-NEXT: v_min_i32_e32 v1, v2, v1 1248; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 1249; GFX8-NEXT: ; return to shader part epilog 1250; 1251; GFX9-LABEL: saddsat_i32_vs: 1252; GFX9: ; %bb.0: 1253; GFX9-NEXT: v_add_i32 v0, v0, s0 clamp 1254; GFX9-NEXT: ; return to shader part epilog 1255; 1256; GFX10PLUS-LABEL: saddsat_i32_vs: 1257; GFX10PLUS: ; %bb.0: 1258; GFX10PLUS-NEXT: v_add_nc_i32 v0, v0, s0 clamp 1259; GFX10PLUS-NEXT: ; return to shader part epilog 1260 %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs) 1261 %cast = bitcast i32 %result to float 1262 ret float %cast 1263} 1264 1265define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { 1266; GFX6-LABEL: v_saddsat_v2i32: 1267; GFX6: ; %bb.0: 1268; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1269; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 1270; GFX6-NEXT: v_max_i32_e32 v4, 0, v0 1271; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x80000000, v5 1272; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x7fffffff, v4 1273; GFX6-NEXT: v_max_i32_e32 v2, v5, v2 1274; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 1275; GFX6-NEXT: v_min_i32_e32 v4, 0, v1 1276; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1277; GFX6-NEXT: v_max_i32_e32 v2, 0, v1 1278; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x80000000, v4 1279; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 1280; GFX6-NEXT: v_max_i32_e32 v3, v4, v3 1281; GFX6-NEXT: v_min_i32_e32 v2, v3, v2 1282; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 1283; GFX6-NEXT: s_setpc_b64 s[30:31] 1284; 1285; GFX8-LABEL: v_saddsat_v2i32: 1286; GFX8: ; %bb.0: 1287; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1288; GFX8-NEXT: v_min_i32_e32 v5, 0, v0 1289; GFX8-NEXT: v_max_i32_e32 v4, 0, v0 1290; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 0x80000000, v5 1291; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0x7fffffff, v4 1292; GFX8-NEXT: v_max_i32_e32 v2, v5, v2 1293; GFX8-NEXT: v_min_i32_e32 v2, v2, v4 1294; GFX8-NEXT: v_min_i32_e32 v4, 0, v1 1295; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 1296; GFX8-NEXT: v_max_i32_e32 v2, 0, v1 1297; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0x80000000, v4 1298; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 0x7fffffff, v2 1299; GFX8-NEXT: v_max_i32_e32 v3, v4, v3 1300; GFX8-NEXT: v_min_i32_e32 v2, v3, v2 1301; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 1302; GFX8-NEXT: s_setpc_b64 s[30:31] 1303; 1304; GFX9-LABEL: v_saddsat_v2i32: 1305; GFX9: ; %bb.0: 1306; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1307; GFX9-NEXT: v_add_i32 v0, v0, v2 clamp 1308; GFX9-NEXT: v_add_i32 v1, v1, v3 clamp 1309; GFX9-NEXT: s_setpc_b64 s[30:31] 1310; 1311; GFX10PLUS-LABEL: v_saddsat_v2i32: 1312; GFX10PLUS: ; %bb.0: 1313; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1314; GFX10PLUS-NEXT: v_add_nc_i32 v0, v0, v2 clamp 1315; GFX10PLUS-NEXT: v_add_nc_i32 v1, v1, v3 clamp 1316; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 1317 %result = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) 1318 ret <2 x i32> %result 1319} 1320 1321define amdgpu_ps <2 x i32> @s_saddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inreg %rhs) { 1322; GFX6-LABEL: s_saddsat_v2i32: 1323; GFX6: ; %bb.0: 1324; GFX6-NEXT: s_min_i32 s5, s0, 0 1325; GFX6-NEXT: s_max_i32 s4, s0, 0 1326; GFX6-NEXT: s_sub_i32 s5, 0x80000000, s5 1327; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4 1328; GFX6-NEXT: s_max_i32 s2, s5, s2 1329; GFX6-NEXT: s_min_i32 s2, s2, s4 1330; GFX6-NEXT: s_min_i32 s4, s1, 0 1331; GFX6-NEXT: s_add_i32 s0, s0, s2 1332; GFX6-NEXT: s_max_i32 s2, s1, 0 1333; GFX6-NEXT: s_sub_i32 s4, 0x80000000, s4 1334; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2 1335; GFX6-NEXT: s_max_i32 s3, s4, s3 1336; GFX6-NEXT: s_min_i32 s2, s3, s2 1337; GFX6-NEXT: s_add_i32 s1, s1, s2 1338; GFX6-NEXT: ; return to shader part epilog 1339; 1340; GFX8-LABEL: s_saddsat_v2i32: 1341; GFX8: ; %bb.0: 1342; GFX8-NEXT: s_min_i32 s5, s0, 0 1343; GFX8-NEXT: s_max_i32 s4, s0, 0 1344; GFX8-NEXT: s_sub_i32 s5, 0x80000000, s5 1345; GFX8-NEXT: s_sub_i32 s4, 0x7fffffff, s4 1346; GFX8-NEXT: s_max_i32 s2, s5, s2 1347; GFX8-NEXT: s_min_i32 s2, s2, s4 1348; GFX8-NEXT: s_min_i32 s4, s1, 0 1349; GFX8-NEXT: s_add_i32 s0, s0, s2 1350; GFX8-NEXT: s_max_i32 s2, s1, 0 1351; GFX8-NEXT: s_sub_i32 s4, 0x80000000, s4 1352; GFX8-NEXT: s_sub_i32 s2, 0x7fffffff, s2 1353; GFX8-NEXT: s_max_i32 s3, s4, s3 1354; GFX8-NEXT: s_min_i32 s2, s3, s2 1355; GFX8-NEXT: s_add_i32 s1, s1, s2 1356; GFX8-NEXT: ; return to shader part epilog 1357; 1358; GFX9-LABEL: s_saddsat_v2i32: 1359; GFX9: ; %bb.0: 1360; GFX9-NEXT: v_mov_b32_e32 v0, s2 1361; GFX9-NEXT: v_mov_b32_e32 v1, s3 1362; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp 1363; GFX9-NEXT: v_add_i32 v1, s1, v1 clamp 1364; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1365; GFX9-NEXT: v_readfirstlane_b32 s1, v1 1366; GFX9-NEXT: ; return to shader part epilog 1367; 1368; GFX10PLUS-LABEL: s_saddsat_v2i32: 1369; GFX10PLUS: ; %bb.0: 1370; GFX10PLUS-NEXT: v_add_nc_i32 v0, s0, s2 clamp 1371; GFX10PLUS-NEXT: v_add_nc_i32 v1, s1, s3 clamp 1372; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 1373; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 1374; GFX10PLUS-NEXT: ; return to shader part epilog 1375 %result = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) 1376 ret <2 x i32> %result 1377} 1378 1379define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { 1380; GFX6-LABEL: v_saddsat_v3i32: 1381; GFX6: ; %bb.0: 1382; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1383; GFX6-NEXT: v_min_i32_e32 v8, 0, v0 1384; GFX6-NEXT: v_max_i32_e32 v6, 0, v0 1385; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0x80000000, v8 1386; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0x7fffffff, v6 1387; GFX6-NEXT: v_max_i32_e32 v3, v8, v3 1388; GFX6-NEXT: v_min_i32_e32 v3, v3, v6 1389; GFX6-NEXT: v_min_i32_e32 v6, 0, v1 1390; GFX6-NEXT: v_bfrev_b32_e32 v7, -2 1391; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3 1392; GFX6-NEXT: v_max_i32_e32 v3, 0, v1 1393; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0x80000000, v6 1394; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v7, v3 1395; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 1396; GFX6-NEXT: v_min_i32_e32 v3, v4, v3 1397; GFX6-NEXT: v_min_i32_e32 v4, 0, v2 1398; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 1399; GFX6-NEXT: v_max_i32_e32 v3, 0, v2 1400; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x80000000, v4 1401; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x7fffffff, v3 1402; GFX6-NEXT: v_max_i32_e32 v4, v4, v5 1403; GFX6-NEXT: v_min_i32_e32 v3, v4, v3 1404; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 1405; GFX6-NEXT: s_setpc_b64 s[30:31] 1406; 1407; GFX8-LABEL: v_saddsat_v3i32: 1408; GFX8: ; %bb.0: 1409; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1410; GFX8-NEXT: v_min_i32_e32 v8, 0, v0 1411; GFX8-NEXT: v_max_i32_e32 v6, 0, v0 1412; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 0x80000000, v8 1413; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 0x7fffffff, v6 1414; GFX8-NEXT: v_max_i32_e32 v3, v8, v3 1415; GFX8-NEXT: v_min_i32_e32 v3, v3, v6 1416; GFX8-NEXT: v_min_i32_e32 v6, 0, v1 1417; GFX8-NEXT: v_bfrev_b32_e32 v7, -2 1418; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 1419; GFX8-NEXT: v_max_i32_e32 v3, 0, v1 1420; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 0x80000000, v6 1421; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v7, v3 1422; GFX8-NEXT: v_max_i32_e32 v4, v6, v4 1423; GFX8-NEXT: v_min_i32_e32 v3, v4, v3 1424; GFX8-NEXT: v_min_i32_e32 v4, 0, v2 1425; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 1426; GFX8-NEXT: v_max_i32_e32 v3, 0, v2 1427; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0x80000000, v4 1428; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 0x7fffffff, v3 1429; GFX8-NEXT: v_max_i32_e32 v4, v4, v5 1430; GFX8-NEXT: v_min_i32_e32 v3, v4, v3 1431; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 1432; GFX8-NEXT: s_setpc_b64 s[30:31] 1433; 1434; GFX9-LABEL: v_saddsat_v3i32: 1435; GFX9: ; %bb.0: 1436; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1437; GFX9-NEXT: v_add_i32 v0, v0, v3 clamp 1438; GFX9-NEXT: v_add_i32 v1, v1, v4 clamp 1439; GFX9-NEXT: v_add_i32 v2, v2, v5 clamp 1440; GFX9-NEXT: s_setpc_b64 s[30:31] 1441; 1442; GFX10PLUS-LABEL: v_saddsat_v3i32: 1443; GFX10PLUS: ; %bb.0: 1444; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1445; GFX10PLUS-NEXT: v_add_nc_i32 v0, v0, v3 clamp 1446; GFX10PLUS-NEXT: v_add_nc_i32 v1, v1, v4 clamp 1447; GFX10PLUS-NEXT: v_add_nc_i32 v2, v2, v5 clamp 1448; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 1449 %result = call <3 x i32> @llvm.sadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) 1450 ret <3 x i32> %result 1451} 1452 1453define amdgpu_ps <3 x i32> @s_saddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inreg %rhs) { 1454; GFX6-LABEL: s_saddsat_v3i32: 1455; GFX6: ; %bb.0: 1456; GFX6-NEXT: s_min_i32 s7, s0, 0 1457; GFX6-NEXT: s_max_i32 s6, s0, 0 1458; GFX6-NEXT: s_sub_i32 s7, 0x80000000, s7 1459; GFX6-NEXT: s_sub_i32 s6, 0x7fffffff, s6 1460; GFX6-NEXT: s_max_i32 s3, s7, s3 1461; GFX6-NEXT: s_min_i32 s3, s3, s6 1462; GFX6-NEXT: s_min_i32 s6, s1, 0 1463; GFX6-NEXT: s_add_i32 s0, s0, s3 1464; GFX6-NEXT: s_max_i32 s3, s1, 0 1465; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 1466; GFX6-NEXT: s_sub_i32 s3, 0x7fffffff, s3 1467; GFX6-NEXT: s_max_i32 s4, s6, s4 1468; GFX6-NEXT: s_min_i32 s3, s4, s3 1469; GFX6-NEXT: s_min_i32 s4, s2, 0 1470; GFX6-NEXT: s_add_i32 s1, s1, s3 1471; GFX6-NEXT: s_max_i32 s3, s2, 0 1472; GFX6-NEXT: s_sub_i32 s4, 0x80000000, s4 1473; GFX6-NEXT: s_sub_i32 s3, 0x7fffffff, s3 1474; GFX6-NEXT: s_max_i32 s4, s4, s5 1475; GFX6-NEXT: s_min_i32 s3, s4, s3 1476; GFX6-NEXT: s_add_i32 s2, s2, s3 1477; GFX6-NEXT: ; return to shader part epilog 1478; 1479; GFX8-LABEL: s_saddsat_v3i32: 1480; GFX8: ; %bb.0: 1481; GFX8-NEXT: s_min_i32 s7, s0, 0 1482; GFX8-NEXT: s_max_i32 s6, s0, 0 1483; GFX8-NEXT: s_sub_i32 s7, 0x80000000, s7 1484; GFX8-NEXT: s_sub_i32 s6, 0x7fffffff, s6 1485; GFX8-NEXT: s_max_i32 s3, s7, s3 1486; GFX8-NEXT: s_min_i32 s3, s3, s6 1487; GFX8-NEXT: s_min_i32 s6, s1, 0 1488; GFX8-NEXT: s_add_i32 s0, s0, s3 1489; GFX8-NEXT: s_max_i32 s3, s1, 0 1490; GFX8-NEXT: s_sub_i32 s6, 0x80000000, s6 1491; GFX8-NEXT: s_sub_i32 s3, 0x7fffffff, s3 1492; GFX8-NEXT: s_max_i32 s4, s6, s4 1493; GFX8-NEXT: s_min_i32 s3, s4, s3 1494; GFX8-NEXT: s_min_i32 s4, s2, 0 1495; GFX8-NEXT: s_add_i32 s1, s1, s3 1496; GFX8-NEXT: s_max_i32 s3, s2, 0 1497; GFX8-NEXT: s_sub_i32 s4, 0x80000000, s4 1498; GFX8-NEXT: s_sub_i32 s3, 0x7fffffff, s3 1499; GFX8-NEXT: s_max_i32 s4, s4, s5 1500; GFX8-NEXT: s_min_i32 s3, s4, s3 1501; GFX8-NEXT: s_add_i32 s2, s2, s3 1502; GFX8-NEXT: ; return to shader part epilog 1503; 1504; GFX9-LABEL: s_saddsat_v3i32: 1505; GFX9: ; %bb.0: 1506; GFX9-NEXT: v_mov_b32_e32 v0, s3 1507; GFX9-NEXT: v_mov_b32_e32 v1, s4 1508; GFX9-NEXT: v_mov_b32_e32 v2, s5 1509; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp 1510; GFX9-NEXT: v_add_i32 v1, s1, v1 clamp 1511; GFX9-NEXT: v_add_i32 v2, s2, v2 clamp 1512; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1513; GFX9-NEXT: v_readfirstlane_b32 s1, v1 1514; GFX9-NEXT: v_readfirstlane_b32 s2, v2 1515; GFX9-NEXT: ; return to shader part epilog 1516; 1517; GFX10PLUS-LABEL: s_saddsat_v3i32: 1518; GFX10PLUS: ; %bb.0: 1519; GFX10PLUS-NEXT: v_add_nc_i32 v0, s0, s3 clamp 1520; GFX10PLUS-NEXT: v_add_nc_i32 v1, s1, s4 clamp 1521; GFX10PLUS-NEXT: v_add_nc_i32 v2, s2, s5 clamp 1522; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 1523; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 1524; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 1525; GFX10PLUS-NEXT: ; return to shader part epilog 1526 %result = call <3 x i32> @llvm.sadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) 1527 ret <3 x i32> %result 1528} 1529 1530define <4 x i32> @v_saddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { 1531; GFX6-LABEL: v_saddsat_v4i32: 1532; GFX6: ; %bb.0: 1533; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1534; GFX6-NEXT: v_min_i32_e32 v10, 0, v0 1535; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 1536; GFX6-NEXT: v_max_i32_e32 v8, 0, v0 1537; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v11, v10 1538; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0x7fffffff, v8 1539; GFX6-NEXT: v_max_i32_e32 v4, v10, v4 1540; GFX6-NEXT: v_min_i32_e32 v4, v4, v8 1541; GFX6-NEXT: v_min_i32_e32 v8, 0, v1 1542; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 1543; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 1544; GFX6-NEXT: v_max_i32_e32 v4, 0, v1 1545; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0x80000000, v8 1546; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v9, v4 1547; GFX6-NEXT: v_max_i32_e32 v5, v8, v5 1548; GFX6-NEXT: v_min_i32_e32 v4, v5, v4 1549; GFX6-NEXT: v_min_i32_e32 v5, 0, v2 1550; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 1551; GFX6-NEXT: v_max_i32_e32 v4, 0, v2 1552; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x80000000, v5 1553; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v9, v4 1554; GFX6-NEXT: v_max_i32_e32 v5, v5, v6 1555; GFX6-NEXT: v_min_i32_e32 v4, v5, v4 1556; GFX6-NEXT: v_min_i32_e32 v5, 0, v3 1557; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 1558; GFX6-NEXT: v_max_i32_e32 v4, 0, v3 1559; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x80000000, v5 1560; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x7fffffff, v4 1561; GFX6-NEXT: v_max_i32_e32 v5, v5, v7 1562; GFX6-NEXT: v_min_i32_e32 v4, v5, v4 1563; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 1564; GFX6-NEXT: s_setpc_b64 s[30:31] 1565; 1566; GFX8-LABEL: v_saddsat_v4i32: 1567; GFX8: ; %bb.0: 1568; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1569; GFX8-NEXT: v_min_i32_e32 v10, 0, v0 1570; GFX8-NEXT: v_bfrev_b32_e32 v11, 1 1571; GFX8-NEXT: v_max_i32_e32 v8, 0, v0 1572; GFX8-NEXT: v_sub_u32_e32 v10, vcc, v11, v10 1573; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 0x7fffffff, v8 1574; GFX8-NEXT: v_max_i32_e32 v4, v10, v4 1575; GFX8-NEXT: v_min_i32_e32 v4, v4, v8 1576; GFX8-NEXT: v_min_i32_e32 v8, 0, v1 1577; GFX8-NEXT: v_bfrev_b32_e32 v9, -2 1578; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4 1579; GFX8-NEXT: v_max_i32_e32 v4, 0, v1 1580; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 0x80000000, v8 1581; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v9, v4 1582; GFX8-NEXT: v_max_i32_e32 v5, v8, v5 1583; GFX8-NEXT: v_min_i32_e32 v4, v5, v4 1584; GFX8-NEXT: v_min_i32_e32 v5, 0, v2 1585; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v4 1586; GFX8-NEXT: v_max_i32_e32 v4, 0, v2 1587; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 0x80000000, v5 1588; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v9, v4 1589; GFX8-NEXT: v_max_i32_e32 v5, v5, v6 1590; GFX8-NEXT: v_min_i32_e32 v4, v5, v4 1591; GFX8-NEXT: v_min_i32_e32 v5, 0, v3 1592; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 1593; GFX8-NEXT: v_max_i32_e32 v4, 0, v3 1594; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 0x80000000, v5 1595; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0x7fffffff, v4 1596; GFX8-NEXT: v_max_i32_e32 v5, v5, v7 1597; GFX8-NEXT: v_min_i32_e32 v4, v5, v4 1598; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 1599; GFX8-NEXT: s_setpc_b64 s[30:31] 1600; 1601; GFX9-LABEL: v_saddsat_v4i32: 1602; GFX9: ; %bb.0: 1603; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1604; GFX9-NEXT: v_add_i32 v0, v0, v4 clamp 1605; GFX9-NEXT: v_add_i32 v1, v1, v5 clamp 1606; GFX9-NEXT: v_add_i32 v2, v2, v6 clamp 1607; GFX9-NEXT: v_add_i32 v3, v3, v7 clamp 1608; GFX9-NEXT: s_setpc_b64 s[30:31] 1609; 1610; GFX10PLUS-LABEL: v_saddsat_v4i32: 1611; GFX10PLUS: ; %bb.0: 1612; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1613; GFX10PLUS-NEXT: v_add_nc_i32 v0, v0, v4 clamp 1614; GFX10PLUS-NEXT: v_add_nc_i32 v1, v1, v5 clamp 1615; GFX10PLUS-NEXT: v_add_nc_i32 v2, v2, v6 clamp 1616; GFX10PLUS-NEXT: v_add_nc_i32 v3, v3, v7 clamp 1617; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 1618 %result = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) 1619 ret <4 x i32> %result 1620} 1621 1622define amdgpu_ps <4 x i32> @s_saddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inreg %rhs) { 1623; GFX6-LABEL: s_saddsat_v4i32: 1624; GFX6: ; %bb.0: 1625; GFX6-NEXT: s_min_i32 s9, s0, 0 1626; GFX6-NEXT: s_max_i32 s8, s0, 0 1627; GFX6-NEXT: s_sub_i32 s9, 0x80000000, s9 1628; GFX6-NEXT: s_sub_i32 s8, 0x7fffffff, s8 1629; GFX6-NEXT: s_max_i32 s4, s9, s4 1630; GFX6-NEXT: s_min_i32 s4, s4, s8 1631; GFX6-NEXT: s_min_i32 s8, s1, 0 1632; GFX6-NEXT: s_add_i32 s0, s0, s4 1633; GFX6-NEXT: s_max_i32 s4, s1, 0 1634; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 1635; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4 1636; GFX6-NEXT: s_max_i32 s5, s8, s5 1637; GFX6-NEXT: s_min_i32 s4, s5, s4 1638; GFX6-NEXT: s_min_i32 s5, s2, 0 1639; GFX6-NEXT: s_add_i32 s1, s1, s4 1640; GFX6-NEXT: s_max_i32 s4, s2, 0 1641; GFX6-NEXT: s_sub_i32 s5, 0x80000000, s5 1642; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4 1643; GFX6-NEXT: s_max_i32 s5, s5, s6 1644; GFX6-NEXT: s_min_i32 s4, s5, s4 1645; GFX6-NEXT: s_min_i32 s5, s3, 0 1646; GFX6-NEXT: s_add_i32 s2, s2, s4 1647; GFX6-NEXT: s_max_i32 s4, s3, 0 1648; GFX6-NEXT: s_sub_i32 s5, 0x80000000, s5 1649; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4 1650; GFX6-NEXT: s_max_i32 s5, s5, s7 1651; GFX6-NEXT: s_min_i32 s4, s5, s4 1652; GFX6-NEXT: s_add_i32 s3, s3, s4 1653; GFX6-NEXT: ; return to shader part epilog 1654; 1655; GFX8-LABEL: s_saddsat_v4i32: 1656; GFX8: ; %bb.0: 1657; GFX8-NEXT: s_min_i32 s9, s0, 0 1658; GFX8-NEXT: s_max_i32 s8, s0, 0 1659; GFX8-NEXT: s_sub_i32 s9, 0x80000000, s9 1660; GFX8-NEXT: s_sub_i32 s8, 0x7fffffff, s8 1661; GFX8-NEXT: s_max_i32 s4, s9, s4 1662; GFX8-NEXT: s_min_i32 s4, s4, s8 1663; GFX8-NEXT: s_min_i32 s8, s1, 0 1664; GFX8-NEXT: s_add_i32 s0, s0, s4 1665; GFX8-NEXT: s_max_i32 s4, s1, 0 1666; GFX8-NEXT: s_sub_i32 s8, 0x80000000, s8 1667; GFX8-NEXT: s_sub_i32 s4, 0x7fffffff, s4 1668; GFX8-NEXT: s_max_i32 s5, s8, s5 1669; GFX8-NEXT: s_min_i32 s4, s5, s4 1670; GFX8-NEXT: s_min_i32 s5, s2, 0 1671; GFX8-NEXT: s_add_i32 s1, s1, s4 1672; GFX8-NEXT: s_max_i32 s4, s2, 0 1673; GFX8-NEXT: s_sub_i32 s5, 0x80000000, s5 1674; GFX8-NEXT: s_sub_i32 s4, 0x7fffffff, s4 1675; GFX8-NEXT: s_max_i32 s5, s5, s6 1676; GFX8-NEXT: s_min_i32 s4, s5, s4 1677; GFX8-NEXT: s_min_i32 s5, s3, 0 1678; GFX8-NEXT: s_add_i32 s2, s2, s4 1679; GFX8-NEXT: s_max_i32 s4, s3, 0 1680; GFX8-NEXT: s_sub_i32 s5, 0x80000000, s5 1681; GFX8-NEXT: s_sub_i32 s4, 0x7fffffff, s4 1682; GFX8-NEXT: s_max_i32 s5, s5, s7 1683; GFX8-NEXT: s_min_i32 s4, s5, s4 1684; GFX8-NEXT: s_add_i32 s3, s3, s4 1685; GFX8-NEXT: ; return to shader part epilog 1686; 1687; GFX9-LABEL: s_saddsat_v4i32: 1688; GFX9: ; %bb.0: 1689; GFX9-NEXT: v_mov_b32_e32 v0, s4 1690; GFX9-NEXT: v_mov_b32_e32 v1, s5 1691; GFX9-NEXT: v_mov_b32_e32 v2, s6 1692; GFX9-NEXT: v_mov_b32_e32 v3, s7 1693; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp 1694; GFX9-NEXT: v_add_i32 v1, s1, v1 clamp 1695; GFX9-NEXT: v_add_i32 v2, s2, v2 clamp 1696; GFX9-NEXT: v_add_i32 v3, s3, v3 clamp 1697; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1698; GFX9-NEXT: v_readfirstlane_b32 s1, v1 1699; GFX9-NEXT: v_readfirstlane_b32 s2, v2 1700; GFX9-NEXT: v_readfirstlane_b32 s3, v3 1701; GFX9-NEXT: ; return to shader part epilog 1702; 1703; GFX10PLUS-LABEL: s_saddsat_v4i32: 1704; GFX10PLUS: ; %bb.0: 1705; GFX10PLUS-NEXT: v_add_nc_i32 v0, s0, s4 clamp 1706; GFX10PLUS-NEXT: v_add_nc_i32 v1, s1, s5 clamp 1707; GFX10PLUS-NEXT: v_add_nc_i32 v2, s2, s6 clamp 1708; GFX10PLUS-NEXT: v_add_nc_i32 v3, s3, s7 clamp 1709; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 1710; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 1711; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 1712; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3 1713; GFX10PLUS-NEXT: ; return to shader part epilog 1714 %result = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) 1715 ret <4 x i32> %result 1716} 1717 1718define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { 1719; GFX6-LABEL: v_saddsat_v5i32: 1720; GFX6: ; %bb.0: 1721; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1722; GFX6-NEXT: v_min_i32_e32 v12, 0, v0 1723; GFX6-NEXT: v_bfrev_b32_e32 v13, 1 1724; GFX6-NEXT: v_max_i32_e32 v10, 0, v0 1725; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v13, v12 1726; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 0x7fffffff, v10 1727; GFX6-NEXT: v_max_i32_e32 v5, v12, v5 1728; GFX6-NEXT: v_min_i32_e32 v5, v5, v10 1729; GFX6-NEXT: v_min_i32_e32 v10, 0, v1 1730; GFX6-NEXT: v_bfrev_b32_e32 v11, -2 1731; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v5 1732; GFX6-NEXT: v_max_i32_e32 v5, 0, v1 1733; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v13, v10 1734; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v11, v5 1735; GFX6-NEXT: v_max_i32_e32 v6, v10, v6 1736; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 1737; GFX6-NEXT: v_min_i32_e32 v6, 0, v2 1738; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 1739; GFX6-NEXT: v_max_i32_e32 v5, 0, v2 1740; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0x80000000, v6 1741; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v11, v5 1742; GFX6-NEXT: v_max_i32_e32 v6, v6, v7 1743; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 1744; GFX6-NEXT: v_min_i32_e32 v6, 0, v3 1745; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 1746; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 1747; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0x80000000, v6 1748; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v11, v5 1749; GFX6-NEXT: v_max_i32_e32 v6, v6, v8 1750; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 1751; GFX6-NEXT: v_min_i32_e32 v6, 0, v4 1752; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 1753; GFX6-NEXT: v_max_i32_e32 v5, 0, v4 1754; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0x80000000, v6 1755; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x7fffffff, v5 1756; GFX6-NEXT: v_max_i32_e32 v6, v6, v9 1757; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 1758; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 1759; GFX6-NEXT: s_setpc_b64 s[30:31] 1760; 1761; GFX8-LABEL: v_saddsat_v5i32: 1762; GFX8: ; %bb.0: 1763; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1764; GFX8-NEXT: v_min_i32_e32 v12, 0, v0 1765; GFX8-NEXT: v_bfrev_b32_e32 v13, 1 1766; GFX8-NEXT: v_max_i32_e32 v10, 0, v0 1767; GFX8-NEXT: v_sub_u32_e32 v12, vcc, v13, v12 1768; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 0x7fffffff, v10 1769; GFX8-NEXT: v_max_i32_e32 v5, v12, v5 1770; GFX8-NEXT: v_min_i32_e32 v5, v5, v10 1771; GFX8-NEXT: v_min_i32_e32 v10, 0, v1 1772; GFX8-NEXT: v_bfrev_b32_e32 v11, -2 1773; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5 1774; GFX8-NEXT: v_max_i32_e32 v5, 0, v1 1775; GFX8-NEXT: v_sub_u32_e32 v10, vcc, v13, v10 1776; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v11, v5 1777; GFX8-NEXT: v_max_i32_e32 v6, v10, v6 1778; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 1779; GFX8-NEXT: v_min_i32_e32 v6, 0, v2 1780; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v5 1781; GFX8-NEXT: v_max_i32_e32 v5, 0, v2 1782; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 0x80000000, v6 1783; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v11, v5 1784; GFX8-NEXT: v_max_i32_e32 v6, v6, v7 1785; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 1786; GFX8-NEXT: v_min_i32_e32 v6, 0, v3 1787; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 1788; GFX8-NEXT: v_max_i32_e32 v5, 0, v3 1789; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 0x80000000, v6 1790; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v11, v5 1791; GFX8-NEXT: v_max_i32_e32 v6, v6, v8 1792; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 1793; GFX8-NEXT: v_min_i32_e32 v6, 0, v4 1794; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 1795; GFX8-NEXT: v_max_i32_e32 v5, 0, v4 1796; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 0x80000000, v6 1797; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 0x7fffffff, v5 1798; GFX8-NEXT: v_max_i32_e32 v6, v6, v9 1799; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 1800; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 1801; GFX8-NEXT: s_setpc_b64 s[30:31] 1802; 1803; GFX9-LABEL: v_saddsat_v5i32: 1804; GFX9: ; %bb.0: 1805; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1806; GFX9-NEXT: v_add_i32 v0, v0, v5 clamp 1807; GFX9-NEXT: v_add_i32 v1, v1, v6 clamp 1808; GFX9-NEXT: v_add_i32 v2, v2, v7 clamp 1809; GFX9-NEXT: v_add_i32 v3, v3, v8 clamp 1810; GFX9-NEXT: v_add_i32 v4, v4, v9 clamp 1811; GFX9-NEXT: s_setpc_b64 s[30:31] 1812; 1813; GFX10PLUS-LABEL: v_saddsat_v5i32: 1814; GFX10PLUS: ; %bb.0: 1815; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1816; GFX10PLUS-NEXT: v_add_nc_i32 v0, v0, v5 clamp 1817; GFX10PLUS-NEXT: v_add_nc_i32 v1, v1, v6 clamp 1818; GFX10PLUS-NEXT: v_add_nc_i32 v2, v2, v7 clamp 1819; GFX10PLUS-NEXT: v_add_nc_i32 v3, v3, v8 clamp 1820; GFX10PLUS-NEXT: v_add_nc_i32 v4, v4, v9 clamp 1821; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 1822 %result = call <5 x i32> @llvm.sadd.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs) 1823 ret <5 x i32> %result 1824} 1825 1826define amdgpu_ps <5 x i32> @s_saddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inreg %rhs) { 1827; GFX6-LABEL: s_saddsat_v5i32: 1828; GFX6: ; %bb.0: 1829; GFX6-NEXT: s_min_i32 s11, s0, 0 1830; GFX6-NEXT: s_max_i32 s10, s0, 0 1831; GFX6-NEXT: s_sub_i32 s11, 0x80000000, s11 1832; GFX6-NEXT: s_sub_i32 s10, 0x7fffffff, s10 1833; GFX6-NEXT: s_max_i32 s5, s11, s5 1834; GFX6-NEXT: s_min_i32 s5, s5, s10 1835; GFX6-NEXT: s_min_i32 s10, s1, 0 1836; GFX6-NEXT: s_add_i32 s0, s0, s5 1837; GFX6-NEXT: s_max_i32 s5, s1, 0 1838; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 1839; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 1840; GFX6-NEXT: s_max_i32 s6, s10, s6 1841; GFX6-NEXT: s_min_i32 s5, s6, s5 1842; GFX6-NEXT: s_min_i32 s6, s2, 0 1843; GFX6-NEXT: s_add_i32 s1, s1, s5 1844; GFX6-NEXT: s_max_i32 s5, s2, 0 1845; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 1846; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 1847; GFX6-NEXT: s_max_i32 s6, s6, s7 1848; GFX6-NEXT: s_min_i32 s5, s6, s5 1849; GFX6-NEXT: s_min_i32 s6, s3, 0 1850; GFX6-NEXT: s_add_i32 s2, s2, s5 1851; GFX6-NEXT: s_max_i32 s5, s3, 0 1852; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 1853; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 1854; GFX6-NEXT: s_max_i32 s6, s6, s8 1855; GFX6-NEXT: s_min_i32 s5, s6, s5 1856; GFX6-NEXT: s_min_i32 s6, s4, 0 1857; GFX6-NEXT: s_add_i32 s3, s3, s5 1858; GFX6-NEXT: s_max_i32 s5, s4, 0 1859; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 1860; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 1861; GFX6-NEXT: s_max_i32 s6, s6, s9 1862; GFX6-NEXT: s_min_i32 s5, s6, s5 1863; GFX6-NEXT: s_add_i32 s4, s4, s5 1864; GFX6-NEXT: ; return to shader part epilog 1865; 1866; GFX8-LABEL: s_saddsat_v5i32: 1867; GFX8: ; %bb.0: 1868; GFX8-NEXT: s_min_i32 s11, s0, 0 1869; GFX8-NEXT: s_max_i32 s10, s0, 0 1870; GFX8-NEXT: s_sub_i32 s11, 0x80000000, s11 1871; GFX8-NEXT: s_sub_i32 s10, 0x7fffffff, s10 1872; GFX8-NEXT: s_max_i32 s5, s11, s5 1873; GFX8-NEXT: s_min_i32 s5, s5, s10 1874; GFX8-NEXT: s_min_i32 s10, s1, 0 1875; GFX8-NEXT: s_add_i32 s0, s0, s5 1876; GFX8-NEXT: s_max_i32 s5, s1, 0 1877; GFX8-NEXT: s_sub_i32 s10, 0x80000000, s10 1878; GFX8-NEXT: s_sub_i32 s5, 0x7fffffff, s5 1879; GFX8-NEXT: s_max_i32 s6, s10, s6 1880; GFX8-NEXT: s_min_i32 s5, s6, s5 1881; GFX8-NEXT: s_min_i32 s6, s2, 0 1882; GFX8-NEXT: s_add_i32 s1, s1, s5 1883; GFX8-NEXT: s_max_i32 s5, s2, 0 1884; GFX8-NEXT: s_sub_i32 s6, 0x80000000, s6 1885; GFX8-NEXT: s_sub_i32 s5, 0x7fffffff, s5 1886; GFX8-NEXT: s_max_i32 s6, s6, s7 1887; GFX8-NEXT: s_min_i32 s5, s6, s5 1888; GFX8-NEXT: s_min_i32 s6, s3, 0 1889; GFX8-NEXT: s_add_i32 s2, s2, s5 1890; GFX8-NEXT: s_max_i32 s5, s3, 0 1891; GFX8-NEXT: s_sub_i32 s6, 0x80000000, s6 1892; GFX8-NEXT: s_sub_i32 s5, 0x7fffffff, s5 1893; GFX8-NEXT: s_max_i32 s6, s6, s8 1894; GFX8-NEXT: s_min_i32 s5, s6, s5 1895; GFX8-NEXT: s_min_i32 s6, s4, 0 1896; GFX8-NEXT: s_add_i32 s3, s3, s5 1897; GFX8-NEXT: s_max_i32 s5, s4, 0 1898; GFX8-NEXT: s_sub_i32 s6, 0x80000000, s6 1899; GFX8-NEXT: s_sub_i32 s5, 0x7fffffff, s5 1900; GFX8-NEXT: s_max_i32 s6, s6, s9 1901; GFX8-NEXT: s_min_i32 s5, s6, s5 1902; GFX8-NEXT: s_add_i32 s4, s4, s5 1903; GFX8-NEXT: ; return to shader part epilog 1904; 1905; GFX9-LABEL: s_saddsat_v5i32: 1906; GFX9: ; %bb.0: 1907; GFX9-NEXT: v_mov_b32_e32 v0, s5 1908; GFX9-NEXT: v_mov_b32_e32 v1, s6 1909; GFX9-NEXT: v_mov_b32_e32 v2, s7 1910; GFX9-NEXT: v_mov_b32_e32 v3, s8 1911; GFX9-NEXT: v_mov_b32_e32 v4, s9 1912; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp 1913; GFX9-NEXT: v_add_i32 v1, s1, v1 clamp 1914; GFX9-NEXT: v_add_i32 v2, s2, v2 clamp 1915; GFX9-NEXT: v_add_i32 v3, s3, v3 clamp 1916; GFX9-NEXT: v_add_i32 v4, s4, v4 clamp 1917; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1918; GFX9-NEXT: v_readfirstlane_b32 s1, v1 1919; GFX9-NEXT: v_readfirstlane_b32 s2, v2 1920; GFX9-NEXT: v_readfirstlane_b32 s3, v3 1921; GFX9-NEXT: v_readfirstlane_b32 s4, v4 1922; GFX9-NEXT: ; return to shader part epilog 1923; 1924; GFX10PLUS-LABEL: s_saddsat_v5i32: 1925; GFX10PLUS: ; %bb.0: 1926; GFX10PLUS-NEXT: v_add_nc_i32 v0, s0, s5 clamp 1927; GFX10PLUS-NEXT: v_add_nc_i32 v1, s1, s6 clamp 1928; GFX10PLUS-NEXT: v_add_nc_i32 v2, s2, s7 clamp 1929; GFX10PLUS-NEXT: v_add_nc_i32 v3, s3, s8 clamp 1930; GFX10PLUS-NEXT: v_add_nc_i32 v4, s4, s9 clamp 1931; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 1932; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 1933; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 1934; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3 1935; GFX10PLUS-NEXT: v_readfirstlane_b32 s4, v4 1936; GFX10PLUS-NEXT: ; return to shader part epilog 1937 %result = call <5 x i32> @llvm.sadd.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs) 1938 ret <5 x i32> %result 1939} 1940 1941define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { 1942; GFX6-LABEL: v_saddsat_v16i32: 1943; GFX6: ; %bb.0: 1944; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1945; GFX6-NEXT: v_min_i32_e32 v32, 0, v0 1946; GFX6-NEXT: v_bfrev_b32_e32 v31, 1 1947; GFX6-NEXT: v_sub_i32_e32 v32, vcc, v31, v32 1948; GFX6-NEXT: v_max_i32_e32 v32, v32, v16 1949; GFX6-NEXT: v_max_i32_e32 v33, 0, v0 1950; GFX6-NEXT: v_bfrev_b32_e32 v16, -2 1951; GFX6-NEXT: v_sub_i32_e32 v33, vcc, v16, v33 1952; GFX6-NEXT: v_min_i32_e32 v32, v32, v33 1953; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v32 1954; GFX6-NEXT: v_min_i32_e32 v32, 0, v1 1955; GFX6-NEXT: v_sub_i32_e32 v32, vcc, v31, v32 1956; GFX6-NEXT: v_max_i32_e32 v17, v32, v17 1957; GFX6-NEXT: v_max_i32_e32 v32, 0, v1 1958; GFX6-NEXT: v_sub_i32_e32 v32, vcc, v16, v32 1959; GFX6-NEXT: v_min_i32_e32 v17, v17, v32 1960; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v17 1961; GFX6-NEXT: v_min_i32_e32 v17, 0, v2 1962; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 1963; GFX6-NEXT: v_max_i32_e32 v17, v17, v18 1964; GFX6-NEXT: v_max_i32_e32 v18, 0, v2 1965; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 1966; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 1967; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v17 1968; GFX6-NEXT: v_min_i32_e32 v17, 0, v3 1969; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 1970; GFX6-NEXT: v_max_i32_e32 v17, v17, v19 1971; GFX6-NEXT: buffer_load_dword v19, off, s[0:3], s32 1972; GFX6-NEXT: v_max_i32_e32 v18, 0, v3 1973; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 1974; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 1975; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v17 1976; GFX6-NEXT: v_min_i32_e32 v17, 0, v4 1977; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 1978; GFX6-NEXT: v_max_i32_e32 v18, 0, v4 1979; GFX6-NEXT: v_max_i32_e32 v17, v17, v20 1980; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 1981; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 1982; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v17 1983; GFX6-NEXT: v_min_i32_e32 v17, 0, v5 1984; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 1985; GFX6-NEXT: v_max_i32_e32 v18, 0, v5 1986; GFX6-NEXT: v_max_i32_e32 v17, v17, v21 1987; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 1988; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 1989; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v17 1990; GFX6-NEXT: v_min_i32_e32 v17, 0, v6 1991; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 1992; GFX6-NEXT: v_max_i32_e32 v18, 0, v6 1993; GFX6-NEXT: v_max_i32_e32 v17, v17, v22 1994; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 1995; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 1996; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v17 1997; GFX6-NEXT: v_min_i32_e32 v17, 0, v7 1998; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 1999; GFX6-NEXT: v_max_i32_e32 v18, 0, v7 2000; GFX6-NEXT: v_max_i32_e32 v17, v17, v23 2001; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 2002; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 2003; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v17 2004; GFX6-NEXT: v_min_i32_e32 v17, 0, v8 2005; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 2006; GFX6-NEXT: v_max_i32_e32 v18, 0, v8 2007; GFX6-NEXT: v_max_i32_e32 v17, v17, v24 2008; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 2009; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 2010; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v17 2011; GFX6-NEXT: v_min_i32_e32 v17, 0, v9 2012; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 2013; GFX6-NEXT: v_max_i32_e32 v18, 0, v9 2014; GFX6-NEXT: v_max_i32_e32 v17, v17, v25 2015; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 2016; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 2017; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 2018; GFX6-NEXT: v_min_i32_e32 v17, 0, v10 2019; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 2020; GFX6-NEXT: v_max_i32_e32 v18, 0, v10 2021; GFX6-NEXT: v_max_i32_e32 v17, v17, v26 2022; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 2023; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 2024; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v17 2025; GFX6-NEXT: v_min_i32_e32 v17, 0, v11 2026; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 2027; GFX6-NEXT: v_max_i32_e32 v18, 0, v11 2028; GFX6-NEXT: v_max_i32_e32 v17, v17, v27 2029; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 2030; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 2031; GFX6-NEXT: v_add_i32_e32 v11, vcc, v11, v17 2032; GFX6-NEXT: v_min_i32_e32 v17, 0, v12 2033; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 2034; GFX6-NEXT: v_max_i32_e32 v18, 0, v12 2035; GFX6-NEXT: v_max_i32_e32 v17, v17, v28 2036; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 2037; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 2038; GFX6-NEXT: v_add_i32_e32 v12, vcc, v12, v17 2039; GFX6-NEXT: v_min_i32_e32 v17, 0, v13 2040; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 2041; GFX6-NEXT: v_max_i32_e32 v18, 0, v13 2042; GFX6-NEXT: v_max_i32_e32 v17, v17, v29 2043; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 2044; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 2045; GFX6-NEXT: v_add_i32_e32 v13, vcc, v13, v17 2046; GFX6-NEXT: v_min_i32_e32 v17, 0, v14 2047; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 2048; GFX6-NEXT: v_max_i32_e32 v18, 0, v14 2049; GFX6-NEXT: v_max_i32_e32 v17, v17, v30 2050; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 2051; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 2052; GFX6-NEXT: v_add_i32_e32 v14, vcc, v14, v17 2053; GFX6-NEXT: v_max_i32_e32 v17, 0, v15 2054; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v17 2055; GFX6-NEXT: v_min_i32_e32 v17, 0, v15 2056; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 2057; GFX6-NEXT: s_waitcnt vmcnt(0) 2058; GFX6-NEXT: v_max_i32_e32 v17, v17, v19 2059; GFX6-NEXT: v_min_i32_e32 v16, v17, v16 2060; GFX6-NEXT: v_add_i32_e32 v15, vcc, v15, v16 2061; GFX6-NEXT: s_setpc_b64 s[30:31] 2062; 2063; GFX8-LABEL: v_saddsat_v16i32: 2064; GFX8: ; %bb.0: 2065; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2066; GFX8-NEXT: v_min_i32_e32 v32, 0, v0 2067; GFX8-NEXT: v_bfrev_b32_e32 v31, 1 2068; GFX8-NEXT: v_sub_u32_e32 v32, vcc, v31, v32 2069; GFX8-NEXT: v_max_i32_e32 v32, v32, v16 2070; GFX8-NEXT: v_max_i32_e32 v33, 0, v0 2071; GFX8-NEXT: v_bfrev_b32_e32 v16, -2 2072; GFX8-NEXT: v_sub_u32_e32 v33, vcc, v16, v33 2073; GFX8-NEXT: v_min_i32_e32 v32, v32, v33 2074; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v32 2075; GFX8-NEXT: v_min_i32_e32 v32, 0, v1 2076; GFX8-NEXT: v_sub_u32_e32 v32, vcc, v31, v32 2077; GFX8-NEXT: v_max_i32_e32 v17, v32, v17 2078; GFX8-NEXT: v_max_i32_e32 v32, 0, v1 2079; GFX8-NEXT: v_sub_u32_e32 v32, vcc, v16, v32 2080; GFX8-NEXT: v_min_i32_e32 v17, v17, v32 2081; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v17 2082; GFX8-NEXT: v_min_i32_e32 v17, 0, v2 2083; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 2084; GFX8-NEXT: v_max_i32_e32 v17, v17, v18 2085; GFX8-NEXT: v_max_i32_e32 v18, 0, v2 2086; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 2087; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 2088; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v17 2089; GFX8-NEXT: v_min_i32_e32 v17, 0, v3 2090; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 2091; GFX8-NEXT: v_max_i32_e32 v17, v17, v19 2092; GFX8-NEXT: buffer_load_dword v19, off, s[0:3], s32 2093; GFX8-NEXT: v_max_i32_e32 v18, 0, v3 2094; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 2095; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 2096; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v17 2097; GFX8-NEXT: v_min_i32_e32 v17, 0, v4 2098; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 2099; GFX8-NEXT: v_max_i32_e32 v18, 0, v4 2100; GFX8-NEXT: v_max_i32_e32 v17, v17, v20 2101; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 2102; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 2103; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v17 2104; GFX8-NEXT: v_min_i32_e32 v17, 0, v5 2105; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 2106; GFX8-NEXT: v_max_i32_e32 v18, 0, v5 2107; GFX8-NEXT: v_max_i32_e32 v17, v17, v21 2108; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 2109; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 2110; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v17 2111; GFX8-NEXT: v_min_i32_e32 v17, 0, v6 2112; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 2113; GFX8-NEXT: v_max_i32_e32 v18, 0, v6 2114; GFX8-NEXT: v_max_i32_e32 v17, v17, v22 2115; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 2116; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 2117; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v17 2118; GFX8-NEXT: v_min_i32_e32 v17, 0, v7 2119; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 2120; GFX8-NEXT: v_max_i32_e32 v18, 0, v7 2121; GFX8-NEXT: v_max_i32_e32 v17, v17, v23 2122; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 2123; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 2124; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v17 2125; GFX8-NEXT: v_min_i32_e32 v17, 0, v8 2126; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 2127; GFX8-NEXT: v_max_i32_e32 v18, 0, v8 2128; GFX8-NEXT: v_max_i32_e32 v17, v17, v24 2129; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 2130; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 2131; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v17 2132; GFX8-NEXT: v_min_i32_e32 v17, 0, v9 2133; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 2134; GFX8-NEXT: v_max_i32_e32 v18, 0, v9 2135; GFX8-NEXT: v_max_i32_e32 v17, v17, v25 2136; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 2137; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 2138; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v17 2139; GFX8-NEXT: v_min_i32_e32 v17, 0, v10 2140; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 2141; GFX8-NEXT: v_max_i32_e32 v18, 0, v10 2142; GFX8-NEXT: v_max_i32_e32 v17, v17, v26 2143; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 2144; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 2145; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v17 2146; GFX8-NEXT: v_min_i32_e32 v17, 0, v11 2147; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 2148; GFX8-NEXT: v_max_i32_e32 v18, 0, v11 2149; GFX8-NEXT: v_max_i32_e32 v17, v17, v27 2150; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 2151; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 2152; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v17 2153; GFX8-NEXT: v_min_i32_e32 v17, 0, v12 2154; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 2155; GFX8-NEXT: v_max_i32_e32 v18, 0, v12 2156; GFX8-NEXT: v_max_i32_e32 v17, v17, v28 2157; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 2158; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 2159; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v17 2160; GFX8-NEXT: v_min_i32_e32 v17, 0, v13 2161; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 2162; GFX8-NEXT: v_max_i32_e32 v18, 0, v13 2163; GFX8-NEXT: v_max_i32_e32 v17, v17, v29 2164; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 2165; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 2166; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v17 2167; GFX8-NEXT: v_min_i32_e32 v17, 0, v14 2168; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 2169; GFX8-NEXT: v_max_i32_e32 v18, 0, v14 2170; GFX8-NEXT: v_max_i32_e32 v17, v17, v30 2171; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 2172; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 2173; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v17 2174; GFX8-NEXT: v_max_i32_e32 v17, 0, v15 2175; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v17 2176; GFX8-NEXT: v_min_i32_e32 v17, 0, v15 2177; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 2178; GFX8-NEXT: s_waitcnt vmcnt(0) 2179; GFX8-NEXT: v_max_i32_e32 v17, v17, v19 2180; GFX8-NEXT: v_min_i32_e32 v16, v17, v16 2181; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v16 2182; GFX8-NEXT: s_setpc_b64 s[30:31] 2183; 2184; GFX9-LABEL: v_saddsat_v16i32: 2185; GFX9: ; %bb.0: 2186; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2187; GFX9-NEXT: v_add_i32 v0, v0, v16 clamp 2188; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 2189; GFX9-NEXT: v_add_i32 v1, v1, v17 clamp 2190; GFX9-NEXT: v_add_i32 v2, v2, v18 clamp 2191; GFX9-NEXT: v_add_i32 v3, v3, v19 clamp 2192; GFX9-NEXT: v_add_i32 v4, v4, v20 clamp 2193; GFX9-NEXT: v_add_i32 v5, v5, v21 clamp 2194; GFX9-NEXT: v_add_i32 v6, v6, v22 clamp 2195; GFX9-NEXT: v_add_i32 v7, v7, v23 clamp 2196; GFX9-NEXT: v_add_i32 v8, v8, v24 clamp 2197; GFX9-NEXT: v_add_i32 v9, v9, v25 clamp 2198; GFX9-NEXT: v_add_i32 v10, v10, v26 clamp 2199; GFX9-NEXT: v_add_i32 v11, v11, v27 clamp 2200; GFX9-NEXT: v_add_i32 v12, v12, v28 clamp 2201; GFX9-NEXT: v_add_i32 v13, v13, v29 clamp 2202; GFX9-NEXT: v_add_i32 v14, v14, v30 clamp 2203; GFX9-NEXT: s_waitcnt vmcnt(0) 2204; GFX9-NEXT: v_add_i32 v15, v15, v16 clamp 2205; GFX9-NEXT: s_setpc_b64 s[30:31] 2206; 2207; GFX10-LABEL: v_saddsat_v16i32: 2208; GFX10: ; %bb.0: 2209; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2210; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 2211; GFX10-NEXT: v_add_nc_i32 v0, v0, v16 clamp 2212; GFX10-NEXT: v_add_nc_i32 v1, v1, v17 clamp 2213; GFX10-NEXT: v_add_nc_i32 v2, v2, v18 clamp 2214; GFX10-NEXT: v_add_nc_i32 v3, v3, v19 clamp 2215; GFX10-NEXT: v_add_nc_i32 v4, v4, v20 clamp 2216; GFX10-NEXT: v_add_nc_i32 v5, v5, v21 clamp 2217; GFX10-NEXT: v_add_nc_i32 v6, v6, v22 clamp 2218; GFX10-NEXT: v_add_nc_i32 v7, v7, v23 clamp 2219; GFX10-NEXT: v_add_nc_i32 v8, v8, v24 clamp 2220; GFX10-NEXT: v_add_nc_i32 v9, v9, v25 clamp 2221; GFX10-NEXT: v_add_nc_i32 v10, v10, v26 clamp 2222; GFX10-NEXT: v_add_nc_i32 v11, v11, v27 clamp 2223; GFX10-NEXT: v_add_nc_i32 v12, v12, v28 clamp 2224; GFX10-NEXT: v_add_nc_i32 v13, v13, v29 clamp 2225; GFX10-NEXT: v_add_nc_i32 v14, v14, v30 clamp 2226; GFX10-NEXT: s_waitcnt vmcnt(0) 2227; GFX10-NEXT: v_add_nc_i32 v15, v15, v31 clamp 2228; GFX10-NEXT: s_setpc_b64 s[30:31] 2229; 2230; GFX11-LABEL: v_saddsat_v16i32: 2231; GFX11: ; %bb.0: 2232; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2233; GFX11-NEXT: scratch_load_b32 v31, off, s32 2234; GFX11-NEXT: v_add_nc_i32 v0, v0, v16 clamp 2235; GFX11-NEXT: v_add_nc_i32 v1, v1, v17 clamp 2236; GFX11-NEXT: v_add_nc_i32 v2, v2, v18 clamp 2237; GFX11-NEXT: v_add_nc_i32 v3, v3, v19 clamp 2238; GFX11-NEXT: v_add_nc_i32 v4, v4, v20 clamp 2239; GFX11-NEXT: v_add_nc_i32 v5, v5, v21 clamp 2240; GFX11-NEXT: v_add_nc_i32 v6, v6, v22 clamp 2241; GFX11-NEXT: v_add_nc_i32 v7, v7, v23 clamp 2242; GFX11-NEXT: v_add_nc_i32 v8, v8, v24 clamp 2243; GFX11-NEXT: v_add_nc_i32 v9, v9, v25 clamp 2244; GFX11-NEXT: v_add_nc_i32 v10, v10, v26 clamp 2245; GFX11-NEXT: v_add_nc_i32 v11, v11, v27 clamp 2246; GFX11-NEXT: v_add_nc_i32 v12, v12, v28 clamp 2247; GFX11-NEXT: v_add_nc_i32 v13, v13, v29 clamp 2248; GFX11-NEXT: v_add_nc_i32 v14, v14, v30 clamp 2249; GFX11-NEXT: s_waitcnt vmcnt(0) 2250; GFX11-NEXT: v_add_nc_i32 v15, v15, v31 clamp 2251; GFX11-NEXT: s_setpc_b64 s[30:31] 2252 %result = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) 2253 ret <16 x i32> %result 2254} 2255 2256define amdgpu_ps <16 x i32> @s_saddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> inreg %rhs) { 2257; GFX6-LABEL: s_saddsat_v16i32: 2258; GFX6: ; %bb.0: 2259; GFX6-NEXT: s_min_i32 s33, s0, 0 2260; GFX6-NEXT: s_max_i32 s32, s0, 0 2261; GFX6-NEXT: s_sub_i32 s33, 0x80000000, s33 2262; GFX6-NEXT: s_sub_i32 s32, 0x7fffffff, s32 2263; GFX6-NEXT: s_max_i32 s16, s33, s16 2264; GFX6-NEXT: s_min_i32 s16, s16, s32 2265; GFX6-NEXT: s_min_i32 s32, s1, 0 2266; GFX6-NEXT: s_add_i32 s0, s0, s16 2267; GFX6-NEXT: s_max_i32 s16, s1, 0 2268; GFX6-NEXT: s_sub_i32 s32, 0x80000000, s32 2269; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 2270; GFX6-NEXT: s_max_i32 s17, s32, s17 2271; GFX6-NEXT: s_min_i32 s16, s17, s16 2272; GFX6-NEXT: s_min_i32 s17, s2, 0 2273; GFX6-NEXT: s_add_i32 s1, s1, s16 2274; GFX6-NEXT: s_max_i32 s16, s2, 0 2275; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 2276; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 2277; GFX6-NEXT: s_max_i32 s17, s17, s18 2278; GFX6-NEXT: s_min_i32 s16, s17, s16 2279; GFX6-NEXT: s_min_i32 s17, s3, 0 2280; GFX6-NEXT: s_add_i32 s2, s2, s16 2281; GFX6-NEXT: s_max_i32 s16, s3, 0 2282; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 2283; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 2284; GFX6-NEXT: s_max_i32 s17, s17, s19 2285; GFX6-NEXT: s_min_i32 s16, s17, s16 2286; GFX6-NEXT: s_min_i32 s17, s4, 0 2287; GFX6-NEXT: s_add_i32 s3, s3, s16 2288; GFX6-NEXT: s_max_i32 s16, s4, 0 2289; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 2290; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 2291; GFX6-NEXT: s_max_i32 s17, s17, s20 2292; GFX6-NEXT: s_min_i32 s16, s17, s16 2293; GFX6-NEXT: s_min_i32 s17, s5, 0 2294; GFX6-NEXT: s_add_i32 s4, s4, s16 2295; GFX6-NEXT: s_max_i32 s16, s5, 0 2296; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 2297; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 2298; GFX6-NEXT: s_max_i32 s17, s17, s21 2299; GFX6-NEXT: s_min_i32 s16, s17, s16 2300; GFX6-NEXT: s_min_i32 s17, s6, 0 2301; GFX6-NEXT: s_add_i32 s5, s5, s16 2302; GFX6-NEXT: s_max_i32 s16, s6, 0 2303; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 2304; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 2305; GFX6-NEXT: s_max_i32 s17, s17, s22 2306; GFX6-NEXT: s_min_i32 s16, s17, s16 2307; GFX6-NEXT: s_min_i32 s17, s7, 0 2308; GFX6-NEXT: s_add_i32 s6, s6, s16 2309; GFX6-NEXT: s_max_i32 s16, s7, 0 2310; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 2311; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 2312; GFX6-NEXT: s_max_i32 s17, s17, s23 2313; GFX6-NEXT: s_min_i32 s16, s17, s16 2314; GFX6-NEXT: s_min_i32 s17, s8, 0 2315; GFX6-NEXT: s_add_i32 s7, s7, s16 2316; GFX6-NEXT: s_max_i32 s16, s8, 0 2317; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 2318; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 2319; GFX6-NEXT: s_max_i32 s17, s17, s24 2320; GFX6-NEXT: s_min_i32 s16, s17, s16 2321; GFX6-NEXT: s_min_i32 s17, s9, 0 2322; GFX6-NEXT: s_add_i32 s8, s8, s16 2323; GFX6-NEXT: s_max_i32 s16, s9, 0 2324; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 2325; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 2326; GFX6-NEXT: s_max_i32 s17, s17, s25 2327; GFX6-NEXT: s_min_i32 s16, s17, s16 2328; GFX6-NEXT: s_min_i32 s17, s10, 0 2329; GFX6-NEXT: s_add_i32 s9, s9, s16 2330; GFX6-NEXT: s_max_i32 s16, s10, 0 2331; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 2332; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 2333; GFX6-NEXT: s_max_i32 s17, s17, s26 2334; GFX6-NEXT: s_min_i32 s16, s17, s16 2335; GFX6-NEXT: s_min_i32 s17, s11, 0 2336; GFX6-NEXT: s_add_i32 s10, s10, s16 2337; GFX6-NEXT: s_max_i32 s16, s11, 0 2338; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 2339; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 2340; GFX6-NEXT: s_max_i32 s17, s17, s27 2341; GFX6-NEXT: s_min_i32 s16, s17, s16 2342; GFX6-NEXT: s_min_i32 s17, s12, 0 2343; GFX6-NEXT: s_add_i32 s11, s11, s16 2344; GFX6-NEXT: s_max_i32 s16, s12, 0 2345; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 2346; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 2347; GFX6-NEXT: s_max_i32 s17, s17, s28 2348; GFX6-NEXT: s_min_i32 s16, s17, s16 2349; GFX6-NEXT: s_min_i32 s17, s13, 0 2350; GFX6-NEXT: s_add_i32 s12, s12, s16 2351; GFX6-NEXT: s_max_i32 s16, s13, 0 2352; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 2353; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 2354; GFX6-NEXT: s_max_i32 s17, s17, s29 2355; GFX6-NEXT: s_min_i32 s16, s17, s16 2356; GFX6-NEXT: s_min_i32 s17, s14, 0 2357; GFX6-NEXT: s_add_i32 s13, s13, s16 2358; GFX6-NEXT: s_max_i32 s16, s14, 0 2359; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 2360; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 2361; GFX6-NEXT: s_max_i32 s17, s17, s30 2362; GFX6-NEXT: s_min_i32 s16, s17, s16 2363; GFX6-NEXT: s_min_i32 s17, s15, 0 2364; GFX6-NEXT: s_add_i32 s14, s14, s16 2365; GFX6-NEXT: s_max_i32 s16, s15, 0 2366; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 2367; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 2368; GFX6-NEXT: s_max_i32 s17, s17, s31 2369; GFX6-NEXT: s_min_i32 s16, s17, s16 2370; GFX6-NEXT: s_add_i32 s15, s15, s16 2371; GFX6-NEXT: ; return to shader part epilog 2372; 2373; GFX8-LABEL: s_saddsat_v16i32: 2374; GFX8: ; %bb.0: 2375; GFX8-NEXT: s_min_i32 s33, s0, 0 2376; GFX8-NEXT: s_max_i32 s32, s0, 0 2377; GFX8-NEXT: s_sub_i32 s33, 0x80000000, s33 2378; GFX8-NEXT: s_sub_i32 s32, 0x7fffffff, s32 2379; GFX8-NEXT: s_max_i32 s16, s33, s16 2380; GFX8-NEXT: s_min_i32 s16, s16, s32 2381; GFX8-NEXT: s_min_i32 s32, s1, 0 2382; GFX8-NEXT: s_add_i32 s0, s0, s16 2383; GFX8-NEXT: s_max_i32 s16, s1, 0 2384; GFX8-NEXT: s_sub_i32 s32, 0x80000000, s32 2385; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 2386; GFX8-NEXT: s_max_i32 s17, s32, s17 2387; GFX8-NEXT: s_min_i32 s16, s17, s16 2388; GFX8-NEXT: s_min_i32 s17, s2, 0 2389; GFX8-NEXT: s_add_i32 s1, s1, s16 2390; GFX8-NEXT: s_max_i32 s16, s2, 0 2391; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 2392; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 2393; GFX8-NEXT: s_max_i32 s17, s17, s18 2394; GFX8-NEXT: s_min_i32 s16, s17, s16 2395; GFX8-NEXT: s_min_i32 s17, s3, 0 2396; GFX8-NEXT: s_add_i32 s2, s2, s16 2397; GFX8-NEXT: s_max_i32 s16, s3, 0 2398; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 2399; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 2400; GFX8-NEXT: s_max_i32 s17, s17, s19 2401; GFX8-NEXT: s_min_i32 s16, s17, s16 2402; GFX8-NEXT: s_min_i32 s17, s4, 0 2403; GFX8-NEXT: s_add_i32 s3, s3, s16 2404; GFX8-NEXT: s_max_i32 s16, s4, 0 2405; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 2406; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 2407; GFX8-NEXT: s_max_i32 s17, s17, s20 2408; GFX8-NEXT: s_min_i32 s16, s17, s16 2409; GFX8-NEXT: s_min_i32 s17, s5, 0 2410; GFX8-NEXT: s_add_i32 s4, s4, s16 2411; GFX8-NEXT: s_max_i32 s16, s5, 0 2412; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 2413; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 2414; GFX8-NEXT: s_max_i32 s17, s17, s21 2415; GFX8-NEXT: s_min_i32 s16, s17, s16 2416; GFX8-NEXT: s_min_i32 s17, s6, 0 2417; GFX8-NEXT: s_add_i32 s5, s5, s16 2418; GFX8-NEXT: s_max_i32 s16, s6, 0 2419; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 2420; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 2421; GFX8-NEXT: s_max_i32 s17, s17, s22 2422; GFX8-NEXT: s_min_i32 s16, s17, s16 2423; GFX8-NEXT: s_min_i32 s17, s7, 0 2424; GFX8-NEXT: s_add_i32 s6, s6, s16 2425; GFX8-NEXT: s_max_i32 s16, s7, 0 2426; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 2427; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 2428; GFX8-NEXT: s_max_i32 s17, s17, s23 2429; GFX8-NEXT: s_min_i32 s16, s17, s16 2430; GFX8-NEXT: s_min_i32 s17, s8, 0 2431; GFX8-NEXT: s_add_i32 s7, s7, s16 2432; GFX8-NEXT: s_max_i32 s16, s8, 0 2433; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 2434; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 2435; GFX8-NEXT: s_max_i32 s17, s17, s24 2436; GFX8-NEXT: s_min_i32 s16, s17, s16 2437; GFX8-NEXT: s_min_i32 s17, s9, 0 2438; GFX8-NEXT: s_add_i32 s8, s8, s16 2439; GFX8-NEXT: s_max_i32 s16, s9, 0 2440; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 2441; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 2442; GFX8-NEXT: s_max_i32 s17, s17, s25 2443; GFX8-NEXT: s_min_i32 s16, s17, s16 2444; GFX8-NEXT: s_min_i32 s17, s10, 0 2445; GFX8-NEXT: s_add_i32 s9, s9, s16 2446; GFX8-NEXT: s_max_i32 s16, s10, 0 2447; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 2448; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 2449; GFX8-NEXT: s_max_i32 s17, s17, s26 2450; GFX8-NEXT: s_min_i32 s16, s17, s16 2451; GFX8-NEXT: s_min_i32 s17, s11, 0 2452; GFX8-NEXT: s_add_i32 s10, s10, s16 2453; GFX8-NEXT: s_max_i32 s16, s11, 0 2454; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 2455; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 2456; GFX8-NEXT: s_max_i32 s17, s17, s27 2457; GFX8-NEXT: s_min_i32 s16, s17, s16 2458; GFX8-NEXT: s_min_i32 s17, s12, 0 2459; GFX8-NEXT: s_add_i32 s11, s11, s16 2460; GFX8-NEXT: s_max_i32 s16, s12, 0 2461; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 2462; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 2463; GFX8-NEXT: s_max_i32 s17, s17, s28 2464; GFX8-NEXT: s_min_i32 s16, s17, s16 2465; GFX8-NEXT: s_min_i32 s17, s13, 0 2466; GFX8-NEXT: s_add_i32 s12, s12, s16 2467; GFX8-NEXT: s_max_i32 s16, s13, 0 2468; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 2469; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 2470; GFX8-NEXT: s_max_i32 s17, s17, s29 2471; GFX8-NEXT: s_min_i32 s16, s17, s16 2472; GFX8-NEXT: s_min_i32 s17, s14, 0 2473; GFX8-NEXT: s_add_i32 s13, s13, s16 2474; GFX8-NEXT: s_max_i32 s16, s14, 0 2475; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 2476; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 2477; GFX8-NEXT: s_max_i32 s17, s17, s30 2478; GFX8-NEXT: s_min_i32 s16, s17, s16 2479; GFX8-NEXT: s_min_i32 s17, s15, 0 2480; GFX8-NEXT: s_add_i32 s14, s14, s16 2481; GFX8-NEXT: s_max_i32 s16, s15, 0 2482; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 2483; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 2484; GFX8-NEXT: s_max_i32 s17, s17, s31 2485; GFX8-NEXT: s_min_i32 s16, s17, s16 2486; GFX8-NEXT: s_add_i32 s15, s15, s16 2487; GFX8-NEXT: ; return to shader part epilog 2488; 2489; GFX9-LABEL: s_saddsat_v16i32: 2490; GFX9: ; %bb.0: 2491; GFX9-NEXT: v_mov_b32_e32 v0, s16 2492; GFX9-NEXT: v_mov_b32_e32 v1, s17 2493; GFX9-NEXT: v_mov_b32_e32 v2, s18 2494; GFX9-NEXT: v_mov_b32_e32 v3, s19 2495; GFX9-NEXT: v_mov_b32_e32 v4, s20 2496; GFX9-NEXT: v_mov_b32_e32 v5, s21 2497; GFX9-NEXT: v_mov_b32_e32 v6, s22 2498; GFX9-NEXT: v_mov_b32_e32 v7, s23 2499; GFX9-NEXT: v_mov_b32_e32 v8, s24 2500; GFX9-NEXT: v_mov_b32_e32 v9, s25 2501; GFX9-NEXT: v_mov_b32_e32 v10, s26 2502; GFX9-NEXT: v_mov_b32_e32 v11, s27 2503; GFX9-NEXT: v_mov_b32_e32 v12, s28 2504; GFX9-NEXT: v_mov_b32_e32 v13, s29 2505; GFX9-NEXT: v_mov_b32_e32 v14, s30 2506; GFX9-NEXT: v_mov_b32_e32 v15, s31 2507; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp 2508; GFX9-NEXT: v_add_i32 v1, s1, v1 clamp 2509; GFX9-NEXT: v_add_i32 v2, s2, v2 clamp 2510; GFX9-NEXT: v_add_i32 v3, s3, v3 clamp 2511; GFX9-NEXT: v_add_i32 v4, s4, v4 clamp 2512; GFX9-NEXT: v_add_i32 v5, s5, v5 clamp 2513; GFX9-NEXT: v_add_i32 v6, s6, v6 clamp 2514; GFX9-NEXT: v_add_i32 v7, s7, v7 clamp 2515; GFX9-NEXT: v_add_i32 v8, s8, v8 clamp 2516; GFX9-NEXT: v_add_i32 v9, s9, v9 clamp 2517; GFX9-NEXT: v_add_i32 v10, s10, v10 clamp 2518; GFX9-NEXT: v_add_i32 v11, s11, v11 clamp 2519; GFX9-NEXT: v_add_i32 v12, s12, v12 clamp 2520; GFX9-NEXT: v_add_i32 v13, s13, v13 clamp 2521; GFX9-NEXT: v_add_i32 v14, s14, v14 clamp 2522; GFX9-NEXT: v_add_i32 v15, s15, v15 clamp 2523; GFX9-NEXT: v_readfirstlane_b32 s0, v0 2524; GFX9-NEXT: v_readfirstlane_b32 s1, v1 2525; GFX9-NEXT: v_readfirstlane_b32 s2, v2 2526; GFX9-NEXT: v_readfirstlane_b32 s3, v3 2527; GFX9-NEXT: v_readfirstlane_b32 s4, v4 2528; GFX9-NEXT: v_readfirstlane_b32 s5, v5 2529; GFX9-NEXT: v_readfirstlane_b32 s6, v6 2530; GFX9-NEXT: v_readfirstlane_b32 s7, v7 2531; GFX9-NEXT: v_readfirstlane_b32 s8, v8 2532; GFX9-NEXT: v_readfirstlane_b32 s9, v9 2533; GFX9-NEXT: v_readfirstlane_b32 s10, v10 2534; GFX9-NEXT: v_readfirstlane_b32 s11, v11 2535; GFX9-NEXT: v_readfirstlane_b32 s12, v12 2536; GFX9-NEXT: v_readfirstlane_b32 s13, v13 2537; GFX9-NEXT: v_readfirstlane_b32 s14, v14 2538; GFX9-NEXT: v_readfirstlane_b32 s15, v15 2539; GFX9-NEXT: ; return to shader part epilog 2540; 2541; GFX10PLUS-LABEL: s_saddsat_v16i32: 2542; GFX10PLUS: ; %bb.0: 2543; GFX10PLUS-NEXT: v_add_nc_i32 v0, s0, s16 clamp 2544; GFX10PLUS-NEXT: v_add_nc_i32 v1, s1, s17 clamp 2545; GFX10PLUS-NEXT: v_add_nc_i32 v2, s2, s18 clamp 2546; GFX10PLUS-NEXT: v_add_nc_i32 v3, s3, s19 clamp 2547; GFX10PLUS-NEXT: v_add_nc_i32 v4, s4, s20 clamp 2548; GFX10PLUS-NEXT: v_add_nc_i32 v5, s5, s21 clamp 2549; GFX10PLUS-NEXT: v_add_nc_i32 v6, s6, s22 clamp 2550; GFX10PLUS-NEXT: v_add_nc_i32 v7, s7, s23 clamp 2551; GFX10PLUS-NEXT: v_add_nc_i32 v8, s8, s24 clamp 2552; GFX10PLUS-NEXT: v_add_nc_i32 v9, s9, s25 clamp 2553; GFX10PLUS-NEXT: v_add_nc_i32 v10, s10, s26 clamp 2554; GFX10PLUS-NEXT: v_add_nc_i32 v11, s11, s27 clamp 2555; GFX10PLUS-NEXT: v_add_nc_i32 v12, s12, s28 clamp 2556; GFX10PLUS-NEXT: v_add_nc_i32 v13, s13, s29 clamp 2557; GFX10PLUS-NEXT: v_add_nc_i32 v14, s14, s30 clamp 2558; GFX10PLUS-NEXT: v_add_nc_i32 v15, s15, s31 clamp 2559; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 2560; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 2561; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 2562; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3 2563; GFX10PLUS-NEXT: v_readfirstlane_b32 s4, v4 2564; GFX10PLUS-NEXT: v_readfirstlane_b32 s5, v5 2565; GFX10PLUS-NEXT: v_readfirstlane_b32 s6, v6 2566; GFX10PLUS-NEXT: v_readfirstlane_b32 s7, v7 2567; GFX10PLUS-NEXT: v_readfirstlane_b32 s8, v8 2568; GFX10PLUS-NEXT: v_readfirstlane_b32 s9, v9 2569; GFX10PLUS-NEXT: v_readfirstlane_b32 s10, v10 2570; GFX10PLUS-NEXT: v_readfirstlane_b32 s11, v11 2571; GFX10PLUS-NEXT: v_readfirstlane_b32 s12, v12 2572; GFX10PLUS-NEXT: v_readfirstlane_b32 s13, v13 2573; GFX10PLUS-NEXT: v_readfirstlane_b32 s14, v14 2574; GFX10PLUS-NEXT: v_readfirstlane_b32 s15, v15 2575; GFX10PLUS-NEXT: ; return to shader part epilog 2576 %result = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) 2577 ret <16 x i32> %result 2578} 2579 2580define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) { 2581; GFX6-LABEL: v_saddsat_i16: 2582; GFX6: ; %bb.0: 2583; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2584; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2585; GFX6-NEXT: v_min_i32_e32 v3, 0, v0 2586; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2587; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 2588; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 2589; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 2590; GFX6-NEXT: v_max_i32_e32 v1, v3, v1 2591; GFX6-NEXT: v_min_i32_e32 v1, v1, v2 2592; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 2593; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 2594; GFX6-NEXT: s_setpc_b64 s[30:31] 2595; 2596; GFX8-LABEL: v_saddsat_i16: 2597; GFX8: ; %bb.0: 2598; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2599; GFX8-NEXT: v_min_i16_e32 v3, 0, v0 2600; GFX8-NEXT: v_max_i16_e32 v2, 0, v0 2601; GFX8-NEXT: v_sub_u16_e32 v3, 0x8000, v3 2602; GFX8-NEXT: v_sub_u16_e32 v2, 0x7fff, v2 2603; GFX8-NEXT: v_max_i16_e32 v1, v3, v1 2604; GFX8-NEXT: v_min_i16_e32 v1, v1, v2 2605; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 2606; GFX8-NEXT: s_setpc_b64 s[30:31] 2607; 2608; GFX9-LABEL: v_saddsat_i16: 2609; GFX9: ; %bb.0: 2610; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2611; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp 2612; GFX9-NEXT: s_setpc_b64 s[30:31] 2613; 2614; GFX10PLUS-LABEL: v_saddsat_i16: 2615; GFX10PLUS: ; %bb.0: 2616; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2617; GFX10PLUS-NEXT: v_add_nc_i16 v0, v0, v1 clamp 2618; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 2619 %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs) 2620 ret i16 %result 2621} 2622 2623define amdgpu_ps i16 @s_saddsat_i16(i16 inreg %lhs, i16 inreg %rhs) { 2624; GFX6-LABEL: s_saddsat_i16: 2625; GFX6: ; %bb.0: 2626; GFX6-NEXT: s_lshl_b32 s0, s0, 16 2627; GFX6-NEXT: s_min_i32 s3, s0, 0 2628; GFX6-NEXT: s_lshl_b32 s1, s1, 16 2629; GFX6-NEXT: s_max_i32 s2, s0, 0 2630; GFX6-NEXT: s_sub_i32 s3, 0x80000000, s3 2631; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2 2632; GFX6-NEXT: s_max_i32 s1, s3, s1 2633; GFX6-NEXT: s_min_i32 s1, s1, s2 2634; GFX6-NEXT: s_add_i32 s0, s0, s1 2635; GFX6-NEXT: s_ashr_i32 s0, s0, 16 2636; GFX6-NEXT: ; return to shader part epilog 2637; 2638; GFX8-LABEL: s_saddsat_i16: 2639; GFX8: ; %bb.0: 2640; GFX8-NEXT: s_sext_i32_i16 s2, s0 2641; GFX8-NEXT: s_sext_i32_i16 s3, 0 2642; GFX8-NEXT: s_max_i32 s4, s2, s3 2643; GFX8-NEXT: s_min_i32 s2, s2, s3 2644; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 2645; GFX8-NEXT: s_sext_i32_i16 s2, s2 2646; GFX8-NEXT: s_sext_i32_i16 s1, s1 2647; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 2648; GFX8-NEXT: s_max_i32 s1, s2, s1 2649; GFX8-NEXT: s_sext_i32_i16 s1, s1 2650; GFX8-NEXT: s_sext_i32_i16 s2, s4 2651; GFX8-NEXT: s_min_i32 s1, s1, s2 2652; GFX8-NEXT: s_add_i32 s0, s0, s1 2653; GFX8-NEXT: ; return to shader part epilog 2654; 2655; GFX9-LABEL: s_saddsat_i16: 2656; GFX9: ; %bb.0: 2657; GFX9-NEXT: v_mov_b32_e32 v0, s1 2658; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp 2659; GFX9-NEXT: v_readfirstlane_b32 s0, v0 2660; GFX9-NEXT: ; return to shader part epilog 2661; 2662; GFX10PLUS-LABEL: s_saddsat_i16: 2663; GFX10PLUS: ; %bb.0: 2664; GFX10PLUS-NEXT: v_add_nc_i16 v0, s0, s1 clamp 2665; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 2666; GFX10PLUS-NEXT: ; return to shader part epilog 2667 %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs) 2668 ret i16 %result 2669} 2670 2671define amdgpu_ps half @saddsat_i16_sv(i16 inreg %lhs, i16 %rhs) { 2672; GFX6-LABEL: saddsat_i16_sv: 2673; GFX6: ; %bb.0: 2674; GFX6-NEXT: s_lshl_b32 s0, s0, 16 2675; GFX6-NEXT: s_min_i32 s2, s0, 0 2676; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2677; GFX6-NEXT: s_max_i32 s1, s0, 0 2678; GFX6-NEXT: s_sub_i32 s2, 0x80000000, s2 2679; GFX6-NEXT: s_sub_i32 s1, 0x7fffffff, s1 2680; GFX6-NEXT: v_max_i32_e32 v0, s2, v0 2681; GFX6-NEXT: v_min_i32_e32 v0, s1, v0 2682; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 2683; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 2684; GFX6-NEXT: ; return to shader part epilog 2685; 2686; GFX8-LABEL: saddsat_i16_sv: 2687; GFX8: ; %bb.0: 2688; GFX8-NEXT: s_sext_i32_i16 s1, s0 2689; GFX8-NEXT: s_sext_i32_i16 s2, 0 2690; GFX8-NEXT: s_max_i32 s3, s1, s2 2691; GFX8-NEXT: s_min_i32 s1, s1, s2 2692; GFX8-NEXT: s_sub_i32 s1, 0xffff8000, s1 2693; GFX8-NEXT: s_sub_i32 s3, 0x7fff, s3 2694; GFX8-NEXT: v_max_i16_e32 v0, s1, v0 2695; GFX8-NEXT: v_min_i16_e32 v0, s3, v0 2696; GFX8-NEXT: v_add_u16_e32 v0, s0, v0 2697; GFX8-NEXT: ; return to shader part epilog 2698; 2699; GFX9-LABEL: saddsat_i16_sv: 2700; GFX9: ; %bb.0: 2701; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp 2702; GFX9-NEXT: ; return to shader part epilog 2703; 2704; GFX10PLUS-LABEL: saddsat_i16_sv: 2705; GFX10PLUS: ; %bb.0: 2706; GFX10PLUS-NEXT: v_add_nc_i16 v0, s0, v0 clamp 2707; GFX10PLUS-NEXT: ; return to shader part epilog 2708 %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs) 2709 %cast = bitcast i16 %result to half 2710 ret half %cast 2711} 2712 2713define amdgpu_ps half @saddsat_i16_vs(i16 %lhs, i16 inreg %rhs) { 2714; GFX6-LABEL: saddsat_i16_vs: 2715; GFX6: ; %bb.0: 2716; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2717; GFX6-NEXT: v_min_i32_e32 v2, 0, v0 2718; GFX6-NEXT: s_lshl_b32 s0, s0, 16 2719; GFX6-NEXT: v_max_i32_e32 v1, 0, v0 2720; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x80000000, v2 2721; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0x7fffffff, v1 2722; GFX6-NEXT: v_max_i32_e32 v2, s0, v2 2723; GFX6-NEXT: v_min_i32_e32 v1, v2, v1 2724; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 2725; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 2726; GFX6-NEXT: ; return to shader part epilog 2727; 2728; GFX8-LABEL: saddsat_i16_vs: 2729; GFX8: ; %bb.0: 2730; GFX8-NEXT: v_min_i16_e32 v2, 0, v0 2731; GFX8-NEXT: v_max_i16_e32 v1, 0, v0 2732; GFX8-NEXT: v_sub_u16_e32 v2, 0x8000, v2 2733; GFX8-NEXT: v_sub_u16_e32 v1, 0x7fff, v1 2734; GFX8-NEXT: v_max_i16_e32 v2, s0, v2 2735; GFX8-NEXT: v_min_i16_e32 v1, v2, v1 2736; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 2737; GFX8-NEXT: ; return to shader part epilog 2738; 2739; GFX9-LABEL: saddsat_i16_vs: 2740; GFX9: ; %bb.0: 2741; GFX9-NEXT: v_add_i16 v0, v0, s0 clamp 2742; GFX9-NEXT: ; return to shader part epilog 2743; 2744; GFX10PLUS-LABEL: saddsat_i16_vs: 2745; GFX10PLUS: ; %bb.0: 2746; GFX10PLUS-NEXT: v_add_nc_i16 v0, v0, s0 clamp 2747; GFX10PLUS-NEXT: ; return to shader part epilog 2748 %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs) 2749 %cast = bitcast i16 %result to half 2750 ret half %cast 2751} 2752 2753define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { 2754; GFX6-LABEL: v_saddsat_v2i16: 2755; GFX6: ; %bb.0: 2756; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2757; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2758; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 2759; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2760; GFX6-NEXT: v_max_i32_e32 v4, 0, v0 2761; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x80000000, v5 2762; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x7fffffff, v4 2763; GFX6-NEXT: v_max_i32_e32 v2, v5, v2 2764; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2765; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 2766; GFX6-NEXT: v_min_i32_e32 v4, 0, v1 2767; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 2768; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 2769; GFX6-NEXT: v_max_i32_e32 v3, 0, v1 2770; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x80000000, v4 2771; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x7fffffff, v3 2772; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 2773; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 2774; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 2775; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 2776; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 2777; GFX6-NEXT: s_setpc_b64 s[30:31] 2778; 2779; GFX8-LABEL: v_saddsat_v2i16: 2780; GFX8: ; %bb.0: 2781; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2782; GFX8-NEXT: v_min_i16_e32 v3, 0, v0 2783; GFX8-NEXT: v_max_i16_e32 v2, 0, v0 2784; GFX8-NEXT: v_sub_u16_e32 v3, 0x8000, v3 2785; GFX8-NEXT: v_sub_u16_e32 v2, 0x7fff, v2 2786; GFX8-NEXT: v_max_i16_e32 v3, v3, v1 2787; GFX8-NEXT: v_min_i16_e32 v2, v3, v2 2788; GFX8-NEXT: v_mov_b32_e32 v3, 0 2789; GFX8-NEXT: v_max_i16_sdwa v4, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2790; GFX8-NEXT: v_min_i16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2791; GFX8-NEXT: v_sub_u16_e32 v3, 0x8000, v3 2792; GFX8-NEXT: v_sub_u16_e32 v4, 0x7fff, v4 2793; GFX8-NEXT: v_max_i16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2794; GFX8-NEXT: v_min_i16_e32 v1, v1, v4 2795; GFX8-NEXT: v_add_u16_e32 v2, v0, v2 2796; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2797; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 2798; GFX8-NEXT: s_setpc_b64 s[30:31] 2799; 2800; GFX9-LABEL: v_saddsat_v2i16: 2801; GFX9: ; %bb.0: 2802; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2803; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp 2804; GFX9-NEXT: s_setpc_b64 s[30:31] 2805; 2806; GFX10PLUS-LABEL: v_saddsat_v2i16: 2807; GFX10PLUS: ; %bb.0: 2808; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2809; GFX10PLUS-NEXT: v_pk_add_i16 v0, v0, v1 clamp 2810; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 2811 %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) 2812 ret <2 x i16> %result 2813} 2814 2815define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) { 2816; GFX6-LABEL: s_saddsat_v2i16: 2817; GFX6: ; %bb.0: 2818; GFX6-NEXT: s_lshl_b32 s0, s0, 16 2819; GFX6-NEXT: s_min_i32 s5, s0, 0 2820; GFX6-NEXT: s_lshl_b32 s2, s2, 16 2821; GFX6-NEXT: s_max_i32 s4, s0, 0 2822; GFX6-NEXT: s_sub_i32 s5, 0x80000000, s5 2823; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4 2824; GFX6-NEXT: s_max_i32 s2, s5, s2 2825; GFX6-NEXT: s_lshl_b32 s1, s1, 16 2826; GFX6-NEXT: s_min_i32 s2, s2, s4 2827; GFX6-NEXT: s_min_i32 s4, s1, 0 2828; GFX6-NEXT: s_add_i32 s0, s0, s2 2829; GFX6-NEXT: s_lshl_b32 s2, s3, 16 2830; GFX6-NEXT: s_max_i32 s3, s1, 0 2831; GFX6-NEXT: s_sub_i32 s4, 0x80000000, s4 2832; GFX6-NEXT: s_sub_i32 s3, 0x7fffffff, s3 2833; GFX6-NEXT: s_max_i32 s2, s4, s2 2834; GFX6-NEXT: s_min_i32 s2, s2, s3 2835; GFX6-NEXT: s_add_i32 s1, s1, s2 2836; GFX6-NEXT: s_ashr_i32 s1, s1, 16 2837; GFX6-NEXT: s_ashr_i32 s0, s0, 16 2838; GFX6-NEXT: s_and_b32 s1, s1, 0xffff 2839; GFX6-NEXT: s_and_b32 s0, s0, 0xffff 2840; GFX6-NEXT: s_lshl_b32 s1, s1, 16 2841; GFX6-NEXT: s_or_b32 s0, s0, s1 2842; GFX6-NEXT: ; return to shader part epilog 2843; 2844; GFX8-LABEL: s_saddsat_v2i16: 2845; GFX8: ; %bb.0: 2846; GFX8-NEXT: s_sext_i32_i16 s4, s0 2847; GFX8-NEXT: s_sext_i32_i16 s5, 0 2848; GFX8-NEXT: s_max_i32 s6, s4, s5 2849; GFX8-NEXT: s_min_i32 s4, s4, s5 2850; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 2851; GFX8-NEXT: s_lshr_b32 s3, s1, 16 2852; GFX8-NEXT: s_sext_i32_i16 s4, s4 2853; GFX8-NEXT: s_sext_i32_i16 s1, s1 2854; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6 2855; GFX8-NEXT: s_max_i32 s1, s4, s1 2856; GFX8-NEXT: s_sext_i32_i16 s1, s1 2857; GFX8-NEXT: s_sext_i32_i16 s4, s6 2858; GFX8-NEXT: s_lshr_b32 s2, s0, 16 2859; GFX8-NEXT: s_min_i32 s1, s1, s4 2860; GFX8-NEXT: s_add_i32 s0, s0, s1 2861; GFX8-NEXT: s_sext_i32_i16 s1, s2 2862; GFX8-NEXT: s_max_i32 s4, s1, s5 2863; GFX8-NEXT: s_min_i32 s1, s1, s5 2864; GFX8-NEXT: s_sub_i32 s1, 0xffff8000, s1 2865; GFX8-NEXT: s_sext_i32_i16 s1, s1 2866; GFX8-NEXT: s_sext_i32_i16 s3, s3 2867; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 2868; GFX8-NEXT: s_max_i32 s1, s1, s3 2869; GFX8-NEXT: s_sext_i32_i16 s1, s1 2870; GFX8-NEXT: s_sext_i32_i16 s3, s4 2871; GFX8-NEXT: s_min_i32 s1, s1, s3 2872; GFX8-NEXT: s_add_i32 s2, s2, s1 2873; GFX8-NEXT: s_and_b32 s1, 0xffff, s2 2874; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 2875; GFX8-NEXT: s_lshl_b32 s1, s1, 16 2876; GFX8-NEXT: s_or_b32 s0, s0, s1 2877; GFX8-NEXT: ; return to shader part epilog 2878; 2879; GFX9-LABEL: s_saddsat_v2i16: 2880; GFX9: ; %bb.0: 2881; GFX9-NEXT: v_mov_b32_e32 v0, s1 2882; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp 2883; GFX9-NEXT: v_readfirstlane_b32 s0, v0 2884; GFX9-NEXT: ; return to shader part epilog 2885; 2886; GFX10PLUS-LABEL: s_saddsat_v2i16: 2887; GFX10PLUS: ; %bb.0: 2888; GFX10PLUS-NEXT: v_pk_add_i16 v0, s0, s1 clamp 2889; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 2890; GFX10PLUS-NEXT: ; return to shader part epilog 2891 %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) 2892 %cast = bitcast <2 x i16> %result to i32 2893 ret i32 %cast 2894} 2895 2896define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { 2897; GFX6-LABEL: saddsat_v2i16_sv: 2898; GFX6: ; %bb.0: 2899; GFX6-NEXT: s_lshl_b32 s0, s0, 16 2900; GFX6-NEXT: s_min_i32 s3, s0, 0 2901; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2902; GFX6-NEXT: s_max_i32 s2, s0, 0 2903; GFX6-NEXT: s_sub_i32 s3, 0x80000000, s3 2904; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2 2905; GFX6-NEXT: v_max_i32_e32 v0, s3, v0 2906; GFX6-NEXT: v_min_i32_e32 v0, s2, v0 2907; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 2908; GFX6-NEXT: s_lshl_b32 s0, s1, 16 2909; GFX6-NEXT: s_min_i32 s2, s0, 0 2910; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2911; GFX6-NEXT: s_max_i32 s1, s0, 0 2912; GFX6-NEXT: s_sub_i32 s2, 0x80000000, s2 2913; GFX6-NEXT: s_sub_i32 s1, 0x7fffffff, s1 2914; GFX6-NEXT: v_max_i32_e32 v1, s2, v1 2915; GFX6-NEXT: v_min_i32_e32 v1, s1, v1 2916; GFX6-NEXT: v_add_i32_e32 v1, vcc, s0, v1 2917; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 2918; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 2919; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 2920; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 2921; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2922; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 2923; GFX6-NEXT: ; return to shader part epilog 2924; 2925; GFX8-LABEL: saddsat_v2i16_sv: 2926; GFX8: ; %bb.0: 2927; GFX8-NEXT: s_sext_i32_i16 s2, s0 2928; GFX8-NEXT: s_sext_i32_i16 s3, 0 2929; GFX8-NEXT: s_max_i32 s4, s2, s3 2930; GFX8-NEXT: s_min_i32 s2, s2, s3 2931; GFX8-NEXT: s_lshr_b32 s1, s0, 16 2932; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 2933; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 2934; GFX8-NEXT: v_max_i16_e32 v1, s2, v0 2935; GFX8-NEXT: s_sext_i32_i16 s2, s1 2936; GFX8-NEXT: v_min_i16_e32 v1, s4, v1 2937; GFX8-NEXT: s_max_i32 s4, s2, s3 2938; GFX8-NEXT: s_min_i32 s2, s2, s3 2939; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 2940; GFX8-NEXT: v_mov_b32_e32 v2, s2 2941; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 2942; GFX8-NEXT: v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2943; GFX8-NEXT: v_min_i16_e32 v0, s4, v0 2944; GFX8-NEXT: v_mov_b32_e32 v2, s1 2945; GFX8-NEXT: v_add_u16_e32 v1, s0, v1 2946; GFX8-NEXT: v_add_u16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2947; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 2948; GFX8-NEXT: ; return to shader part epilog 2949; 2950; GFX9-LABEL: saddsat_v2i16_sv: 2951; GFX9: ; %bb.0: 2952; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp 2953; GFX9-NEXT: ; return to shader part epilog 2954; 2955; GFX10PLUS-LABEL: saddsat_v2i16_sv: 2956; GFX10PLUS: ; %bb.0: 2957; GFX10PLUS-NEXT: v_pk_add_i16 v0, s0, v0 clamp 2958; GFX10PLUS-NEXT: ; return to shader part epilog 2959 %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) 2960 %cast = bitcast <2 x i16> %result to float 2961 ret float %cast 2962} 2963 2964define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { 2965; GFX6-LABEL: saddsat_v2i16_vs: 2966; GFX6: ; %bb.0: 2967; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2968; GFX6-NEXT: v_min_i32_e32 v3, 0, v0 2969; GFX6-NEXT: s_lshl_b32 s0, s0, 16 2970; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 2971; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 2972; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 2973; GFX6-NEXT: v_max_i32_e32 v3, s0, v3 2974; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2975; GFX6-NEXT: v_min_i32_e32 v2, v3, v2 2976; GFX6-NEXT: v_min_i32_e32 v3, 0, v1 2977; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 2978; GFX6-NEXT: s_lshl_b32 s0, s1, 16 2979; GFX6-NEXT: v_max_i32_e32 v2, 0, v1 2980; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 2981; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 2982; GFX6-NEXT: v_max_i32_e32 v3, s0, v3 2983; GFX6-NEXT: v_min_i32_e32 v2, v3, v2 2984; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 2985; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 2986; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 2987; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 2988; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 2989; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2990; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 2991; GFX6-NEXT: ; return to shader part epilog 2992; 2993; GFX8-LABEL: saddsat_v2i16_vs: 2994; GFX8: ; %bb.0: 2995; GFX8-NEXT: v_min_i16_e32 v2, 0, v0 2996; GFX8-NEXT: v_max_i16_e32 v1, 0, v0 2997; GFX8-NEXT: v_sub_u16_e32 v2, 0x8000, v2 2998; GFX8-NEXT: v_sub_u16_e32 v1, 0x7fff, v1 2999; GFX8-NEXT: v_max_i16_e32 v2, s0, v2 3000; GFX8-NEXT: v_min_i16_e32 v1, v2, v1 3001; GFX8-NEXT: v_mov_b32_e32 v2, 0 3002; GFX8-NEXT: v_max_i16_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3003; GFX8-NEXT: v_min_i16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3004; GFX8-NEXT: s_lshr_b32 s1, s0, 16 3005; GFX8-NEXT: v_sub_u16_e32 v2, 0x8000, v2 3006; GFX8-NEXT: v_sub_u16_e32 v3, 0x7fff, v3 3007; GFX8-NEXT: v_max_i16_e32 v2, s1, v2 3008; GFX8-NEXT: v_min_i16_e32 v2, v2, v3 3009; GFX8-NEXT: v_add_u16_e32 v1, v0, v1 3010; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3011; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 3012; GFX8-NEXT: ; return to shader part epilog 3013; 3014; GFX9-LABEL: saddsat_v2i16_vs: 3015; GFX9: ; %bb.0: 3016; GFX9-NEXT: v_pk_add_i16 v0, v0, s0 clamp 3017; GFX9-NEXT: ; return to shader part epilog 3018; 3019; GFX10PLUS-LABEL: saddsat_v2i16_vs: 3020; GFX10PLUS: ; %bb.0: 3021; GFX10PLUS-NEXT: v_pk_add_i16 v0, v0, s0 clamp 3022; GFX10PLUS-NEXT: ; return to shader part epilog 3023 %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) 3024 %cast = bitcast <2 x i16> %result to float 3025 ret float %cast 3026} 3027 3028; FIXME: v3i16 insert/extract 3029; define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { 3030; %result = call <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs) 3031; ret <3 x i16> %result 3032; } 3033 3034; define amdgpu_ps <3 x i16> @s_saddsat_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs) { 3035; %result = call <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs) 3036; ret <3 x i16> %result 3037; } 3038 3039define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { 3040; GFX6-LABEL: v_saddsat_v4i16: 3041; GFX6: ; %bb.0: 3042; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3043; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 3044; GFX6-NEXT: v_min_i32_e32 v10, 0, v0 3045; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 3046; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 3047; GFX6-NEXT: v_max_i32_e32 v8, 0, v0 3048; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v11, v10 3049; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0x7fffffff, v8 3050; GFX6-NEXT: v_max_i32_e32 v4, v10, v4 3051; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3052; GFX6-NEXT: v_min_i32_e32 v4, v4, v8 3053; GFX6-NEXT: v_min_i32_e32 v8, 0, v1 3054; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 3055; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 3056; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 3057; GFX6-NEXT: v_max_i32_e32 v5, 0, v1 3058; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v11, v8 3059; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 3060; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 3061; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 3062; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3063; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 3064; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 3065; GFX6-NEXT: v_min_i32_e32 v6, 0, v2 3066; GFX6-NEXT: v_max_i32_e32 v5, 0, v2 3067; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v11, v6 3068; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 3069; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 3070; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 3071; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 3072; GFX6-NEXT: v_min_i32_e32 v6, 0, v3 3073; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 3074; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 3075; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 3076; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v11, v6 3077; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 3078; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 3079; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 3080; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 3081; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 3082; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 3083; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 3084; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 3085; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 3086; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 3087; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3088; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 3089; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 3090; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 3091; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3092; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 3093; GFX6-NEXT: s_setpc_b64 s[30:31] 3094; 3095; GFX8-LABEL: v_saddsat_v4i16: 3096; GFX8: ; %bb.0: 3097; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3098; GFX8-NEXT: v_min_i16_e32 v5, 0, v0 3099; GFX8-NEXT: v_max_i16_e32 v4, 0, v0 3100; GFX8-NEXT: v_sub_u16_e32 v5, 0x8000, v5 3101; GFX8-NEXT: v_sub_u16_e32 v4, 0x7fff, v4 3102; GFX8-NEXT: v_max_i16_e32 v5, v5, v2 3103; GFX8-NEXT: v_min_i16_e32 v4, v5, v4 3104; GFX8-NEXT: v_mov_b32_e32 v5, 0 3105; GFX8-NEXT: v_min_i16_sdwa v7, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3106; GFX8-NEXT: v_max_i16_sdwa v6, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3107; GFX8-NEXT: v_sub_u16_e32 v7, 0x8000, v7 3108; GFX8-NEXT: v_sub_u16_e32 v6, 0x7fff, v6 3109; GFX8-NEXT: v_max_i16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3110; GFX8-NEXT: v_min_i16_e32 v7, 0, v1 3111; GFX8-NEXT: v_min_i16_e32 v2, v2, v6 3112; GFX8-NEXT: v_max_i16_e32 v6, 0, v1 3113; GFX8-NEXT: v_sub_u16_e32 v7, 0x8000, v7 3114; GFX8-NEXT: v_sub_u16_e32 v6, 0x7fff, v6 3115; GFX8-NEXT: v_max_i16_e32 v7, v7, v3 3116; GFX8-NEXT: v_min_i16_e32 v6, v7, v6 3117; GFX8-NEXT: v_max_i16_sdwa v7, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3118; GFX8-NEXT: v_min_i16_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3119; GFX8-NEXT: v_sub_u16_e32 v5, 0x8000, v5 3120; GFX8-NEXT: v_sub_u16_e32 v7, 0x7fff, v7 3121; GFX8-NEXT: v_max_i16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3122; GFX8-NEXT: v_min_i16_e32 v3, v3, v7 3123; GFX8-NEXT: v_add_u16_e32 v4, v0, v4 3124; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3125; GFX8-NEXT: v_add_u16_e32 v2, v1, v6 3126; GFX8-NEXT: v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3127; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 3128; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 3129; GFX8-NEXT: s_setpc_b64 s[30:31] 3130; 3131; GFX9-LABEL: v_saddsat_v4i16: 3132; GFX9: ; %bb.0: 3133; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3134; GFX9-NEXT: v_pk_add_i16 v0, v0, v2 clamp 3135; GFX9-NEXT: v_pk_add_i16 v1, v1, v3 clamp 3136; GFX9-NEXT: s_setpc_b64 s[30:31] 3137; 3138; GFX10PLUS-LABEL: v_saddsat_v4i16: 3139; GFX10PLUS: ; %bb.0: 3140; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3141; GFX10PLUS-NEXT: v_pk_add_i16 v0, v0, v2 clamp 3142; GFX10PLUS-NEXT: v_pk_add_i16 v1, v1, v3 clamp 3143; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 3144 %result = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) 3145 %cast = bitcast <4 x i16> %result to <2 x float> 3146 ret <2 x float> %cast 3147} 3148 3149define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs) { 3150; GFX6-LABEL: s_saddsat_v4i16: 3151; GFX6: ; %bb.0: 3152; GFX6-NEXT: s_lshl_b32 s0, s0, 16 3153; GFX6-NEXT: s_min_i32 s9, s0, 0 3154; GFX6-NEXT: s_lshl_b32 s4, s4, 16 3155; GFX6-NEXT: s_max_i32 s8, s0, 0 3156; GFX6-NEXT: s_sub_i32 s9, 0x80000000, s9 3157; GFX6-NEXT: s_sub_i32 s8, 0x7fffffff, s8 3158; GFX6-NEXT: s_max_i32 s4, s9, s4 3159; GFX6-NEXT: s_lshl_b32 s1, s1, 16 3160; GFX6-NEXT: s_min_i32 s4, s4, s8 3161; GFX6-NEXT: s_min_i32 s8, s1, 0 3162; GFX6-NEXT: s_add_i32 s0, s0, s4 3163; GFX6-NEXT: s_lshl_b32 s4, s5, 16 3164; GFX6-NEXT: s_max_i32 s5, s1, 0 3165; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 3166; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 3167; GFX6-NEXT: s_max_i32 s4, s8, s4 3168; GFX6-NEXT: s_min_i32 s4, s4, s5 3169; GFX6-NEXT: s_lshl_b32 s2, s2, 16 3170; GFX6-NEXT: s_add_i32 s1, s1, s4 3171; GFX6-NEXT: s_lshl_b32 s4, s6, 16 3172; GFX6-NEXT: s_min_i32 s6, s2, 0 3173; GFX6-NEXT: s_max_i32 s5, s2, 0 3174; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 3175; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 3176; GFX6-NEXT: s_max_i32 s4, s6, s4 3177; GFX6-NEXT: s_lshl_b32 s3, s3, 16 3178; GFX6-NEXT: s_min_i32 s4, s4, s5 3179; GFX6-NEXT: s_min_i32 s6, s3, 0 3180; GFX6-NEXT: s_add_i32 s2, s2, s4 3181; GFX6-NEXT: s_lshl_b32 s4, s7, 16 3182; GFX6-NEXT: s_max_i32 s5, s3, 0 3183; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 3184; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 3185; GFX6-NEXT: s_max_i32 s4, s6, s4 3186; GFX6-NEXT: s_ashr_i32 s1, s1, 16 3187; GFX6-NEXT: s_min_i32 s4, s4, s5 3188; GFX6-NEXT: s_ashr_i32 s0, s0, 16 3189; GFX6-NEXT: s_add_i32 s3, s3, s4 3190; GFX6-NEXT: s_and_b32 s1, s1, 0xffff 3191; GFX6-NEXT: s_ashr_i32 s2, s2, 16 3192; GFX6-NEXT: s_ashr_i32 s3, s3, 16 3193; GFX6-NEXT: s_and_b32 s0, s0, 0xffff 3194; GFX6-NEXT: s_lshl_b32 s1, s1, 16 3195; GFX6-NEXT: s_or_b32 s0, s0, s1 3196; GFX6-NEXT: s_and_b32 s1, s2, 0xffff 3197; GFX6-NEXT: s_and_b32 s2, s3, 0xffff 3198; GFX6-NEXT: s_lshl_b32 s2, s2, 16 3199; GFX6-NEXT: s_or_b32 s1, s1, s2 3200; GFX6-NEXT: ; return to shader part epilog 3201; 3202; GFX8-LABEL: s_saddsat_v4i16: 3203; GFX8: ; %bb.0: 3204; GFX8-NEXT: s_sext_i32_i16 s8, s0 3205; GFX8-NEXT: s_sext_i32_i16 s9, 0 3206; GFX8-NEXT: s_max_i32 s10, s8, s9 3207; GFX8-NEXT: s_min_i32 s8, s8, s9 3208; GFX8-NEXT: s_sub_i32 s8, 0xffff8000, s8 3209; GFX8-NEXT: s_lshr_b32 s6, s2, 16 3210; GFX8-NEXT: s_sext_i32_i16 s8, s8 3211; GFX8-NEXT: s_sext_i32_i16 s2, s2 3212; GFX8-NEXT: s_sub_i32 s10, 0x7fff, s10 3213; GFX8-NEXT: s_max_i32 s2, s8, s2 3214; GFX8-NEXT: s_sext_i32_i16 s2, s2 3215; GFX8-NEXT: s_sext_i32_i16 s8, s10 3216; GFX8-NEXT: s_lshr_b32 s4, s0, 16 3217; GFX8-NEXT: s_min_i32 s2, s2, s8 3218; GFX8-NEXT: s_add_i32 s0, s0, s2 3219; GFX8-NEXT: s_sext_i32_i16 s2, s4 3220; GFX8-NEXT: s_max_i32 s8, s2, s9 3221; GFX8-NEXT: s_min_i32 s2, s2, s9 3222; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 3223; GFX8-NEXT: s_sext_i32_i16 s2, s2 3224; GFX8-NEXT: s_sext_i32_i16 s6, s6 3225; GFX8-NEXT: s_sub_i32 s8, 0x7fff, s8 3226; GFX8-NEXT: s_max_i32 s2, s2, s6 3227; GFX8-NEXT: s_sext_i32_i16 s2, s2 3228; GFX8-NEXT: s_sext_i32_i16 s6, s8 3229; GFX8-NEXT: s_min_i32 s2, s2, s6 3230; GFX8-NEXT: s_add_i32 s4, s4, s2 3231; GFX8-NEXT: s_sext_i32_i16 s2, s1 3232; GFX8-NEXT: s_max_i32 s6, s2, s9 3233; GFX8-NEXT: s_min_i32 s2, s2, s9 3234; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 3235; GFX8-NEXT: s_lshr_b32 s7, s3, 16 3236; GFX8-NEXT: s_sext_i32_i16 s2, s2 3237; GFX8-NEXT: s_sext_i32_i16 s3, s3 3238; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6 3239; GFX8-NEXT: s_max_i32 s2, s2, s3 3240; GFX8-NEXT: s_sext_i32_i16 s2, s2 3241; GFX8-NEXT: s_sext_i32_i16 s3, s6 3242; GFX8-NEXT: s_lshr_b32 s5, s1, 16 3243; GFX8-NEXT: s_min_i32 s2, s2, s3 3244; GFX8-NEXT: s_add_i32 s1, s1, s2 3245; GFX8-NEXT: s_sext_i32_i16 s2, s5 3246; GFX8-NEXT: s_max_i32 s3, s2, s9 3247; GFX8-NEXT: s_min_i32 s2, s2, s9 3248; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 3249; GFX8-NEXT: s_sext_i32_i16 s2, s2 3250; GFX8-NEXT: s_sext_i32_i16 s6, s7 3251; GFX8-NEXT: s_sub_i32 s3, 0x7fff, s3 3252; GFX8-NEXT: s_max_i32 s2, s2, s6 3253; GFX8-NEXT: s_sext_i32_i16 s2, s2 3254; GFX8-NEXT: s_sext_i32_i16 s3, s3 3255; GFX8-NEXT: s_min_i32 s2, s2, s3 3256; GFX8-NEXT: s_add_i32 s5, s5, s2 3257; GFX8-NEXT: s_and_b32 s2, 0xffff, s4 3258; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 3259; GFX8-NEXT: s_lshl_b32 s2, s2, 16 3260; GFX8-NEXT: s_or_b32 s0, s0, s2 3261; GFX8-NEXT: s_and_b32 s2, 0xffff, s5 3262; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 3263; GFX8-NEXT: s_lshl_b32 s2, s2, 16 3264; GFX8-NEXT: s_or_b32 s1, s1, s2 3265; GFX8-NEXT: ; return to shader part epilog 3266; 3267; GFX9-LABEL: s_saddsat_v4i16: 3268; GFX9: ; %bb.0: 3269; GFX9-NEXT: v_mov_b32_e32 v0, s2 3270; GFX9-NEXT: v_mov_b32_e32 v1, s3 3271; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp 3272; GFX9-NEXT: v_pk_add_i16 v1, s1, v1 clamp 3273; GFX9-NEXT: v_readfirstlane_b32 s0, v0 3274; GFX9-NEXT: v_readfirstlane_b32 s1, v1 3275; GFX9-NEXT: ; return to shader part epilog 3276; 3277; GFX10PLUS-LABEL: s_saddsat_v4i16: 3278; GFX10PLUS: ; %bb.0: 3279; GFX10PLUS-NEXT: v_pk_add_i16 v0, s0, s2 clamp 3280; GFX10PLUS-NEXT: v_pk_add_i16 v1, s1, s3 clamp 3281; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 3282; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 3283; GFX10PLUS-NEXT: ; return to shader part epilog 3284 %result = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) 3285 %cast = bitcast <4 x i16> %result to <2 x i32> 3286 ret <2 x i32> %cast 3287} 3288 3289; FIXME 3290; define <5 x i16> @v_saddsat_v5i16(<5 x i16> %lhs, <5 x i16> %rhs) { 3291; %result = call <5 x i16> @llvm.sadd.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs) 3292; ret <5 x i16> %result 3293; } 3294 3295; define amdgpu_ps <5 x i16> @s_saddsat_v5i16(<5 x i16> inreg %lhs, <5 x i16> inreg %rhs) { 3296; %result = call <5 x i16> @llvm.sadd.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs) 3297; ret <5 x i16> %result 3298; } 3299 3300define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { 3301; GFX6-LABEL: v_saddsat_v6i16: 3302; GFX6: ; %bb.0: 3303; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3304; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 3305; GFX6-NEXT: v_min_i32_e32 v14, 0, v0 3306; GFX6-NEXT: v_bfrev_b32_e32 v15, 1 3307; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 3308; GFX6-NEXT: v_max_i32_e32 v12, 0, v0 3309; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v15, v14 3310; GFX6-NEXT: v_sub_i32_e32 v12, vcc, 0x7fffffff, v12 3311; GFX6-NEXT: v_max_i32_e32 v6, v14, v6 3312; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3313; GFX6-NEXT: v_min_i32_e32 v6, v6, v12 3314; GFX6-NEXT: v_min_i32_e32 v12, 0, v1 3315; GFX6-NEXT: v_bfrev_b32_e32 v13, -2 3316; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v6 3317; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v7 3318; GFX6-NEXT: v_max_i32_e32 v7, 0, v1 3319; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v15, v12 3320; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 3321; GFX6-NEXT: v_max_i32_e32 v6, v12, v6 3322; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 3323; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3324; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v6 3325; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v8 3326; GFX6-NEXT: v_min_i32_e32 v8, 0, v2 3327; GFX6-NEXT: v_max_i32_e32 v7, 0, v2 3328; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 3329; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 3330; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 3331; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 3332; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 3333; GFX6-NEXT: v_min_i32_e32 v8, 0, v3 3334; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 3335; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9 3336; GFX6-NEXT: v_max_i32_e32 v7, 0, v3 3337; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 3338; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 3339; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 3340; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 3341; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 3342; GFX6-NEXT: v_min_i32_e32 v8, 0, v4 3343; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 3344; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10 3345; GFX6-NEXT: v_max_i32_e32 v7, 0, v4 3346; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 3347; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 3348; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 3349; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 3350; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 3351; GFX6-NEXT: v_min_i32_e32 v8, 0, v5 3352; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 3353; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11 3354; GFX6-NEXT: v_max_i32_e32 v7, 0, v5 3355; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 3356; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 3357; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 3358; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 3359; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 3360; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 3361; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 3362; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 3363; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 3364; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 3365; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 3366; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3367; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 3368; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 3369; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 3370; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 3371; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 3372; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3373; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5 3374; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 3375; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4 3376; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 3377; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 3378; GFX6-NEXT: s_setpc_b64 s[30:31] 3379; 3380; GFX8-LABEL: v_saddsat_v6i16: 3381; GFX8: ; %bb.0: 3382; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3383; GFX8-NEXT: v_min_i16_e32 v7, 0, v0 3384; GFX8-NEXT: v_max_i16_e32 v6, 0, v0 3385; GFX8-NEXT: v_sub_u16_e32 v7, 0x8000, v7 3386; GFX8-NEXT: v_sub_u16_e32 v6, 0x7fff, v6 3387; GFX8-NEXT: v_max_i16_e32 v7, v7, v3 3388; GFX8-NEXT: v_min_i16_e32 v6, v7, v6 3389; GFX8-NEXT: v_mov_b32_e32 v7, 0 3390; GFX8-NEXT: v_min_i16_sdwa v9, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3391; GFX8-NEXT: v_max_i16_sdwa v8, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3392; GFX8-NEXT: v_sub_u16_e32 v9, 0x8000, v9 3393; GFX8-NEXT: v_sub_u16_e32 v8, 0x7fff, v8 3394; GFX8-NEXT: v_max_i16_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3395; GFX8-NEXT: v_min_i16_e32 v9, 0, v1 3396; GFX8-NEXT: v_min_i16_e32 v3, v3, v8 3397; GFX8-NEXT: v_max_i16_e32 v8, 0, v1 3398; GFX8-NEXT: v_sub_u16_e32 v9, 0x8000, v9 3399; GFX8-NEXT: v_sub_u16_e32 v8, 0x7fff, v8 3400; GFX8-NEXT: v_max_i16_e32 v9, v9, v4 3401; GFX8-NEXT: v_min_i16_sdwa v10, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3402; GFX8-NEXT: v_min_i16_e32 v8, v9, v8 3403; GFX8-NEXT: v_max_i16_sdwa v9, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3404; GFX8-NEXT: v_sub_u16_e32 v10, 0x8000, v10 3405; GFX8-NEXT: v_sub_u16_e32 v9, 0x7fff, v9 3406; GFX8-NEXT: v_max_i16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3407; GFX8-NEXT: v_min_i16_e32 v10, 0, v2 3408; GFX8-NEXT: v_min_i16_e32 v4, v4, v9 3409; GFX8-NEXT: v_max_i16_e32 v9, 0, v2 3410; GFX8-NEXT: v_sub_u16_e32 v10, 0x8000, v10 3411; GFX8-NEXT: v_sub_u16_e32 v9, 0x7fff, v9 3412; GFX8-NEXT: v_max_i16_e32 v10, v10, v5 3413; GFX8-NEXT: v_min_i16_e32 v9, v10, v9 3414; GFX8-NEXT: v_max_i16_sdwa v10, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3415; GFX8-NEXT: v_min_i16_sdwa v7, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3416; GFX8-NEXT: v_sub_u16_e32 v7, 0x8000, v7 3417; GFX8-NEXT: v_sub_u16_e32 v10, 0x7fff, v10 3418; GFX8-NEXT: v_max_i16_sdwa v5, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3419; GFX8-NEXT: v_min_i16_e32 v5, v5, v10 3420; GFX8-NEXT: v_add_u16_e32 v6, v0, v6 3421; GFX8-NEXT: v_add_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3422; GFX8-NEXT: v_add_u16_e32 v3, v1, v8 3423; GFX8-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3424; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 3425; GFX8-NEXT: v_add_u16_e32 v3, v2, v9 3426; GFX8-NEXT: v_add_u16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3427; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 3428; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 3429; GFX8-NEXT: s_setpc_b64 s[30:31] 3430; 3431; GFX9-LABEL: v_saddsat_v6i16: 3432; GFX9: ; %bb.0: 3433; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3434; GFX9-NEXT: v_pk_add_i16 v0, v0, v3 clamp 3435; GFX9-NEXT: v_pk_add_i16 v1, v1, v4 clamp 3436; GFX9-NEXT: v_pk_add_i16 v2, v2, v5 clamp 3437; GFX9-NEXT: s_setpc_b64 s[30:31] 3438; 3439; GFX10PLUS-LABEL: v_saddsat_v6i16: 3440; GFX10PLUS: ; %bb.0: 3441; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3442; GFX10PLUS-NEXT: v_pk_add_i16 v0, v0, v3 clamp 3443; GFX10PLUS-NEXT: v_pk_add_i16 v1, v1, v4 clamp 3444; GFX10PLUS-NEXT: v_pk_add_i16 v2, v2, v5 clamp 3445; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 3446 %result = call <6 x i16> @llvm.sadd.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs) 3447 %cast = bitcast <6 x i16> %result to <3 x float> 3448 ret <3 x float> %cast 3449} 3450 3451define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inreg %rhs) { 3452; GFX6-LABEL: s_saddsat_v6i16: 3453; GFX6: ; %bb.0: 3454; GFX6-NEXT: s_lshl_b32 s0, s0, 16 3455; GFX6-NEXT: s_min_i32 s13, s0, 0 3456; GFX6-NEXT: s_lshl_b32 s6, s6, 16 3457; GFX6-NEXT: s_max_i32 s12, s0, 0 3458; GFX6-NEXT: s_sub_i32 s13, 0x80000000, s13 3459; GFX6-NEXT: s_sub_i32 s12, 0x7fffffff, s12 3460; GFX6-NEXT: s_max_i32 s6, s13, s6 3461; GFX6-NEXT: s_lshl_b32 s1, s1, 16 3462; GFX6-NEXT: s_min_i32 s6, s6, s12 3463; GFX6-NEXT: s_min_i32 s12, s1, 0 3464; GFX6-NEXT: s_add_i32 s0, s0, s6 3465; GFX6-NEXT: s_lshl_b32 s6, s7, 16 3466; GFX6-NEXT: s_max_i32 s7, s1, 0 3467; GFX6-NEXT: s_sub_i32 s12, 0x80000000, s12 3468; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7 3469; GFX6-NEXT: s_max_i32 s6, s12, s6 3470; GFX6-NEXT: s_min_i32 s6, s6, s7 3471; GFX6-NEXT: s_lshl_b32 s2, s2, 16 3472; GFX6-NEXT: s_add_i32 s1, s1, s6 3473; GFX6-NEXT: s_lshl_b32 s6, s8, 16 3474; GFX6-NEXT: s_min_i32 s8, s2, 0 3475; GFX6-NEXT: s_max_i32 s7, s2, 0 3476; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 3477; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7 3478; GFX6-NEXT: s_max_i32 s6, s8, s6 3479; GFX6-NEXT: s_lshl_b32 s3, s3, 16 3480; GFX6-NEXT: s_min_i32 s6, s6, s7 3481; GFX6-NEXT: s_min_i32 s8, s3, 0 3482; GFX6-NEXT: s_add_i32 s2, s2, s6 3483; GFX6-NEXT: s_lshl_b32 s6, s9, 16 3484; GFX6-NEXT: s_max_i32 s7, s3, 0 3485; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 3486; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7 3487; GFX6-NEXT: s_max_i32 s6, s8, s6 3488; GFX6-NEXT: s_lshl_b32 s4, s4, 16 3489; GFX6-NEXT: s_min_i32 s6, s6, s7 3490; GFX6-NEXT: s_min_i32 s8, s4, 0 3491; GFX6-NEXT: s_add_i32 s3, s3, s6 3492; GFX6-NEXT: s_lshl_b32 s6, s10, 16 3493; GFX6-NEXT: s_max_i32 s7, s4, 0 3494; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 3495; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7 3496; GFX6-NEXT: s_max_i32 s6, s8, s6 3497; GFX6-NEXT: s_lshl_b32 s5, s5, 16 3498; GFX6-NEXT: s_min_i32 s6, s6, s7 3499; GFX6-NEXT: s_min_i32 s8, s5, 0 3500; GFX6-NEXT: s_add_i32 s4, s4, s6 3501; GFX6-NEXT: s_lshl_b32 s6, s11, 16 3502; GFX6-NEXT: s_max_i32 s7, s5, 0 3503; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 3504; GFX6-NEXT: s_ashr_i32 s1, s1, 16 3505; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7 3506; GFX6-NEXT: s_max_i32 s6, s8, s6 3507; GFX6-NEXT: s_ashr_i32 s0, s0, 16 3508; GFX6-NEXT: s_min_i32 s6, s6, s7 3509; GFX6-NEXT: s_and_b32 s1, s1, 0xffff 3510; GFX6-NEXT: s_ashr_i32 s2, s2, 16 3511; GFX6-NEXT: s_ashr_i32 s3, s3, 16 3512; GFX6-NEXT: s_add_i32 s5, s5, s6 3513; GFX6-NEXT: s_and_b32 s0, s0, 0xffff 3514; GFX6-NEXT: s_lshl_b32 s1, s1, 16 3515; GFX6-NEXT: s_ashr_i32 s5, s5, 16 3516; GFX6-NEXT: s_or_b32 s0, s0, s1 3517; GFX6-NEXT: s_and_b32 s1, s2, 0xffff 3518; GFX6-NEXT: s_and_b32 s2, s3, 0xffff 3519; GFX6-NEXT: s_ashr_i32 s4, s4, 16 3520; GFX6-NEXT: s_lshl_b32 s2, s2, 16 3521; GFX6-NEXT: s_and_b32 s3, s5, 0xffff 3522; GFX6-NEXT: s_or_b32 s1, s1, s2 3523; GFX6-NEXT: s_and_b32 s2, s4, 0xffff 3524; GFX6-NEXT: s_lshl_b32 s3, s3, 16 3525; GFX6-NEXT: s_or_b32 s2, s2, s3 3526; GFX6-NEXT: ; return to shader part epilog 3527; 3528; GFX8-LABEL: s_saddsat_v6i16: 3529; GFX8: ; %bb.0: 3530; GFX8-NEXT: s_sext_i32_i16 s12, s0 3531; GFX8-NEXT: s_sext_i32_i16 s13, 0 3532; GFX8-NEXT: s_max_i32 s14, s12, s13 3533; GFX8-NEXT: s_min_i32 s12, s12, s13 3534; GFX8-NEXT: s_sub_i32 s12, 0xffff8000, s12 3535; GFX8-NEXT: s_lshr_b32 s9, s3, 16 3536; GFX8-NEXT: s_sext_i32_i16 s12, s12 3537; GFX8-NEXT: s_sext_i32_i16 s3, s3 3538; GFX8-NEXT: s_sub_i32 s14, 0x7fff, s14 3539; GFX8-NEXT: s_max_i32 s3, s12, s3 3540; GFX8-NEXT: s_sext_i32_i16 s3, s3 3541; GFX8-NEXT: s_sext_i32_i16 s12, s14 3542; GFX8-NEXT: s_lshr_b32 s6, s0, 16 3543; GFX8-NEXT: s_min_i32 s3, s3, s12 3544; GFX8-NEXT: s_add_i32 s0, s0, s3 3545; GFX8-NEXT: s_sext_i32_i16 s3, s6 3546; GFX8-NEXT: s_max_i32 s12, s3, s13 3547; GFX8-NEXT: s_min_i32 s3, s3, s13 3548; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 3549; GFX8-NEXT: s_sext_i32_i16 s3, s3 3550; GFX8-NEXT: s_sext_i32_i16 s9, s9 3551; GFX8-NEXT: s_sub_i32 s12, 0x7fff, s12 3552; GFX8-NEXT: s_max_i32 s3, s3, s9 3553; GFX8-NEXT: s_sext_i32_i16 s3, s3 3554; GFX8-NEXT: s_sext_i32_i16 s9, s12 3555; GFX8-NEXT: s_min_i32 s3, s3, s9 3556; GFX8-NEXT: s_add_i32 s6, s6, s3 3557; GFX8-NEXT: s_sext_i32_i16 s3, s1 3558; GFX8-NEXT: s_max_i32 s9, s3, s13 3559; GFX8-NEXT: s_min_i32 s3, s3, s13 3560; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 3561; GFX8-NEXT: s_lshr_b32 s10, s4, 16 3562; GFX8-NEXT: s_sext_i32_i16 s3, s3 3563; GFX8-NEXT: s_sext_i32_i16 s4, s4 3564; GFX8-NEXT: s_sub_i32 s9, 0x7fff, s9 3565; GFX8-NEXT: s_max_i32 s3, s3, s4 3566; GFX8-NEXT: s_sext_i32_i16 s3, s3 3567; GFX8-NEXT: s_sext_i32_i16 s4, s9 3568; GFX8-NEXT: s_lshr_b32 s7, s1, 16 3569; GFX8-NEXT: s_min_i32 s3, s3, s4 3570; GFX8-NEXT: s_add_i32 s1, s1, s3 3571; GFX8-NEXT: s_sext_i32_i16 s3, s7 3572; GFX8-NEXT: s_max_i32 s4, s3, s13 3573; GFX8-NEXT: s_min_i32 s3, s3, s13 3574; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 3575; GFX8-NEXT: s_sext_i32_i16 s3, s3 3576; GFX8-NEXT: s_sext_i32_i16 s9, s10 3577; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 3578; GFX8-NEXT: s_max_i32 s3, s3, s9 3579; GFX8-NEXT: s_sext_i32_i16 s3, s3 3580; GFX8-NEXT: s_sext_i32_i16 s4, s4 3581; GFX8-NEXT: s_min_i32 s3, s3, s4 3582; GFX8-NEXT: s_add_i32 s7, s7, s3 3583; GFX8-NEXT: s_sext_i32_i16 s3, s2 3584; GFX8-NEXT: s_max_i32 s4, s3, s13 3585; GFX8-NEXT: s_min_i32 s3, s3, s13 3586; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 3587; GFX8-NEXT: s_lshr_b32 s11, s5, 16 3588; GFX8-NEXT: s_sext_i32_i16 s3, s3 3589; GFX8-NEXT: s_sext_i32_i16 s5, s5 3590; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 3591; GFX8-NEXT: s_max_i32 s3, s3, s5 3592; GFX8-NEXT: s_sext_i32_i16 s3, s3 3593; GFX8-NEXT: s_sext_i32_i16 s4, s4 3594; GFX8-NEXT: s_lshr_b32 s8, s2, 16 3595; GFX8-NEXT: s_min_i32 s3, s3, s4 3596; GFX8-NEXT: s_add_i32 s2, s2, s3 3597; GFX8-NEXT: s_sext_i32_i16 s3, s8 3598; GFX8-NEXT: s_max_i32 s4, s3, s13 3599; GFX8-NEXT: s_min_i32 s3, s3, s13 3600; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 3601; GFX8-NEXT: s_sext_i32_i16 s3, s3 3602; GFX8-NEXT: s_sext_i32_i16 s5, s11 3603; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 3604; GFX8-NEXT: s_max_i32 s3, s3, s5 3605; GFX8-NEXT: s_sext_i32_i16 s3, s3 3606; GFX8-NEXT: s_sext_i32_i16 s4, s4 3607; GFX8-NEXT: s_min_i32 s3, s3, s4 3608; GFX8-NEXT: s_add_i32 s8, s8, s3 3609; GFX8-NEXT: s_and_b32 s3, 0xffff, s6 3610; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 3611; GFX8-NEXT: s_lshl_b32 s3, s3, 16 3612; GFX8-NEXT: s_or_b32 s0, s0, s3 3613; GFX8-NEXT: s_and_b32 s3, 0xffff, s7 3614; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 3615; GFX8-NEXT: s_lshl_b32 s3, s3, 16 3616; GFX8-NEXT: s_or_b32 s1, s1, s3 3617; GFX8-NEXT: s_and_b32 s3, 0xffff, s8 3618; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 3619; GFX8-NEXT: s_lshl_b32 s3, s3, 16 3620; GFX8-NEXT: s_or_b32 s2, s2, s3 3621; GFX8-NEXT: ; return to shader part epilog 3622; 3623; GFX9-LABEL: s_saddsat_v6i16: 3624; GFX9: ; %bb.0: 3625; GFX9-NEXT: v_mov_b32_e32 v0, s3 3626; GFX9-NEXT: v_mov_b32_e32 v1, s4 3627; GFX9-NEXT: v_mov_b32_e32 v2, s5 3628; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp 3629; GFX9-NEXT: v_pk_add_i16 v1, s1, v1 clamp 3630; GFX9-NEXT: v_pk_add_i16 v2, s2, v2 clamp 3631; GFX9-NEXT: v_readfirstlane_b32 s0, v0 3632; GFX9-NEXT: v_readfirstlane_b32 s1, v1 3633; GFX9-NEXT: v_readfirstlane_b32 s2, v2 3634; GFX9-NEXT: ; return to shader part epilog 3635; 3636; GFX10PLUS-LABEL: s_saddsat_v6i16: 3637; GFX10PLUS: ; %bb.0: 3638; GFX10PLUS-NEXT: v_pk_add_i16 v0, s0, s3 clamp 3639; GFX10PLUS-NEXT: v_pk_add_i16 v1, s1, s4 clamp 3640; GFX10PLUS-NEXT: v_pk_add_i16 v2, s2, s5 clamp 3641; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 3642; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 3643; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 3644; GFX10PLUS-NEXT: ; return to shader part epilog 3645 %result = call <6 x i16> @llvm.sadd.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs) 3646 %cast = bitcast <6 x i16> %result to <3 x i32> 3647 ret <3 x i32> %cast 3648} 3649 3650define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { 3651; GFX6-LABEL: v_saddsat_v8i16: 3652; GFX6: ; %bb.0: 3653; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3654; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 3655; GFX6-NEXT: v_min_i32_e32 v18, 0, v0 3656; GFX6-NEXT: v_bfrev_b32_e32 v19, 1 3657; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8 3658; GFX6-NEXT: v_max_i32_e32 v16, 0, v0 3659; GFX6-NEXT: v_bfrev_b32_e32 v17, -2 3660; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v19, v18 3661; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v17, v16 3662; GFX6-NEXT: v_max_i32_e32 v8, v18, v8 3663; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3664; GFX6-NEXT: v_min_i32_e32 v8, v8, v16 3665; GFX6-NEXT: v_min_i32_e32 v16, 0, v1 3666; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v8 3667; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v9 3668; GFX6-NEXT: v_max_i32_e32 v9, 0, v1 3669; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v19, v16 3670; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 3671; GFX6-NEXT: v_max_i32_e32 v8, v16, v8 3672; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 3673; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3674; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v8 3675; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v10 3676; GFX6-NEXT: v_min_i32_e32 v10, 0, v2 3677; GFX6-NEXT: v_max_i32_e32 v9, 0, v2 3678; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 3679; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 3680; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 3681; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 3682; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 3683; GFX6-NEXT: v_min_i32_e32 v10, 0, v3 3684; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v8 3685; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v11 3686; GFX6-NEXT: v_max_i32_e32 v9, 0, v3 3687; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 3688; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 3689; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 3690; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 3691; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 3692; GFX6-NEXT: v_min_i32_e32 v10, 0, v4 3693; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v8 3694; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12 3695; GFX6-NEXT: v_max_i32_e32 v9, 0, v4 3696; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 3697; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 3698; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 3699; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 3700; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 3701; GFX6-NEXT: v_min_i32_e32 v10, 0, v5 3702; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v8 3703; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v13 3704; GFX6-NEXT: v_max_i32_e32 v9, 0, v5 3705; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 3706; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 3707; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 3708; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 3709; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 3710; GFX6-NEXT: v_min_i32_e32 v10, 0, v6 3711; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v8 3712; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14 3713; GFX6-NEXT: v_max_i32_e32 v9, 0, v6 3714; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 3715; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 3716; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 3717; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 3718; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 3719; GFX6-NEXT: v_min_i32_e32 v10, 0, v7 3720; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 3721; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 3722; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 3723; GFX6-NEXT: v_max_i32_e32 v9, 0, v7 3724; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 3725; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 3726; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 3727; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 3728; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 3729; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 3730; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 3731; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 3732; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 3733; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3734; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 3735; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v8 3736; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 3737; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 3738; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 3739; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 3740; GFX6-NEXT: v_ashrrev_i32_e32 v7, 16, v7 3741; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3742; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5 3743; GFX6-NEXT: v_ashrrev_i32_e32 v6, 16, v6 3744; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 3745; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4 3746; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 3747; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 3748; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 3749; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v6 3750; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 3751; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 3752; GFX6-NEXT: s_setpc_b64 s[30:31] 3753; 3754; GFX8-LABEL: v_saddsat_v8i16: 3755; GFX8: ; %bb.0: 3756; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3757; GFX8-NEXT: v_min_i16_e32 v9, 0, v0 3758; GFX8-NEXT: v_max_i16_e32 v8, 0, v0 3759; GFX8-NEXT: v_sub_u16_e32 v9, 0x8000, v9 3760; GFX8-NEXT: v_sub_u16_e32 v8, 0x7fff, v8 3761; GFX8-NEXT: v_max_i16_e32 v9, v9, v4 3762; GFX8-NEXT: v_min_i16_e32 v8, v9, v8 3763; GFX8-NEXT: v_mov_b32_e32 v9, 0 3764; GFX8-NEXT: v_min_i16_sdwa v11, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3765; GFX8-NEXT: v_max_i16_sdwa v10, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3766; GFX8-NEXT: v_sub_u16_e32 v11, 0x8000, v11 3767; GFX8-NEXT: v_sub_u16_e32 v10, 0x7fff, v10 3768; GFX8-NEXT: v_max_i16_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3769; GFX8-NEXT: v_min_i16_e32 v11, 0, v1 3770; GFX8-NEXT: v_min_i16_e32 v4, v4, v10 3771; GFX8-NEXT: v_max_i16_e32 v10, 0, v1 3772; GFX8-NEXT: v_sub_u16_e32 v11, 0x8000, v11 3773; GFX8-NEXT: v_sub_u16_e32 v10, 0x7fff, v10 3774; GFX8-NEXT: v_max_i16_e32 v11, v11, v5 3775; GFX8-NEXT: v_min_i16_sdwa v12, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3776; GFX8-NEXT: v_min_i16_e32 v10, v11, v10 3777; GFX8-NEXT: v_max_i16_sdwa v11, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3778; GFX8-NEXT: v_sub_u16_e32 v12, 0x8000, v12 3779; GFX8-NEXT: v_sub_u16_e32 v11, 0x7fff, v11 3780; GFX8-NEXT: v_max_i16_sdwa v5, v12, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3781; GFX8-NEXT: v_min_i16_e32 v12, 0, v2 3782; GFX8-NEXT: v_min_i16_e32 v5, v5, v11 3783; GFX8-NEXT: v_max_i16_e32 v11, 0, v2 3784; GFX8-NEXT: v_sub_u16_e32 v12, 0x8000, v12 3785; GFX8-NEXT: v_sub_u16_e32 v11, 0x7fff, v11 3786; GFX8-NEXT: v_max_i16_e32 v12, v12, v6 3787; GFX8-NEXT: v_min_i16_sdwa v13, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3788; GFX8-NEXT: v_min_i16_e32 v11, v12, v11 3789; GFX8-NEXT: v_max_i16_sdwa v12, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3790; GFX8-NEXT: v_sub_u16_e32 v13, 0x8000, v13 3791; GFX8-NEXT: v_sub_u16_e32 v12, 0x7fff, v12 3792; GFX8-NEXT: v_max_i16_sdwa v6, v13, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3793; GFX8-NEXT: v_min_i16_e32 v13, 0, v3 3794; GFX8-NEXT: v_min_i16_e32 v6, v6, v12 3795; GFX8-NEXT: v_max_i16_e32 v12, 0, v3 3796; GFX8-NEXT: v_sub_u16_e32 v13, 0x8000, v13 3797; GFX8-NEXT: v_sub_u16_e32 v12, 0x7fff, v12 3798; GFX8-NEXT: v_max_i16_e32 v13, v13, v7 3799; GFX8-NEXT: v_min_i16_e32 v12, v13, v12 3800; GFX8-NEXT: v_max_i16_sdwa v13, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3801; GFX8-NEXT: v_min_i16_sdwa v9, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3802; GFX8-NEXT: v_sub_u16_e32 v9, 0x8000, v9 3803; GFX8-NEXT: v_sub_u16_e32 v13, 0x7fff, v13 3804; GFX8-NEXT: v_max_i16_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3805; GFX8-NEXT: v_add_u16_e32 v8, v0, v8 3806; GFX8-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3807; GFX8-NEXT: v_add_u16_e32 v4, v1, v10 3808; GFX8-NEXT: v_add_u16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3809; GFX8-NEXT: v_min_i16_e32 v7, v7, v13 3810; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 3811; GFX8-NEXT: v_add_u16_e32 v4, v2, v11 3812; GFX8-NEXT: v_add_u16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3813; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 3814; GFX8-NEXT: v_add_u16_e32 v4, v3, v12 3815; GFX8-NEXT: v_add_u16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3816; GFX8-NEXT: v_or_b32_e32 v0, v8, v0 3817; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 3818; GFX8-NEXT: s_setpc_b64 s[30:31] 3819; 3820; GFX9-LABEL: v_saddsat_v8i16: 3821; GFX9: ; %bb.0: 3822; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3823; GFX9-NEXT: v_pk_add_i16 v0, v0, v4 clamp 3824; GFX9-NEXT: v_pk_add_i16 v1, v1, v5 clamp 3825; GFX9-NEXT: v_pk_add_i16 v2, v2, v6 clamp 3826; GFX9-NEXT: v_pk_add_i16 v3, v3, v7 clamp 3827; GFX9-NEXT: s_setpc_b64 s[30:31] 3828; 3829; GFX10PLUS-LABEL: v_saddsat_v8i16: 3830; GFX10PLUS: ; %bb.0: 3831; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3832; GFX10PLUS-NEXT: v_pk_add_i16 v0, v0, v4 clamp 3833; GFX10PLUS-NEXT: v_pk_add_i16 v1, v1, v5 clamp 3834; GFX10PLUS-NEXT: v_pk_add_i16 v2, v2, v6 clamp 3835; GFX10PLUS-NEXT: v_pk_add_i16 v3, v3, v7 clamp 3836; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 3837 %result = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) 3838 %cast = bitcast <8 x i16> %result to <4 x float> 3839 ret <4 x float> %cast 3840} 3841 3842define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inreg %rhs) { 3843; GFX6-LABEL: s_saddsat_v8i16: 3844; GFX6: ; %bb.0: 3845; GFX6-NEXT: s_lshl_b32 s0, s0, 16 3846; GFX6-NEXT: s_min_i32 s17, s0, 0 3847; GFX6-NEXT: s_lshl_b32 s8, s8, 16 3848; GFX6-NEXT: s_max_i32 s16, s0, 0 3849; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 3850; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 3851; GFX6-NEXT: s_max_i32 s8, s17, s8 3852; GFX6-NEXT: s_lshl_b32 s1, s1, 16 3853; GFX6-NEXT: s_min_i32 s8, s8, s16 3854; GFX6-NEXT: s_min_i32 s16, s1, 0 3855; GFX6-NEXT: s_add_i32 s0, s0, s8 3856; GFX6-NEXT: s_lshl_b32 s8, s9, 16 3857; GFX6-NEXT: s_max_i32 s9, s1, 0 3858; GFX6-NEXT: s_sub_i32 s16, 0x80000000, s16 3859; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 3860; GFX6-NEXT: s_max_i32 s8, s16, s8 3861; GFX6-NEXT: s_min_i32 s8, s8, s9 3862; GFX6-NEXT: s_lshl_b32 s2, s2, 16 3863; GFX6-NEXT: s_add_i32 s1, s1, s8 3864; GFX6-NEXT: s_lshl_b32 s8, s10, 16 3865; GFX6-NEXT: s_min_i32 s10, s2, 0 3866; GFX6-NEXT: s_max_i32 s9, s2, 0 3867; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 3868; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 3869; GFX6-NEXT: s_max_i32 s8, s10, s8 3870; GFX6-NEXT: s_lshl_b32 s3, s3, 16 3871; GFX6-NEXT: s_min_i32 s8, s8, s9 3872; GFX6-NEXT: s_min_i32 s10, s3, 0 3873; GFX6-NEXT: s_add_i32 s2, s2, s8 3874; GFX6-NEXT: s_lshl_b32 s8, s11, 16 3875; GFX6-NEXT: s_max_i32 s9, s3, 0 3876; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 3877; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 3878; GFX6-NEXT: s_max_i32 s8, s10, s8 3879; GFX6-NEXT: s_lshl_b32 s4, s4, 16 3880; GFX6-NEXT: s_min_i32 s8, s8, s9 3881; GFX6-NEXT: s_min_i32 s10, s4, 0 3882; GFX6-NEXT: s_add_i32 s3, s3, s8 3883; GFX6-NEXT: s_lshl_b32 s8, s12, 16 3884; GFX6-NEXT: s_max_i32 s9, s4, 0 3885; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 3886; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 3887; GFX6-NEXT: s_max_i32 s8, s10, s8 3888; GFX6-NEXT: s_lshl_b32 s5, s5, 16 3889; GFX6-NEXT: s_min_i32 s8, s8, s9 3890; GFX6-NEXT: s_min_i32 s10, s5, 0 3891; GFX6-NEXT: s_add_i32 s4, s4, s8 3892; GFX6-NEXT: s_lshl_b32 s8, s13, 16 3893; GFX6-NEXT: s_max_i32 s9, s5, 0 3894; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 3895; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 3896; GFX6-NEXT: s_max_i32 s8, s10, s8 3897; GFX6-NEXT: s_lshl_b32 s6, s6, 16 3898; GFX6-NEXT: s_min_i32 s8, s8, s9 3899; GFX6-NEXT: s_min_i32 s10, s6, 0 3900; GFX6-NEXT: s_add_i32 s5, s5, s8 3901; GFX6-NEXT: s_lshl_b32 s8, s14, 16 3902; GFX6-NEXT: s_max_i32 s9, s6, 0 3903; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 3904; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 3905; GFX6-NEXT: s_max_i32 s8, s10, s8 3906; GFX6-NEXT: s_lshl_b32 s7, s7, 16 3907; GFX6-NEXT: s_min_i32 s8, s8, s9 3908; GFX6-NEXT: s_min_i32 s10, s7, 0 3909; GFX6-NEXT: s_ashr_i32 s1, s1, 16 3910; GFX6-NEXT: s_add_i32 s6, s6, s8 3911; GFX6-NEXT: s_lshl_b32 s8, s15, 16 3912; GFX6-NEXT: s_max_i32 s9, s7, 0 3913; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 3914; GFX6-NEXT: s_ashr_i32 s0, s0, 16 3915; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 3916; GFX6-NEXT: s_max_i32 s8, s10, s8 3917; GFX6-NEXT: s_and_b32 s1, s1, 0xffff 3918; GFX6-NEXT: s_ashr_i32 s2, s2, 16 3919; GFX6-NEXT: s_ashr_i32 s3, s3, 16 3920; GFX6-NEXT: s_min_i32 s8, s8, s9 3921; GFX6-NEXT: s_and_b32 s0, s0, 0xffff 3922; GFX6-NEXT: s_lshl_b32 s1, s1, 16 3923; GFX6-NEXT: s_ashr_i32 s5, s5, 16 3924; GFX6-NEXT: s_add_i32 s7, s7, s8 3925; GFX6-NEXT: s_or_b32 s0, s0, s1 3926; GFX6-NEXT: s_and_b32 s1, s2, 0xffff 3927; GFX6-NEXT: s_and_b32 s2, s3, 0xffff 3928; GFX6-NEXT: s_ashr_i32 s4, s4, 16 3929; GFX6-NEXT: s_ashr_i32 s7, s7, 16 3930; GFX6-NEXT: s_lshl_b32 s2, s2, 16 3931; GFX6-NEXT: s_and_b32 s3, s5, 0xffff 3932; GFX6-NEXT: s_ashr_i32 s6, s6, 16 3933; GFX6-NEXT: s_or_b32 s1, s1, s2 3934; GFX6-NEXT: s_and_b32 s2, s4, 0xffff 3935; GFX6-NEXT: s_lshl_b32 s3, s3, 16 3936; GFX6-NEXT: s_and_b32 s4, s7, 0xffff 3937; GFX6-NEXT: s_or_b32 s2, s2, s3 3938; GFX6-NEXT: s_and_b32 s3, s6, 0xffff 3939; GFX6-NEXT: s_lshl_b32 s4, s4, 16 3940; GFX6-NEXT: s_or_b32 s3, s3, s4 3941; GFX6-NEXT: ; return to shader part epilog 3942; 3943; GFX8-LABEL: s_saddsat_v8i16: 3944; GFX8: ; %bb.0: 3945; GFX8-NEXT: s_sext_i32_i16 s16, s0 3946; GFX8-NEXT: s_sext_i32_i16 s17, 0 3947; GFX8-NEXT: s_max_i32 s18, s16, s17 3948; GFX8-NEXT: s_min_i32 s16, s16, s17 3949; GFX8-NEXT: s_sub_i32 s16, 0xffff8000, s16 3950; GFX8-NEXT: s_lshr_b32 s12, s4, 16 3951; GFX8-NEXT: s_sext_i32_i16 s16, s16 3952; GFX8-NEXT: s_sext_i32_i16 s4, s4 3953; GFX8-NEXT: s_sub_i32 s18, 0x7fff, s18 3954; GFX8-NEXT: s_max_i32 s4, s16, s4 3955; GFX8-NEXT: s_sext_i32_i16 s4, s4 3956; GFX8-NEXT: s_sext_i32_i16 s16, s18 3957; GFX8-NEXT: s_lshr_b32 s8, s0, 16 3958; GFX8-NEXT: s_min_i32 s4, s4, s16 3959; GFX8-NEXT: s_add_i32 s0, s0, s4 3960; GFX8-NEXT: s_sext_i32_i16 s4, s8 3961; GFX8-NEXT: s_max_i32 s16, s4, s17 3962; GFX8-NEXT: s_min_i32 s4, s4, s17 3963; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 3964; GFX8-NEXT: s_sext_i32_i16 s4, s4 3965; GFX8-NEXT: s_sext_i32_i16 s12, s12 3966; GFX8-NEXT: s_sub_i32 s16, 0x7fff, s16 3967; GFX8-NEXT: s_max_i32 s4, s4, s12 3968; GFX8-NEXT: s_sext_i32_i16 s4, s4 3969; GFX8-NEXT: s_sext_i32_i16 s12, s16 3970; GFX8-NEXT: s_min_i32 s4, s4, s12 3971; GFX8-NEXT: s_add_i32 s8, s8, s4 3972; GFX8-NEXT: s_sext_i32_i16 s4, s1 3973; GFX8-NEXT: s_max_i32 s12, s4, s17 3974; GFX8-NEXT: s_min_i32 s4, s4, s17 3975; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 3976; GFX8-NEXT: s_lshr_b32 s13, s5, 16 3977; GFX8-NEXT: s_sext_i32_i16 s4, s4 3978; GFX8-NEXT: s_sext_i32_i16 s5, s5 3979; GFX8-NEXT: s_sub_i32 s12, 0x7fff, s12 3980; GFX8-NEXT: s_max_i32 s4, s4, s5 3981; GFX8-NEXT: s_sext_i32_i16 s4, s4 3982; GFX8-NEXT: s_sext_i32_i16 s5, s12 3983; GFX8-NEXT: s_lshr_b32 s9, s1, 16 3984; GFX8-NEXT: s_min_i32 s4, s4, s5 3985; GFX8-NEXT: s_add_i32 s1, s1, s4 3986; GFX8-NEXT: s_sext_i32_i16 s4, s9 3987; GFX8-NEXT: s_max_i32 s5, s4, s17 3988; GFX8-NEXT: s_min_i32 s4, s4, s17 3989; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 3990; GFX8-NEXT: s_sext_i32_i16 s4, s4 3991; GFX8-NEXT: s_sext_i32_i16 s12, s13 3992; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 3993; GFX8-NEXT: s_max_i32 s4, s4, s12 3994; GFX8-NEXT: s_sext_i32_i16 s4, s4 3995; GFX8-NEXT: s_sext_i32_i16 s5, s5 3996; GFX8-NEXT: s_min_i32 s4, s4, s5 3997; GFX8-NEXT: s_add_i32 s9, s9, s4 3998; GFX8-NEXT: s_sext_i32_i16 s4, s2 3999; GFX8-NEXT: s_max_i32 s5, s4, s17 4000; GFX8-NEXT: s_min_i32 s4, s4, s17 4001; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 4002; GFX8-NEXT: s_lshr_b32 s14, s6, 16 4003; GFX8-NEXT: s_sext_i32_i16 s4, s4 4004; GFX8-NEXT: s_sext_i32_i16 s6, s6 4005; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 4006; GFX8-NEXT: s_max_i32 s4, s4, s6 4007; GFX8-NEXT: s_sext_i32_i16 s4, s4 4008; GFX8-NEXT: s_sext_i32_i16 s5, s5 4009; GFX8-NEXT: s_lshr_b32 s10, s2, 16 4010; GFX8-NEXT: s_min_i32 s4, s4, s5 4011; GFX8-NEXT: s_add_i32 s2, s2, s4 4012; GFX8-NEXT: s_sext_i32_i16 s4, s10 4013; GFX8-NEXT: s_max_i32 s5, s4, s17 4014; GFX8-NEXT: s_min_i32 s4, s4, s17 4015; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 4016; GFX8-NEXT: s_sext_i32_i16 s4, s4 4017; GFX8-NEXT: s_sext_i32_i16 s6, s14 4018; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 4019; GFX8-NEXT: s_max_i32 s4, s4, s6 4020; GFX8-NEXT: s_sext_i32_i16 s4, s4 4021; GFX8-NEXT: s_sext_i32_i16 s5, s5 4022; GFX8-NEXT: s_min_i32 s4, s4, s5 4023; GFX8-NEXT: s_add_i32 s10, s10, s4 4024; GFX8-NEXT: s_sext_i32_i16 s4, s3 4025; GFX8-NEXT: s_max_i32 s5, s4, s17 4026; GFX8-NEXT: s_min_i32 s4, s4, s17 4027; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 4028; GFX8-NEXT: s_sext_i32_i16 s4, s4 4029; GFX8-NEXT: s_sext_i32_i16 s6, s7 4030; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 4031; GFX8-NEXT: s_max_i32 s4, s4, s6 4032; GFX8-NEXT: s_sext_i32_i16 s4, s4 4033; GFX8-NEXT: s_sext_i32_i16 s5, s5 4034; GFX8-NEXT: s_lshr_b32 s11, s3, 16 4035; GFX8-NEXT: s_min_i32 s4, s4, s5 4036; GFX8-NEXT: s_add_i32 s3, s3, s4 4037; GFX8-NEXT: s_sext_i32_i16 s4, s11 4038; GFX8-NEXT: s_max_i32 s5, s4, s17 4039; GFX8-NEXT: s_min_i32 s4, s4, s17 4040; GFX8-NEXT: s_lshr_b32 s15, s7, 16 4041; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 4042; GFX8-NEXT: s_sext_i32_i16 s4, s4 4043; GFX8-NEXT: s_sext_i32_i16 s6, s15 4044; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 4045; GFX8-NEXT: s_max_i32 s4, s4, s6 4046; GFX8-NEXT: s_sext_i32_i16 s4, s4 4047; GFX8-NEXT: s_sext_i32_i16 s5, s5 4048; GFX8-NEXT: s_min_i32 s4, s4, s5 4049; GFX8-NEXT: s_add_i32 s11, s11, s4 4050; GFX8-NEXT: s_and_b32 s4, 0xffff, s8 4051; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 4052; GFX8-NEXT: s_lshl_b32 s4, s4, 16 4053; GFX8-NEXT: s_or_b32 s0, s0, s4 4054; GFX8-NEXT: s_and_b32 s4, 0xffff, s9 4055; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 4056; GFX8-NEXT: s_lshl_b32 s4, s4, 16 4057; GFX8-NEXT: s_or_b32 s1, s1, s4 4058; GFX8-NEXT: s_and_b32 s4, 0xffff, s10 4059; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 4060; GFX8-NEXT: s_lshl_b32 s4, s4, 16 4061; GFX8-NEXT: s_or_b32 s2, s2, s4 4062; GFX8-NEXT: s_and_b32 s4, 0xffff, s11 4063; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 4064; GFX8-NEXT: s_lshl_b32 s4, s4, 16 4065; GFX8-NEXT: s_or_b32 s3, s3, s4 4066; GFX8-NEXT: ; return to shader part epilog 4067; 4068; GFX9-LABEL: s_saddsat_v8i16: 4069; GFX9: ; %bb.0: 4070; GFX9-NEXT: v_mov_b32_e32 v0, s4 4071; GFX9-NEXT: v_mov_b32_e32 v1, s5 4072; GFX9-NEXT: v_mov_b32_e32 v2, s6 4073; GFX9-NEXT: v_mov_b32_e32 v3, s7 4074; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp 4075; GFX9-NEXT: v_pk_add_i16 v1, s1, v1 clamp 4076; GFX9-NEXT: v_pk_add_i16 v2, s2, v2 clamp 4077; GFX9-NEXT: v_pk_add_i16 v3, s3, v3 clamp 4078; GFX9-NEXT: v_readfirstlane_b32 s0, v0 4079; GFX9-NEXT: v_readfirstlane_b32 s1, v1 4080; GFX9-NEXT: v_readfirstlane_b32 s2, v2 4081; GFX9-NEXT: v_readfirstlane_b32 s3, v3 4082; GFX9-NEXT: ; return to shader part epilog 4083; 4084; GFX10PLUS-LABEL: s_saddsat_v8i16: 4085; GFX10PLUS: ; %bb.0: 4086; GFX10PLUS-NEXT: v_pk_add_i16 v0, s0, s4 clamp 4087; GFX10PLUS-NEXT: v_pk_add_i16 v1, s1, s5 clamp 4088; GFX10PLUS-NEXT: v_pk_add_i16 v2, s2, s6 clamp 4089; GFX10PLUS-NEXT: v_pk_add_i16 v3, s3, s7 clamp 4090; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 4091; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 4092; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 4093; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3 4094; GFX10PLUS-NEXT: ; return to shader part epilog 4095 %result = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) 4096 %cast = bitcast <8 x i16> %result to <4 x i32> 4097 ret <4 x i32> %cast 4098} 4099 4100define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) { 4101; GFX6-LABEL: v_saddsat_i48: 4102; GFX6: ; %bb.0: 4103; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4104; GFX6-NEXT: v_add_i32_e32 v4, vcc, v0, v2 4105; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v1, v3, vcc 4106; GFX6-NEXT: v_bfe_i32 v5, v4, 0, 16 4107; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16 4108; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 4109; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] 4110; GFX6-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3] 4111; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5 4112; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xffff8000, v0 4113; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v5 4114; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] 4115; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc 4116; GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc 4117; GFX6-NEXT: s_setpc_b64 s[30:31] 4118; 4119; GFX8-LABEL: v_saddsat_i48: 4120; GFX8: ; %bb.0: 4121; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4122; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2 4123; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v1, v3, vcc 4124; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 16 4125; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16 4126; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16 4127; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] 4128; GFX8-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3] 4129; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5 4130; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xffff8000, v0 4131; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v5 4132; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] 4133; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc 4134; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc 4135; GFX8-NEXT: s_setpc_b64 s[30:31] 4136; 4137; GFX9-LABEL: v_saddsat_i48: 4138; GFX9: ; %bb.0: 4139; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4140; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] 4141; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] 4142; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2 4143; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc 4144; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] 4145; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] 4146; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 4147; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 4148; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc 4149; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 4150; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 4151; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] 4152; GFX9-NEXT: s_setpc_b64 s[30:31] 4153; 4154; GFX10-LABEL: v_saddsat_i48: 4155; GFX10: ; %bb.0: 4156; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4157; GFX10-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] 4158; GFX10-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] 4159; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 4160; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo 4161; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] 4162; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 4163; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1] 4164; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 4165; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4 4166; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo 4167; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo 4168; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] 4169; GFX10-NEXT: s_setpc_b64 s[30:31] 4170; 4171; GFX11-LABEL: v_saddsat_i48: 4172; GFX11: ; %bb.0: 4173; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4174; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] 4175; GFX11-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] 4176; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 4177; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo 4178; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] 4179; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 4180; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] 4181; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 4182; GFX11-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 4183; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 4184; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] 4185; GFX11-NEXT: s_setpc_b64 s[30:31] 4186 %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs) 4187 ret i48 %result 4188} 4189 4190define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { 4191; GFX6-LABEL: s_saddsat_i48: 4192; GFX6: ; %bb.0: 4193; GFX6-NEXT: s_add_u32 s4, s0, s2 4194; GFX6-NEXT: s_addc_u32 s3, s1, s3 4195; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 4196; GFX6-NEXT: v_mov_b32_e32 v0, s0 4197; GFX6-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000 4198; GFX6-NEXT: v_mov_b32_e32 v1, s1 4199; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000 4200; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] 4201; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 4202; GFX6-NEXT: s_ashr_i32 s2, s7, 31 4203; GFX6-NEXT: s_ashr_i32 s5, s7, 15 4204; GFX6-NEXT: s_addk_i32 s2, 0x8000 4205; GFX6-NEXT: v_mov_b32_e32 v0, s5 4206; GFX6-NEXT: v_mov_b32_e32 v1, s2 4207; GFX6-NEXT: v_mov_b32_e32 v2, s4 4208; GFX6-NEXT: v_mov_b32_e32 v3, s3 4209; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc 4210; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 4211; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4212; GFX6-NEXT: v_readfirstlane_b32 s0, v0 4213; GFX6-NEXT: v_readfirstlane_b32 s1, v1 4214; GFX6-NEXT: ; return to shader part epilog 4215; 4216; GFX8-LABEL: s_saddsat_i48: 4217; GFX8: ; %bb.0: 4218; GFX8-NEXT: s_add_u32 s4, s0, s2 4219; GFX8-NEXT: s_addc_u32 s3, s1, s3 4220; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 4221; GFX8-NEXT: v_mov_b32_e32 v0, s0 4222; GFX8-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000 4223; GFX8-NEXT: v_mov_b32_e32 v1, s1 4224; GFX8-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000 4225; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] 4226; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 4227; GFX8-NEXT: s_ashr_i32 s2, s7, 31 4228; GFX8-NEXT: s_ashr_i32 s5, s7, 15 4229; GFX8-NEXT: s_addk_i32 s2, 0x8000 4230; GFX8-NEXT: v_mov_b32_e32 v0, s5 4231; GFX8-NEXT: v_mov_b32_e32 v1, s2 4232; GFX8-NEXT: v_mov_b32_e32 v2, s4 4233; GFX8-NEXT: v_mov_b32_e32 v3, s3 4234; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc 4235; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 4236; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4237; GFX8-NEXT: v_readfirstlane_b32 s0, v0 4238; GFX8-NEXT: v_readfirstlane_b32 s1, v1 4239; GFX8-NEXT: ; return to shader part epilog 4240; 4241; GFX9-LABEL: s_saddsat_i48: 4242; GFX9: ; %bb.0: 4243; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 4244; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 4245; GFX9-NEXT: s_add_u32 s4, s0, s2 4246; GFX9-NEXT: v_mov_b32_e32 v0, s0 4247; GFX9-NEXT: s_addc_u32 s5, s1, s3 4248; GFX9-NEXT: v_mov_b32_e32 v1, s1 4249; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4250; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 4251; GFX9-NEXT: s_ashr_i32 s2, s5, 31 4252; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000 4253; GFX9-NEXT: v_mov_b32_e32 v0, s2 4254; GFX9-NEXT: v_mov_b32_e32 v1, s3 4255; GFX9-NEXT: v_mov_b32_e32 v2, s4 4256; GFX9-NEXT: v_mov_b32_e32 v3, s5 4257; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc 4258; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 4259; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4260; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] 4261; GFX9-NEXT: v_readfirstlane_b32 s0, v0 4262; GFX9-NEXT: v_readfirstlane_b32 s1, v1 4263; GFX9-NEXT: ; return to shader part epilog 4264; 4265; GFX10-LABEL: s_saddsat_i48: 4266; GFX10: ; %bb.0: 4267; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 4268; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 4269; GFX10-NEXT: s_add_u32 s4, s0, s2 4270; GFX10-NEXT: s_addc_u32 s5, s1, s3 4271; GFX10-NEXT: v_mov_b32_e32 v0, s4 4272; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1] 4273; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0 4274; GFX10-NEXT: v_mov_b32_e32 v1, s5 4275; GFX10-NEXT: s_ashr_i32 s2, s5, 31 4276; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000 4277; GFX10-NEXT: s_xor_b32 s0, s1, s0 4278; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 4279; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 4280; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] 4281; GFX10-NEXT: v_readfirstlane_b32 s0, v0 4282; GFX10-NEXT: v_readfirstlane_b32 s1, v1 4283; GFX10-NEXT: ; return to shader part epilog 4284; 4285; GFX11-LABEL: s_saddsat_i48: 4286; GFX11: ; %bb.0: 4287; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 4288; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 4289; GFX11-NEXT: s_add_u32 s4, s0, s2 4290; GFX11-NEXT: s_addc_u32 s5, s1, s3 4291; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 4292; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1] 4293; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0 4294; GFX11-NEXT: s_ashr_i32 s2, s5, 31 4295; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000 4296; GFX11-NEXT: s_xor_b32 s0, s1, s0 4297; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 4298; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 4299; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] 4300; GFX11-NEXT: v_readfirstlane_b32 s0, v0 4301; GFX11-NEXT: v_readfirstlane_b32 s1, v1 4302; GFX11-NEXT: ; return to shader part epilog 4303 %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs) 4304 ret i48 %result 4305} 4306 4307define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) { 4308; GFX6-LABEL: saddsat_i48_sv: 4309; GFX6: ; %bb.0: 4310; GFX6-NEXT: v_mov_b32_e32 v3, s1 4311; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v0 4312; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v3, v1, vcc 4313; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 4314; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 4315; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16 4316; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] 4317; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1] 4318; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 4319; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v3 4320; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xffff8000, v0 4321; GFX6-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] 4322; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 4323; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 4324; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 4325; GFX6-NEXT: ; return to shader part epilog 4326; 4327; GFX8-LABEL: saddsat_i48_sv: 4328; GFX8: ; %bb.0: 4329; GFX8-NEXT: v_mov_b32_e32 v3, s1 4330; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 4331; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v3, v1, vcc 4332; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16 4333; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 4334; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16 4335; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] 4336; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1] 4337; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 4338; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v3 4339; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xffff8000, v0 4340; GFX8-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] 4341; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 4342; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 4343; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 4344; GFX8-NEXT: ; return to shader part epilog 4345; 4346; GFX9-LABEL: saddsat_i48_sv: 4347; GFX9: ; %bb.0: 4348; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] 4349; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 4350; GFX9-NEXT: v_mov_b32_e32 v3, s1 4351; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 4352; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc 4353; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] 4354; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[0:1] 4355; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 4356; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 4357; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc 4358; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 4359; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4360; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] 4361; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 4362; GFX9-NEXT: ; return to shader part epilog 4363; 4364; GFX10-LABEL: saddsat_i48_sv: 4365; GFX10: ; %bb.0: 4366; GFX10-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] 4367; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 4368; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0 4369; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo 4370; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 4371; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] 4372; GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1] 4373; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 4374; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo 4375; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo 4376; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo 4377; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] 4378; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 4379; GFX10-NEXT: ; return to shader part epilog 4380; 4381; GFX11-LABEL: saddsat_i48_sv: 4382; GFX11: ; %bb.0: 4383; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] 4384; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 4385; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0 4386; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo 4387; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 4388; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] 4389; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1] 4390; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 4391; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo 4392; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 4393; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] 4394; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 4395; GFX11-NEXT: ; return to shader part epilog 4396 %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs) 4397 %ext.result = zext i48 %result to i64 4398 %cast = bitcast i64 %ext.result to <2 x float> 4399 ret <2 x float> %cast 4400} 4401 4402define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) { 4403; GFX6-LABEL: saddsat_i48_vs: 4404; GFX6: ; %bb.0: 4405; GFX6-NEXT: v_mov_b32_e32 v3, s1 4406; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v0 4407; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v1, v3, vcc 4408; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 4409; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16 4410; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 4411; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] 4412; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 4413; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 4414; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v3 4415; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xffff8000, v0 4416; GFX6-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] 4417; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 4418; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 4419; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 4420; GFX6-NEXT: ; return to shader part epilog 4421; 4422; GFX8-LABEL: saddsat_i48_vs: 4423; GFX8: ; %bb.0: 4424; GFX8-NEXT: v_mov_b32_e32 v3, s1 4425; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 4426; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v1, v3, vcc 4427; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16 4428; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16 4429; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 4430; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] 4431; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 4432; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 4433; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v3 4434; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xffff8000, v0 4435; GFX8-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] 4436; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 4437; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 4438; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 4439; GFX8-NEXT: ; return to shader part epilog 4440; 4441; GFX9-LABEL: saddsat_i48_vs: 4442; GFX9: ; %bb.0: 4443; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] 4444; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 4445; GFX9-NEXT: v_mov_b32_e32 v3, s1 4446; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 4447; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc 4448; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] 4449; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 4450; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 4451; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 4452; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc 4453; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 4454; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4455; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] 4456; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 4457; GFX9-NEXT: ; return to shader part epilog 4458; 4459; GFX10-LABEL: saddsat_i48_vs: 4460; GFX10: ; %bb.0: 4461; GFX10-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] 4462; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 4463; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0 4464; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo 4465; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0 4466; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 4467; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] 4468; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 4469; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo 4470; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo 4471; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo 4472; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] 4473; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 4474; GFX10-NEXT: ; return to shader part epilog 4475; 4476; GFX11-LABEL: saddsat_i48_vs: 4477; GFX11: ; %bb.0: 4478; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] 4479; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 4480; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0 4481; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo 4482; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0 4483; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 4484; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] 4485; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 4486; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo 4487; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 4488; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] 4489; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 4490; GFX11-NEXT: ; return to shader part epilog 4491 %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs) 4492 %ext.result = zext i48 %result to i64 4493 %cast = bitcast i64 %ext.result to <2 x float> 4494 ret <2 x float> %cast 4495} 4496 4497define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { 4498; GFX6-LABEL: v_saddsat_i64: 4499; GFX6: ; %bb.0: 4500; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4501; GFX6-NEXT: v_add_i32_e32 v4, vcc, v0, v2 4502; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc 4503; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] 4504; GFX6-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3] 4505; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5 4506; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v0 4507; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] 4508; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 4509; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 4510; GFX6-NEXT: s_setpc_b64 s[30:31] 4511; 4512; GFX8-LABEL: v_saddsat_i64: 4513; GFX8: ; %bb.0: 4514; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4515; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2 4516; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc 4517; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] 4518; GFX8-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3] 4519; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5 4520; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v0 4521; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] 4522; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 4523; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 4524; GFX8-NEXT: s_setpc_b64 s[30:31] 4525; 4526; GFX9-LABEL: v_saddsat_i64: 4527; GFX9: ; %bb.0: 4528; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4529; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2 4530; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc 4531; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] 4532; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] 4533; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 4534; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 4535; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc 4536; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 4537; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 4538; GFX9-NEXT: s_setpc_b64 s[30:31] 4539; 4540; GFX10-LABEL: v_saddsat_i64: 4541; GFX10: ; %bb.0: 4542; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4543; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 4544; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo 4545; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[2:3] 4546; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 4547; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] 4548; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 4549; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo 4550; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo 4551; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo 4552; GFX10-NEXT: s_setpc_b64 s[30:31] 4553; 4554; GFX11-LABEL: v_saddsat_i64: 4555; GFX11: ; %bb.0: 4556; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4557; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 4558; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo 4559; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3] 4560; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 4561; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] 4562; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 4563; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo 4564; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 4565; GFX11-NEXT: s_setpc_b64 s[30:31] 4566 %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) 4567 ret i64 %result 4568} 4569 4570define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { 4571; GFX6-LABEL: s_saddsat_i64: 4572; GFX6: ; %bb.0: 4573; GFX6-NEXT: s_add_u32 s4, s0, s2 4574; GFX6-NEXT: v_mov_b32_e32 v0, s0 4575; GFX6-NEXT: s_addc_u32 s5, s1, s3 4576; GFX6-NEXT: v_mov_b32_e32 v1, s1 4577; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4578; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 4579; GFX6-NEXT: s_ashr_i32 s2, s5, 31 4580; GFX6-NEXT: s_add_i32 s3, s2, 0x80000000 4581; GFX6-NEXT: v_mov_b32_e32 v0, s2 4582; GFX6-NEXT: v_mov_b32_e32 v1, s3 4583; GFX6-NEXT: v_mov_b32_e32 v2, s4 4584; GFX6-NEXT: v_mov_b32_e32 v3, s5 4585; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc 4586; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 4587; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4588; GFX6-NEXT: v_readfirstlane_b32 s0, v0 4589; GFX6-NEXT: v_readfirstlane_b32 s1, v1 4590; GFX6-NEXT: ; return to shader part epilog 4591; 4592; GFX8-LABEL: s_saddsat_i64: 4593; GFX8: ; %bb.0: 4594; GFX8-NEXT: s_add_u32 s4, s0, s2 4595; GFX8-NEXT: v_mov_b32_e32 v0, s0 4596; GFX8-NEXT: s_addc_u32 s5, s1, s3 4597; GFX8-NEXT: v_mov_b32_e32 v1, s1 4598; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4599; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 4600; GFX8-NEXT: s_ashr_i32 s2, s5, 31 4601; GFX8-NEXT: s_add_i32 s3, s2, 0x80000000 4602; GFX8-NEXT: v_mov_b32_e32 v0, s2 4603; GFX8-NEXT: v_mov_b32_e32 v1, s3 4604; GFX8-NEXT: v_mov_b32_e32 v2, s4 4605; GFX8-NEXT: v_mov_b32_e32 v3, s5 4606; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc 4607; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 4608; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4609; GFX8-NEXT: v_readfirstlane_b32 s0, v0 4610; GFX8-NEXT: v_readfirstlane_b32 s1, v1 4611; GFX8-NEXT: ; return to shader part epilog 4612; 4613; GFX9-LABEL: s_saddsat_i64: 4614; GFX9: ; %bb.0: 4615; GFX9-NEXT: s_add_u32 s4, s0, s2 4616; GFX9-NEXT: v_mov_b32_e32 v0, s0 4617; GFX9-NEXT: s_addc_u32 s5, s1, s3 4618; GFX9-NEXT: v_mov_b32_e32 v1, s1 4619; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4620; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 4621; GFX9-NEXT: s_ashr_i32 s2, s5, 31 4622; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000 4623; GFX9-NEXT: v_mov_b32_e32 v0, s2 4624; GFX9-NEXT: v_mov_b32_e32 v1, s3 4625; GFX9-NEXT: v_mov_b32_e32 v2, s4 4626; GFX9-NEXT: v_mov_b32_e32 v3, s5 4627; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc 4628; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 4629; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4630; GFX9-NEXT: v_readfirstlane_b32 s0, v0 4631; GFX9-NEXT: v_readfirstlane_b32 s1, v1 4632; GFX9-NEXT: ; return to shader part epilog 4633; 4634; GFX10-LABEL: s_saddsat_i64: 4635; GFX10: ; %bb.0: 4636; GFX10-NEXT: s_add_u32 s4, s0, s2 4637; GFX10-NEXT: s_addc_u32 s5, s1, s3 4638; GFX10-NEXT: v_mov_b32_e32 v0, s4 4639; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1] 4640; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0 4641; GFX10-NEXT: v_mov_b32_e32 v1, s5 4642; GFX10-NEXT: s_ashr_i32 s2, s5, 31 4643; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000 4644; GFX10-NEXT: s_xor_b32 s0, s1, s0 4645; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 4646; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 4647; GFX10-NEXT: v_readfirstlane_b32 s0, v0 4648; GFX10-NEXT: v_readfirstlane_b32 s1, v1 4649; GFX10-NEXT: ; return to shader part epilog 4650; 4651; GFX11-LABEL: s_saddsat_i64: 4652; GFX11: ; %bb.0: 4653; GFX11-NEXT: s_add_u32 s4, s0, s2 4654; GFX11-NEXT: s_addc_u32 s5, s1, s3 4655; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 4656; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1] 4657; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0 4658; GFX11-NEXT: s_ashr_i32 s2, s5, 31 4659; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000 4660; GFX11-NEXT: s_xor_b32 s0, s1, s0 4661; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 4662; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 4663; GFX11-NEXT: v_readfirstlane_b32 s0, v0 4664; GFX11-NEXT: v_readfirstlane_b32 s1, v1 4665; GFX11-NEXT: ; return to shader part epilog 4666 %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) 4667 ret i64 %result 4668} 4669 4670define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) { 4671; GFX6-LABEL: saddsat_i64_sv: 4672; GFX6: ; %bb.0: 4673; GFX6-NEXT: v_mov_b32_e32 v3, s1 4674; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v0 4675; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc 4676; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] 4677; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1] 4678; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 4679; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v0 4680; GFX6-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] 4681; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 4682; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4683; GFX6-NEXT: ; return to shader part epilog 4684; 4685; GFX8-LABEL: saddsat_i64_sv: 4686; GFX8: ; %bb.0: 4687; GFX8-NEXT: v_mov_b32_e32 v3, s1 4688; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 4689; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc 4690; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] 4691; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1] 4692; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 4693; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v0 4694; GFX8-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] 4695; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 4696; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4697; GFX8-NEXT: ; return to shader part epilog 4698; 4699; GFX9-LABEL: saddsat_i64_sv: 4700; GFX9: ; %bb.0: 4701; GFX9-NEXT: v_mov_b32_e32 v3, s1 4702; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 4703; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc 4704; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] 4705; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[0:1] 4706; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 4707; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 4708; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc 4709; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 4710; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4711; GFX9-NEXT: ; return to shader part epilog 4712; 4713; GFX10-LABEL: saddsat_i64_sv: 4714; GFX10: ; %bb.0: 4715; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0 4716; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo 4717; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 4718; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] 4719; GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1] 4720; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 4721; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo 4722; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo 4723; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo 4724; GFX10-NEXT: ; return to shader part epilog 4725; 4726; GFX11-LABEL: saddsat_i64_sv: 4727; GFX11: ; %bb.0: 4728; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0 4729; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo 4730; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 4731; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] 4732; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1] 4733; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 4734; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo 4735; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 4736; GFX11-NEXT: ; return to shader part epilog 4737 %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) 4738 %cast = bitcast i64 %result to <2 x float> 4739 ret <2 x float> %cast 4740} 4741 4742define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) { 4743; GFX6-LABEL: saddsat_i64_vs: 4744; GFX6: ; %bb.0: 4745; GFX6-NEXT: v_mov_b32_e32 v3, s1 4746; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v0 4747; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc 4748; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] 4749; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 4750; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 4751; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v0 4752; GFX6-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] 4753; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 4754; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4755; GFX6-NEXT: ; return to shader part epilog 4756; 4757; GFX8-LABEL: saddsat_i64_vs: 4758; GFX8: ; %bb.0: 4759; GFX8-NEXT: v_mov_b32_e32 v3, s1 4760; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 4761; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc 4762; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] 4763; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 4764; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 4765; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v0 4766; GFX8-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] 4767; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 4768; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4769; GFX8-NEXT: ; return to shader part epilog 4770; 4771; GFX9-LABEL: saddsat_i64_vs: 4772; GFX9: ; %bb.0: 4773; GFX9-NEXT: v_mov_b32_e32 v3, s1 4774; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 4775; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc 4776; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] 4777; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 4778; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 4779; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 4780; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc 4781; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 4782; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4783; GFX9-NEXT: ; return to shader part epilog 4784; 4785; GFX10-LABEL: saddsat_i64_vs: 4786; GFX10: ; %bb.0: 4787; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0 4788; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo 4789; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0 4790; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 4791; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] 4792; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 4793; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo 4794; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo 4795; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo 4796; GFX10-NEXT: ; return to shader part epilog 4797; 4798; GFX11-LABEL: saddsat_i64_vs: 4799; GFX11: ; %bb.0: 4800; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0 4801; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo 4802; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0 4803; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 4804; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] 4805; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 4806; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo 4807; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 4808; GFX11-NEXT: ; return to shader part epilog 4809 %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) 4810 %cast = bitcast i64 %result to <2 x float> 4811 ret <2 x float> %cast 4812} 4813 4814define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { 4815; GFX6-LABEL: v_saddsat_v2i64: 4816; GFX6: ; %bb.0: 4817; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4818; GFX6-NEXT: v_add_i32_e32 v8, vcc, v0, v4 4819; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v1, v5, vcc 4820; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1] 4821; GFX6-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[4:5] 4822; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v9 4823; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 4824; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v1 4825; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] 4826; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc 4827; GFX6-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc 4828; GFX6-NEXT: v_add_i32_e32 v4, vcc, v2, v6 4829; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v3, v7, vcc 4830; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3] 4831; GFX6-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[6:7] 4832; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v5 4833; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v2 4834; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] 4835; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 4836; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 4837; GFX6-NEXT: s_setpc_b64 s[30:31] 4838; 4839; GFX8-LABEL: v_saddsat_v2i64: 4840; GFX8: ; %bb.0: 4841; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4842; GFX8-NEXT: v_add_u32_e32 v8, vcc, v0, v4 4843; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v1, v5, vcc 4844; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1] 4845; GFX8-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[4:5] 4846; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v9 4847; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 4848; GFX8-NEXT: v_add_u32_e32 v1, vcc, v0, v1 4849; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] 4850; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc 4851; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc 4852; GFX8-NEXT: v_add_u32_e32 v4, vcc, v2, v6 4853; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v3, v7, vcc 4854; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3] 4855; GFX8-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[6:7] 4856; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v5 4857; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x80000000, v2 4858; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] 4859; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 4860; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 4861; GFX8-NEXT: s_setpc_b64 s[30:31] 4862; 4863; GFX9-LABEL: v_saddsat_v2i64: 4864; GFX9: ; %bb.0: 4865; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4866; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v4 4867; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v1, v5, vcc 4868; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1] 4869; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] 4870; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v9 4871; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 4872; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc 4873; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc 4874; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc 4875; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v6 4876; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v7, vcc 4877; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3] 4878; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7] 4879; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v5 4880; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 4881; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc 4882; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 4883; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 4884; GFX9-NEXT: s_setpc_b64 s[30:31] 4885; 4886; GFX10-LABEL: v_saddsat_v2i64: 4887; GFX10: ; %bb.0: 4888; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4889; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v0, v4 4890; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo 4891; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v2, v6 4892; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo 4893; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v9 4894; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] 4895; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[4:5] 4896; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v11 4897; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3] 4898; GFX10-NEXT: v_cmp_gt_i64_e64 s6, 0, v[6:7] 4899; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v12 4900; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v4 4901; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo 4902; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v12, vcc_lo 4903; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo 4904; GFX10-NEXT: s_xor_b32 vcc_lo, s6, s5 4905; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v4, vcc_lo 4906; GFX10-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo 4907; GFX10-NEXT: s_setpc_b64 s[30:31] 4908; 4909; GFX11-LABEL: v_saddsat_v2i64: 4910; GFX11: ; %bb.0: 4911; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4912; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v0, v4 4913; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo 4914; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v2, v6 4915; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo 4916; GFX11-NEXT: v_ashrrev_i32_e32 v12, 31, v9 4917; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] 4918; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[4:5] 4919; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v11 4920; GFX11-NEXT: v_cmp_lt_i64_e64 s1, v[10:11], v[2:3] 4921; GFX11-NEXT: v_cmp_gt_i64_e64 s2, 0, v[6:7] 4922; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v12 4923; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v4 4924; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo 4925; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v12 :: v_dual_cndmask_b32 v1, v9, v1 4926; GFX11-NEXT: s_xor_b32 vcc_lo, s2, s1 4927; GFX11-NEXT: v_dual_cndmask_b32 v2, v10, v4 :: v_dual_cndmask_b32 v3, v11, v3 4928; GFX11-NEXT: s_setpc_b64 s[30:31] 4929 %result = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) 4930 ret <2 x i64> %result 4931} 4932 4933define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs) { 4934; GFX6-LABEL: s_saddsat_v2i64: 4935; GFX6: ; %bb.0: 4936; GFX6-NEXT: s_add_u32 s8, s0, s4 4937; GFX6-NEXT: v_mov_b32_e32 v0, s0 4938; GFX6-NEXT: s_addc_u32 s9, s1, s5 4939; GFX6-NEXT: v_mov_b32_e32 v1, s1 4940; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] 4941; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 4942; GFX6-NEXT: s_ashr_i32 s4, s9, 31 4943; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 4944; GFX6-NEXT: v_mov_b32_e32 v0, s4 4945; GFX6-NEXT: v_mov_b32_e32 v1, s5 4946; GFX6-NEXT: v_mov_b32_e32 v2, s8 4947; GFX6-NEXT: v_mov_b32_e32 v3, s9 4948; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc 4949; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc 4950; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc 4951; GFX6-NEXT: s_add_u32 s0, s2, s6 4952; GFX6-NEXT: v_mov_b32_e32 v0, s2 4953; GFX6-NEXT: s_addc_u32 s1, s3, s7 4954; GFX6-NEXT: v_mov_b32_e32 v1, s3 4955; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] 4956; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 4957; GFX6-NEXT: s_ashr_i32 s4, s1, 31 4958; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 4959; GFX6-NEXT: v_mov_b32_e32 v0, s4 4960; GFX6-NEXT: v_mov_b32_e32 v1, s5 4961; GFX6-NEXT: v_mov_b32_e32 v4, s0 4962; GFX6-NEXT: v_mov_b32_e32 v5, s1 4963; GFX6-NEXT: s_xor_b64 vcc, s[2:3], vcc 4964; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 4965; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 4966; GFX6-NEXT: v_readfirstlane_b32 s0, v2 4967; GFX6-NEXT: v_readfirstlane_b32 s1, v3 4968; GFX6-NEXT: v_readfirstlane_b32 s2, v0 4969; GFX6-NEXT: v_readfirstlane_b32 s3, v1 4970; GFX6-NEXT: ; return to shader part epilog 4971; 4972; GFX8-LABEL: s_saddsat_v2i64: 4973; GFX8: ; %bb.0: 4974; GFX8-NEXT: s_add_u32 s8, s0, s4 4975; GFX8-NEXT: v_mov_b32_e32 v0, s0 4976; GFX8-NEXT: s_addc_u32 s9, s1, s5 4977; GFX8-NEXT: v_mov_b32_e32 v1, s1 4978; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] 4979; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 4980; GFX8-NEXT: s_ashr_i32 s4, s9, 31 4981; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 4982; GFX8-NEXT: v_mov_b32_e32 v0, s4 4983; GFX8-NEXT: v_mov_b32_e32 v1, s5 4984; GFX8-NEXT: v_mov_b32_e32 v2, s8 4985; GFX8-NEXT: v_mov_b32_e32 v3, s9 4986; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc 4987; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc 4988; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc 4989; GFX8-NEXT: s_add_u32 s0, s2, s6 4990; GFX8-NEXT: v_mov_b32_e32 v0, s2 4991; GFX8-NEXT: s_addc_u32 s1, s3, s7 4992; GFX8-NEXT: v_mov_b32_e32 v1, s3 4993; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] 4994; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 4995; GFX8-NEXT: s_ashr_i32 s4, s1, 31 4996; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 4997; GFX8-NEXT: v_mov_b32_e32 v0, s4 4998; GFX8-NEXT: v_mov_b32_e32 v1, s5 4999; GFX8-NEXT: v_mov_b32_e32 v4, s0 5000; GFX8-NEXT: v_mov_b32_e32 v5, s1 5001; GFX8-NEXT: s_xor_b64 vcc, s[2:3], vcc 5002; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 5003; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 5004; GFX8-NEXT: v_readfirstlane_b32 s0, v2 5005; GFX8-NEXT: v_readfirstlane_b32 s1, v3 5006; GFX8-NEXT: v_readfirstlane_b32 s2, v0 5007; GFX8-NEXT: v_readfirstlane_b32 s3, v1 5008; GFX8-NEXT: ; return to shader part epilog 5009; 5010; GFX9-LABEL: s_saddsat_v2i64: 5011; GFX9: ; %bb.0: 5012; GFX9-NEXT: s_add_u32 s8, s0, s4 5013; GFX9-NEXT: v_mov_b32_e32 v0, s0 5014; GFX9-NEXT: s_addc_u32 s9, s1, s5 5015; GFX9-NEXT: v_mov_b32_e32 v1, s1 5016; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] 5017; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 5018; GFX9-NEXT: s_ashr_i32 s4, s9, 31 5019; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 5020; GFX9-NEXT: v_mov_b32_e32 v0, s4 5021; GFX9-NEXT: v_mov_b32_e32 v1, s5 5022; GFX9-NEXT: v_mov_b32_e32 v2, s8 5023; GFX9-NEXT: v_mov_b32_e32 v3, s9 5024; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc 5025; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc 5026; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc 5027; GFX9-NEXT: s_add_u32 s0, s2, s6 5028; GFX9-NEXT: v_mov_b32_e32 v0, s2 5029; GFX9-NEXT: s_addc_u32 s1, s3, s7 5030; GFX9-NEXT: v_mov_b32_e32 v1, s3 5031; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] 5032; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 5033; GFX9-NEXT: s_ashr_i32 s4, s1, 31 5034; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 5035; GFX9-NEXT: v_mov_b32_e32 v0, s4 5036; GFX9-NEXT: v_mov_b32_e32 v1, s5 5037; GFX9-NEXT: v_mov_b32_e32 v4, s0 5038; GFX9-NEXT: v_mov_b32_e32 v5, s1 5039; GFX9-NEXT: s_xor_b64 vcc, s[2:3], vcc 5040; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 5041; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 5042; GFX9-NEXT: v_readfirstlane_b32 s0, v2 5043; GFX9-NEXT: v_readfirstlane_b32 s1, v3 5044; GFX9-NEXT: v_readfirstlane_b32 s2, v0 5045; GFX9-NEXT: v_readfirstlane_b32 s3, v1 5046; GFX9-NEXT: ; return to shader part epilog 5047; 5048; GFX10-LABEL: s_saddsat_v2i64: 5049; GFX10: ; %bb.0: 5050; GFX10-NEXT: s_add_u32 s8, s0, s4 5051; GFX10-NEXT: s_addc_u32 s9, s1, s5 5052; GFX10-NEXT: v_mov_b32_e32 v0, s8 5053; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1] 5054; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], 0 5055; GFX10-NEXT: s_ashr_i32 s4, s9, 31 5056; GFX10-NEXT: v_mov_b32_e32 v1, s9 5057; GFX10-NEXT: s_add_i32 s5, s4, 0x80000000 5058; GFX10-NEXT: s_xor_b32 s8, s1, s0 5059; GFX10-NEXT: s_add_u32 s0, s2, s6 5060; GFX10-NEXT: s_addc_u32 s1, s3, s7 5061; GFX10-NEXT: v_mov_b32_e32 v2, s0 5062; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[0:1], s[2:3] 5063; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[6:7], 0 5064; GFX10-NEXT: v_mov_b32_e32 v3, s1 5065; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s8 5066; GFX10-NEXT: s_ashr_i32 s4, s1, 31 5067; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, s8 5068; GFX10-NEXT: s_add_i32 s0, s4, 0x80000000 5069; GFX10-NEXT: s_xor_b32 s1, s3, s2 5070; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1 5071; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1 5072; GFX10-NEXT: v_readfirstlane_b32 s0, v0 5073; GFX10-NEXT: v_readfirstlane_b32 s1, v1 5074; GFX10-NEXT: v_readfirstlane_b32 s2, v2 5075; GFX10-NEXT: v_readfirstlane_b32 s3, v3 5076; GFX10-NEXT: ; return to shader part epilog 5077; 5078; GFX11-LABEL: s_saddsat_v2i64: 5079; GFX11: ; %bb.0: 5080; GFX11-NEXT: s_add_u32 s8, s0, s4 5081; GFX11-NEXT: s_addc_u32 s9, s1, s5 5082; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 5083; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1] 5084; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], 0 5085; GFX11-NEXT: s_ashr_i32 s4, s9, 31 5086; GFX11-NEXT: s_add_i32 s5, s4, 0x80000000 5087; GFX11-NEXT: s_xor_b32 s8, s1, s0 5088; GFX11-NEXT: s_add_u32 s0, s2, s6 5089; GFX11-NEXT: s_addc_u32 s1, s3, s7 5090; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 5091; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[0:1], s[2:3] 5092; GFX11-NEXT: v_cmp_lt_i64_e64 s3, s[6:7], 0 5093; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s8 5094; GFX11-NEXT: s_ashr_i32 s4, s1, 31 5095; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s5, s8 5096; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000 5097; GFX11-NEXT: s_xor_b32 s1, s3, s2 5098; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1 5099; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1 5100; GFX11-NEXT: v_readfirstlane_b32 s0, v0 5101; GFX11-NEXT: v_readfirstlane_b32 s1, v1 5102; GFX11-NEXT: v_readfirstlane_b32 s2, v2 5103; GFX11-NEXT: v_readfirstlane_b32 s3, v3 5104; GFX11-NEXT: ; return to shader part epilog 5105 %result = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) 5106 ret <2 x i64> %result 5107} 5108 5109define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { 5110; GFX6-LABEL: s_saddsat_i128: 5111; GFX6: ; %bb.0: 5112; GFX6-NEXT: s_add_u32 s4, s0, s4 5113; GFX6-NEXT: v_mov_b32_e32 v0, s0 5114; GFX6-NEXT: s_addc_u32 s5, s1, s5 5115; GFX6-NEXT: v_mov_b32_e32 v1, s1 5116; GFX6-NEXT: s_addc_u32 s8, s2, s6 5117; GFX6-NEXT: v_mov_b32_e32 v2, s2 5118; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 5119; GFX6-NEXT: s_addc_u32 s9, s3, s7 5120; GFX6-NEXT: v_mov_b32_e32 v3, s3 5121; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5122; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[2:3] 5123; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], 0 5124; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5125; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3] 5126; GFX6-NEXT: v_mov_b32_e32 v2, s4 5127; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5128; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 5129; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], 0 5130; GFX6-NEXT: v_mov_b32_e32 v3, s5 5131; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] 5132; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 5133; GFX6-NEXT: s_ashr_i32 s0, s9, 31 5134; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 5135; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000 5136; GFX6-NEXT: v_mov_b32_e32 v1, s0 5137; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5138; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 5139; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc 5140; GFX6-NEXT: v_mov_b32_e32 v3, s1 5141; GFX6-NEXT: v_mov_b32_e32 v4, s8 5142; GFX6-NEXT: v_mov_b32_e32 v5, s9 5143; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc 5144; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 5145; GFX6-NEXT: v_readfirstlane_b32 s0, v0 5146; GFX6-NEXT: v_readfirstlane_b32 s1, v2 5147; GFX6-NEXT: v_readfirstlane_b32 s2, v1 5148; GFX6-NEXT: v_readfirstlane_b32 s3, v3 5149; GFX6-NEXT: ; return to shader part epilog 5150; 5151; GFX8-LABEL: s_saddsat_i128: 5152; GFX8: ; %bb.0: 5153; GFX8-NEXT: s_add_u32 s4, s0, s4 5154; GFX8-NEXT: s_addc_u32 s5, s1, s5 5155; GFX8-NEXT: v_mov_b32_e32 v0, s0 5156; GFX8-NEXT: s_addc_u32 s8, s2, s6 5157; GFX8-NEXT: v_mov_b32_e32 v1, s1 5158; GFX8-NEXT: s_addc_u32 s9, s3, s7 5159; GFX8-NEXT: v_mov_b32_e32 v2, s2 5160; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 5161; GFX8-NEXT: v_mov_b32_e32 v3, s3 5162; GFX8-NEXT: s_cmp_eq_u64 s[8:9], s[2:3] 5163; GFX8-NEXT: s_cselect_b32 s0, 1, 0 5164; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5165; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[2:3] 5166; GFX8-NEXT: s_and_b32 s0, 1, s0 5167; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5168; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 5169; GFX8-NEXT: s_cmp_eq_u64 s[6:7], 0 5170; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], 0 5171; GFX8-NEXT: s_cselect_b32 s2, 1, 0 5172; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5173; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 5174; GFX8-NEXT: s_and_b32 s0, 1, s2 5175; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 5176; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] 5177; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 5178; GFX8-NEXT: s_ashr_i32 s0, s9, 31 5179; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 5180; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000 5181; GFX8-NEXT: v_mov_b32_e32 v1, s0 5182; GFX8-NEXT: v_mov_b32_e32 v2, s4 5183; GFX8-NEXT: v_mov_b32_e32 v3, s5 5184; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5185; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 5186; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc 5187; GFX8-NEXT: v_mov_b32_e32 v3, s1 5188; GFX8-NEXT: v_mov_b32_e32 v4, s8 5189; GFX8-NEXT: v_mov_b32_e32 v5, s9 5190; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc 5191; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 5192; GFX8-NEXT: v_readfirstlane_b32 s0, v0 5193; GFX8-NEXT: v_readfirstlane_b32 s1, v2 5194; GFX8-NEXT: v_readfirstlane_b32 s2, v1 5195; GFX8-NEXT: v_readfirstlane_b32 s3, v3 5196; GFX8-NEXT: ; return to shader part epilog 5197; 5198; GFX9-LABEL: s_saddsat_i128: 5199; GFX9: ; %bb.0: 5200; GFX9-NEXT: s_add_u32 s4, s0, s4 5201; GFX9-NEXT: s_addc_u32 s5, s1, s5 5202; GFX9-NEXT: v_mov_b32_e32 v0, s0 5203; GFX9-NEXT: s_addc_u32 s8, s2, s6 5204; GFX9-NEXT: v_mov_b32_e32 v1, s1 5205; GFX9-NEXT: s_addc_u32 s9, s3, s7 5206; GFX9-NEXT: v_mov_b32_e32 v2, s2 5207; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 5208; GFX9-NEXT: v_mov_b32_e32 v3, s3 5209; GFX9-NEXT: s_cmp_eq_u64 s[8:9], s[2:3] 5210; GFX9-NEXT: s_cselect_b32 s0, 1, 0 5211; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5212; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[2:3] 5213; GFX9-NEXT: s_and_b32 s0, 1, s0 5214; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5215; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 5216; GFX9-NEXT: s_cmp_eq_u64 s[6:7], 0 5217; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], 0 5218; GFX9-NEXT: s_cselect_b32 s2, 1, 0 5219; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5220; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 5221; GFX9-NEXT: s_and_b32 s0, 1, s2 5222; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 5223; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] 5224; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 5225; GFX9-NEXT: s_ashr_i32 s0, s9, 31 5226; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 5227; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000 5228; GFX9-NEXT: v_mov_b32_e32 v1, s0 5229; GFX9-NEXT: v_mov_b32_e32 v2, s4 5230; GFX9-NEXT: v_mov_b32_e32 v3, s5 5231; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5232; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 5233; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc 5234; GFX9-NEXT: v_mov_b32_e32 v3, s1 5235; GFX9-NEXT: v_mov_b32_e32 v4, s8 5236; GFX9-NEXT: v_mov_b32_e32 v5, s9 5237; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc 5238; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 5239; GFX9-NEXT: v_readfirstlane_b32 s0, v0 5240; GFX9-NEXT: v_readfirstlane_b32 s1, v2 5241; GFX9-NEXT: v_readfirstlane_b32 s2, v1 5242; GFX9-NEXT: v_readfirstlane_b32 s3, v3 5243; GFX9-NEXT: ; return to shader part epilog 5244; 5245; GFX10-LABEL: s_saddsat_i128: 5246; GFX10: ; %bb.0: 5247; GFX10-NEXT: s_add_u32 s4, s0, s4 5248; GFX10-NEXT: s_addc_u32 s5, s1, s5 5249; GFX10-NEXT: s_addc_u32 s8, s2, s6 5250; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], s[0:1] 5251; GFX10-NEXT: s_addc_u32 s9, s3, s7 5252; GFX10-NEXT: s_cmp_eq_u64 s[8:9], s[2:3] 5253; GFX10-NEXT: v_mov_b32_e32 v3, s9 5254; GFX10-NEXT: s_cselect_b32 s10, 1, 0 5255; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 5256; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[2:3] 5257; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], 0 5258; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 5259; GFX10-NEXT: s_and_b32 s0, 1, s10 5260; GFX10-NEXT: s_cmp_eq_u64 s[6:7], 0 5261; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 5262; GFX10-NEXT: s_cselect_b32 s1, 1, 0 5263; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 5264; GFX10-NEXT: s_and_b32 s1, 1, s1 5265; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 5266; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo 5267; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 5268; GFX10-NEXT: v_mov_b32_e32 v2, s5 5269; GFX10-NEXT: s_ashr_i32 s0, s9, 31 5270; GFX10-NEXT: s_add_i32 s1, s0, 0x80000000 5271; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 5272; GFX10-NEXT: v_mov_b32_e32 v1, s4 5273; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 5274; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 5275; GFX10-NEXT: v_mov_b32_e32 v0, s8 5276; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo 5277; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s0, vcc_lo 5278; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo 5279; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s1, vcc_lo 5280; GFX10-NEXT: v_readfirstlane_b32 s0, v1 5281; GFX10-NEXT: v_readfirstlane_b32 s1, v2 5282; GFX10-NEXT: v_readfirstlane_b32 s2, v0 5283; GFX10-NEXT: v_readfirstlane_b32 s3, v3 5284; GFX10-NEXT: ; return to shader part epilog 5285; 5286; GFX11-LABEL: s_saddsat_i128: 5287; GFX11: ; %bb.0: 5288; GFX11-NEXT: s_add_u32 s4, s0, s4 5289; GFX11-NEXT: s_addc_u32 s5, s1, s5 5290; GFX11-NEXT: s_addc_u32 s8, s2, s6 5291; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], s[0:1] 5292; GFX11-NEXT: s_addc_u32 s9, s3, s7 5293; GFX11-NEXT: s_cmp_eq_u64 s[8:9], s[2:3] 5294; GFX11-NEXT: v_mov_b32_e32 v3, s9 5295; GFX11-NEXT: s_cselect_b32 s10, 1, 0 5296; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 5297; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[2:3] 5298; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], 0 5299; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 5300; GFX11-NEXT: s_and_b32 s0, 1, s10 5301; GFX11-NEXT: s_cmp_eq_u64 s[6:7], 0 5302; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 5303; GFX11-NEXT: s_cselect_b32 s1, 1, 0 5304; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 5305; GFX11-NEXT: s_and_b32 s1, 1, s1 5306; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 5307; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo 5308; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 5309; GFX11-NEXT: v_mov_b32_e32 v2, s5 5310; GFX11-NEXT: s_ashr_i32 s0, s9, 31 5311; GFX11-NEXT: s_add_i32 s1, s0, 0x80000000 5312; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 5313; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 1, v0 5314; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 5315; GFX11-NEXT: v_mov_b32_e32 v0, s8 5316; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo 5317; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s0, vcc_lo 5318; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo 5319; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s1, vcc_lo 5320; GFX11-NEXT: v_readfirstlane_b32 s0, v1 5321; GFX11-NEXT: v_readfirstlane_b32 s1, v2 5322; GFX11-NEXT: v_readfirstlane_b32 s2, v0 5323; GFX11-NEXT: v_readfirstlane_b32 s3, v3 5324; GFX11-NEXT: ; return to shader part epilog 5325 %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs) 5326 ret i128 %result 5327} 5328 5329define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { 5330; GFX6-LABEL: saddsat_i128_sv: 5331; GFX6: ; %bb.0: 5332; GFX6-NEXT: v_mov_b32_e32 v4, s1 5333; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 5334; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc 5335; GFX6-NEXT: v_mov_b32_e32 v4, s2 5336; GFX6-NEXT: v_mov_b32_e32 v5, s3 5337; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v2, vcc 5338; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v3, vcc 5339; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] 5340; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc 5341; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[4:5] 5342; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc 5343; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[4:5] 5344; GFX6-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc 5345; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] 5346; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc 5347; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] 5348; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v5 5349; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, 0, vcc 5350; GFX6-NEXT: v_xor_b32_e32 v2, v2, v6 5351; GFX6-NEXT: v_bfrev_b32_e32 v6, 1 5352; GFX6-NEXT: v_add_i32_e32 v6, vcc, v3, v6 5353; GFX6-NEXT: v_and_b32_e32 v2, 1, v2 5354; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 5355; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 5356; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 5357; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc 5358; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc 5359; GFX6-NEXT: ; return to shader part epilog 5360; 5361; GFX8-LABEL: saddsat_i128_sv: 5362; GFX8: ; %bb.0: 5363; GFX8-NEXT: v_mov_b32_e32 v4, s1 5364; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 5365; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc 5366; GFX8-NEXT: v_mov_b32_e32 v4, s2 5367; GFX8-NEXT: v_mov_b32_e32 v5, s3 5368; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v2, vcc 5369; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v3, vcc 5370; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] 5371; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc 5372; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[4:5] 5373; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc 5374; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[4:5] 5375; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc 5376; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] 5377; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc 5378; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] 5379; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v5 5380; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, 0, vcc 5381; GFX8-NEXT: v_xor_b32_e32 v2, v2, v6 5382; GFX8-NEXT: v_bfrev_b32_e32 v6, 1 5383; GFX8-NEXT: v_add_u32_e32 v6, vcc, v3, v6 5384; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 5385; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 5386; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 5387; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 5388; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc 5389; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc 5390; GFX8-NEXT: ; return to shader part epilog 5391; 5392; GFX9-LABEL: saddsat_i128_sv: 5393; GFX9: ; %bb.0: 5394; GFX9-NEXT: v_mov_b32_e32 v4, s1 5395; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 5396; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc 5397; GFX9-NEXT: v_mov_b32_e32 v4, s2 5398; GFX9-NEXT: v_mov_b32_e32 v5, s3 5399; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc 5400; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v3, vcc 5401; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] 5402; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc 5403; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[4:5] 5404; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc 5405; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[4:5] 5406; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc 5407; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] 5408; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc 5409; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] 5410; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5 5411; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, 0, vcc 5412; GFX9-NEXT: v_xor_b32_e32 v2, v2, v6 5413; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 5414; GFX9-NEXT: v_add_u32_e32 v6, 0x80000000, v3 5415; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 5416; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 5417; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 5418; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc 5419; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc 5420; GFX9-NEXT: ; return to shader part epilog 5421; 5422; GFX10-LABEL: saddsat_i128_sv: 5423; GFX10: ; %bb.0: 5424; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 5425; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo 5426; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s2, v2, vcc_lo 5427; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v3, vcc_lo 5428; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] 5429; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo 5430; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[4:5] 5431; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo 5432; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] 5433; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo 5434; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5] 5435; GFX10-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc_lo 5436; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] 5437; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v5 5438; GFX10-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc_lo 5439; GFX10-NEXT: v_xor_b32_e32 v2, v2, v6 5440; GFX10-NEXT: v_add_nc_u32_e32 v6, 0x80000000, v3 5441; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 5442; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 5443; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo 5444; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo 5445; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc_lo 5446; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo 5447; GFX10-NEXT: ; return to shader part epilog 5448; 5449; GFX11-LABEL: saddsat_i128_sv: 5450; GFX11: ; %bb.0: 5451; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 5452; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo 5453; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s2, v2, vcc_lo 5454; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v3, vcc_lo 5455; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] 5456; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo 5457; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[4:5] 5458; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo 5459; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] 5460; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo 5461; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5] 5462; GFX11-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc_lo 5463; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] 5464; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v5 5465; GFX11-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc_lo 5466; GFX11-NEXT: v_xor_b32_e32 v2, v2, v6 5467; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x80000000, v3 5468; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 5469; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 5470; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo 5471; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo 5472; GFX11-NEXT: v_dual_cndmask_b32 v2, v4, v3 :: v_dual_cndmask_b32 v3, v5, v6 5473; GFX11-NEXT: ; return to shader part epilog 5474 %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs) 5475 %cast = bitcast i128 %result to <4 x float> 5476 ret <4 x float> %cast 5477} 5478 5479define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) { 5480; GFX6-LABEL: saddsat_i128_vs: 5481; GFX6: ; %bb.0: 5482; GFX6-NEXT: v_mov_b32_e32 v5, s1 5483; GFX6-NEXT: v_add_i32_e32 v4, vcc, s0, v0 5484; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v5, vcc 5485; GFX6-NEXT: v_mov_b32_e32 v6, s2 5486; GFX6-NEXT: v_mov_b32_e32 v7, s3 5487; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v2, v6, vcc 5488; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v3, v7, vcc 5489; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1] 5490; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 5491; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5492; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3] 5493; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5494; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] 5495; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v7 5496; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5497; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 5498; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[2:3], 0 5499; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] 5500; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 5501; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 5502; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1 5503; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 5504; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5505; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc 5506; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc 5507; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 5508; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc 5509; GFX6-NEXT: ; return to shader part epilog 5510; 5511; GFX8-LABEL: saddsat_i128_vs: 5512; GFX8: ; %bb.0: 5513; GFX8-NEXT: v_mov_b32_e32 v5, s1 5514; GFX8-NEXT: v_add_u32_e32 v4, vcc, s0, v0 5515; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v1, v5, vcc 5516; GFX8-NEXT: v_mov_b32_e32 v6, s2 5517; GFX8-NEXT: v_mov_b32_e32 v7, s3 5518; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v2, v6, vcc 5519; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v3, v7, vcc 5520; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1] 5521; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 5522; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5523; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3] 5524; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 5525; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5526; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] 5527; GFX8-NEXT: s_cselect_b32 s4, 1, 0 5528; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5529; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 5530; GFX8-NEXT: s_and_b32 s0, 1, s4 5531; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 5532; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] 5533; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 5534; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v7 5535; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 5536; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1 5537; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 5538; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5539; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc 5540; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc 5541; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 5542; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc 5543; GFX8-NEXT: ; return to shader part epilog 5544; 5545; GFX9-LABEL: saddsat_i128_vs: 5546; GFX9: ; %bb.0: 5547; GFX9-NEXT: v_mov_b32_e32 v5, s1 5548; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v0 5549; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v5, vcc 5550; GFX9-NEXT: v_mov_b32_e32 v6, s2 5551; GFX9-NEXT: v_mov_b32_e32 v7, s3 5552; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v2, v6, vcc 5553; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v3, v7, vcc 5554; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1] 5555; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 5556; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5557; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3] 5558; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 5559; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5560; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] 5561; GFX9-NEXT: s_cselect_b32 s4, 1, 0 5562; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5563; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 5564; GFX9-NEXT: s_and_b32 s0, 1, s4 5565; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 5566; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] 5567; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 5568; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7 5569; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 5570; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 5571; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5572; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc 5573; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc 5574; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 5575; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc 5576; GFX9-NEXT: ; return to shader part epilog 5577; 5578; GFX10-LABEL: saddsat_i128_vs: 5579; GFX10: ; %bb.0: 5580; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, s0 5581; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo 5582; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo 5583; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo 5584; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1] 5585; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0 5586; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0 5587; GFX10-NEXT: s_cselect_b32 s0, 1, 0 5588; GFX10-NEXT: s_and_b32 s0, 1, s0 5589; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 5590; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3] 5591; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1 5592; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 5593; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo 5594; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] 5595; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7 5596; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2 5597; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo 5598; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, 0, s0 5599; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 5600; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 5601; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 5602; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo 5603; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo 5604; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo 5605; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo 5606; GFX10-NEXT: ; return to shader part epilog 5607; 5608; GFX11-LABEL: saddsat_i128_vs: 5609; GFX11: ; %bb.0: 5610; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, s0 5611; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo 5612; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo 5613; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo 5614; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1] 5615; GFX11-NEXT: s_cmp_eq_u64 s[2:3], 0 5616; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0 5617; GFX11-NEXT: s_cselect_b32 s0, 1, 0 5618; GFX11-NEXT: s_and_b32 s0, 1, s0 5619; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 5620; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3] 5621; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1 5622; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 5623; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo 5624; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] 5625; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7 5626; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2 5627; GFX11-NEXT: v_cndmask_b32_e64 v1, v8, 0, s0 5628; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 5629; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 5630; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 5631; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo 5632; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v2 :: v_dual_cndmask_b32 v3, v7, v3 5633; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo 5634; GFX11-NEXT: ; return to shader part epilog 5635 %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs) 5636 %cast = bitcast i128 %result to <4 x float> 5637 ret <4 x float> %cast 5638} 5639 5640define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { 5641; GFX6-LABEL: v_saddsat_v2i128: 5642; GFX6: ; %bb.0: 5643; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5644; GFX6-NEXT: v_add_i32_e32 v8, vcc, v0, v8 5645; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v1, v9, vcc 5646; GFX6-NEXT: v_addc_u32_e32 v16, vcc, v2, v10, vcc 5647; GFX6-NEXT: v_addc_u32_e32 v17, vcc, v3, v11, vcc 5648; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[0:1] 5649; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5650; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[16:17], v[2:3] 5651; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5652; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[16:17], v[2:3] 5653; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v17 5654; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5655; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11] 5656; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5657; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] 5658; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 5659; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 5660; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 5661; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1 5662; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 5663; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5664; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc 5665; GFX6-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc 5666; GFX6-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc 5667; GFX6-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc 5668; GFX6-NEXT: v_add_i32_e32 v8, vcc, v4, v12 5669; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v5, v13, vcc 5670; GFX6-NEXT: v_addc_u32_e32 v10, vcc, v6, v14, vcc 5671; GFX6-NEXT: v_addc_u32_e32 v11, vcc, v7, v15, vcc 5672; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] 5673; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc 5674; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] 5675; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc 5676; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] 5677; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v11 5678; GFX6-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc 5679; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[14:15] 5680; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc 5681; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] 5682; GFX6-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc 5683; GFX6-NEXT: v_xor_b32_e32 v4, v5, v4 5684; GFX6-NEXT: v_add_i32_e32 v7, vcc, 0x80000000, v6 5685; GFX6-NEXT: v_and_b32_e32 v4, 1, v4 5686; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 5687; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc 5688; GFX6-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc 5689; GFX6-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc 5690; GFX6-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc 5691; GFX6-NEXT: s_setpc_b64 s[30:31] 5692; 5693; GFX8-LABEL: v_saddsat_v2i128: 5694; GFX8: ; %bb.0: 5695; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5696; GFX8-NEXT: v_add_u32_e32 v8, vcc, v0, v8 5697; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v1, v9, vcc 5698; GFX8-NEXT: v_addc_u32_e32 v16, vcc, v2, v10, vcc 5699; GFX8-NEXT: v_addc_u32_e32 v17, vcc, v3, v11, vcc 5700; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[0:1] 5701; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5702; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[16:17], v[2:3] 5703; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5704; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[16:17], v[2:3] 5705; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v17 5706; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5707; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11] 5708; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5709; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] 5710; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 5711; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 5712; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 5713; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1 5714; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 5715; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5716; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc 5717; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc 5718; GFX8-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc 5719; GFX8-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc 5720; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v12 5721; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v5, v13, vcc 5722; GFX8-NEXT: v_addc_u32_e32 v10, vcc, v6, v14, vcc 5723; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v7, v15, vcc 5724; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] 5725; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc 5726; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] 5727; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc 5728; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] 5729; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v11 5730; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc 5731; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[14:15] 5732; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc 5733; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] 5734; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc 5735; GFX8-NEXT: v_xor_b32_e32 v4, v5, v4 5736; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x80000000, v6 5737; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 5738; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 5739; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc 5740; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc 5741; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc 5742; GFX8-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc 5743; GFX8-NEXT: s_setpc_b64 s[30:31] 5744; 5745; GFX9-LABEL: v_saddsat_v2i128: 5746; GFX9: ; %bb.0: 5747; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5748; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v8 5749; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v1, v9, vcc 5750; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, v2, v10, vcc 5751; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, v3, v11, vcc 5752; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[0:1] 5753; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5754; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[16:17], v[2:3] 5755; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5756; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[16:17], v[2:3] 5757; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v17 5758; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5759; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11] 5760; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 5761; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5762; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] 5763; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 5764; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 5765; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 5766; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5767; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc 5768; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc 5769; GFX9-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc 5770; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc 5771; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v4, v12 5772; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v5, v13, vcc 5773; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v6, v14, vcc 5774; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v7, v15, vcc 5775; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] 5776; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc 5777; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] 5778; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc 5779; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] 5780; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v11 5781; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc 5782; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[14:15] 5783; GFX9-NEXT: v_add_u32_e32 v7, 0x80000000, v6 5784; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc 5785; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] 5786; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc 5787; GFX9-NEXT: v_xor_b32_e32 v4, v5, v4 5788; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 5789; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 5790; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc 5791; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc 5792; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc 5793; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc 5794; GFX9-NEXT: s_setpc_b64 s[30:31] 5795; 5796; GFX10-LABEL: v_saddsat_v2i128: 5797; GFX10: ; %bb.0: 5798; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5799; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v0, v8 5800; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v9, vcc_lo 5801; GFX10-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, v2, v10, vcc_lo 5802; GFX10-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, v3, v11, vcc_lo 5803; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[0:1] 5804; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 5805; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[16:17], v[2:3] 5806; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo 5807; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[16:17], v[2:3] 5808; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo 5809; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[10:11] 5810; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo 5811; GFX10-NEXT: v_add_co_u32 v12, vcc_lo, v4, v12 5812; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, v5, v13, vcc_lo 5813; GFX10-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, v6, v14, vcc_lo 5814; GFX10-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, v7, v15, vcc_lo 5815; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11] 5816; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo 5817; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[12:13], v[4:5] 5818; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 5819; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo 5820; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[6:7] 5821; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 5822; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo 5823; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[14:15] 5824; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo 5825; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[6:7] 5826; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v19 5827; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo 5828; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15] 5829; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x80000000, v6 5830; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo 5831; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 5832; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1 5833; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v17 5834; GFX10-NEXT: v_and_b32_e32 v3, 1, v1 5835; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v2 5836; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc_lo 5837; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc_lo 5838; GFX10-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc_lo 5839; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v3 5840; GFX10-NEXT: v_cndmask_b32_e32 v3, v17, v4, vcc_lo 5841; GFX10-NEXT: v_cndmask_b32_e64 v4, v12, v6, s4 5842; GFX10-NEXT: v_cndmask_b32_e64 v5, v13, v6, s4 5843; GFX10-NEXT: v_cndmask_b32_e64 v6, v18, v6, s4 5844; GFX10-NEXT: v_cndmask_b32_e64 v7, v19, v7, s4 5845; GFX10-NEXT: s_setpc_b64 s[30:31] 5846; 5847; GFX11-LABEL: v_saddsat_v2i128: 5848; GFX11: ; %bb.0: 5849; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5850; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v0, v8 5851; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v9, vcc_lo 5852; GFX11-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, v2, v10, vcc_lo 5853; GFX11-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, v3, v11, vcc_lo 5854; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[0:1] 5855; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 5856; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[16:17], v[2:3] 5857; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo 5858; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[16:17], v[2:3] 5859; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo 5860; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[10:11] 5861; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo 5862; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v4, v12 5863; GFX11-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, v5, v13, vcc_lo 5864; GFX11-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, v6, v14, vcc_lo 5865; GFX11-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, v7, v15, vcc_lo 5866; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11] 5867; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo 5868; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[12:13], v[4:5] 5869; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 5870; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo 5871; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[6:7] 5872; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo 5873; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[14:15] 5874; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo 5875; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[6:7] 5876; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v19 5877; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo 5878; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15] 5879; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x80000000, v6 5880; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo 5881; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1 5882; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v17 5883; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 5884; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v2 5885; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 5886; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v2 :: v_dual_and_b32 v3, 1, v1 5887; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v3 5888; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc_lo 5889; GFX11-NEXT: v_dual_cndmask_b32 v2, v16, v2 :: v_dual_cndmask_b32 v3, v17, v4 5890; GFX11-NEXT: v_cndmask_b32_e64 v4, v12, v6, s0 5891; GFX11-NEXT: v_cndmask_b32_e64 v5, v13, v6, s0 5892; GFX11-NEXT: v_cndmask_b32_e64 v6, v18, v6, s0 5893; GFX11-NEXT: v_cndmask_b32_e64 v7, v19, v7, s0 5894; GFX11-NEXT: s_setpc_b64 s[30:31] 5895 %result = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) 5896 ret <2 x i128> %result 5897} 5898 5899define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs) { 5900; GFX6-LABEL: s_saddsat_v2i128: 5901; GFX6: ; %bb.0: 5902; GFX6-NEXT: s_add_u32 s8, s0, s8 5903; GFX6-NEXT: v_mov_b32_e32 v0, s0 5904; GFX6-NEXT: s_addc_u32 s9, s1, s9 5905; GFX6-NEXT: v_mov_b32_e32 v1, s1 5906; GFX6-NEXT: s_addc_u32 s16, s2, s10 5907; GFX6-NEXT: v_mov_b32_e32 v2, s2 5908; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] 5909; GFX6-NEXT: s_addc_u32 s17, s3, s11 5910; GFX6-NEXT: v_mov_b32_e32 v3, s3 5911; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5912; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[16:17], v[2:3] 5913; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[10:11], 0 5914; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5915; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[2:3] 5916; GFX6-NEXT: v_mov_b32_e32 v2, s8 5917; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5918; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 5919; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], 0 5920; GFX6-NEXT: v_mov_b32_e32 v3, s9 5921; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] 5922; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 5923; GFX6-NEXT: s_ashr_i32 s0, s17, 31 5924; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 5925; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000 5926; GFX6-NEXT: v_mov_b32_e32 v1, s0 5927; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5928; GFX6-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc 5929; GFX6-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc 5930; GFX6-NEXT: v_mov_b32_e32 v0, s1 5931; GFX6-NEXT: v_mov_b32_e32 v2, s16 5932; GFX6-NEXT: v_mov_b32_e32 v3, s17 5933; GFX6-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc 5934; GFX6-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc 5935; GFX6-NEXT: s_add_u32 s0, s4, s12 5936; GFX6-NEXT: v_mov_b32_e32 v0, s4 5937; GFX6-NEXT: s_addc_u32 s1, s5, s13 5938; GFX6-NEXT: v_mov_b32_e32 v1, s5 5939; GFX6-NEXT: s_addc_u32 s2, s6, s14 5940; GFX6-NEXT: v_mov_b32_e32 v2, s6 5941; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] 5942; GFX6-NEXT: s_addc_u32 s3, s7, s15 5943; GFX6-NEXT: v_mov_b32_e32 v3, s7 5944; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5945; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] 5946; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], s[14:15], 0 5947; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5948; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3] 5949; GFX6-NEXT: v_mov_b32_e32 v2, s0 5950; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5951; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] 5952; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[14:15], 0 5953; GFX6-NEXT: v_mov_b32_e32 v3, s1 5954; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5] 5955; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 5956; GFX6-NEXT: s_ashr_i32 s4, s3, 31 5957; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 5958; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 5959; GFX6-NEXT: v_mov_b32_e32 v1, s4 5960; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5961; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 5962; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc 5963; GFX6-NEXT: v_mov_b32_e32 v3, s5 5964; GFX6-NEXT: v_mov_b32_e32 v8, s2 5965; GFX6-NEXT: v_mov_b32_e32 v9, s3 5966; GFX6-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc 5967; GFX6-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc 5968; GFX6-NEXT: v_readfirstlane_b32 s0, v4 5969; GFX6-NEXT: v_readfirstlane_b32 s1, v5 5970; GFX6-NEXT: v_readfirstlane_b32 s2, v6 5971; GFX6-NEXT: v_readfirstlane_b32 s3, v7 5972; GFX6-NEXT: v_readfirstlane_b32 s4, v0 5973; GFX6-NEXT: v_readfirstlane_b32 s5, v2 5974; GFX6-NEXT: v_readfirstlane_b32 s6, v1 5975; GFX6-NEXT: v_readfirstlane_b32 s7, v3 5976; GFX6-NEXT: ; return to shader part epilog 5977; 5978; GFX8-LABEL: s_saddsat_v2i128: 5979; GFX8: ; %bb.0: 5980; GFX8-NEXT: s_add_u32 s8, s0, s8 5981; GFX8-NEXT: s_addc_u32 s9, s1, s9 5982; GFX8-NEXT: v_mov_b32_e32 v0, s0 5983; GFX8-NEXT: s_addc_u32 s16, s2, s10 5984; GFX8-NEXT: v_mov_b32_e32 v1, s1 5985; GFX8-NEXT: s_addc_u32 s17, s3, s11 5986; GFX8-NEXT: v_mov_b32_e32 v2, s2 5987; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] 5988; GFX8-NEXT: v_mov_b32_e32 v3, s3 5989; GFX8-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] 5990; GFX8-NEXT: s_cselect_b32 s0, 1, 0 5991; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5992; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[16:17], v[2:3] 5993; GFX8-NEXT: s_and_b32 s0, 1, s0 5994; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5995; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 5996; GFX8-NEXT: s_cmp_eq_u64 s[10:11], 0 5997; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[10:11], 0 5998; GFX8-NEXT: s_cselect_b32 s2, 1, 0 5999; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 6000; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 6001; GFX8-NEXT: s_and_b32 s0, 1, s2 6002; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 6003; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] 6004; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 6005; GFX8-NEXT: s_ashr_i32 s0, s17, 31 6006; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 6007; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000 6008; GFX8-NEXT: v_mov_b32_e32 v1, s0 6009; GFX8-NEXT: v_mov_b32_e32 v2, s8 6010; GFX8-NEXT: v_mov_b32_e32 v3, s9 6011; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 6012; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc 6013; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc 6014; GFX8-NEXT: v_mov_b32_e32 v0, s1 6015; GFX8-NEXT: v_mov_b32_e32 v2, s16 6016; GFX8-NEXT: v_mov_b32_e32 v3, s17 6017; GFX8-NEXT: s_add_u32 s0, s4, s12 6018; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc 6019; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc 6020; GFX8-NEXT: s_addc_u32 s1, s5, s13 6021; GFX8-NEXT: v_mov_b32_e32 v0, s4 6022; GFX8-NEXT: s_addc_u32 s2, s6, s14 6023; GFX8-NEXT: v_mov_b32_e32 v1, s5 6024; GFX8-NEXT: s_addc_u32 s3, s7, s15 6025; GFX8-NEXT: v_mov_b32_e32 v2, s6 6026; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] 6027; GFX8-NEXT: v_mov_b32_e32 v3, s7 6028; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] 6029; GFX8-NEXT: s_cselect_b32 s4, 1, 0 6030; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 6031; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] 6032; GFX8-NEXT: s_and_b32 s4, 1, s4 6033; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 6034; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 6035; GFX8-NEXT: s_cmp_eq_u64 s[14:15], 0 6036; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], s[14:15], 0 6037; GFX8-NEXT: s_cselect_b32 s6, 1, 0 6038; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 6039; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] 6040; GFX8-NEXT: s_and_b32 s4, 1, s6 6041; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, s4 6042; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5] 6043; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 6044; GFX8-NEXT: s_ashr_i32 s4, s3, 31 6045; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 6046; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 6047; GFX8-NEXT: v_mov_b32_e32 v1, s4 6048; GFX8-NEXT: v_mov_b32_e32 v2, s0 6049; GFX8-NEXT: v_mov_b32_e32 v3, s1 6050; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 6051; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 6052; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc 6053; GFX8-NEXT: v_mov_b32_e32 v3, s5 6054; GFX8-NEXT: v_mov_b32_e32 v8, s2 6055; GFX8-NEXT: v_mov_b32_e32 v9, s3 6056; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc 6057; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc 6058; GFX8-NEXT: v_readfirstlane_b32 s0, v4 6059; GFX8-NEXT: v_readfirstlane_b32 s1, v5 6060; GFX8-NEXT: v_readfirstlane_b32 s2, v6 6061; GFX8-NEXT: v_readfirstlane_b32 s3, v7 6062; GFX8-NEXT: v_readfirstlane_b32 s4, v0 6063; GFX8-NEXT: v_readfirstlane_b32 s5, v2 6064; GFX8-NEXT: v_readfirstlane_b32 s6, v1 6065; GFX8-NEXT: v_readfirstlane_b32 s7, v3 6066; GFX8-NEXT: ; return to shader part epilog 6067; 6068; GFX9-LABEL: s_saddsat_v2i128: 6069; GFX9: ; %bb.0: 6070; GFX9-NEXT: s_add_u32 s8, s0, s8 6071; GFX9-NEXT: s_addc_u32 s9, s1, s9 6072; GFX9-NEXT: v_mov_b32_e32 v0, s0 6073; GFX9-NEXT: s_addc_u32 s16, s2, s10 6074; GFX9-NEXT: v_mov_b32_e32 v1, s1 6075; GFX9-NEXT: s_addc_u32 s17, s3, s11 6076; GFX9-NEXT: v_mov_b32_e32 v2, s2 6077; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] 6078; GFX9-NEXT: v_mov_b32_e32 v3, s3 6079; GFX9-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] 6080; GFX9-NEXT: s_cselect_b32 s0, 1, 0 6081; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 6082; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[16:17], v[2:3] 6083; GFX9-NEXT: s_and_b32 s0, 1, s0 6084; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 6085; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 6086; GFX9-NEXT: s_cmp_eq_u64 s[10:11], 0 6087; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[10:11], 0 6088; GFX9-NEXT: s_cselect_b32 s2, 1, 0 6089; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 6090; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 6091; GFX9-NEXT: s_and_b32 s0, 1, s2 6092; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 6093; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] 6094; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 6095; GFX9-NEXT: s_ashr_i32 s0, s17, 31 6096; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 6097; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000 6098; GFX9-NEXT: v_mov_b32_e32 v1, s0 6099; GFX9-NEXT: v_mov_b32_e32 v2, s8 6100; GFX9-NEXT: v_mov_b32_e32 v3, s9 6101; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 6102; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc 6103; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc 6104; GFX9-NEXT: v_mov_b32_e32 v0, s1 6105; GFX9-NEXT: v_mov_b32_e32 v2, s16 6106; GFX9-NEXT: v_mov_b32_e32 v3, s17 6107; GFX9-NEXT: s_add_u32 s0, s4, s12 6108; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc 6109; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc 6110; GFX9-NEXT: s_addc_u32 s1, s5, s13 6111; GFX9-NEXT: v_mov_b32_e32 v0, s4 6112; GFX9-NEXT: s_addc_u32 s2, s6, s14 6113; GFX9-NEXT: v_mov_b32_e32 v1, s5 6114; GFX9-NEXT: s_addc_u32 s3, s7, s15 6115; GFX9-NEXT: v_mov_b32_e32 v2, s6 6116; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] 6117; GFX9-NEXT: v_mov_b32_e32 v3, s7 6118; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] 6119; GFX9-NEXT: s_cselect_b32 s4, 1, 0 6120; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 6121; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] 6122; GFX9-NEXT: s_and_b32 s4, 1, s4 6123; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 6124; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 6125; GFX9-NEXT: s_cmp_eq_u64 s[14:15], 0 6126; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], s[14:15], 0 6127; GFX9-NEXT: s_cselect_b32 s6, 1, 0 6128; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 6129; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] 6130; GFX9-NEXT: s_and_b32 s4, 1, s6 6131; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, s4 6132; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5] 6133; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 6134; GFX9-NEXT: s_ashr_i32 s4, s3, 31 6135; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 6136; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 6137; GFX9-NEXT: v_mov_b32_e32 v1, s4 6138; GFX9-NEXT: v_mov_b32_e32 v2, s0 6139; GFX9-NEXT: v_mov_b32_e32 v3, s1 6140; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 6141; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 6142; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc 6143; GFX9-NEXT: v_mov_b32_e32 v3, s5 6144; GFX9-NEXT: v_mov_b32_e32 v8, s2 6145; GFX9-NEXT: v_mov_b32_e32 v9, s3 6146; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc 6147; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc 6148; GFX9-NEXT: v_readfirstlane_b32 s0, v4 6149; GFX9-NEXT: v_readfirstlane_b32 s1, v5 6150; GFX9-NEXT: v_readfirstlane_b32 s2, v6 6151; GFX9-NEXT: v_readfirstlane_b32 s3, v7 6152; GFX9-NEXT: v_readfirstlane_b32 s4, v0 6153; GFX9-NEXT: v_readfirstlane_b32 s5, v2 6154; GFX9-NEXT: v_readfirstlane_b32 s6, v1 6155; GFX9-NEXT: v_readfirstlane_b32 s7, v3 6156; GFX9-NEXT: ; return to shader part epilog 6157; 6158; GFX10-LABEL: s_saddsat_v2i128: 6159; GFX10: ; %bb.0: 6160; GFX10-NEXT: s_add_u32 s8, s0, s8 6161; GFX10-NEXT: s_addc_u32 s9, s1, s9 6162; GFX10-NEXT: s_addc_u32 s16, s2, s10 6163; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1] 6164; GFX10-NEXT: s_addc_u32 s17, s3, s11 6165; GFX10-NEXT: v_mov_b32_e32 v4, s9 6166; GFX10-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] 6167; GFX10-NEXT: s_cselect_b32 s18, 1, 0 6168; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 6169; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3] 6170; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[10:11], 0 6171; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 6172; GFX10-NEXT: s_and_b32 s0, 1, s18 6173; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 6174; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 6175; GFX10-NEXT: s_cselect_b32 s1, 1, 0 6176; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 6177; GFX10-NEXT: s_and_b32 s1, 1, s1 6178; GFX10-NEXT: s_ashr_i32 s10, s17, 31 6179; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 6180; GFX10-NEXT: s_add_i32 s11, s10, 0x80000000 6181; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo 6182; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 6183; GFX10-NEXT: s_add_u32 s0, s4, s12 6184; GFX10-NEXT: s_addc_u32 s1, s5, s13 6185; GFX10-NEXT: s_addc_u32 s2, s6, s14 6186; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] 6187; GFX10-NEXT: s_addc_u32 s3, s7, s15 6188; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 6189; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] 6190; GFX10-NEXT: v_mov_b32_e32 v5, s0 6191; GFX10-NEXT: s_cselect_b32 s12, 1, 0 6192; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 6193; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[2:3], s[6:7] 6194; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[14:15], 0 6195; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 6196; GFX10-NEXT: v_mov_b32_e32 v6, s1 6197; GFX10-NEXT: v_mov_b32_e32 v7, s3 6198; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 6199; GFX10-NEXT: s_and_b32 s4, 1, s12 6200; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0 6201; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6 6202; GFX10-NEXT: s_cselect_b32 s5, 1, 0 6203; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 6204; GFX10-NEXT: s_and_b32 s5, 1, s5 6205; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s5 6206; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo 6207; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 6208; GFX10-NEXT: v_mov_b32_e32 v0, s16 6209; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, 0, s4 6210; GFX10-NEXT: v_mov_b32_e32 v3, s8 6211; GFX10-NEXT: s_ashr_i32 s4, s3, 31 6212; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s10, vcc_lo 6213; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s10, vcc_lo 6214; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1 6215; GFX10-NEXT: v_mov_b32_e32 v2, s17 6216; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s10, vcc_lo 6217; GFX10-NEXT: s_add_i32 s0, s4, 0x80000000 6218; GFX10-NEXT: v_readfirstlane_b32 s1, v4 6219; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 6220; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo 6221; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 6222; GFX10-NEXT: v_mov_b32_e32 v1, s2 6223; GFX10-NEXT: v_readfirstlane_b32 s2, v0 6224; GFX10-NEXT: v_readfirstlane_b32 s3, v2 6225; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s4, vcc_lo 6226; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s4, vcc_lo 6227; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo 6228; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s0, vcc_lo 6229; GFX10-NEXT: v_readfirstlane_b32 s0, v3 6230; GFX10-NEXT: v_readfirstlane_b32 s4, v5 6231; GFX10-NEXT: v_readfirstlane_b32 s5, v6 6232; GFX10-NEXT: v_readfirstlane_b32 s6, v1 6233; GFX10-NEXT: v_readfirstlane_b32 s7, v7 6234; GFX10-NEXT: ; return to shader part epilog 6235; 6236; GFX11-LABEL: s_saddsat_v2i128: 6237; GFX11: ; %bb.0: 6238; GFX11-NEXT: s_add_u32 s8, s0, s8 6239; GFX11-NEXT: s_addc_u32 s9, s1, s9 6240; GFX11-NEXT: s_addc_u32 s16, s2, s10 6241; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1] 6242; GFX11-NEXT: s_addc_u32 s17, s3, s11 6243; GFX11-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] 6244; GFX11-NEXT: s_cselect_b32 s18, 1, 0 6245; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 6246; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3] 6247; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[10:11], 0 6248; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 6249; GFX11-NEXT: s_and_b32 s0, 1, s18 6250; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 6251; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 6252; GFX11-NEXT: s_cselect_b32 s1, 1, 0 6253; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 6254; GFX11-NEXT: s_and_b32 s1, 1, s1 6255; GFX11-NEXT: s_ashr_i32 s10, s17, 31 6256; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 6257; GFX11-NEXT: s_add_i32 s11, s10, 0x80000000 6258; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo 6259; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 6260; GFX11-NEXT: s_add_u32 s0, s4, s12 6261; GFX11-NEXT: s_addc_u32 s1, s5, s13 6262; GFX11-NEXT: s_addc_u32 s2, s6, s14 6263; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] 6264; GFX11-NEXT: s_addc_u32 s3, s7, s15 6265; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 6266; GFX11-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] 6267; GFX11-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s3 6268; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 6269; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[2:3], s[6:7] 6270; GFX11-NEXT: s_cselect_b32 s12, 1, 0 6271; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[14:15], 0 6272; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_and_b32 v0, 1, v0 6273; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 6274; GFX11-NEXT: s_and_b32 s4, 1, s12 6275; GFX11-NEXT: s_cmp_eq_u64 s[14:15], 0 6276; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6 6277; GFX11-NEXT: s_cselect_b32 s5, 1, 0 6278; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 6279; GFX11-NEXT: s_and_b32 s5, 1, s5 6280; GFX11-NEXT: v_cmp_ne_u32_e64 s4, 0, s5 6281; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo 6282; GFX11-NEXT: v_cndmask_b32_e64 v2, v3, 0, s4 6283; GFX11-NEXT: v_mov_b32_e32 v3, s8 6284; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 6285; GFX11-NEXT: v_mov_b32_e32 v0, s16 6286; GFX11-NEXT: s_ashr_i32 s4, s3, 31 6287; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1 6288; GFX11-NEXT: v_mov_b32_e32 v4, s9 6289; GFX11-NEXT: v_mov_b32_e32 v2, s17 6290; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s10, vcc_lo 6291; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s10, vcc_lo 6292; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 6293; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s10, vcc_lo 6294; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo 6295; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000 6296; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 6297; GFX11-NEXT: v_mov_b32_e32 v1, s2 6298; GFX11-NEXT: v_readfirstlane_b32 s1, v4 6299; GFX11-NEXT: v_readfirstlane_b32 s2, v0 6300; GFX11-NEXT: v_readfirstlane_b32 s3, v2 6301; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s4, vcc_lo 6302; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s4, vcc_lo 6303; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo 6304; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s0, vcc_lo 6305; GFX11-NEXT: v_readfirstlane_b32 s0, v3 6306; GFX11-NEXT: v_readfirstlane_b32 s4, v5 6307; GFX11-NEXT: v_readfirstlane_b32 s5, v6 6308; GFX11-NEXT: v_readfirstlane_b32 s6, v1 6309; GFX11-NEXT: v_readfirstlane_b32 s7, v7 6310; GFX11-NEXT: ; return to shader part epilog 6311 %result = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) 6312 ret <2 x i128> %result 6313} 6314 6315declare i7 @llvm.sadd.sat.i7(i7, i7) #0 6316declare i8 @llvm.sadd.sat.i8(i8, i8) #0 6317declare <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8>, <2 x i8>) #0 6318declare <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8>, <4 x i8>) #0 6319 6320declare i16 @llvm.sadd.sat.i16(i16, i16) #0 6321declare <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16>, <2 x i16>) #0 6322declare <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16>, <3 x i16>) #0 6323declare <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16>, <4 x i16>) #0 6324declare <5 x i16> @llvm.sadd.sat.v5i16(<5 x i16>, <5 x i16>) #0 6325declare <6 x i16> @llvm.sadd.sat.v6i16(<6 x i16>, <6 x i16>) #0 6326declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) #0 6327 6328declare i24 @llvm.sadd.sat.i24(i24, i24) #0 6329 6330declare i32 @llvm.sadd.sat.i32(i32, i32) #0 6331declare <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32>, <2 x i32>) #0 6332declare <3 x i32> @llvm.sadd.sat.v3i32(<3 x i32>, <3 x i32>) #0 6333declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>) #0 6334declare <5 x i32> @llvm.sadd.sat.v5i32(<5 x i32>, <5 x i32>) #0 6335declare <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32>, <16 x i32>) #0 6336 6337declare i48 @llvm.sadd.sat.i48(i48, i48) #0 6338 6339declare i64 @llvm.sadd.sat.i64(i64, i64) #0 6340declare <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64>, <2 x i64>) #0 6341 6342declare i128 @llvm.sadd.sat.i128(i128, i128) #0 6343declare <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128>, <2 x i128>) #0 6344 6345attributes #0 = { nounwind readnone speculatable willreturn } 6346