1/* SPDX-License-Identifier: MIT */ 2/* 3 * Copyright 2024 Advanced Micro Devices, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 */ 23 24// This shader is to clean LDS, SGPRs and VGPRs. It is first 64 Dwords or 256 bytes of 192 Dwords cleaner shader. 25//To turn this shader program on for complitaion change this to main and lower shader main to main_1 26 27// MI300 : Clear SGPRs, VGPRs and LDS 28// Uses two kernels launched separately: 29// 1. Clean VGPRs, LDS, and lower SGPRs 30// Launches one workgroup per CU, each workgroup with 4x wave64 per SIMD in the CU 31// Waves are "wave64" and have 128 VGPRs each, which uses all 512 VGPRs per SIMD 32// Waves in the workgroup share the 64KB of LDS 33// Each wave clears SGPRs 0 - 95. Because there are 4 waves/SIMD, this is physical SGPRs 0-383 34// Each wave clears 128 VGPRs, so all 512 in the SIMD 35// The first wave of the workgroup clears its 64KB of LDS 36// The shader starts with "S_BARRIER" to ensure SPI has launched all waves of the workgroup 37// before any wave in the workgroup could end. Without this, it is possible not all SGPRs get cleared. 38// 2. Clean remaining SGPRs 39// Launches a workgroup with 24 waves per workgroup, yielding 6 waves per SIMD in each CU 40// Waves are allocating 96 SGPRs 41// CP sets up SPI_RESOURCE_RESERVE_* registers to prevent these waves from allocating SGPRs 0-223. 42// As such, these 6 waves per SIMD are allocated physical SGPRs 224-799 43// Barriers do not work for >16 waves per workgroup, so we cannot start with S_BARRIER 44// Instead, the shader starts with an S_SETHALT 1. Once all waves are launched CP will send unhalt command 45// The shader then clears all SGPRs allocated to it, cleaning out physical SGPRs 224-799 46 47shader main 48 asic(MI300) 49 type(CS) 50 wave_size(64) 51// Note: original source code from SQ team 52 53// (theorhetical fastest = ~512clks vgpr + 1536 lds + ~128 sgpr = 2176 clks) 54 55 s_cmp_eq_u32 s0, 1 // Bit0 is set, sgpr0 is set then clear VGPRS and LDS as FW set COMPUTE_USER_DATA_3 56 s_cbranch_scc0 label_0023 // Clean VGPRs and LDS if sgpr0 of wave is set, scc = (s3 == 1) 57 S_BARRIER 58 59 s_movk_i32 m0, 0x0000 60 s_mov_b32 s2, 0x00000078 // Loop 128/8=16 times (loop unrolled for performance) 61 // 62 // CLEAR VGPRs 63 // 64 s_set_gpr_idx_on s2, 0x8 // enable Dest VGPR indexing 65label_0005: 66 v_mov_b32 v0, 0 67 v_mov_b32 v1, 0 68 v_mov_b32 v2, 0 69 v_mov_b32 v3, 0 70 v_mov_b32 v4, 0 71 v_mov_b32 v5, 0 72 v_mov_b32 v6, 0 73 v_mov_b32 v7, 0 74 s_sub_u32 s2, s2, 8 75 s_set_gpr_idx_idx s2 76 s_cbranch_scc0 label_0005 77 s_set_gpr_idx_off 78 79 // 80 // 81 82 s_mov_b32 s2, 0x80000000 // Bit31 is first_wave 83 s_and_b32 s2, s2, s1 // sgpr0 has tg_size (first_wave) term as in ucode only COMPUTE_PGM_RSRC2.tg_size_en is set 84 s_cbranch_scc0 label_clean_sgpr_1 // Clean LDS if its first wave of ThreadGroup/WorkGroup 85 // CLEAR LDS 86 // 87 s_mov_b32 exec_lo, 0xffffffff 88 s_mov_b32 exec_hi, 0xffffffff 89 v_mbcnt_lo_u32_b32 v1, exec_hi, 0 // Set V1 to thread-ID (0..63) 90 v_mbcnt_hi_u32_b32 v1, exec_lo, v1 // Set V1 to thread-ID (0..63) 91 v_mul_u32_u24 v1, 0x00000008, v1 // * 8, so each thread is a double-dword address (8byte) 92 s_mov_b32 s2, 0x00000003f // 64 loop iteraions 93 s_mov_b32 m0, 0xffffffff 94 // Clear all of LDS space 95 // Each FirstWave of WorkGroup clears 64kbyte block 96 97label_001F: 98 ds_write2_b64 v1, v[2:3], v[2:3] offset1:32 99 ds_write2_b64 v1, v[4:5], v[4:5] offset0:64 offset1:96 100 v_add_co_u32 v1, vcc, 0x00000400, v1 101 s_sub_u32 s2, s2, 1 102 s_cbranch_scc0 label_001F 103 // 104 // CLEAR SGPRs 105 // 106label_clean_sgpr_1: 107 s_mov_b32 m0, 0x0000005c // Loop 96/4=24 times (loop unrolled for performance) 108 s_nop 0 109label_sgpr_loop: 110 s_movreld_b32 s0, 0 111 s_movreld_b32 s1, 0 112 s_movreld_b32 s2, 0 113 s_movreld_b32 s3, 0 114 s_sub_u32 m0, m0, 4 115 s_cbranch_scc0 label_sgpr_loop 116 117 //clear vcc, flat scratch 118 s_mov_b32 flat_scratch_lo, 0 //clear flat scratch lo SGPR 119 s_mov_b32 flat_scratch_hi, 0 //clear flat scratch hi SGPR 120 s_mov_b64 vcc, 0 //clear vcc 121 s_mov_b64 ttmp0, 0 //Clear ttmp0 and ttmp1 122 s_mov_b64 ttmp2, 0 //Clear ttmp2 and ttmp3 123 s_mov_b64 ttmp4, 0 //Clear ttmp4 and ttmp5 124 s_mov_b64 ttmp6, 0 //Clear ttmp6 and ttmp7 125 s_mov_b64 ttmp8, 0 //Clear ttmp8 and ttmp9 126 s_mov_b64 ttmp10, 0 //Clear ttmp10 and ttmp11 127 s_mov_b64 ttmp12, 0 //Clear ttmp12 and ttmp13 128 s_mov_b64 ttmp14, 0 //Clear ttmp14 and ttmp15 129s_endpgm 130 131label_0023: 132 133 s_sethalt 1 134 135 s_mov_b32 m0, 0x0000005c // Loop 96/4=24 times (loop unrolled for performance) 136 s_nop 0 137label_sgpr_loop1: 138 139 s_movreld_b32 s0, 0 140 s_movreld_b32 s1, 0 141 s_movreld_b32 s2, 0 142 s_movreld_b32 s3, 0 143 s_sub_u32 m0, m0, 4 144 s_cbranch_scc0 label_sgpr_loop1 145 146 //clear vcc, flat scratch 147 s_mov_b32 flat_scratch_lo, 0 //clear flat scratch lo SGPR 148 s_mov_b32 flat_scratch_hi, 0 //clear flat scratch hi SGPR 149 s_mov_b64 vcc, 0xee //clear vcc 150 151s_endpgm 152end 153 154