15ca02815Sjsg // SPDX-License-Identifier: GPL-2.0 OR MIT 25ca02815Sjsg /* 35ca02815Sjsg * Copyright 2020-2021 Advanced Micro Devices, Inc. 45ca02815Sjsg * 55ca02815Sjsg * Permission is hereby granted, free of charge, to any person obtaining a 65ca02815Sjsg * copy of this software and associated documentation files (the "Software"), 75ca02815Sjsg * to deal in the Software without restriction, including without limitation 85ca02815Sjsg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 95ca02815Sjsg * and/or sell copies of the Software, and to permit persons to whom the 105ca02815Sjsg * Software is furnished to do so, subject to the following conditions: 115ca02815Sjsg * 125ca02815Sjsg * The above copyright notice and this permission notice shall be included in 135ca02815Sjsg * all copies or substantial portions of the Software. 145ca02815Sjsg * 155ca02815Sjsg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 165ca02815Sjsg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 175ca02815Sjsg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 185ca02815Sjsg * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 195ca02815Sjsg * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 205ca02815Sjsg * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 215ca02815Sjsg * OTHER DEALINGS IN THE SOFTWARE. 225ca02815Sjsg */ 235ca02815Sjsg 245ca02815Sjsg #include <linux/types.h> 255ca02815Sjsg #include <linux/sched/task.h> 26f005ef32Sjsg #include <linux/dynamic_debug.h> 27f005ef32Sjsg #include <drm/ttm/ttm_tt.h> 28f005ef32Sjsg #include <drm/drm_exec.h> 29f005ef32Sjsg 305ca02815Sjsg #include "amdgpu_sync.h" 315ca02815Sjsg #include "amdgpu_object.h" 325ca02815Sjsg #include "amdgpu_vm.h" 33f005ef32Sjsg #include "amdgpu_hmm.h" 345ca02815Sjsg #include "amdgpu.h" 355ca02815Sjsg #include "amdgpu_xgmi.h" 365ca02815Sjsg #include "kfd_priv.h" 375ca02815Sjsg #include "kfd_svm.h" 385ca02815Sjsg #include "kfd_migrate.h" 391bb76ff1Sjsg #include "kfd_smi_events.h" 401bb76ff1Sjsg 411bb76ff1Sjsg #ifdef dev_fmt 421bb76ff1Sjsg #undef dev_fmt 431bb76ff1Sjsg #endif 441bb76ff1Sjsg #define dev_fmt(fmt) "kfd_svm: %s: " fmt, __func__ 455ca02815Sjsg 465ca02815Sjsg #define AMDGPU_SVM_RANGE_RESTORE_DELAY_MS 1 475ca02815Sjsg 485ca02815Sjsg /* Long enough to ensure no retry fault comes after svm range is restored and 495ca02815Sjsg * page table is updated. 505ca02815Sjsg */ 511bb76ff1Sjsg #define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING (2UL * NSEC_PER_MSEC) 52f005ef32Sjsg #if IS_ENABLED(CONFIG_DYNAMIC_DEBUG) 53f005ef32Sjsg #define dynamic_svm_range_dump(svms) \ 54f005ef32Sjsg _dynamic_func_call_no_desc("svm_range_dump", svm_range_debug_dump, svms) 55f005ef32Sjsg #else 56f005ef32Sjsg #define dynamic_svm_range_dump(svms) \ 57f005ef32Sjsg do { if (0) svm_range_debug_dump(svms); } while (0) 58f005ef32Sjsg #endif 591bb76ff1Sjsg 601bb76ff1Sjsg /* Giant svm range split into smaller ranges based on this, it is decided using 611bb76ff1Sjsg * minimum of all dGPU/APU 1/32 VRAM size, between 2MB to 1GB and alignment to 621bb76ff1Sjsg * power of 2MB. 631bb76ff1Sjsg */ 641bb76ff1Sjsg static uint64_t max_svm_range_pages; 651bb76ff1Sjsg 661bb76ff1Sjsg struct criu_svm_metadata { 671bb76ff1Sjsg struct list_head list; 681bb76ff1Sjsg struct kfd_criu_svm_range_priv_data data; 691bb76ff1Sjsg }; 705ca02815Sjsg 715ca02815Sjsg static void svm_range_evict_svm_bo_worker(struct work_struct *work); 725ca02815Sjsg static bool 735ca02815Sjsg svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni, 745ca02815Sjsg const struct mmu_notifier_range *range, 755ca02815Sjsg unsigned long cur_seq); 761bb76ff1Sjsg static int 771bb76ff1Sjsg svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last, 781bb76ff1Sjsg uint64_t *bo_s, uint64_t *bo_l); 795ca02815Sjsg static const struct mmu_interval_notifier_ops svm_range_mn_ops = { 805ca02815Sjsg .invalidate = svm_range_cpu_invalidate_pagetables, 815ca02815Sjsg }; 825ca02815Sjsg 835ca02815Sjsg /** 845ca02815Sjsg * svm_range_unlink - unlink svm_range from lists and interval tree 855ca02815Sjsg * @prange: svm range structure to be removed 865ca02815Sjsg * 875ca02815Sjsg * Remove the svm_range from the svms and svm_bo lists and the svms 885ca02815Sjsg * interval tree. 895ca02815Sjsg * 905ca02815Sjsg * Context: The caller must hold svms->lock 915ca02815Sjsg */ 925ca02815Sjsg static void svm_range_unlink(struct svm_range *prange) 935ca02815Sjsg { 945ca02815Sjsg pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, 955ca02815Sjsg prange, prange->start, prange->last); 965ca02815Sjsg 975ca02815Sjsg if (prange->svm_bo) { 985ca02815Sjsg spin_lock(&prange->svm_bo->list_lock); 995ca02815Sjsg list_del(&prange->svm_bo_list); 1005ca02815Sjsg spin_unlock(&prange->svm_bo->list_lock); 1015ca02815Sjsg } 1025ca02815Sjsg 1035ca02815Sjsg list_del(&prange->list); 1045ca02815Sjsg if (prange->it_node.start != 0 && prange->it_node.last != 0) 1055ca02815Sjsg interval_tree_remove(&prange->it_node, &prange->svms->objects); 1065ca02815Sjsg } 1075ca02815Sjsg 1085ca02815Sjsg static void 1095ca02815Sjsg svm_range_add_notifier_locked(struct mm_struct *mm, struct svm_range *prange) 1105ca02815Sjsg { 1115ca02815Sjsg pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, 1125ca02815Sjsg prange, prange->start, prange->last); 1135ca02815Sjsg 1145ca02815Sjsg mmu_interval_notifier_insert_locked(&prange->notifier, mm, 1155ca02815Sjsg prange->start << PAGE_SHIFT, 1165ca02815Sjsg prange->npages << PAGE_SHIFT, 1175ca02815Sjsg &svm_range_mn_ops); 1185ca02815Sjsg } 1195ca02815Sjsg 1205ca02815Sjsg /** 1215ca02815Sjsg * svm_range_add_to_svms - add svm range to svms 1225ca02815Sjsg * @prange: svm range structure to be added 1235ca02815Sjsg * 1245ca02815Sjsg * Add the svm range to svms interval tree and link list 1255ca02815Sjsg * 1265ca02815Sjsg * Context: The caller must hold svms->lock 1275ca02815Sjsg */ 1285ca02815Sjsg static void svm_range_add_to_svms(struct svm_range *prange) 1295ca02815Sjsg { 1305ca02815Sjsg pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, 1315ca02815Sjsg prange, prange->start, prange->last); 1325ca02815Sjsg 1331bb76ff1Sjsg list_move_tail(&prange->list, &prange->svms->list); 1345ca02815Sjsg prange->it_node.start = prange->start; 1355ca02815Sjsg prange->it_node.last = prange->last; 1365ca02815Sjsg interval_tree_insert(&prange->it_node, &prange->svms->objects); 1375ca02815Sjsg } 1385ca02815Sjsg 1395ca02815Sjsg static void svm_range_remove_notifier(struct svm_range *prange) 1405ca02815Sjsg { 1415ca02815Sjsg pr_debug("remove notifier svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", 1425ca02815Sjsg prange->svms, prange, 1435ca02815Sjsg prange->notifier.interval_tree.start >> PAGE_SHIFT, 1445ca02815Sjsg prange->notifier.interval_tree.last >> PAGE_SHIFT); 1455ca02815Sjsg 1465ca02815Sjsg if (prange->notifier.interval_tree.start != 0 && 1475ca02815Sjsg prange->notifier.interval_tree.last != 0) 1485ca02815Sjsg mmu_interval_notifier_remove(&prange->notifier); 1495ca02815Sjsg } 1505ca02815Sjsg 1515ca02815Sjsg static bool 1525ca02815Sjsg svm_is_valid_dma_mapping_addr(struct device *dev, dma_addr_t dma_addr) 1535ca02815Sjsg { 1545ca02815Sjsg return dma_addr && !dma_mapping_error(dev, dma_addr) && 1555ca02815Sjsg !(dma_addr & SVM_RANGE_VRAM_DOMAIN); 1565ca02815Sjsg } 1575ca02815Sjsg 1585ca02815Sjsg static int 1595ca02815Sjsg svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange, 1605ca02815Sjsg unsigned long offset, unsigned long npages, 1615ca02815Sjsg unsigned long *hmm_pfns, uint32_t gpuidx) 1625ca02815Sjsg { 1635ca02815Sjsg enum dma_data_direction dir = DMA_BIDIRECTIONAL; 1645ca02815Sjsg dma_addr_t *addr = prange->dma_addr[gpuidx]; 1655ca02815Sjsg struct device *dev = adev->dev; 1665ca02815Sjsg struct page *page; 1675ca02815Sjsg int i, r; 1685ca02815Sjsg 1695ca02815Sjsg if (!addr) { 1701bb76ff1Sjsg addr = kvcalloc(prange->npages, sizeof(*addr), GFP_KERNEL); 1715ca02815Sjsg if (!addr) 1725ca02815Sjsg return -ENOMEM; 1735ca02815Sjsg prange->dma_addr[gpuidx] = addr; 1745ca02815Sjsg } 1755ca02815Sjsg 1765ca02815Sjsg addr += offset; 1775ca02815Sjsg for (i = 0; i < npages; i++) { 1785ca02815Sjsg if (svm_is_valid_dma_mapping_addr(dev, addr[i])) 1795ca02815Sjsg dma_unmap_page(dev, addr[i], PAGE_SIZE, dir); 1805ca02815Sjsg 1815ca02815Sjsg page = hmm_pfn_to_page(hmm_pfns[i]); 1825ca02815Sjsg if (is_zone_device_page(page)) { 183f005ef32Sjsg struct amdgpu_device *bo_adev = prange->svm_bo->node->adev; 1845ca02815Sjsg 1855ca02815Sjsg addr[i] = (hmm_pfns[i] << PAGE_SHIFT) + 1865ca02815Sjsg bo_adev->vm_manager.vram_base_offset - 187f005ef32Sjsg bo_adev->kfd.pgmap.range.start; 1885ca02815Sjsg addr[i] |= SVM_RANGE_VRAM_DOMAIN; 1891bb76ff1Sjsg pr_debug_ratelimited("vram address: 0x%llx\n", addr[i]); 1905ca02815Sjsg continue; 1915ca02815Sjsg } 1925ca02815Sjsg addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir); 1935ca02815Sjsg r = dma_mapping_error(dev, addr[i]); 1945ca02815Sjsg if (r) { 1951bb76ff1Sjsg dev_err(dev, "failed %d dma_map_page\n", r); 1965ca02815Sjsg return r; 1975ca02815Sjsg } 1981bb76ff1Sjsg pr_debug_ratelimited("dma mapping 0x%llx for page addr 0x%lx\n", 1995ca02815Sjsg addr[i] >> PAGE_SHIFT, page_to_pfn(page)); 2005ca02815Sjsg } 2015ca02815Sjsg return 0; 2025ca02815Sjsg } 2035ca02815Sjsg 2045ca02815Sjsg static int 2055ca02815Sjsg svm_range_dma_map(struct svm_range *prange, unsigned long *bitmap, 2065ca02815Sjsg unsigned long offset, unsigned long npages, 2075ca02815Sjsg unsigned long *hmm_pfns) 2085ca02815Sjsg { 2095ca02815Sjsg struct kfd_process *p; 2105ca02815Sjsg uint32_t gpuidx; 2115ca02815Sjsg int r; 2125ca02815Sjsg 2135ca02815Sjsg p = container_of(prange->svms, struct kfd_process, svms); 2145ca02815Sjsg 2155ca02815Sjsg for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 2165ca02815Sjsg struct kfd_process_device *pdd; 2175ca02815Sjsg 2185ca02815Sjsg pr_debug("mapping to gpu idx 0x%x\n", gpuidx); 2195ca02815Sjsg pdd = kfd_process_device_from_gpuidx(p, gpuidx); 2205ca02815Sjsg if (!pdd) { 2215ca02815Sjsg pr_debug("failed to find device idx %d\n", gpuidx); 2225ca02815Sjsg return -EINVAL; 2235ca02815Sjsg } 2245ca02815Sjsg 2251bb76ff1Sjsg r = svm_range_dma_map_dev(pdd->dev->adev, prange, offset, npages, 2265ca02815Sjsg hmm_pfns, gpuidx); 2275ca02815Sjsg if (r) 2285ca02815Sjsg break; 2295ca02815Sjsg } 2305ca02815Sjsg 2315ca02815Sjsg return r; 2325ca02815Sjsg } 2335ca02815Sjsg 2345ca02815Sjsg void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr, 2355ca02815Sjsg unsigned long offset, unsigned long npages) 2365ca02815Sjsg { 2375ca02815Sjsg enum dma_data_direction dir = DMA_BIDIRECTIONAL; 2385ca02815Sjsg int i; 2395ca02815Sjsg 2405ca02815Sjsg if (!dma_addr) 2415ca02815Sjsg return; 2425ca02815Sjsg 2435ca02815Sjsg for (i = offset; i < offset + npages; i++) { 2445ca02815Sjsg if (!svm_is_valid_dma_mapping_addr(dev, dma_addr[i])) 2455ca02815Sjsg continue; 2461bb76ff1Sjsg pr_debug_ratelimited("unmap 0x%llx\n", dma_addr[i] >> PAGE_SHIFT); 2475ca02815Sjsg dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir); 2485ca02815Sjsg dma_addr[i] = 0; 2495ca02815Sjsg } 2505ca02815Sjsg } 2515ca02815Sjsg 252f005ef32Sjsg void svm_range_free_dma_mappings(struct svm_range *prange, bool unmap_dma) 2535ca02815Sjsg { 2545ca02815Sjsg struct kfd_process_device *pdd; 2555ca02815Sjsg dma_addr_t *dma_addr; 2565ca02815Sjsg struct device *dev; 2575ca02815Sjsg struct kfd_process *p; 2585ca02815Sjsg uint32_t gpuidx; 2595ca02815Sjsg 2605ca02815Sjsg p = container_of(prange->svms, struct kfd_process, svms); 2615ca02815Sjsg 2625ca02815Sjsg for (gpuidx = 0; gpuidx < MAX_GPU_INSTANCE; gpuidx++) { 2635ca02815Sjsg dma_addr = prange->dma_addr[gpuidx]; 2645ca02815Sjsg if (!dma_addr) 2655ca02815Sjsg continue; 2665ca02815Sjsg 2675ca02815Sjsg pdd = kfd_process_device_from_gpuidx(p, gpuidx); 2685ca02815Sjsg if (!pdd) { 2695ca02815Sjsg pr_debug("failed to find device idx %d\n", gpuidx); 2705ca02815Sjsg continue; 2715ca02815Sjsg } 272f005ef32Sjsg dev = &pdd->dev->adev->pdev->dev; 273f005ef32Sjsg if (unmap_dma) 2745ca02815Sjsg svm_range_dma_unmap(dev, dma_addr, 0, prange->npages); 2755ca02815Sjsg kvfree(dma_addr); 2765ca02815Sjsg prange->dma_addr[gpuidx] = NULL; 2775ca02815Sjsg } 2785ca02815Sjsg } 2795ca02815Sjsg 280f005ef32Sjsg static void svm_range_free(struct svm_range *prange, bool do_unmap) 2815ca02815Sjsg { 2821bb76ff1Sjsg uint64_t size = (prange->last - prange->start + 1) << PAGE_SHIFT; 2831bb76ff1Sjsg struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms); 2841bb76ff1Sjsg 2855ca02815Sjsg pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, prange, 2865ca02815Sjsg prange->start, prange->last); 2875ca02815Sjsg 2885ca02815Sjsg svm_range_vram_node_free(prange); 289f005ef32Sjsg svm_range_free_dma_mappings(prange, do_unmap); 2901bb76ff1Sjsg 291f005ef32Sjsg if (do_unmap && !p->xnack_enabled) { 2921bb76ff1Sjsg pr_debug("unreserve prange 0x%p size: 0x%llx\n", prange, size); 2931bb76ff1Sjsg amdgpu_amdkfd_unreserve_mem_limit(NULL, size, 294f005ef32Sjsg KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0); 2951bb76ff1Sjsg } 2965ca02815Sjsg mutex_destroy(&prange->lock); 2975ca02815Sjsg mutex_destroy(&prange->migrate_mutex); 2985ca02815Sjsg kfree(prange); 2995ca02815Sjsg } 3005ca02815Sjsg 3015ca02815Sjsg static void 3025ca02815Sjsg svm_range_set_default_attributes(int32_t *location, int32_t *prefetch_loc, 3035ca02815Sjsg uint8_t *granularity, uint32_t *flags) 3045ca02815Sjsg { 3055ca02815Sjsg *location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 3065ca02815Sjsg *prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 3075ca02815Sjsg *granularity = 9; 3085ca02815Sjsg *flags = 3095ca02815Sjsg KFD_IOCTL_SVM_FLAG_HOST_ACCESS | KFD_IOCTL_SVM_FLAG_COHERENT; 3105ca02815Sjsg } 3115ca02815Sjsg 3125ca02815Sjsg static struct 3135ca02815Sjsg svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, 3141bb76ff1Sjsg uint64_t last, bool update_mem_usage) 3155ca02815Sjsg { 3165ca02815Sjsg uint64_t size = last - start + 1; 3175ca02815Sjsg struct svm_range *prange; 3185ca02815Sjsg struct kfd_process *p; 3195ca02815Sjsg 3205ca02815Sjsg prange = kzalloc(sizeof(*prange), GFP_KERNEL); 3215ca02815Sjsg if (!prange) 3225ca02815Sjsg return NULL; 3231bb76ff1Sjsg 3241bb76ff1Sjsg p = container_of(svms, struct kfd_process, svms); 3251bb76ff1Sjsg if (!p->xnack_enabled && update_mem_usage && 3261bb76ff1Sjsg amdgpu_amdkfd_reserve_mem_limit(NULL, size << PAGE_SHIFT, 327f005ef32Sjsg KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0)) { 3281bb76ff1Sjsg pr_info("SVM mapping failed, exceeds resident system memory limit\n"); 3291bb76ff1Sjsg kfree(prange); 3301bb76ff1Sjsg return NULL; 3311bb76ff1Sjsg } 3325ca02815Sjsg prange->npages = size; 3335ca02815Sjsg prange->svms = svms; 3345ca02815Sjsg prange->start = start; 3355ca02815Sjsg prange->last = last; 3365ca02815Sjsg INIT_LIST_HEAD(&prange->list); 3375ca02815Sjsg INIT_LIST_HEAD(&prange->update_list); 3385ca02815Sjsg INIT_LIST_HEAD(&prange->svm_bo_list); 3395ca02815Sjsg INIT_LIST_HEAD(&prange->deferred_list); 3405ca02815Sjsg INIT_LIST_HEAD(&prange->child_list); 3415ca02815Sjsg atomic_set(&prange->invalid, 0); 3425ca02815Sjsg prange->validate_timestamp = 0; 3435ca02815Sjsg mutex_init(&prange->migrate_mutex); 3445ca02815Sjsg mutex_init(&prange->lock); 3455ca02815Sjsg 3465ca02815Sjsg if (p->xnack_enabled) 3475ca02815Sjsg bitmap_copy(prange->bitmap_access, svms->bitmap_supported, 3485ca02815Sjsg MAX_GPU_INSTANCE); 3495ca02815Sjsg 3505ca02815Sjsg svm_range_set_default_attributes(&prange->preferred_loc, 3515ca02815Sjsg &prange->prefetch_loc, 3525ca02815Sjsg &prange->granularity, &prange->flags); 3535ca02815Sjsg 3545ca02815Sjsg pr_debug("svms 0x%p [0x%llx 0x%llx]\n", svms, start, last); 3555ca02815Sjsg 3565ca02815Sjsg return prange; 3575ca02815Sjsg } 3585ca02815Sjsg 3595ca02815Sjsg static bool svm_bo_ref_unless_zero(struct svm_range_bo *svm_bo) 3605ca02815Sjsg { 3615ca02815Sjsg if (!svm_bo || !kref_get_unless_zero(&svm_bo->kref)) 3625ca02815Sjsg return false; 3635ca02815Sjsg 3645ca02815Sjsg return true; 3655ca02815Sjsg } 3665ca02815Sjsg 3675ca02815Sjsg static void svm_range_bo_release(struct kref *kref) 3685ca02815Sjsg { 3695ca02815Sjsg struct svm_range_bo *svm_bo; 3705ca02815Sjsg 3715ca02815Sjsg svm_bo = container_of(kref, struct svm_range_bo, kref); 3721bb76ff1Sjsg pr_debug("svm_bo 0x%p\n", svm_bo); 3731bb76ff1Sjsg 3745ca02815Sjsg spin_lock(&svm_bo->list_lock); 3755ca02815Sjsg while (!list_empty(&svm_bo->range_list)) { 3765ca02815Sjsg struct svm_range *prange = 3775ca02815Sjsg list_first_entry(&svm_bo->range_list, 3785ca02815Sjsg struct svm_range, svm_bo_list); 3795ca02815Sjsg /* list_del_init tells a concurrent svm_range_vram_node_new when 3805ca02815Sjsg * it's safe to reuse the svm_bo pointer and svm_bo_list head. 3815ca02815Sjsg */ 3825ca02815Sjsg list_del_init(&prange->svm_bo_list); 3835ca02815Sjsg spin_unlock(&svm_bo->list_lock); 3845ca02815Sjsg 3855ca02815Sjsg pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, 3865ca02815Sjsg prange->start, prange->last); 3875ca02815Sjsg mutex_lock(&prange->lock); 3885ca02815Sjsg prange->svm_bo = NULL; 3895ca02815Sjsg mutex_unlock(&prange->lock); 3905ca02815Sjsg 3915ca02815Sjsg spin_lock(&svm_bo->list_lock); 3925ca02815Sjsg } 3935ca02815Sjsg spin_unlock(&svm_bo->list_lock); 394*f7304f60Sjsg 395*f7304f60Sjsg if (mmget_not_zero(svm_bo->eviction_fence->mm)) { 396*f7304f60Sjsg struct kfd_process_device *pdd; 397*f7304f60Sjsg struct kfd_process *p; 398*f7304f60Sjsg struct mm_struct *mm; 399*f7304f60Sjsg 400*f7304f60Sjsg mm = svm_bo->eviction_fence->mm; 401*f7304f60Sjsg /* 402*f7304f60Sjsg * The forked child process takes svm_bo device pages ref, svm_bo could be 403*f7304f60Sjsg * released after parent process is gone. 404*f7304f60Sjsg */ 405*f7304f60Sjsg p = kfd_lookup_process_by_mm(mm); 406*f7304f60Sjsg if (p) { 407*f7304f60Sjsg pdd = kfd_get_process_device_data(svm_bo->node, p); 408*f7304f60Sjsg if (pdd) 409*f7304f60Sjsg atomic64_sub(amdgpu_bo_size(svm_bo->bo), &pdd->vram_usage); 410*f7304f60Sjsg kfd_unref_process(p); 411*f7304f60Sjsg } 412*f7304f60Sjsg mmput(mm); 413*f7304f60Sjsg } 414*f7304f60Sjsg 4156982f73fSjsg if (!dma_fence_is_signaled(&svm_bo->eviction_fence->base)) 4166982f73fSjsg /* We're not in the eviction worker. Signal the fence. */ 4175ca02815Sjsg dma_fence_signal(&svm_bo->eviction_fence->base); 4185ca02815Sjsg dma_fence_put(&svm_bo->eviction_fence->base); 4195ca02815Sjsg amdgpu_bo_unref(&svm_bo->bo); 4205ca02815Sjsg kfree(svm_bo); 4215ca02815Sjsg } 4225ca02815Sjsg 4231bb76ff1Sjsg static void svm_range_bo_wq_release(struct work_struct *work) 4245ca02815Sjsg { 4251bb76ff1Sjsg struct svm_range_bo *svm_bo; 4265ca02815Sjsg 4271bb76ff1Sjsg svm_bo = container_of(work, struct svm_range_bo, release_work); 4281bb76ff1Sjsg svm_range_bo_release(&svm_bo->kref); 4291bb76ff1Sjsg } 4301bb76ff1Sjsg 4311bb76ff1Sjsg static void svm_range_bo_release_async(struct kref *kref) 4321bb76ff1Sjsg { 4331bb76ff1Sjsg struct svm_range_bo *svm_bo; 4341bb76ff1Sjsg 4351bb76ff1Sjsg svm_bo = container_of(kref, struct svm_range_bo, kref); 4361bb76ff1Sjsg pr_debug("svm_bo 0x%p\n", svm_bo); 4371bb76ff1Sjsg INIT_WORK(&svm_bo->release_work, svm_range_bo_wq_release); 4381bb76ff1Sjsg schedule_work(&svm_bo->release_work); 4391bb76ff1Sjsg } 4401bb76ff1Sjsg 4411bb76ff1Sjsg void svm_range_bo_unref_async(struct svm_range_bo *svm_bo) 4421bb76ff1Sjsg { 4431bb76ff1Sjsg kref_put(&svm_bo->kref, svm_range_bo_release_async); 4441bb76ff1Sjsg } 4451bb76ff1Sjsg 4461bb76ff1Sjsg static void svm_range_bo_unref(struct svm_range_bo *svm_bo) 4471bb76ff1Sjsg { 4481bb76ff1Sjsg if (svm_bo) 4495ca02815Sjsg kref_put(&svm_bo->kref, svm_range_bo_release); 4505ca02815Sjsg } 4515ca02815Sjsg 4525ca02815Sjsg static bool 453f005ef32Sjsg svm_range_validate_svm_bo(struct kfd_node *node, struct svm_range *prange) 4545ca02815Sjsg { 4555ca02815Sjsg mutex_lock(&prange->lock); 4565ca02815Sjsg if (!prange->svm_bo) { 4575ca02815Sjsg mutex_unlock(&prange->lock); 4585ca02815Sjsg return false; 4595ca02815Sjsg } 4605ca02815Sjsg if (prange->ttm_res) { 4615ca02815Sjsg /* We still have a reference, all is well */ 4625ca02815Sjsg mutex_unlock(&prange->lock); 4635ca02815Sjsg return true; 4645ca02815Sjsg } 4655ca02815Sjsg if (svm_bo_ref_unless_zero(prange->svm_bo)) { 4665ca02815Sjsg /* 467f005ef32Sjsg * Migrate from GPU to GPU, remove range from source svm_bo->node 468f005ef32Sjsg * range list, and return false to allocate svm_bo from destination 469f005ef32Sjsg * node. 4705ca02815Sjsg */ 471f005ef32Sjsg if (prange->svm_bo->node != node) { 4725ca02815Sjsg mutex_unlock(&prange->lock); 4735ca02815Sjsg 4745ca02815Sjsg spin_lock(&prange->svm_bo->list_lock); 4755ca02815Sjsg list_del_init(&prange->svm_bo_list); 4765ca02815Sjsg spin_unlock(&prange->svm_bo->list_lock); 4775ca02815Sjsg 4785ca02815Sjsg svm_range_bo_unref(prange->svm_bo); 4795ca02815Sjsg return false; 4805ca02815Sjsg } 4815ca02815Sjsg if (READ_ONCE(prange->svm_bo->evicting)) { 4825ca02815Sjsg struct dma_fence *f; 4835ca02815Sjsg struct svm_range_bo *svm_bo; 4845ca02815Sjsg /* The BO is getting evicted, 4855ca02815Sjsg * we need to get a new one 4865ca02815Sjsg */ 4875ca02815Sjsg mutex_unlock(&prange->lock); 4885ca02815Sjsg svm_bo = prange->svm_bo; 4895ca02815Sjsg f = dma_fence_get(&svm_bo->eviction_fence->base); 4905ca02815Sjsg svm_range_bo_unref(prange->svm_bo); 4915ca02815Sjsg /* wait for the fence to avoid long spin-loop 4925ca02815Sjsg * at list_empty_careful 4935ca02815Sjsg */ 4945ca02815Sjsg dma_fence_wait(f, false); 4955ca02815Sjsg dma_fence_put(f); 4965ca02815Sjsg } else { 4975ca02815Sjsg /* The BO was still around and we got 4985ca02815Sjsg * a new reference to it 4995ca02815Sjsg */ 5005ca02815Sjsg mutex_unlock(&prange->lock); 5015ca02815Sjsg pr_debug("reuse old bo svms 0x%p [0x%lx 0x%lx]\n", 5025ca02815Sjsg prange->svms, prange->start, prange->last); 5035ca02815Sjsg 5045ca02815Sjsg prange->ttm_res = prange->svm_bo->bo->tbo.resource; 5055ca02815Sjsg return true; 5065ca02815Sjsg } 5075ca02815Sjsg 5085ca02815Sjsg } else { 5095ca02815Sjsg mutex_unlock(&prange->lock); 5105ca02815Sjsg } 5115ca02815Sjsg 5125ca02815Sjsg /* We need a new svm_bo. Spin-loop to wait for concurrent 5135ca02815Sjsg * svm_range_bo_release to finish removing this range from 514eeb91866Sjsg * its range list and set prange->svm_bo to null. After this, 515eeb91866Sjsg * it is safe to reuse the svm_bo pointer and svm_bo_list head. 5165ca02815Sjsg */ 517eeb91866Sjsg while (!list_empty_careful(&prange->svm_bo_list) || prange->svm_bo) 518eeb91866Sjsg cond_resched(); 5195ca02815Sjsg 5205ca02815Sjsg return false; 5215ca02815Sjsg } 5225ca02815Sjsg 5235ca02815Sjsg static struct svm_range_bo *svm_range_bo_new(void) 5245ca02815Sjsg { 5255ca02815Sjsg struct svm_range_bo *svm_bo; 5265ca02815Sjsg 5275ca02815Sjsg svm_bo = kzalloc(sizeof(*svm_bo), GFP_KERNEL); 5285ca02815Sjsg if (!svm_bo) 5295ca02815Sjsg return NULL; 5305ca02815Sjsg 5315ca02815Sjsg kref_init(&svm_bo->kref); 5325ca02815Sjsg INIT_LIST_HEAD(&svm_bo->range_list); 5335ca02815Sjsg spin_lock_init(&svm_bo->list_lock); 5345ca02815Sjsg 5355ca02815Sjsg return svm_bo; 5365ca02815Sjsg } 5375ca02815Sjsg 5385ca02815Sjsg int 539f005ef32Sjsg svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange, 5405ca02815Sjsg bool clear) 5415ca02815Sjsg { 542*f7304f60Sjsg struct kfd_process_device *pdd; 5435ca02815Sjsg struct amdgpu_bo_param bp; 5445ca02815Sjsg struct svm_range_bo *svm_bo; 5455ca02815Sjsg struct amdgpu_bo_user *ubo; 5465ca02815Sjsg struct amdgpu_bo *bo; 5475ca02815Sjsg struct kfd_process *p; 5485ca02815Sjsg struct mm_struct *mm; 5495ca02815Sjsg int r; 5505ca02815Sjsg 5515ca02815Sjsg p = container_of(prange->svms, struct kfd_process, svms); 5525ca02815Sjsg pr_debug("pasid: %x svms 0x%p [0x%lx 0x%lx]\n", p->pasid, prange->svms, 5535ca02815Sjsg prange->start, prange->last); 5545ca02815Sjsg 555f005ef32Sjsg if (svm_range_validate_svm_bo(node, prange)) 5565ca02815Sjsg return 0; 5575ca02815Sjsg 5585ca02815Sjsg svm_bo = svm_range_bo_new(); 5595ca02815Sjsg if (!svm_bo) { 5605ca02815Sjsg pr_debug("failed to alloc svm bo\n"); 5615ca02815Sjsg return -ENOMEM; 5625ca02815Sjsg } 5635ca02815Sjsg mm = get_task_mm(p->lead_thread); 5645ca02815Sjsg if (!mm) { 5655ca02815Sjsg pr_debug("failed to get mm\n"); 5665ca02815Sjsg kfree(svm_bo); 5675ca02815Sjsg return -ESRCH; 5685ca02815Sjsg } 569f005ef32Sjsg svm_bo->node = node; 5705ca02815Sjsg svm_bo->eviction_fence = 5715ca02815Sjsg amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1), 5725ca02815Sjsg mm, 5735ca02815Sjsg svm_bo); 5745ca02815Sjsg mmput(mm); 5755ca02815Sjsg INIT_WORK(&svm_bo->eviction_work, svm_range_evict_svm_bo_worker); 5765ca02815Sjsg svm_bo->evicting = 0; 5775ca02815Sjsg memset(&bp, 0, sizeof(bp)); 5785ca02815Sjsg bp.size = prange->npages * PAGE_SIZE; 5795ca02815Sjsg bp.byte_align = PAGE_SIZE; 5805ca02815Sjsg bp.domain = AMDGPU_GEM_DOMAIN_VRAM; 5815ca02815Sjsg bp.flags = AMDGPU_GEM_CREATE_NO_CPU_ACCESS; 5825ca02815Sjsg bp.flags |= clear ? AMDGPU_GEM_CREATE_VRAM_CLEARED : 0; 5831bb76ff1Sjsg bp.flags |= AMDGPU_GEM_CREATE_DISCARDABLE; 5845ca02815Sjsg bp.type = ttm_bo_type_device; 5855ca02815Sjsg bp.resv = NULL; 586f005ef32Sjsg if (node->xcp) 587f005ef32Sjsg bp.xcp_id_plus1 = node->xcp->id + 1; 5885ca02815Sjsg 589f005ef32Sjsg r = amdgpu_bo_create_user(node->adev, &bp, &ubo); 5905ca02815Sjsg if (r) { 5915ca02815Sjsg pr_debug("failed %d to create bo\n", r); 5925ca02815Sjsg goto create_bo_failed; 5935ca02815Sjsg } 5945ca02815Sjsg bo = &ubo->bo; 595f005ef32Sjsg 596f005ef32Sjsg pr_debug("alloc bo at offset 0x%lx size 0x%lx on partition %d\n", 597f005ef32Sjsg bo->tbo.resource->start << PAGE_SHIFT, bp.size, 598f005ef32Sjsg bp.xcp_id_plus1 - 1); 599f005ef32Sjsg 6005ca02815Sjsg r = amdgpu_bo_reserve(bo, true); 6015ca02815Sjsg if (r) { 6025ca02815Sjsg pr_debug("failed %d to reserve bo\n", r); 6035ca02815Sjsg goto reserve_bo_failed; 6045ca02815Sjsg } 6055ca02815Sjsg 6063adaca97Sjsg if (clear) { 6073adaca97Sjsg r = amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false); 6083adaca97Sjsg if (r) { 6093adaca97Sjsg pr_debug("failed %d to sync bo\n", r); 6103adaca97Sjsg amdgpu_bo_unreserve(bo); 6113adaca97Sjsg goto reserve_bo_failed; 6123adaca97Sjsg } 6133adaca97Sjsg } 6143adaca97Sjsg 6151bb76ff1Sjsg r = dma_resv_reserve_fences(bo->tbo.base.resv, 1); 6165ca02815Sjsg if (r) { 6175ca02815Sjsg pr_debug("failed %d to reserve bo\n", r); 6185ca02815Sjsg amdgpu_bo_unreserve(bo); 6195ca02815Sjsg goto reserve_bo_failed; 6205ca02815Sjsg } 6215ca02815Sjsg amdgpu_bo_fence(bo, &svm_bo->eviction_fence->base, true); 6225ca02815Sjsg 6235ca02815Sjsg amdgpu_bo_unreserve(bo); 6245ca02815Sjsg 6255ca02815Sjsg svm_bo->bo = bo; 6265ca02815Sjsg prange->svm_bo = svm_bo; 6275ca02815Sjsg prange->ttm_res = bo->tbo.resource; 6285ca02815Sjsg prange->offset = 0; 6295ca02815Sjsg 6305ca02815Sjsg spin_lock(&svm_bo->list_lock); 6315ca02815Sjsg list_add(&prange->svm_bo_list, &svm_bo->range_list); 6325ca02815Sjsg spin_unlock(&svm_bo->list_lock); 6335ca02815Sjsg 634*f7304f60Sjsg pdd = svm_range_get_pdd_by_node(prange, node); 635*f7304f60Sjsg if (pdd) 636*f7304f60Sjsg atomic64_add(amdgpu_bo_size(bo), &pdd->vram_usage); 637*f7304f60Sjsg 6385ca02815Sjsg return 0; 6395ca02815Sjsg 6405ca02815Sjsg reserve_bo_failed: 6415ca02815Sjsg amdgpu_bo_unref(&bo); 6425ca02815Sjsg create_bo_failed: 6435ca02815Sjsg dma_fence_put(&svm_bo->eviction_fence->base); 6445ca02815Sjsg kfree(svm_bo); 6455ca02815Sjsg prange->ttm_res = NULL; 6465ca02815Sjsg 6475ca02815Sjsg return r; 6485ca02815Sjsg } 6495ca02815Sjsg 6505ca02815Sjsg void svm_range_vram_node_free(struct svm_range *prange) 6515ca02815Sjsg { 652f20eba74Sjsg /* serialize prange->svm_bo unref */ 653f20eba74Sjsg mutex_lock(&prange->lock); 654f20eba74Sjsg /* prange->svm_bo has not been unref */ 655f20eba74Sjsg if (prange->ttm_res) { 6565ca02815Sjsg prange->ttm_res = NULL; 657f20eba74Sjsg mutex_unlock(&prange->lock); 658f20eba74Sjsg svm_range_bo_unref(prange->svm_bo); 659f20eba74Sjsg } else 660f20eba74Sjsg mutex_unlock(&prange->lock); 6615ca02815Sjsg } 6625ca02815Sjsg 663f005ef32Sjsg struct kfd_node * 664f005ef32Sjsg svm_range_get_node_by_id(struct svm_range *prange, uint32_t gpu_id) 6655ca02815Sjsg { 6665ca02815Sjsg struct kfd_process *p; 667f005ef32Sjsg struct kfd_process_device *pdd; 6685ca02815Sjsg 6695ca02815Sjsg p = container_of(prange->svms, struct kfd_process, svms); 670f005ef32Sjsg pdd = kfd_process_device_data_by_id(p, gpu_id); 6715ca02815Sjsg if (!pdd) { 672f005ef32Sjsg pr_debug("failed to get kfd process device by id 0x%x\n", gpu_id); 6735ca02815Sjsg return NULL; 6745ca02815Sjsg } 6755ca02815Sjsg 676f005ef32Sjsg return pdd->dev; 6775ca02815Sjsg } 6785ca02815Sjsg 6795ca02815Sjsg struct kfd_process_device * 680f005ef32Sjsg svm_range_get_pdd_by_node(struct svm_range *prange, struct kfd_node *node) 6815ca02815Sjsg { 6825ca02815Sjsg struct kfd_process *p; 6835ca02815Sjsg 6845ca02815Sjsg p = container_of(prange->svms, struct kfd_process, svms); 6855ca02815Sjsg 686f005ef32Sjsg return kfd_get_process_device_data(node, p); 6875ca02815Sjsg } 6885ca02815Sjsg 6895ca02815Sjsg static int svm_range_bo_validate(void *param, struct amdgpu_bo *bo) 6905ca02815Sjsg { 6915ca02815Sjsg struct ttm_operation_ctx ctx = { false, false }; 6925ca02815Sjsg 6935ca02815Sjsg amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_VRAM); 6945ca02815Sjsg 6955ca02815Sjsg return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); 6965ca02815Sjsg } 6975ca02815Sjsg 6985ca02815Sjsg static int 6995ca02815Sjsg svm_range_check_attr(struct kfd_process *p, 7005ca02815Sjsg uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 7015ca02815Sjsg { 7025ca02815Sjsg uint32_t i; 7035ca02815Sjsg 7045ca02815Sjsg for (i = 0; i < nattr; i++) { 7055ca02815Sjsg uint32_t val = attrs[i].value; 7065ca02815Sjsg int gpuidx = MAX_GPU_INSTANCE; 7075ca02815Sjsg 7085ca02815Sjsg switch (attrs[i].type) { 7095ca02815Sjsg case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 7105ca02815Sjsg if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM && 7115ca02815Sjsg val != KFD_IOCTL_SVM_LOCATION_UNDEFINED) 7125ca02815Sjsg gpuidx = kfd_process_gpuidx_from_gpuid(p, val); 7135ca02815Sjsg break; 7145ca02815Sjsg case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 7155ca02815Sjsg if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM) 7165ca02815Sjsg gpuidx = kfd_process_gpuidx_from_gpuid(p, val); 7175ca02815Sjsg break; 7185ca02815Sjsg case KFD_IOCTL_SVM_ATTR_ACCESS: 7195ca02815Sjsg case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: 7205ca02815Sjsg case KFD_IOCTL_SVM_ATTR_NO_ACCESS: 7215ca02815Sjsg gpuidx = kfd_process_gpuidx_from_gpuid(p, val); 7225ca02815Sjsg break; 7235ca02815Sjsg case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 7245ca02815Sjsg break; 7255ca02815Sjsg case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 7265ca02815Sjsg break; 7275ca02815Sjsg case KFD_IOCTL_SVM_ATTR_GRANULARITY: 7285ca02815Sjsg break; 7295ca02815Sjsg default: 7305ca02815Sjsg pr_debug("unknown attr type 0x%x\n", attrs[i].type); 7315ca02815Sjsg return -EINVAL; 7325ca02815Sjsg } 7335ca02815Sjsg 7345ca02815Sjsg if (gpuidx < 0) { 7355ca02815Sjsg pr_debug("no GPU 0x%x found\n", val); 7365ca02815Sjsg return -EINVAL; 7375ca02815Sjsg } else if (gpuidx < MAX_GPU_INSTANCE && 7385ca02815Sjsg !test_bit(gpuidx, p->svms.bitmap_supported)) { 7395ca02815Sjsg pr_debug("GPU 0x%x not supported\n", val); 7405ca02815Sjsg return -EINVAL; 7415ca02815Sjsg } 7425ca02815Sjsg } 7435ca02815Sjsg 7445ca02815Sjsg return 0; 7455ca02815Sjsg } 7465ca02815Sjsg 7475ca02815Sjsg static void 7485ca02815Sjsg svm_range_apply_attrs(struct kfd_process *p, struct svm_range *prange, 7491bb76ff1Sjsg uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs, 7501bb76ff1Sjsg bool *update_mapping) 7515ca02815Sjsg { 7525ca02815Sjsg uint32_t i; 7535ca02815Sjsg int gpuidx; 7545ca02815Sjsg 7555ca02815Sjsg for (i = 0; i < nattr; i++) { 7565ca02815Sjsg switch (attrs[i].type) { 7575ca02815Sjsg case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 7585ca02815Sjsg prange->preferred_loc = attrs[i].value; 7595ca02815Sjsg break; 7605ca02815Sjsg case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 7615ca02815Sjsg prange->prefetch_loc = attrs[i].value; 7625ca02815Sjsg break; 7635ca02815Sjsg case KFD_IOCTL_SVM_ATTR_ACCESS: 7645ca02815Sjsg case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: 7655ca02815Sjsg case KFD_IOCTL_SVM_ATTR_NO_ACCESS: 766f005ef32Sjsg if (!p->xnack_enabled) 7671bb76ff1Sjsg *update_mapping = true; 768f005ef32Sjsg 7695ca02815Sjsg gpuidx = kfd_process_gpuidx_from_gpuid(p, 7705ca02815Sjsg attrs[i].value); 7715ca02815Sjsg if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) { 7725ca02815Sjsg bitmap_clear(prange->bitmap_access, gpuidx, 1); 7735ca02815Sjsg bitmap_clear(prange->bitmap_aip, gpuidx, 1); 7745ca02815Sjsg } else if (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS) { 7755ca02815Sjsg bitmap_set(prange->bitmap_access, gpuidx, 1); 7765ca02815Sjsg bitmap_clear(prange->bitmap_aip, gpuidx, 1); 7775ca02815Sjsg } else { 7785ca02815Sjsg bitmap_clear(prange->bitmap_access, gpuidx, 1); 7795ca02815Sjsg bitmap_set(prange->bitmap_aip, gpuidx, 1); 7805ca02815Sjsg } 7815ca02815Sjsg break; 7825ca02815Sjsg case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 7831bb76ff1Sjsg *update_mapping = true; 7845ca02815Sjsg prange->flags |= attrs[i].value; 7855ca02815Sjsg break; 7865ca02815Sjsg case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 7871bb76ff1Sjsg *update_mapping = true; 7885ca02815Sjsg prange->flags &= ~attrs[i].value; 7895ca02815Sjsg break; 7905ca02815Sjsg case KFD_IOCTL_SVM_ATTR_GRANULARITY: 7916a6ae6f6Sjsg prange->granularity = min_t(uint32_t, attrs[i].value, 0x3F); 7925ca02815Sjsg break; 7935ca02815Sjsg default: 7945ca02815Sjsg WARN_ONCE(1, "svm_range_check_attrs wasn't called?"); 7955ca02815Sjsg } 7965ca02815Sjsg } 7975ca02815Sjsg } 7985ca02815Sjsg 7991bb76ff1Sjsg static bool 8001bb76ff1Sjsg svm_range_is_same_attrs(struct kfd_process *p, struct svm_range *prange, 8011bb76ff1Sjsg uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) 8021bb76ff1Sjsg { 8031bb76ff1Sjsg uint32_t i; 8041bb76ff1Sjsg int gpuidx; 8051bb76ff1Sjsg 8061bb76ff1Sjsg for (i = 0; i < nattr; i++) { 8071bb76ff1Sjsg switch (attrs[i].type) { 8081bb76ff1Sjsg case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 8091bb76ff1Sjsg if (prange->preferred_loc != attrs[i].value) 8101bb76ff1Sjsg return false; 8111bb76ff1Sjsg break; 8121bb76ff1Sjsg case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 8131bb76ff1Sjsg /* Prefetch should always trigger a migration even 8141bb76ff1Sjsg * if the value of the attribute didn't change. 8151bb76ff1Sjsg */ 8161bb76ff1Sjsg return false; 8171bb76ff1Sjsg case KFD_IOCTL_SVM_ATTR_ACCESS: 8181bb76ff1Sjsg case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: 8191bb76ff1Sjsg case KFD_IOCTL_SVM_ATTR_NO_ACCESS: 8201bb76ff1Sjsg gpuidx = kfd_process_gpuidx_from_gpuid(p, 8211bb76ff1Sjsg attrs[i].value); 8221bb76ff1Sjsg if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) { 8231bb76ff1Sjsg if (test_bit(gpuidx, prange->bitmap_access) || 8241bb76ff1Sjsg test_bit(gpuidx, prange->bitmap_aip)) 8251bb76ff1Sjsg return false; 8261bb76ff1Sjsg } else if (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS) { 8271bb76ff1Sjsg if (!test_bit(gpuidx, prange->bitmap_access)) 8281bb76ff1Sjsg return false; 8291bb76ff1Sjsg } else { 8301bb76ff1Sjsg if (!test_bit(gpuidx, prange->bitmap_aip)) 8311bb76ff1Sjsg return false; 8321bb76ff1Sjsg } 8331bb76ff1Sjsg break; 8341bb76ff1Sjsg case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 8351bb76ff1Sjsg if ((prange->flags & attrs[i].value) != attrs[i].value) 8361bb76ff1Sjsg return false; 8371bb76ff1Sjsg break; 8381bb76ff1Sjsg case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 8391bb76ff1Sjsg if ((prange->flags & attrs[i].value) != 0) 8401bb76ff1Sjsg return false; 8411bb76ff1Sjsg break; 8421bb76ff1Sjsg case KFD_IOCTL_SVM_ATTR_GRANULARITY: 8431bb76ff1Sjsg if (prange->granularity != attrs[i].value) 8441bb76ff1Sjsg return false; 8451bb76ff1Sjsg break; 8461bb76ff1Sjsg default: 8471bb76ff1Sjsg WARN_ONCE(1, "svm_range_check_attrs wasn't called?"); 8481bb76ff1Sjsg } 8491bb76ff1Sjsg } 8501bb76ff1Sjsg 8511bb76ff1Sjsg return true; 8521bb76ff1Sjsg } 8531bb76ff1Sjsg 8545ca02815Sjsg /** 8555ca02815Sjsg * svm_range_debug_dump - print all range information from svms 8565ca02815Sjsg * @svms: svm range list header 8575ca02815Sjsg * 8585ca02815Sjsg * debug output svm range start, end, prefetch location from svms 8595ca02815Sjsg * interval tree and link list 8605ca02815Sjsg * 8615ca02815Sjsg * Context: The caller must hold svms->lock 8625ca02815Sjsg */ 8635ca02815Sjsg static void svm_range_debug_dump(struct svm_range_list *svms) 8645ca02815Sjsg { 8655ca02815Sjsg struct interval_tree_node *node; 8665ca02815Sjsg struct svm_range *prange; 8675ca02815Sjsg 8685ca02815Sjsg pr_debug("dump svms 0x%p list\n", svms); 8695ca02815Sjsg pr_debug("range\tstart\tpage\tend\t\tlocation\n"); 8705ca02815Sjsg 8715ca02815Sjsg list_for_each_entry(prange, &svms->list, list) { 8725ca02815Sjsg pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n", 8735ca02815Sjsg prange, prange->start, prange->npages, 8745ca02815Sjsg prange->start + prange->npages - 1, 8755ca02815Sjsg prange->actual_loc); 8765ca02815Sjsg } 8775ca02815Sjsg 8785ca02815Sjsg pr_debug("dump svms 0x%p interval tree\n", svms); 8795ca02815Sjsg pr_debug("range\tstart\tpage\tend\t\tlocation\n"); 8805ca02815Sjsg node = interval_tree_iter_first(&svms->objects, 0, ~0ULL); 8815ca02815Sjsg while (node) { 8825ca02815Sjsg prange = container_of(node, struct svm_range, it_node); 8835ca02815Sjsg pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n", 8845ca02815Sjsg prange, prange->start, prange->npages, 8855ca02815Sjsg prange->start + prange->npages - 1, 8865ca02815Sjsg prange->actual_loc); 8875ca02815Sjsg node = interval_tree_iter_next(node, 0, ~0ULL); 8885ca02815Sjsg } 8895ca02815Sjsg } 8905ca02815Sjsg 891f005ef32Sjsg static void * 892f005ef32Sjsg svm_range_copy_array(void *psrc, size_t size, uint64_t num_elements, 893f005ef32Sjsg uint64_t offset) 894f005ef32Sjsg { 895f005ef32Sjsg unsigned char *dst; 896f005ef32Sjsg 897f005ef32Sjsg dst = kvmalloc_array(num_elements, size, GFP_KERNEL); 898f005ef32Sjsg if (!dst) 899f005ef32Sjsg return NULL; 900f005ef32Sjsg memcpy(dst, (unsigned char *)psrc + offset, num_elements * size); 901f005ef32Sjsg 902f005ef32Sjsg return (void *)dst; 903f005ef32Sjsg } 904f005ef32Sjsg 905f005ef32Sjsg static int 906f005ef32Sjsg svm_range_copy_dma_addrs(struct svm_range *dst, struct svm_range *src) 907f005ef32Sjsg { 908f005ef32Sjsg int i; 909f005ef32Sjsg 910f005ef32Sjsg for (i = 0; i < MAX_GPU_INSTANCE; i++) { 911f005ef32Sjsg if (!src->dma_addr[i]) 912f005ef32Sjsg continue; 913f005ef32Sjsg dst->dma_addr[i] = svm_range_copy_array(src->dma_addr[i], 914f005ef32Sjsg sizeof(*src->dma_addr[i]), src->npages, 0); 915f005ef32Sjsg if (!dst->dma_addr[i]) 916f005ef32Sjsg return -ENOMEM; 917f005ef32Sjsg } 918f005ef32Sjsg 919f005ef32Sjsg return 0; 920f005ef32Sjsg } 921f005ef32Sjsg 9225ca02815Sjsg static int 9235ca02815Sjsg svm_range_split_array(void *ppnew, void *ppold, size_t size, 9245ca02815Sjsg uint64_t old_start, uint64_t old_n, 9255ca02815Sjsg uint64_t new_start, uint64_t new_n) 9265ca02815Sjsg { 9275ca02815Sjsg unsigned char *new, *old, *pold; 9285ca02815Sjsg uint64_t d; 9295ca02815Sjsg 9305ca02815Sjsg if (!ppold) 9315ca02815Sjsg return 0; 9325ca02815Sjsg pold = *(unsigned char **)ppold; 9335ca02815Sjsg if (!pold) 9345ca02815Sjsg return 0; 9355ca02815Sjsg 936f005ef32Sjsg d = (new_start - old_start) * size; 937f005ef32Sjsg new = svm_range_copy_array(pold, size, new_n, d); 9385ca02815Sjsg if (!new) 9395ca02815Sjsg return -ENOMEM; 940f005ef32Sjsg d = (new_start == old_start) ? new_n * size : 0; 941f005ef32Sjsg old = svm_range_copy_array(pold, size, old_n, d); 9425ca02815Sjsg if (!old) { 9435ca02815Sjsg kvfree(new); 9445ca02815Sjsg return -ENOMEM; 9455ca02815Sjsg } 9465ca02815Sjsg kvfree(pold); 9475ca02815Sjsg *(void **)ppold = old; 9485ca02815Sjsg *(void **)ppnew = new; 9495ca02815Sjsg 9505ca02815Sjsg return 0; 9515ca02815Sjsg } 9525ca02815Sjsg 9535ca02815Sjsg static int 9545ca02815Sjsg svm_range_split_pages(struct svm_range *new, struct svm_range *old, 9555ca02815Sjsg uint64_t start, uint64_t last) 9565ca02815Sjsg { 9575ca02815Sjsg uint64_t npages = last - start + 1; 9585ca02815Sjsg int i, r; 9595ca02815Sjsg 9605ca02815Sjsg for (i = 0; i < MAX_GPU_INSTANCE; i++) { 9615ca02815Sjsg r = svm_range_split_array(&new->dma_addr[i], &old->dma_addr[i], 9625ca02815Sjsg sizeof(*old->dma_addr[i]), old->start, 9635ca02815Sjsg npages, new->start, new->npages); 9645ca02815Sjsg if (r) 9655ca02815Sjsg return r; 9665ca02815Sjsg } 9675ca02815Sjsg 9685ca02815Sjsg return 0; 9695ca02815Sjsg } 9705ca02815Sjsg 9715ca02815Sjsg static int 9725ca02815Sjsg svm_range_split_nodes(struct svm_range *new, struct svm_range *old, 9735ca02815Sjsg uint64_t start, uint64_t last) 9745ca02815Sjsg { 9755ca02815Sjsg uint64_t npages = last - start + 1; 9765ca02815Sjsg 9775ca02815Sjsg pr_debug("svms 0x%p new prange 0x%p start 0x%lx [0x%llx 0x%llx]\n", 9785ca02815Sjsg new->svms, new, new->start, start, last); 9795ca02815Sjsg 9805ca02815Sjsg if (new->start == old->start) { 9815ca02815Sjsg new->offset = old->offset; 9825ca02815Sjsg old->offset += new->npages; 9835ca02815Sjsg } else { 9845ca02815Sjsg new->offset = old->offset + npages; 9855ca02815Sjsg } 9865ca02815Sjsg 9875ca02815Sjsg new->svm_bo = svm_range_bo_ref(old->svm_bo); 9885ca02815Sjsg new->ttm_res = old->ttm_res; 9895ca02815Sjsg 9905ca02815Sjsg spin_lock(&new->svm_bo->list_lock); 9915ca02815Sjsg list_add(&new->svm_bo_list, &new->svm_bo->range_list); 9925ca02815Sjsg spin_unlock(&new->svm_bo->list_lock); 9935ca02815Sjsg 9945ca02815Sjsg return 0; 9955ca02815Sjsg } 9965ca02815Sjsg 9975ca02815Sjsg /** 9985ca02815Sjsg * svm_range_split_adjust - split range and adjust 9995ca02815Sjsg * 10005ca02815Sjsg * @new: new range 10015ca02815Sjsg * @old: the old range 10025ca02815Sjsg * @start: the old range adjust to start address in pages 10035ca02815Sjsg * @last: the old range adjust to last address in pages 10045ca02815Sjsg * 10055ca02815Sjsg * Copy system memory dma_addr or vram ttm_res in old range to new 10065ca02815Sjsg * range from new_start up to size new->npages, the remaining old range is from 10075ca02815Sjsg * start to last 10085ca02815Sjsg * 10095ca02815Sjsg * Return: 10105ca02815Sjsg * 0 - OK, -ENOMEM - out of memory 10115ca02815Sjsg */ 10125ca02815Sjsg static int 10135ca02815Sjsg svm_range_split_adjust(struct svm_range *new, struct svm_range *old, 10145ca02815Sjsg uint64_t start, uint64_t last) 10155ca02815Sjsg { 10165ca02815Sjsg int r; 10175ca02815Sjsg 10185ca02815Sjsg pr_debug("svms 0x%p new 0x%lx old [0x%lx 0x%lx] => [0x%llx 0x%llx]\n", 10195ca02815Sjsg new->svms, new->start, old->start, old->last, start, last); 10205ca02815Sjsg 10215ca02815Sjsg if (new->start < old->start || 10225ca02815Sjsg new->last > old->last) { 10235ca02815Sjsg WARN_ONCE(1, "invalid new range start or last\n"); 10245ca02815Sjsg return -EINVAL; 10255ca02815Sjsg } 10265ca02815Sjsg 10275ca02815Sjsg r = svm_range_split_pages(new, old, start, last); 10285ca02815Sjsg if (r) 10295ca02815Sjsg return r; 10305ca02815Sjsg 10315ca02815Sjsg if (old->actual_loc && old->ttm_res) { 10325ca02815Sjsg r = svm_range_split_nodes(new, old, start, last); 10335ca02815Sjsg if (r) 10345ca02815Sjsg return r; 10355ca02815Sjsg } 10365ca02815Sjsg 10375ca02815Sjsg old->npages = last - start + 1; 10385ca02815Sjsg old->start = start; 10395ca02815Sjsg old->last = last; 10405ca02815Sjsg new->flags = old->flags; 10415ca02815Sjsg new->preferred_loc = old->preferred_loc; 10425ca02815Sjsg new->prefetch_loc = old->prefetch_loc; 10435ca02815Sjsg new->actual_loc = old->actual_loc; 10445ca02815Sjsg new->granularity = old->granularity; 10451bb76ff1Sjsg new->mapped_to_gpu = old->mapped_to_gpu; 10465ca02815Sjsg bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE); 10475ca02815Sjsg bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE); 10485ca02815Sjsg 10495ca02815Sjsg return 0; 10505ca02815Sjsg } 10515ca02815Sjsg 10525ca02815Sjsg /** 10535ca02815Sjsg * svm_range_split - split a range in 2 ranges 10545ca02815Sjsg * 10555ca02815Sjsg * @prange: the svm range to split 10565ca02815Sjsg * @start: the remaining range start address in pages 10575ca02815Sjsg * @last: the remaining range last address in pages 10585ca02815Sjsg * @new: the result new range generated 10595ca02815Sjsg * 10605ca02815Sjsg * Two cases only: 10615ca02815Sjsg * case 1: if start == prange->start 10625ca02815Sjsg * prange ==> prange[start, last] 10635ca02815Sjsg * new range [last + 1, prange->last] 10645ca02815Sjsg * 10655ca02815Sjsg * case 2: if last == prange->last 10665ca02815Sjsg * prange ==> prange[start, last] 10675ca02815Sjsg * new range [prange->start, start - 1] 10685ca02815Sjsg * 10695ca02815Sjsg * Return: 10705ca02815Sjsg * 0 - OK, -ENOMEM - out of memory, -EINVAL - invalid start, last 10715ca02815Sjsg */ 10725ca02815Sjsg static int 10735ca02815Sjsg svm_range_split(struct svm_range *prange, uint64_t start, uint64_t last, 10745ca02815Sjsg struct svm_range **new) 10755ca02815Sjsg { 10765ca02815Sjsg uint64_t old_start = prange->start; 10775ca02815Sjsg uint64_t old_last = prange->last; 10785ca02815Sjsg struct svm_range_list *svms; 10795ca02815Sjsg int r = 0; 10805ca02815Sjsg 10815ca02815Sjsg pr_debug("svms 0x%p [0x%llx 0x%llx] to [0x%llx 0x%llx]\n", prange->svms, 10825ca02815Sjsg old_start, old_last, start, last); 10835ca02815Sjsg 10845ca02815Sjsg if (old_start != start && old_last != last) 10855ca02815Sjsg return -EINVAL; 10865ca02815Sjsg if (start < old_start || last > old_last) 10875ca02815Sjsg return -EINVAL; 10885ca02815Sjsg 10895ca02815Sjsg svms = prange->svms; 10905ca02815Sjsg if (old_start == start) 10911bb76ff1Sjsg *new = svm_range_new(svms, last + 1, old_last, false); 10925ca02815Sjsg else 10931bb76ff1Sjsg *new = svm_range_new(svms, old_start, start - 1, false); 10945ca02815Sjsg if (!*new) 10955ca02815Sjsg return -ENOMEM; 10965ca02815Sjsg 10975ca02815Sjsg r = svm_range_split_adjust(*new, prange, start, last); 10985ca02815Sjsg if (r) { 10995ca02815Sjsg pr_debug("failed %d split [0x%llx 0x%llx] to [0x%llx 0x%llx]\n", 11005ca02815Sjsg r, old_start, old_last, start, last); 11011bb76ff1Sjsg svm_range_free(*new, false); 11025ca02815Sjsg *new = NULL; 11035ca02815Sjsg } 11045ca02815Sjsg 11055ca02815Sjsg return r; 11065ca02815Sjsg } 11075ca02815Sjsg 11085ca02815Sjsg static int 110902f671b6Sjsg svm_range_split_tail(struct svm_range *prange, 11105ca02815Sjsg uint64_t new_last, struct list_head *insert_list) 11115ca02815Sjsg { 11125ca02815Sjsg struct svm_range *tail; 11135ca02815Sjsg int r = svm_range_split(prange, prange->start, new_last, &tail); 11145ca02815Sjsg 11155ca02815Sjsg if (!r) 11161bb76ff1Sjsg list_add(&tail->list, insert_list); 11175ca02815Sjsg return r; 11185ca02815Sjsg } 11195ca02815Sjsg 11205ca02815Sjsg static int 112102f671b6Sjsg svm_range_split_head(struct svm_range *prange, 11225ca02815Sjsg uint64_t new_start, struct list_head *insert_list) 11235ca02815Sjsg { 11245ca02815Sjsg struct svm_range *head; 11255ca02815Sjsg int r = svm_range_split(prange, new_start, prange->last, &head); 11265ca02815Sjsg 11275ca02815Sjsg if (!r) 11281bb76ff1Sjsg list_add(&head->list, insert_list); 11295ca02815Sjsg return r; 11305ca02815Sjsg } 11315ca02815Sjsg 11325ca02815Sjsg static void 11335ca02815Sjsg svm_range_add_child(struct svm_range *prange, struct mm_struct *mm, 11345ca02815Sjsg struct svm_range *pchild, enum svm_work_list_ops op) 11355ca02815Sjsg { 11365ca02815Sjsg pr_debug("add child 0x%p [0x%lx 0x%lx] to prange 0x%p child list %d\n", 11375ca02815Sjsg pchild, pchild->start, pchild->last, prange, op); 11385ca02815Sjsg 11395ca02815Sjsg pchild->work_item.mm = mm; 11405ca02815Sjsg pchild->work_item.op = op; 11415ca02815Sjsg list_add_tail(&pchild->child_list, &prange->child_list); 11425ca02815Sjsg } 11435ca02815Sjsg 11445ca02815Sjsg /** 11455ca02815Sjsg * svm_range_split_by_granularity - collect ranges within granularity boundary 11465ca02815Sjsg * 11475ca02815Sjsg * @p: the process with svms list 11485ca02815Sjsg * @mm: mm structure 11495ca02815Sjsg * @addr: the vm fault address in pages, to split the prange 11505ca02815Sjsg * @parent: parent range if prange is from child list 11515ca02815Sjsg * @prange: prange to split 11525ca02815Sjsg * 11535ca02815Sjsg * Trims @prange to be a single aligned block of prange->granularity if 11545ca02815Sjsg * possible. The head and tail are added to the child_list in @parent. 11555ca02815Sjsg * 11565ca02815Sjsg * Context: caller must hold mmap_read_lock and prange->lock 11575ca02815Sjsg * 11585ca02815Sjsg * Return: 11595ca02815Sjsg * 0 - OK, otherwise error code 11605ca02815Sjsg */ 11615ca02815Sjsg int 11625ca02815Sjsg svm_range_split_by_granularity(struct kfd_process *p, struct mm_struct *mm, 11635ca02815Sjsg unsigned long addr, struct svm_range *parent, 11645ca02815Sjsg struct svm_range *prange) 11655ca02815Sjsg { 11665ca02815Sjsg struct svm_range *head, *tail; 11675ca02815Sjsg unsigned long start, last, size; 11685ca02815Sjsg int r; 11695ca02815Sjsg 11705ca02815Sjsg /* Align splited range start and size to granularity size, then a single 11715ca02815Sjsg * PTE will be used for whole range, this reduces the number of PTE 11725ca02815Sjsg * updated and the L1 TLB space used for translation. 11735ca02815Sjsg */ 11745ca02815Sjsg size = 1UL << prange->granularity; 11755ca02815Sjsg start = ALIGN_DOWN(addr, size); 11765ca02815Sjsg last = ALIGN(addr + 1, size) - 1; 11775ca02815Sjsg 11785ca02815Sjsg pr_debug("svms 0x%p split [0x%lx 0x%lx] to [0x%lx 0x%lx] size 0x%lx\n", 11795ca02815Sjsg prange->svms, prange->start, prange->last, start, last, size); 11805ca02815Sjsg 11815ca02815Sjsg if (start > prange->start) { 11825ca02815Sjsg r = svm_range_split(prange, start, prange->last, &head); 11835ca02815Sjsg if (r) 11845ca02815Sjsg return r; 11855ca02815Sjsg svm_range_add_child(parent, mm, head, SVM_OP_ADD_RANGE); 11865ca02815Sjsg } 11875ca02815Sjsg 11885ca02815Sjsg if (last < prange->last) { 11895ca02815Sjsg r = svm_range_split(prange, prange->start, last, &tail); 11905ca02815Sjsg if (r) 11915ca02815Sjsg return r; 11925ca02815Sjsg svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE); 11935ca02815Sjsg } 11945ca02815Sjsg 11955ca02815Sjsg /* xnack on, update mapping on GPUs with ACCESS_IN_PLACE */ 11965ca02815Sjsg if (p->xnack_enabled && prange->work_item.op == SVM_OP_ADD_RANGE) { 11975ca02815Sjsg prange->work_item.op = SVM_OP_ADD_RANGE_AND_MAP; 11985ca02815Sjsg pr_debug("change prange 0x%p [0x%lx 0x%lx] op %d\n", 11995ca02815Sjsg prange, prange->start, prange->last, 12005ca02815Sjsg SVM_OP_ADD_RANGE_AND_MAP); 12015ca02815Sjsg } 12025ca02815Sjsg return 0; 12035ca02815Sjsg } 1204f005ef32Sjsg static bool 1205f005ef32Sjsg svm_nodes_in_same_hive(struct kfd_node *node_a, struct kfd_node *node_b) 1206f005ef32Sjsg { 1207f005ef32Sjsg return (node_a->adev == node_b->adev || 1208f005ef32Sjsg amdgpu_xgmi_same_hive(node_a->adev, node_b->adev)); 1209f005ef32Sjsg } 12105ca02815Sjsg 12115ca02815Sjsg static uint64_t 1212f005ef32Sjsg svm_range_get_pte_flags(struct kfd_node *node, 1213f005ef32Sjsg struct svm_range *prange, int domain) 12145ca02815Sjsg { 1215f005ef32Sjsg struct kfd_node *bo_node; 12165ca02815Sjsg uint32_t flags = prange->flags; 12175ca02815Sjsg uint32_t mapping_flags = 0; 12185ca02815Sjsg uint64_t pte_flags; 12195ca02815Sjsg bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN); 12205ca02815Sjsg bool coherent = flags & KFD_IOCTL_SVM_FLAG_COHERENT; 1221f005ef32Sjsg bool uncached = false; /*flags & KFD_IOCTL_SVM_FLAG_UNCACHED;*/ 1222f005ef32Sjsg unsigned int mtype_local; 12235ca02815Sjsg 12245ca02815Sjsg if (domain == SVM_RANGE_VRAM_DOMAIN) 1225f005ef32Sjsg bo_node = prange->svm_bo->node; 12265ca02815Sjsg 1227f005ef32Sjsg switch (node->adev->ip_versions[GC_HWIP][0]) { 12281bb76ff1Sjsg case IP_VERSION(9, 4, 1): 12295ca02815Sjsg if (domain == SVM_RANGE_VRAM_DOMAIN) { 1230f005ef32Sjsg if (bo_node == node) { 12315ca02815Sjsg mapping_flags |= coherent ? 12325ca02815Sjsg AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; 12335ca02815Sjsg } else { 12345ca02815Sjsg mapping_flags |= coherent ? 12355ca02815Sjsg AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1236f005ef32Sjsg if (svm_nodes_in_same_hive(node, bo_node)) 12375ca02815Sjsg snoop = true; 12385ca02815Sjsg } 12395ca02815Sjsg } else { 12405ca02815Sjsg mapping_flags |= coherent ? 12415ca02815Sjsg AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 12425ca02815Sjsg } 12435ca02815Sjsg break; 12441bb76ff1Sjsg case IP_VERSION(9, 4, 2): 12455ca02815Sjsg if (domain == SVM_RANGE_VRAM_DOMAIN) { 1246f005ef32Sjsg if (bo_node == node) { 12475ca02815Sjsg mapping_flags |= coherent ? 12485ca02815Sjsg AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; 1249f005ef32Sjsg if (node->adev->gmc.xgmi.connected_to_cpu) 12505ca02815Sjsg snoop = true; 12515ca02815Sjsg } else { 12525ca02815Sjsg mapping_flags |= coherent ? 12535ca02815Sjsg AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 1254f005ef32Sjsg if (svm_nodes_in_same_hive(node, bo_node)) 12555ca02815Sjsg snoop = true; 12565ca02815Sjsg } 12575ca02815Sjsg } else { 12585ca02815Sjsg mapping_flags |= coherent ? 12595ca02815Sjsg AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 12605ca02815Sjsg } 12615ca02815Sjsg break; 1262f005ef32Sjsg case IP_VERSION(9, 4, 3): 1263f005ef32Sjsg mtype_local = amdgpu_mtype_local == 1 ? AMDGPU_VM_MTYPE_NC : 1264f005ef32Sjsg (amdgpu_mtype_local == 2 ? AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW); 1265f005ef32Sjsg snoop = true; 1266f005ef32Sjsg if (uncached) { 1267f005ef32Sjsg mapping_flags |= AMDGPU_VM_MTYPE_UC; 1268f005ef32Sjsg } else if (domain == SVM_RANGE_VRAM_DOMAIN) { 1269f005ef32Sjsg /* local HBM region close to partition */ 1270f005ef32Sjsg if (bo_node->adev == node->adev && 1271f005ef32Sjsg (!bo_node->xcp || !node->xcp || bo_node->xcp->mem_id == node->xcp->mem_id)) 1272f005ef32Sjsg mapping_flags |= mtype_local; 1273f005ef32Sjsg /* local HBM region far from partition or remote XGMI GPU */ 1274f005ef32Sjsg else if (svm_nodes_in_same_hive(bo_node, node)) 1275f005ef32Sjsg mapping_flags |= AMDGPU_VM_MTYPE_NC; 1276f005ef32Sjsg /* PCIe P2P */ 1277f005ef32Sjsg else 1278f005ef32Sjsg mapping_flags |= AMDGPU_VM_MTYPE_UC; 1279f005ef32Sjsg /* system memory accessed by the APU */ 1280f005ef32Sjsg } else if (node->adev->flags & AMD_IS_APU) { 1281f005ef32Sjsg /* On NUMA systems, locality is determined per-page 1282f005ef32Sjsg * in amdgpu_gmc_override_vm_pte_flags 1283f005ef32Sjsg */ 1284f005ef32Sjsg if (num_possible_nodes() <= 1) 1285f005ef32Sjsg mapping_flags |= mtype_local; 1286f005ef32Sjsg else 1287f005ef32Sjsg mapping_flags |= AMDGPU_VM_MTYPE_NC; 1288f005ef32Sjsg /* system memory accessed by the dGPU */ 1289f005ef32Sjsg } else { 1290f005ef32Sjsg mapping_flags |= AMDGPU_VM_MTYPE_UC; 1291f005ef32Sjsg } 1292f005ef32Sjsg break; 12935ca02815Sjsg default: 12945ca02815Sjsg mapping_flags |= coherent ? 12955ca02815Sjsg AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; 12965ca02815Sjsg } 12975ca02815Sjsg 12985ca02815Sjsg mapping_flags |= AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE; 12995ca02815Sjsg 13005ca02815Sjsg if (flags & KFD_IOCTL_SVM_FLAG_GPU_RO) 13015ca02815Sjsg mapping_flags &= ~AMDGPU_VM_PAGE_WRITEABLE; 13025ca02815Sjsg if (flags & KFD_IOCTL_SVM_FLAG_GPU_EXEC) 13035ca02815Sjsg mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE; 13045ca02815Sjsg 13055ca02815Sjsg pte_flags = AMDGPU_PTE_VALID; 13065ca02815Sjsg pte_flags |= (domain == SVM_RANGE_VRAM_DOMAIN) ? 0 : AMDGPU_PTE_SYSTEM; 13075ca02815Sjsg pte_flags |= snoop ? AMDGPU_PTE_SNOOPED : 0; 13085ca02815Sjsg 1309f005ef32Sjsg pte_flags |= amdgpu_gem_va_map_flags(node->adev, mapping_flags); 13105ca02815Sjsg return pte_flags; 13115ca02815Sjsg } 13125ca02815Sjsg 13135ca02815Sjsg static int 13145ca02815Sjsg svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm, 13155ca02815Sjsg uint64_t start, uint64_t last, 13165ca02815Sjsg struct dma_fence **fence) 13175ca02815Sjsg { 13185ca02815Sjsg uint64_t init_pte_value = 0; 13195ca02815Sjsg 13205ca02815Sjsg pr_debug("[0x%llx 0x%llx]\n", start, last); 13215ca02815Sjsg 13221bb76ff1Sjsg return amdgpu_vm_update_range(adev, vm, false, true, true, NULL, start, 13231bb76ff1Sjsg last, init_pte_value, 0, 0, NULL, NULL, 13241bb76ff1Sjsg fence); 13255ca02815Sjsg } 13265ca02815Sjsg 13275ca02815Sjsg static int 13285ca02815Sjsg svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start, 13291bb76ff1Sjsg unsigned long last, uint32_t trigger) 13305ca02815Sjsg { 13315ca02815Sjsg DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE); 13325ca02815Sjsg struct kfd_process_device *pdd; 13335ca02815Sjsg struct dma_fence *fence = NULL; 13345ca02815Sjsg struct kfd_process *p; 13355ca02815Sjsg uint32_t gpuidx; 13365ca02815Sjsg int r = 0; 13375ca02815Sjsg 13381bb76ff1Sjsg if (!prange->mapped_to_gpu) { 13391bb76ff1Sjsg pr_debug("prange 0x%p [0x%lx 0x%lx] not mapped to GPU\n", 13401bb76ff1Sjsg prange, prange->start, prange->last); 13411bb76ff1Sjsg return 0; 13421bb76ff1Sjsg } 13431bb76ff1Sjsg 13441bb76ff1Sjsg if (prange->start == start && prange->last == last) { 13451bb76ff1Sjsg pr_debug("unmap svms 0x%p prange 0x%p\n", prange->svms, prange); 13461bb76ff1Sjsg prange->mapped_to_gpu = false; 13471bb76ff1Sjsg } 13481bb76ff1Sjsg 13495ca02815Sjsg bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip, 13505ca02815Sjsg MAX_GPU_INSTANCE); 13515ca02815Sjsg p = container_of(prange->svms, struct kfd_process, svms); 13525ca02815Sjsg 13535ca02815Sjsg for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 13545ca02815Sjsg pr_debug("unmap from gpu idx 0x%x\n", gpuidx); 13555ca02815Sjsg pdd = kfd_process_device_from_gpuidx(p, gpuidx); 13565ca02815Sjsg if (!pdd) { 13575ca02815Sjsg pr_debug("failed to find device idx %d\n", gpuidx); 13585ca02815Sjsg return -EINVAL; 13595ca02815Sjsg } 13605ca02815Sjsg 13611bb76ff1Sjsg kfd_smi_event_unmap_from_gpu(pdd->dev, p->lead_thread->pid, 13621bb76ff1Sjsg start, last, trigger); 13631bb76ff1Sjsg 13641bb76ff1Sjsg r = svm_range_unmap_from_gpu(pdd->dev->adev, 13651bb76ff1Sjsg drm_priv_to_vm(pdd->drm_priv), 13665ca02815Sjsg start, last, &fence); 13675ca02815Sjsg if (r) 13685ca02815Sjsg break; 13695ca02815Sjsg 13705ca02815Sjsg if (fence) { 13715ca02815Sjsg r = dma_fence_wait(fence, false); 13725ca02815Sjsg dma_fence_put(fence); 13735ca02815Sjsg fence = NULL; 13745ca02815Sjsg if (r) 13755ca02815Sjsg break; 13765ca02815Sjsg } 13771bb76ff1Sjsg kfd_flush_tlb(pdd, TLB_FLUSH_HEAVYWEIGHT); 13785ca02815Sjsg } 13795ca02815Sjsg 13805ca02815Sjsg return r; 13815ca02815Sjsg } 13825ca02815Sjsg 13835ca02815Sjsg static int 13841bb76ff1Sjsg svm_range_map_to_gpu(struct kfd_process_device *pdd, struct svm_range *prange, 13851bb76ff1Sjsg unsigned long offset, unsigned long npages, bool readonly, 13861bb76ff1Sjsg dma_addr_t *dma_addr, struct amdgpu_device *bo_adev, 13871bb76ff1Sjsg struct dma_fence **fence, bool flush_tlb) 13885ca02815Sjsg { 13891bb76ff1Sjsg struct amdgpu_device *adev = pdd->dev->adev; 13901bb76ff1Sjsg struct amdgpu_vm *vm = drm_priv_to_vm(pdd->drm_priv); 13915ca02815Sjsg uint64_t pte_flags; 13925ca02815Sjsg unsigned long last_start; 13935ca02815Sjsg int last_domain; 13945ca02815Sjsg int r = 0; 13955ca02815Sjsg int64_t i, j; 13965ca02815Sjsg 13975ca02815Sjsg last_start = prange->start + offset; 13985ca02815Sjsg 13995ca02815Sjsg pr_debug("svms 0x%p [0x%lx 0x%lx] readonly %d\n", prange->svms, 14005ca02815Sjsg last_start, last_start + npages - 1, readonly); 14015ca02815Sjsg 14025ca02815Sjsg for (i = offset; i < offset + npages; i++) { 14035ca02815Sjsg last_domain = dma_addr[i] & SVM_RANGE_VRAM_DOMAIN; 14045ca02815Sjsg dma_addr[i] &= ~SVM_RANGE_VRAM_DOMAIN; 14055ca02815Sjsg 14065ca02815Sjsg /* Collect all pages in the same address range and memory domain 14075ca02815Sjsg * that can be mapped with a single call to update mapping. 14085ca02815Sjsg */ 14095ca02815Sjsg if (i < offset + npages - 1 && 14105ca02815Sjsg last_domain == (dma_addr[i + 1] & SVM_RANGE_VRAM_DOMAIN)) 14115ca02815Sjsg continue; 14125ca02815Sjsg 14135ca02815Sjsg pr_debug("Mapping range [0x%lx 0x%llx] on domain: %s\n", 14145ca02815Sjsg last_start, prange->start + i, last_domain ? "GPU" : "CPU"); 14155ca02815Sjsg 1416f005ef32Sjsg pte_flags = svm_range_get_pte_flags(pdd->dev, prange, last_domain); 14175ca02815Sjsg if (readonly) 14185ca02815Sjsg pte_flags &= ~AMDGPU_PTE_WRITEABLE; 14195ca02815Sjsg 14205ca02815Sjsg pr_debug("svms 0x%p map [0x%lx 0x%llx] vram %d PTE 0x%llx\n", 14215ca02815Sjsg prange->svms, last_start, prange->start + i, 14225ca02815Sjsg (last_domain == SVM_RANGE_VRAM_DOMAIN) ? 1 : 0, 14235ca02815Sjsg pte_flags); 14245ca02815Sjsg 1425f005ef32Sjsg /* For dGPU mode, we use same vm_manager to allocate VRAM for 1426f005ef32Sjsg * different memory partition based on fpfn/lpfn, we should use 1427f005ef32Sjsg * same vm_manager.vram_base_offset regardless memory partition. 1428f005ef32Sjsg */ 14291bb76ff1Sjsg r = amdgpu_vm_update_range(adev, vm, false, false, flush_tlb, NULL, 14301bb76ff1Sjsg last_start, prange->start + i, 14311bb76ff1Sjsg pte_flags, 14321bb76ff1Sjsg (last_start - prange->start) << PAGE_SHIFT, 14331bb76ff1Sjsg bo_adev ? bo_adev->vm_manager.vram_base_offset : 0, 14341bb76ff1Sjsg NULL, dma_addr, &vm->last_update); 14355ca02815Sjsg 14365ca02815Sjsg for (j = last_start - prange->start; j <= i; j++) 14375ca02815Sjsg dma_addr[j] |= last_domain; 14385ca02815Sjsg 14395ca02815Sjsg if (r) { 14405ca02815Sjsg pr_debug("failed %d to map to gpu 0x%lx\n", r, prange->start); 14415ca02815Sjsg goto out; 14425ca02815Sjsg } 14435ca02815Sjsg last_start = prange->start + i + 1; 14445ca02815Sjsg } 14455ca02815Sjsg 14465ca02815Sjsg r = amdgpu_vm_update_pdes(adev, vm, false); 14475ca02815Sjsg if (r) { 14485ca02815Sjsg pr_debug("failed %d to update directories 0x%lx\n", r, 14495ca02815Sjsg prange->start); 14505ca02815Sjsg goto out; 14515ca02815Sjsg } 14525ca02815Sjsg 14535ca02815Sjsg if (fence) 14545ca02815Sjsg *fence = dma_fence_get(vm->last_update); 14555ca02815Sjsg 14565ca02815Sjsg out: 14575ca02815Sjsg return r; 14585ca02815Sjsg } 14595ca02815Sjsg 14605ca02815Sjsg static int 14615ca02815Sjsg svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset, 14625ca02815Sjsg unsigned long npages, bool readonly, 14631bb76ff1Sjsg unsigned long *bitmap, bool wait, bool flush_tlb) 14645ca02815Sjsg { 14655ca02815Sjsg struct kfd_process_device *pdd; 1466f005ef32Sjsg struct amdgpu_device *bo_adev = NULL; 14675ca02815Sjsg struct kfd_process *p; 14685ca02815Sjsg struct dma_fence *fence = NULL; 14695ca02815Sjsg uint32_t gpuidx; 14705ca02815Sjsg int r = 0; 14715ca02815Sjsg 14725ca02815Sjsg if (prange->svm_bo && prange->ttm_res) 1473f005ef32Sjsg bo_adev = prange->svm_bo->node->adev; 14745ca02815Sjsg 14755ca02815Sjsg p = container_of(prange->svms, struct kfd_process, svms); 14765ca02815Sjsg for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 14775ca02815Sjsg pr_debug("mapping to gpu idx 0x%x\n", gpuidx); 14785ca02815Sjsg pdd = kfd_process_device_from_gpuidx(p, gpuidx); 14795ca02815Sjsg if (!pdd) { 14805ca02815Sjsg pr_debug("failed to find device idx %d\n", gpuidx); 14815ca02815Sjsg return -EINVAL; 14825ca02815Sjsg } 14835ca02815Sjsg 14845ca02815Sjsg pdd = kfd_bind_process_to_device(pdd->dev, p); 14855ca02815Sjsg if (IS_ERR(pdd)) 14865ca02815Sjsg return -EINVAL; 14875ca02815Sjsg 14881bb76ff1Sjsg if (bo_adev && pdd->dev->adev != bo_adev && 14891bb76ff1Sjsg !amdgpu_xgmi_same_hive(pdd->dev->adev, bo_adev)) { 14905ca02815Sjsg pr_debug("cannot map to device idx %d\n", gpuidx); 14915ca02815Sjsg continue; 14925ca02815Sjsg } 14935ca02815Sjsg 14941bb76ff1Sjsg r = svm_range_map_to_gpu(pdd, prange, offset, npages, readonly, 14955ca02815Sjsg prange->dma_addr[gpuidx], 14961bb76ff1Sjsg bo_adev, wait ? &fence : NULL, 14971bb76ff1Sjsg flush_tlb); 14985ca02815Sjsg if (r) 14995ca02815Sjsg break; 15005ca02815Sjsg 15015ca02815Sjsg if (fence) { 15025ca02815Sjsg r = dma_fence_wait(fence, false); 15035ca02815Sjsg dma_fence_put(fence); 15045ca02815Sjsg fence = NULL; 15055ca02815Sjsg if (r) { 15065ca02815Sjsg pr_debug("failed %d to dma fence wait\n", r); 15075ca02815Sjsg break; 15085ca02815Sjsg } 15095ca02815Sjsg } 15101bb76ff1Sjsg 15111bb76ff1Sjsg kfd_flush_tlb(pdd, TLB_FLUSH_LEGACY); 15125ca02815Sjsg } 15135ca02815Sjsg 15145ca02815Sjsg return r; 15155ca02815Sjsg } 15165ca02815Sjsg 15175ca02815Sjsg struct svm_validate_context { 15185ca02815Sjsg struct kfd_process *process; 15195ca02815Sjsg struct svm_range *prange; 15205ca02815Sjsg bool intr; 15211bb76ff1Sjsg DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE); 1522f005ef32Sjsg struct drm_exec exec; 15235ca02815Sjsg }; 15245ca02815Sjsg 1525f005ef32Sjsg static int svm_range_reserve_bos(struct svm_validate_context *ctx, bool intr) 15265ca02815Sjsg { 15275ca02815Sjsg struct kfd_process_device *pdd; 15285ca02815Sjsg struct amdgpu_vm *vm; 15295ca02815Sjsg uint32_t gpuidx; 15305ca02815Sjsg int r; 15315ca02815Sjsg 1532f005ef32Sjsg drm_exec_init(&ctx->exec, intr ? DRM_EXEC_INTERRUPTIBLE_WAIT: 0); 1533f005ef32Sjsg drm_exec_until_all_locked(&ctx->exec) { 15345ca02815Sjsg for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) { 15355ca02815Sjsg pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx); 15365ca02815Sjsg if (!pdd) { 15375ca02815Sjsg pr_debug("failed to find device idx %d\n", gpuidx); 1538f005ef32Sjsg r = -EINVAL; 1539f005ef32Sjsg goto unreserve_out; 15405ca02815Sjsg } 15415ca02815Sjsg vm = drm_priv_to_vm(pdd->drm_priv); 15425ca02815Sjsg 1543f005ef32Sjsg r = amdgpu_vm_lock_pd(vm, &ctx->exec, 2); 1544f005ef32Sjsg drm_exec_retry_on_contention(&ctx->exec); 1545f005ef32Sjsg if (unlikely(r)) { 15465ca02815Sjsg pr_debug("failed %d to reserve bo\n", r); 1547f005ef32Sjsg goto unreserve_out; 1548f005ef32Sjsg } 1549f005ef32Sjsg } 15505ca02815Sjsg } 15515ca02815Sjsg 15525ca02815Sjsg for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) { 15535ca02815Sjsg pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx); 15545ca02815Sjsg if (!pdd) { 15555ca02815Sjsg pr_debug("failed to find device idx %d\n", gpuidx); 15565ca02815Sjsg r = -EINVAL; 15575ca02815Sjsg goto unreserve_out; 15585ca02815Sjsg } 15595ca02815Sjsg 15601bb76ff1Sjsg r = amdgpu_vm_validate_pt_bos(pdd->dev->adev, 15611bb76ff1Sjsg drm_priv_to_vm(pdd->drm_priv), 15625ca02815Sjsg svm_range_bo_validate, NULL); 15635ca02815Sjsg if (r) { 15645ca02815Sjsg pr_debug("failed %d validate pt bos\n", r); 15655ca02815Sjsg goto unreserve_out; 15665ca02815Sjsg } 15675ca02815Sjsg } 15685ca02815Sjsg 15695ca02815Sjsg return 0; 15705ca02815Sjsg 15715ca02815Sjsg unreserve_out: 1572f005ef32Sjsg drm_exec_fini(&ctx->exec); 15735ca02815Sjsg return r; 15745ca02815Sjsg } 15755ca02815Sjsg 15765ca02815Sjsg static void svm_range_unreserve_bos(struct svm_validate_context *ctx) 15775ca02815Sjsg { 1578f005ef32Sjsg drm_exec_fini(&ctx->exec); 15795ca02815Sjsg } 15805ca02815Sjsg 15815ca02815Sjsg static void *kfd_svm_page_owner(struct kfd_process *p, int32_t gpuidx) 15825ca02815Sjsg { 15835ca02815Sjsg struct kfd_process_device *pdd; 15845ca02815Sjsg 15855ca02815Sjsg pdd = kfd_process_device_from_gpuidx(p, gpuidx); 1586f005ef32Sjsg if (!pdd) 1587f005ef32Sjsg return NULL; 15885ca02815Sjsg 15891bb76ff1Sjsg return SVM_ADEV_PGMAP_OWNER(pdd->dev->adev); 15905ca02815Sjsg } 15915ca02815Sjsg 15925ca02815Sjsg /* 15935ca02815Sjsg * Validation+GPU mapping with concurrent invalidation (MMU notifiers) 15945ca02815Sjsg * 15955ca02815Sjsg * To prevent concurrent destruction or change of range attributes, the 15965ca02815Sjsg * svm_read_lock must be held. The caller must not hold the svm_write_lock 15975ca02815Sjsg * because that would block concurrent evictions and lead to deadlocks. To 15985ca02815Sjsg * serialize concurrent migrations or validations of the same range, the 15995ca02815Sjsg * prange->migrate_mutex must be held. 16005ca02815Sjsg * 16015ca02815Sjsg * For VRAM ranges, the SVM BO must be allocated and valid (protected by its 16025ca02815Sjsg * eviction fence. 16035ca02815Sjsg * 16045ca02815Sjsg * The following sequence ensures race-free validation and GPU mapping: 16055ca02815Sjsg * 16065ca02815Sjsg * 1. Reserve page table (and SVM BO if range is in VRAM) 16075ca02815Sjsg * 2. hmm_range_fault to get page addresses (if system memory) 16085ca02815Sjsg * 3. DMA-map pages (if system memory) 16095ca02815Sjsg * 4-a. Take notifier lock 16105ca02815Sjsg * 4-b. Check that pages still valid (mmu_interval_read_retry) 16115ca02815Sjsg * 4-c. Check that the range was not split or otherwise invalidated 16125ca02815Sjsg * 4-d. Update GPU page table 16135ca02815Sjsg * 4.e. Release notifier lock 16145ca02815Sjsg * 5. Release page table (and SVM BO) reservation 16155ca02815Sjsg */ 16165ca02815Sjsg static int svm_range_validate_and_map(struct mm_struct *mm, 16171bb76ff1Sjsg struct svm_range *prange, int32_t gpuidx, 16181bb76ff1Sjsg bool intr, bool wait, bool flush_tlb) 16195ca02815Sjsg { 1620f005ef32Sjsg struct svm_validate_context *ctx; 16215ca02815Sjsg unsigned long start, end, addr; 16225ca02815Sjsg struct kfd_process *p; 16235ca02815Sjsg void *owner; 16245ca02815Sjsg int32_t idx; 16255ca02815Sjsg int r = 0; 16265ca02815Sjsg 1627f005ef32Sjsg ctx = kzalloc(sizeof(struct svm_validate_context), GFP_KERNEL); 1628f005ef32Sjsg if (!ctx) 1629f005ef32Sjsg return -ENOMEM; 1630f005ef32Sjsg ctx->process = container_of(prange->svms, struct kfd_process, svms); 1631f005ef32Sjsg ctx->prange = prange; 1632f005ef32Sjsg ctx->intr = intr; 16335ca02815Sjsg 16345ca02815Sjsg if (gpuidx < MAX_GPU_INSTANCE) { 1635f005ef32Sjsg bitmap_zero(ctx->bitmap, MAX_GPU_INSTANCE); 1636f005ef32Sjsg bitmap_set(ctx->bitmap, gpuidx, 1); 1637f005ef32Sjsg } else if (ctx->process->xnack_enabled) { 1638f005ef32Sjsg bitmap_copy(ctx->bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE); 16395ca02815Sjsg 16405ca02815Sjsg /* If prefetch range to GPU, or GPU retry fault migrate range to 16415ca02815Sjsg * GPU, which has ACCESS attribute to the range, create mapping 16425ca02815Sjsg * on that GPU. 16435ca02815Sjsg */ 16445ca02815Sjsg if (prange->actual_loc) { 1645f005ef32Sjsg gpuidx = kfd_process_gpuidx_from_gpuid(ctx->process, 16465ca02815Sjsg prange->actual_loc); 16475ca02815Sjsg if (gpuidx < 0) { 16485ca02815Sjsg WARN_ONCE(1, "failed get device by id 0x%x\n", 16495ca02815Sjsg prange->actual_loc); 1650f005ef32Sjsg r = -EINVAL; 1651f005ef32Sjsg goto free_ctx; 16525ca02815Sjsg } 16535ca02815Sjsg if (test_bit(gpuidx, prange->bitmap_access)) 1654f005ef32Sjsg bitmap_set(ctx->bitmap, gpuidx, 1); 16555ca02815Sjsg } 1656d29bf3deSjsg 1657d29bf3deSjsg /* 1658d29bf3deSjsg * If prange is already mapped or with always mapped flag, 1659d29bf3deSjsg * update mapping on GPUs with ACCESS attribute 1660d29bf3deSjsg */ 1661d29bf3deSjsg if (bitmap_empty(ctx->bitmap, MAX_GPU_INSTANCE)) { 1662d29bf3deSjsg if (prange->mapped_to_gpu || 1663d29bf3deSjsg prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED) 1664d29bf3deSjsg bitmap_copy(ctx->bitmap, prange->bitmap_access, MAX_GPU_INSTANCE); 1665d29bf3deSjsg } 16665ca02815Sjsg } else { 1667f005ef32Sjsg bitmap_or(ctx->bitmap, prange->bitmap_access, 16685ca02815Sjsg prange->bitmap_aip, MAX_GPU_INSTANCE); 16695ca02815Sjsg } 16705ca02815Sjsg 1671f005ef32Sjsg if (bitmap_empty(ctx->bitmap, MAX_GPU_INSTANCE)) { 1672f005ef32Sjsg r = 0; 1673f005ef32Sjsg goto free_ctx; 1674f005ef32Sjsg } 16751bb76ff1Sjsg 16765ca02815Sjsg if (prange->actual_loc && !prange->ttm_res) { 16775ca02815Sjsg /* This should never happen. actual_loc gets set by 16785ca02815Sjsg * svm_migrate_ram_to_vram after allocating a BO. 16795ca02815Sjsg */ 16801bb76ff1Sjsg WARN_ONCE(1, "VRAM BO missing during validation\n"); 1681f005ef32Sjsg r = -EINVAL; 1682f005ef32Sjsg goto free_ctx; 16835ca02815Sjsg } 16845ca02815Sjsg 1685f005ef32Sjsg svm_range_reserve_bos(ctx, intr); 16865ca02815Sjsg 16875ca02815Sjsg p = container_of(prange->svms, struct kfd_process, svms); 1688f005ef32Sjsg owner = kfd_svm_page_owner(p, find_first_bit(ctx->bitmap, 16895ca02815Sjsg MAX_GPU_INSTANCE)); 1690f005ef32Sjsg for_each_set_bit(idx, ctx->bitmap, MAX_GPU_INSTANCE) { 16915ca02815Sjsg if (kfd_svm_page_owner(p, idx) != owner) { 16925ca02815Sjsg owner = NULL; 16935ca02815Sjsg break; 16945ca02815Sjsg } 16955ca02815Sjsg } 16965ca02815Sjsg 16975ca02815Sjsg start = prange->start << PAGE_SHIFT; 16985ca02815Sjsg end = (prange->last + 1) << PAGE_SHIFT; 1699f005ef32Sjsg for (addr = start; !r && addr < end; ) { 17005ca02815Sjsg struct hmm_range *hmm_range; 17015ca02815Sjsg struct vm_area_struct *vma; 1702f005ef32Sjsg unsigned long next = 0; 17035ca02815Sjsg unsigned long offset; 17045ca02815Sjsg unsigned long npages; 17055ca02815Sjsg bool readonly; 17065ca02815Sjsg 1707f005ef32Sjsg vma = vma_lookup(mm, addr); 1708f005ef32Sjsg if (vma) { 17095ca02815Sjsg readonly = !(vma->vm_flags & VM_WRITE); 17105ca02815Sjsg 17115ca02815Sjsg next = min(vma->vm_end, end); 17125ca02815Sjsg npages = (next - addr) >> PAGE_SHIFT; 17131bb76ff1Sjsg WRITE_ONCE(p->svms.faulting_task, current); 1714f005ef32Sjsg r = amdgpu_hmm_range_get_pages(&prange->notifier, addr, npages, 1715f005ef32Sjsg readonly, owner, NULL, 1716f005ef32Sjsg &hmm_range); 17171bb76ff1Sjsg WRITE_ONCE(p->svms.faulting_task, NULL); 17185ca02815Sjsg if (r) { 17195ca02815Sjsg pr_debug("failed %d to get svm range pages\n", r); 1720f005ef32Sjsg if (r == -EBUSY) 1721f005ef32Sjsg r = -EAGAIN; 1722f005ef32Sjsg } 1723f005ef32Sjsg } else { 1724f005ef32Sjsg r = -EFAULT; 17255ca02815Sjsg } 17265ca02815Sjsg 1727f005ef32Sjsg if (!r) { 17285ca02815Sjsg offset = (addr - start) >> PAGE_SHIFT; 1729f005ef32Sjsg r = svm_range_dma_map(prange, ctx->bitmap, offset, npages, 17305ca02815Sjsg hmm_range->hmm_pfns); 1731f005ef32Sjsg if (r) 17325ca02815Sjsg pr_debug("failed %d to dma map range\n", r); 17335ca02815Sjsg } 17345ca02815Sjsg 17355ca02815Sjsg svm_range_lock(prange); 1736f005ef32Sjsg if (!r && amdgpu_hmm_range_get_pages_done(hmm_range)) { 17375ca02815Sjsg pr_debug("hmm update the range, need validate again\n"); 17385ca02815Sjsg r = -EAGAIN; 17395ca02815Sjsg } 1740f005ef32Sjsg 1741f005ef32Sjsg if (!r && !list_empty(&prange->child_list)) { 17425ca02815Sjsg pr_debug("range split by unmap in parallel, validate again\n"); 17435ca02815Sjsg r = -EAGAIN; 17445ca02815Sjsg } 17455ca02815Sjsg 1746f005ef32Sjsg if (!r) 17475ca02815Sjsg r = svm_range_map_to_gpus(prange, offset, npages, readonly, 1748f005ef32Sjsg ctx->bitmap, wait, flush_tlb); 17495ca02815Sjsg 1750f005ef32Sjsg if (!r && next == end) 1751f005ef32Sjsg prange->mapped_to_gpu = true; 1752f005ef32Sjsg 17535ca02815Sjsg svm_range_unlock(prange); 17545ca02815Sjsg 17555ca02815Sjsg addr = next; 17565ca02815Sjsg } 17575ca02815Sjsg 1758f005ef32Sjsg svm_range_unreserve_bos(ctx); 17595ca02815Sjsg if (!r) 17601bb76ff1Sjsg prange->validate_timestamp = ktime_get_boottime(); 17615ca02815Sjsg 1762f005ef32Sjsg free_ctx: 1763f005ef32Sjsg kfree(ctx); 1764f005ef32Sjsg 17655ca02815Sjsg return r; 17665ca02815Sjsg } 17675ca02815Sjsg 17685ca02815Sjsg /** 17695ca02815Sjsg * svm_range_list_lock_and_flush_work - flush pending deferred work 17705ca02815Sjsg * 17715ca02815Sjsg * @svms: the svm range list 17725ca02815Sjsg * @mm: the mm structure 17735ca02815Sjsg * 17745ca02815Sjsg * Context: Returns with mmap write lock held, pending deferred work flushed 17755ca02815Sjsg * 17765ca02815Sjsg */ 17771bb76ff1Sjsg void 17785ca02815Sjsg svm_range_list_lock_and_flush_work(struct svm_range_list *svms, 17795ca02815Sjsg struct mm_struct *mm) 17805ca02815Sjsg { 17815ca02815Sjsg retry_flush_work: 17825ca02815Sjsg flush_work(&svms->deferred_list_work); 17835ca02815Sjsg mmap_write_lock(mm); 17845ca02815Sjsg 17855ca02815Sjsg if (list_empty(&svms->deferred_range_list)) 17865ca02815Sjsg return; 17875ca02815Sjsg mmap_write_unlock(mm); 17885ca02815Sjsg pr_debug("retry flush\n"); 17895ca02815Sjsg goto retry_flush_work; 17905ca02815Sjsg } 17915ca02815Sjsg 17925ca02815Sjsg static void svm_range_restore_work(struct work_struct *work) 17935ca02815Sjsg { 17945ca02815Sjsg struct delayed_work *dwork = to_delayed_work(work); 17951bb76ff1Sjsg struct amdkfd_process_info *process_info; 17965ca02815Sjsg struct svm_range_list *svms; 17975ca02815Sjsg struct svm_range *prange; 17985ca02815Sjsg struct kfd_process *p; 17995ca02815Sjsg struct mm_struct *mm; 18005ca02815Sjsg int evicted_ranges; 18015ca02815Sjsg int invalid; 18025ca02815Sjsg int r; 18035ca02815Sjsg 18045ca02815Sjsg svms = container_of(dwork, struct svm_range_list, restore_work); 18055ca02815Sjsg evicted_ranges = atomic_read(&svms->evicted_ranges); 18065ca02815Sjsg if (!evicted_ranges) 18075ca02815Sjsg return; 18085ca02815Sjsg 18095ca02815Sjsg pr_debug("restore svm ranges\n"); 18105ca02815Sjsg 18115ca02815Sjsg p = container_of(svms, struct kfd_process, svms); 18121bb76ff1Sjsg process_info = p->kgd_process_info; 18135ca02815Sjsg 18141bb76ff1Sjsg /* Keep mm reference when svm_range_validate_and_map ranges */ 18151bb76ff1Sjsg mm = get_task_mm(p->lead_thread); 18161bb76ff1Sjsg if (!mm) { 18171bb76ff1Sjsg pr_debug("svms 0x%p process mm gone\n", svms); 18181bb76ff1Sjsg return; 18191bb76ff1Sjsg } 18201bb76ff1Sjsg 18211bb76ff1Sjsg mutex_lock(&process_info->lock); 18225ca02815Sjsg svm_range_list_lock_and_flush_work(svms, mm); 18235ca02815Sjsg mutex_lock(&svms->lock); 18245ca02815Sjsg 18255ca02815Sjsg evicted_ranges = atomic_read(&svms->evicted_ranges); 18265ca02815Sjsg 18275ca02815Sjsg list_for_each_entry(prange, &svms->list, list) { 18285ca02815Sjsg invalid = atomic_read(&prange->invalid); 18295ca02815Sjsg if (!invalid) 18305ca02815Sjsg continue; 18315ca02815Sjsg 18325ca02815Sjsg pr_debug("restoring svms 0x%p prange 0x%p [0x%lx %lx] inv %d\n", 18335ca02815Sjsg prange->svms, prange, prange->start, prange->last, 18345ca02815Sjsg invalid); 18355ca02815Sjsg 18365ca02815Sjsg /* 18375ca02815Sjsg * If range is migrating, wait for migration is done. 18385ca02815Sjsg */ 18395ca02815Sjsg mutex_lock(&prange->migrate_mutex); 18405ca02815Sjsg 18415ca02815Sjsg r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE, 18421bb76ff1Sjsg false, true, false); 18435ca02815Sjsg if (r) 18445ca02815Sjsg pr_debug("failed %d to map 0x%lx to gpus\n", r, 18455ca02815Sjsg prange->start); 18465ca02815Sjsg 18475ca02815Sjsg mutex_unlock(&prange->migrate_mutex); 18485ca02815Sjsg if (r) 18495ca02815Sjsg goto out_reschedule; 18505ca02815Sjsg 18515ca02815Sjsg if (atomic_cmpxchg(&prange->invalid, invalid, 0) != invalid) 18525ca02815Sjsg goto out_reschedule; 18535ca02815Sjsg } 18545ca02815Sjsg 18555ca02815Sjsg if (atomic_cmpxchg(&svms->evicted_ranges, evicted_ranges, 0) != 18565ca02815Sjsg evicted_ranges) 18575ca02815Sjsg goto out_reschedule; 18585ca02815Sjsg 18595ca02815Sjsg evicted_ranges = 0; 18605ca02815Sjsg 18615ca02815Sjsg r = kgd2kfd_resume_mm(mm); 18625ca02815Sjsg if (r) { 18635ca02815Sjsg /* No recovery from this failure. Probably the CP is 18645ca02815Sjsg * hanging. No point trying again. 18655ca02815Sjsg */ 18665ca02815Sjsg pr_debug("failed %d to resume KFD\n", r); 18675ca02815Sjsg } 18685ca02815Sjsg 18695ca02815Sjsg pr_debug("restore svm ranges successfully\n"); 18705ca02815Sjsg 18715ca02815Sjsg out_reschedule: 18725ca02815Sjsg mutex_unlock(&svms->lock); 18735ca02815Sjsg mmap_write_unlock(mm); 18741bb76ff1Sjsg mutex_unlock(&process_info->lock); 18755ca02815Sjsg 18765ca02815Sjsg /* If validation failed, reschedule another attempt */ 18775ca02815Sjsg if (evicted_ranges) { 18785ca02815Sjsg pr_debug("reschedule to restore svm range\n"); 18795ca02815Sjsg schedule_delayed_work(&svms->restore_work, 18805ca02815Sjsg msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS)); 18811bb76ff1Sjsg 18821bb76ff1Sjsg kfd_smi_event_queue_restore_rescheduled(mm); 18835ca02815Sjsg } 18841bb76ff1Sjsg mmput(mm); 18855ca02815Sjsg } 18865ca02815Sjsg 18875ca02815Sjsg /** 18885ca02815Sjsg * svm_range_evict - evict svm range 18891bb76ff1Sjsg * @prange: svm range structure 18901bb76ff1Sjsg * @mm: current process mm_struct 18911bb76ff1Sjsg * @start: starting process queue number 18921bb76ff1Sjsg * @last: last process queue number 1893f005ef32Sjsg * @event: mmu notifier event when range is evicted or migrated 18945ca02815Sjsg * 18955ca02815Sjsg * Stop all queues of the process to ensure GPU doesn't access the memory, then 18965ca02815Sjsg * return to let CPU evict the buffer and proceed CPU pagetable update. 18975ca02815Sjsg * 18985ca02815Sjsg * Don't need use lock to sync cpu pagetable invalidation with GPU execution. 18995ca02815Sjsg * If invalidation happens while restore work is running, restore work will 19005ca02815Sjsg * restart to ensure to get the latest CPU pages mapping to GPU, then start 19015ca02815Sjsg * the queues. 19025ca02815Sjsg */ 19035ca02815Sjsg static int 19045ca02815Sjsg svm_range_evict(struct svm_range *prange, struct mm_struct *mm, 19051bb76ff1Sjsg unsigned long start, unsigned long last, 19061bb76ff1Sjsg enum mmu_notifier_event event) 19075ca02815Sjsg { 19085ca02815Sjsg struct svm_range_list *svms = prange->svms; 19095ca02815Sjsg struct svm_range *pchild; 19105ca02815Sjsg struct kfd_process *p; 19115ca02815Sjsg int r = 0; 19125ca02815Sjsg 19135ca02815Sjsg p = container_of(svms, struct kfd_process, svms); 19145ca02815Sjsg 19155ca02815Sjsg pr_debug("invalidate svms 0x%p prange [0x%lx 0x%lx] [0x%lx 0x%lx]\n", 19165ca02815Sjsg svms, prange->start, prange->last, start, last); 19175ca02815Sjsg 19181bb76ff1Sjsg if (!p->xnack_enabled || 19191bb76ff1Sjsg (prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) { 19205ca02815Sjsg int evicted_ranges; 19211bb76ff1Sjsg bool mapped = prange->mapped_to_gpu; 19225ca02815Sjsg 19235ca02815Sjsg list_for_each_entry(pchild, &prange->child_list, child_list) { 19241bb76ff1Sjsg if (!pchild->mapped_to_gpu) 19251bb76ff1Sjsg continue; 19261bb76ff1Sjsg mapped = true; 19275ca02815Sjsg mutex_lock_nested(&pchild->lock, 1); 19285ca02815Sjsg if (pchild->start <= last && pchild->last >= start) { 19295ca02815Sjsg pr_debug("increment pchild invalid [0x%lx 0x%lx]\n", 19305ca02815Sjsg pchild->start, pchild->last); 19315ca02815Sjsg atomic_inc(&pchild->invalid); 19325ca02815Sjsg } 19335ca02815Sjsg mutex_unlock(&pchild->lock); 19345ca02815Sjsg } 19355ca02815Sjsg 19361bb76ff1Sjsg if (!mapped) 19371bb76ff1Sjsg return r; 19381bb76ff1Sjsg 19395ca02815Sjsg if (prange->start <= last && prange->last >= start) 19405ca02815Sjsg atomic_inc(&prange->invalid); 19415ca02815Sjsg 19425ca02815Sjsg evicted_ranges = atomic_inc_return(&svms->evicted_ranges); 19435ca02815Sjsg if (evicted_ranges != 1) 19445ca02815Sjsg return r; 19455ca02815Sjsg 19465ca02815Sjsg pr_debug("evicting svms 0x%p range [0x%lx 0x%lx]\n", 19475ca02815Sjsg prange->svms, prange->start, prange->last); 19485ca02815Sjsg 19495ca02815Sjsg /* First eviction, stop the queues */ 19501bb76ff1Sjsg r = kgd2kfd_quiesce_mm(mm, KFD_QUEUE_EVICTION_TRIGGER_SVM); 19515ca02815Sjsg if (r) 19525ca02815Sjsg pr_debug("failed to quiesce KFD\n"); 19535ca02815Sjsg 19545ca02815Sjsg pr_debug("schedule to restore svm %p ranges\n", svms); 19555ca02815Sjsg schedule_delayed_work(&svms->restore_work, 19565ca02815Sjsg msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS)); 19575ca02815Sjsg } else { 19585ca02815Sjsg unsigned long s, l; 19591bb76ff1Sjsg uint32_t trigger; 19601bb76ff1Sjsg 19611bb76ff1Sjsg if (event == MMU_NOTIFY_MIGRATE) 19621bb76ff1Sjsg trigger = KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE; 19631bb76ff1Sjsg else 19641bb76ff1Sjsg trigger = KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY; 19655ca02815Sjsg 19665ca02815Sjsg pr_debug("invalidate unmap svms 0x%p [0x%lx 0x%lx] from GPUs\n", 19675ca02815Sjsg prange->svms, start, last); 19685ca02815Sjsg list_for_each_entry(pchild, &prange->child_list, child_list) { 19695ca02815Sjsg mutex_lock_nested(&pchild->lock, 1); 19705ca02815Sjsg s = max(start, pchild->start); 19715ca02815Sjsg l = min(last, pchild->last); 19725ca02815Sjsg if (l >= s) 19731bb76ff1Sjsg svm_range_unmap_from_gpus(pchild, s, l, trigger); 19745ca02815Sjsg mutex_unlock(&pchild->lock); 19755ca02815Sjsg } 19765ca02815Sjsg s = max(start, prange->start); 19775ca02815Sjsg l = min(last, prange->last); 19785ca02815Sjsg if (l >= s) 19791bb76ff1Sjsg svm_range_unmap_from_gpus(prange, s, l, trigger); 19805ca02815Sjsg } 19815ca02815Sjsg 19825ca02815Sjsg return r; 19835ca02815Sjsg } 19845ca02815Sjsg 19855ca02815Sjsg static struct svm_range *svm_range_clone(struct svm_range *old) 19865ca02815Sjsg { 19875ca02815Sjsg struct svm_range *new; 19885ca02815Sjsg 19891bb76ff1Sjsg new = svm_range_new(old->svms, old->start, old->last, false); 19905ca02815Sjsg if (!new) 19915ca02815Sjsg return NULL; 1992f005ef32Sjsg if (svm_range_copy_dma_addrs(new, old)) { 1993f005ef32Sjsg svm_range_free(new, false); 1994f005ef32Sjsg return NULL; 1995f005ef32Sjsg } 19965ca02815Sjsg if (old->svm_bo) { 19975ca02815Sjsg new->ttm_res = old->ttm_res; 19985ca02815Sjsg new->offset = old->offset; 19995ca02815Sjsg new->svm_bo = svm_range_bo_ref(old->svm_bo); 20005ca02815Sjsg spin_lock(&new->svm_bo->list_lock); 20015ca02815Sjsg list_add(&new->svm_bo_list, &new->svm_bo->range_list); 20025ca02815Sjsg spin_unlock(&new->svm_bo->list_lock); 20035ca02815Sjsg } 20045ca02815Sjsg new->flags = old->flags; 20055ca02815Sjsg new->preferred_loc = old->preferred_loc; 20065ca02815Sjsg new->prefetch_loc = old->prefetch_loc; 20075ca02815Sjsg new->actual_loc = old->actual_loc; 20085ca02815Sjsg new->granularity = old->granularity; 20091bb76ff1Sjsg new->mapped_to_gpu = old->mapped_to_gpu; 20105ca02815Sjsg bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE); 20115ca02815Sjsg bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE); 20125ca02815Sjsg 20135ca02815Sjsg return new; 20145ca02815Sjsg } 20155ca02815Sjsg 20161bb76ff1Sjsg void svm_range_set_max_pages(struct amdgpu_device *adev) 20171bb76ff1Sjsg { 20181bb76ff1Sjsg uint64_t max_pages; 20191bb76ff1Sjsg uint64_t pages, _pages; 2020f005ef32Sjsg uint64_t min_pages = 0; 2021f005ef32Sjsg int i, id; 20221bb76ff1Sjsg 2023f005ef32Sjsg for (i = 0; i < adev->kfd.dev->num_nodes; i++) { 2024f005ef32Sjsg if (adev->kfd.dev->nodes[i]->xcp) 2025f005ef32Sjsg id = adev->kfd.dev->nodes[i]->xcp->id; 2026f005ef32Sjsg else 2027f005ef32Sjsg id = -1; 2028f005ef32Sjsg pages = KFD_XCP_MEMORY_SIZE(adev, id) >> 17; 20291bb76ff1Sjsg pages = clamp(pages, 1ULL << 9, 1ULL << 18); 20301bb76ff1Sjsg pages = rounddown_pow_of_two(pages); 2031f005ef32Sjsg min_pages = min_not_zero(min_pages, pages); 2032f005ef32Sjsg } 2033f005ef32Sjsg 20341bb76ff1Sjsg do { 20351bb76ff1Sjsg max_pages = READ_ONCE(max_svm_range_pages); 2036f005ef32Sjsg _pages = min_not_zero(max_pages, min_pages); 20371bb76ff1Sjsg } while (cmpxchg(&max_svm_range_pages, max_pages, _pages) != max_pages); 20381bb76ff1Sjsg } 20391bb76ff1Sjsg 20401bb76ff1Sjsg static int 20411bb76ff1Sjsg svm_range_split_new(struct svm_range_list *svms, uint64_t start, uint64_t last, 20421bb76ff1Sjsg uint64_t max_pages, struct list_head *insert_list, 20431bb76ff1Sjsg struct list_head *update_list) 20441bb76ff1Sjsg { 20451bb76ff1Sjsg struct svm_range *prange; 20461bb76ff1Sjsg uint64_t l; 20471bb76ff1Sjsg 20481bb76ff1Sjsg pr_debug("max_svm_range_pages 0x%llx adding [0x%llx 0x%llx]\n", 20491bb76ff1Sjsg max_pages, start, last); 20501bb76ff1Sjsg 20511bb76ff1Sjsg while (last >= start) { 20521bb76ff1Sjsg l = min(last, ALIGN_DOWN(start + max_pages, max_pages) - 1); 20531bb76ff1Sjsg 20541bb76ff1Sjsg prange = svm_range_new(svms, start, l, true); 20551bb76ff1Sjsg if (!prange) 20561bb76ff1Sjsg return -ENOMEM; 20571bb76ff1Sjsg list_add(&prange->list, insert_list); 20581bb76ff1Sjsg list_add(&prange->update_list, update_list); 20591bb76ff1Sjsg 20601bb76ff1Sjsg start = l + 1; 20611bb76ff1Sjsg } 20621bb76ff1Sjsg return 0; 20631bb76ff1Sjsg } 20641bb76ff1Sjsg 20655ca02815Sjsg /** 206602f671b6Sjsg * svm_range_add - add svm range and handle overlap 206702f671b6Sjsg * @p: the range add to this process svms 206802f671b6Sjsg * @start: page size aligned 206902f671b6Sjsg * @size: page size aligned 207002f671b6Sjsg * @nattr: number of attributes 207102f671b6Sjsg * @attrs: array of attributes 207202f671b6Sjsg * @update_list: output, the ranges need validate and update GPU mapping 207302f671b6Sjsg * @insert_list: output, the ranges need insert to svms 207402f671b6Sjsg * @remove_list: output, the ranges are replaced and need remove from svms 20755ca02815Sjsg * 207602f671b6Sjsg * Check if the virtual address range has overlap with any existing ranges, 207702f671b6Sjsg * split partly overlapping ranges and add new ranges in the gaps. All changes 207802f671b6Sjsg * should be applied to the range_list and interval tree transactionally. If 207902f671b6Sjsg * any range split or allocation fails, the entire update fails. Therefore any 208002f671b6Sjsg * existing overlapping svm_ranges are cloned and the original svm_ranges left 208102f671b6Sjsg * unchanged. 20825ca02815Sjsg * 208302f671b6Sjsg * If the transaction succeeds, the caller can update and insert clones and 208402f671b6Sjsg * new ranges, then free the originals. 20855ca02815Sjsg * 208602f671b6Sjsg * Otherwise the caller can free the clones and new ranges, while the old 208702f671b6Sjsg * svm_ranges remain unchanged. 208802f671b6Sjsg * 208902f671b6Sjsg * Context: Process context, caller must hold svms->lock 209002f671b6Sjsg * 209102f671b6Sjsg * Return: 209202f671b6Sjsg * 0 - OK, otherwise error code 20935ca02815Sjsg */ 20945ca02815Sjsg static int 209502f671b6Sjsg svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size, 209602f671b6Sjsg uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs, 209702f671b6Sjsg struct list_head *update_list, struct list_head *insert_list, 209802f671b6Sjsg struct list_head *remove_list) 20995ca02815Sjsg { 210002f671b6Sjsg unsigned long last = start + size - 1UL; 210102f671b6Sjsg struct svm_range_list *svms = &p->svms; 21025ca02815Sjsg struct interval_tree_node *node; 21035ca02815Sjsg struct svm_range *prange; 21045ca02815Sjsg struct svm_range *tmp; 21051bb76ff1Sjsg struct list_head new_list; 21065ca02815Sjsg int r = 0; 21075ca02815Sjsg 210802f671b6Sjsg pr_debug("svms 0x%p [0x%llx 0x%lx]\n", &p->svms, start, last); 210902f671b6Sjsg 21105ca02815Sjsg INIT_LIST_HEAD(update_list); 21115ca02815Sjsg INIT_LIST_HEAD(insert_list); 21125ca02815Sjsg INIT_LIST_HEAD(remove_list); 21131bb76ff1Sjsg INIT_LIST_HEAD(&new_list); 21145ca02815Sjsg 21155ca02815Sjsg node = interval_tree_iter_first(&svms->objects, start, last); 21165ca02815Sjsg while (node) { 21175ca02815Sjsg struct interval_tree_node *next; 21185ca02815Sjsg unsigned long next_start; 21195ca02815Sjsg 21205ca02815Sjsg pr_debug("found overlap node [0x%lx 0x%lx]\n", node->start, 21215ca02815Sjsg node->last); 21225ca02815Sjsg 21231bb76ff1Sjsg prange = container_of(node, struct svm_range, it_node); 21245ca02815Sjsg next = interval_tree_iter_next(node, start, last); 21255ca02815Sjsg next_start = min(node->last, last) + 1; 21265ca02815Sjsg 2127f005ef32Sjsg if (svm_range_is_same_attrs(p, prange, nattr, attrs) && 2128f005ef32Sjsg prange->mapped_to_gpu) { 21291bb76ff1Sjsg /* nothing to do */ 21301bb76ff1Sjsg } else if (node->start < start || node->last > last) { 21311bb76ff1Sjsg /* node intersects the update range and its attributes 21321bb76ff1Sjsg * will change. Clone and split it, apply updates only 21331bb76ff1Sjsg * to the overlapping part 21341bb76ff1Sjsg */ 21351bb76ff1Sjsg struct svm_range *old = prange; 21361bb76ff1Sjsg 21375ca02815Sjsg prange = svm_range_clone(old); 21385ca02815Sjsg if (!prange) { 21395ca02815Sjsg r = -ENOMEM; 21405ca02815Sjsg goto out; 21415ca02815Sjsg } 21425ca02815Sjsg 21431bb76ff1Sjsg list_add(&old->update_list, remove_list); 21441bb76ff1Sjsg list_add(&prange->list, insert_list); 21451bb76ff1Sjsg list_add(&prange->update_list, update_list); 21465ca02815Sjsg 21475ca02815Sjsg if (node->start < start) { 21485ca02815Sjsg pr_debug("change old range start\n"); 214902f671b6Sjsg r = svm_range_split_head(prange, start, 21505ca02815Sjsg insert_list); 21515ca02815Sjsg if (r) 21525ca02815Sjsg goto out; 21535ca02815Sjsg } 21545ca02815Sjsg if (node->last > last) { 21555ca02815Sjsg pr_debug("change old range last\n"); 215602f671b6Sjsg r = svm_range_split_tail(prange, last, 21575ca02815Sjsg insert_list); 21585ca02815Sjsg if (r) 21595ca02815Sjsg goto out; 21605ca02815Sjsg } 21615ca02815Sjsg } else { 21625ca02815Sjsg /* The node is contained within start..last, 21635ca02815Sjsg * just update it 21645ca02815Sjsg */ 21655ca02815Sjsg list_add(&prange->update_list, update_list); 21661bb76ff1Sjsg } 21675ca02815Sjsg 21685ca02815Sjsg /* insert a new node if needed */ 21695ca02815Sjsg if (node->start > start) { 21701bb76ff1Sjsg r = svm_range_split_new(svms, start, node->start - 1, 21711bb76ff1Sjsg READ_ONCE(max_svm_range_pages), 21721bb76ff1Sjsg &new_list, update_list); 21731bb76ff1Sjsg if (r) 21745ca02815Sjsg goto out; 21755ca02815Sjsg } 21765ca02815Sjsg 21775ca02815Sjsg node = next; 21785ca02815Sjsg start = next_start; 21795ca02815Sjsg } 21805ca02815Sjsg 218102f671b6Sjsg /* add a final range at the end if needed */ 21821bb76ff1Sjsg if (start <= last) 21831bb76ff1Sjsg r = svm_range_split_new(svms, start, last, 21841bb76ff1Sjsg READ_ONCE(max_svm_range_pages), 21851bb76ff1Sjsg &new_list, update_list); 21865ca02815Sjsg 21875ca02815Sjsg out: 21881bb76ff1Sjsg if (r) { 21891bb76ff1Sjsg list_for_each_entry_safe(prange, tmp, insert_list, list) 21901bb76ff1Sjsg svm_range_free(prange, false); 21911bb76ff1Sjsg list_for_each_entry_safe(prange, tmp, &new_list, list) 21921bb76ff1Sjsg svm_range_free(prange, true); 21931bb76ff1Sjsg } else { 21941bb76ff1Sjsg list_splice(&new_list, insert_list); 21951bb76ff1Sjsg } 21965ca02815Sjsg 21975ca02815Sjsg return r; 21985ca02815Sjsg } 21995ca02815Sjsg 22005ca02815Sjsg static void 22015ca02815Sjsg svm_range_update_notifier_and_interval_tree(struct mm_struct *mm, 22025ca02815Sjsg struct svm_range *prange) 22035ca02815Sjsg { 22045ca02815Sjsg unsigned long start; 22055ca02815Sjsg unsigned long last; 22065ca02815Sjsg 22075ca02815Sjsg start = prange->notifier.interval_tree.start >> PAGE_SHIFT; 22085ca02815Sjsg last = prange->notifier.interval_tree.last >> PAGE_SHIFT; 22095ca02815Sjsg 22105ca02815Sjsg if (prange->start == start && prange->last == last) 22115ca02815Sjsg return; 22125ca02815Sjsg 22135ca02815Sjsg pr_debug("up notifier 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", 22145ca02815Sjsg prange->svms, prange, start, last, prange->start, 22155ca02815Sjsg prange->last); 22165ca02815Sjsg 22175ca02815Sjsg if (start != 0 && last != 0) { 22185ca02815Sjsg interval_tree_remove(&prange->it_node, &prange->svms->objects); 22195ca02815Sjsg svm_range_remove_notifier(prange); 22205ca02815Sjsg } 22215ca02815Sjsg prange->it_node.start = prange->start; 22225ca02815Sjsg prange->it_node.last = prange->last; 22235ca02815Sjsg 22245ca02815Sjsg interval_tree_insert(&prange->it_node, &prange->svms->objects); 22255ca02815Sjsg svm_range_add_notifier_locked(mm, prange); 22265ca02815Sjsg } 22275ca02815Sjsg 22285ca02815Sjsg static void 22291bb76ff1Sjsg svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange, 22301bb76ff1Sjsg struct mm_struct *mm) 22315ca02815Sjsg { 22325ca02815Sjsg switch (prange->work_item.op) { 22335ca02815Sjsg case SVM_OP_NULL: 22345ca02815Sjsg pr_debug("NULL OP 0x%p prange 0x%p [0x%lx 0x%lx]\n", 22355ca02815Sjsg svms, prange, prange->start, prange->last); 22365ca02815Sjsg break; 22375ca02815Sjsg case SVM_OP_UNMAP_RANGE: 22385ca02815Sjsg pr_debug("remove 0x%p prange 0x%p [0x%lx 0x%lx]\n", 22395ca02815Sjsg svms, prange, prange->start, prange->last); 22405ca02815Sjsg svm_range_unlink(prange); 22415ca02815Sjsg svm_range_remove_notifier(prange); 22421bb76ff1Sjsg svm_range_free(prange, true); 22435ca02815Sjsg break; 22445ca02815Sjsg case SVM_OP_UPDATE_RANGE_NOTIFIER: 22455ca02815Sjsg pr_debug("update notifier 0x%p prange 0x%p [0x%lx 0x%lx]\n", 22465ca02815Sjsg svms, prange, prange->start, prange->last); 22475ca02815Sjsg svm_range_update_notifier_and_interval_tree(mm, prange); 22485ca02815Sjsg break; 22495ca02815Sjsg case SVM_OP_UPDATE_RANGE_NOTIFIER_AND_MAP: 22505ca02815Sjsg pr_debug("update and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", 22515ca02815Sjsg svms, prange, prange->start, prange->last); 22525ca02815Sjsg svm_range_update_notifier_and_interval_tree(mm, prange); 22535ca02815Sjsg /* TODO: implement deferred validation and mapping */ 22545ca02815Sjsg break; 22555ca02815Sjsg case SVM_OP_ADD_RANGE: 22565ca02815Sjsg pr_debug("add 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, prange, 22575ca02815Sjsg prange->start, prange->last); 22585ca02815Sjsg svm_range_add_to_svms(prange); 22595ca02815Sjsg svm_range_add_notifier_locked(mm, prange); 22605ca02815Sjsg break; 22615ca02815Sjsg case SVM_OP_ADD_RANGE_AND_MAP: 22625ca02815Sjsg pr_debug("add and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, 22635ca02815Sjsg prange, prange->start, prange->last); 22645ca02815Sjsg svm_range_add_to_svms(prange); 22655ca02815Sjsg svm_range_add_notifier_locked(mm, prange); 22665ca02815Sjsg /* TODO: implement deferred validation and mapping */ 22675ca02815Sjsg break; 22685ca02815Sjsg default: 22695ca02815Sjsg WARN_ONCE(1, "Unknown prange 0x%p work op %d\n", prange, 22705ca02815Sjsg prange->work_item.op); 22715ca02815Sjsg } 22725ca02815Sjsg } 22735ca02815Sjsg 22745ca02815Sjsg static void svm_range_drain_retry_fault(struct svm_range_list *svms) 22755ca02815Sjsg { 22765ca02815Sjsg struct kfd_process_device *pdd; 22775ca02815Sjsg struct kfd_process *p; 22781bb76ff1Sjsg int drain; 22795ca02815Sjsg uint32_t i; 22805ca02815Sjsg 22815ca02815Sjsg p = container_of(svms, struct kfd_process, svms); 22825ca02815Sjsg 22831bb76ff1Sjsg restart: 22841bb76ff1Sjsg drain = atomic_read(&svms->drain_pagefaults); 22851bb76ff1Sjsg if (!drain) 22861bb76ff1Sjsg return; 22871bb76ff1Sjsg 22885ca02815Sjsg for_each_set_bit(i, svms->bitmap_supported, p->n_pdds) { 22895ca02815Sjsg pdd = p->pdds[i]; 22905ca02815Sjsg if (!pdd) 22915ca02815Sjsg continue; 22925ca02815Sjsg 22935ca02815Sjsg pr_debug("drain retry fault gpu %d svms %p\n", i, svms); 22945ca02815Sjsg 22951bb76ff1Sjsg amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev, 2296f005ef32Sjsg pdd->dev->adev->irq.retry_cam_enabled ? 2297f005ef32Sjsg &pdd->dev->adev->irq.ih : 22981bb76ff1Sjsg &pdd->dev->adev->irq.ih1); 2299f005ef32Sjsg 2300f005ef32Sjsg if (pdd->dev->adev->irq.retry_cam_enabled) 2301f005ef32Sjsg amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev, 2302f005ef32Sjsg &pdd->dev->adev->irq.ih_soft); 2303f005ef32Sjsg 2304f005ef32Sjsg 23055ca02815Sjsg pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms); 23065ca02815Sjsg } 23071bb76ff1Sjsg if (atomic_cmpxchg(&svms->drain_pagefaults, drain, 0) != drain) 23081bb76ff1Sjsg goto restart; 23095ca02815Sjsg } 23105ca02815Sjsg 23115ca02815Sjsg static void svm_range_deferred_list_work(struct work_struct *work) 23125ca02815Sjsg { 23135ca02815Sjsg struct svm_range_list *svms; 23145ca02815Sjsg struct svm_range *prange; 23155ca02815Sjsg struct mm_struct *mm; 23165ca02815Sjsg 23175ca02815Sjsg svms = container_of(work, struct svm_range_list, deferred_list_work); 23185ca02815Sjsg pr_debug("enter svms 0x%p\n", svms); 23195ca02815Sjsg 23205ca02815Sjsg spin_lock(&svms->deferred_list_lock); 23215ca02815Sjsg while (!list_empty(&svms->deferred_range_list)) { 23225ca02815Sjsg prange = list_first_entry(&svms->deferred_range_list, 23235ca02815Sjsg struct svm_range, deferred_list); 23245ca02815Sjsg spin_unlock(&svms->deferred_list_lock); 23251bb76ff1Sjsg 23265ca02815Sjsg pr_debug("prange 0x%p [0x%lx 0x%lx] op %d\n", prange, 23275ca02815Sjsg prange->start, prange->last, prange->work_item.op); 23285ca02815Sjsg 23295ca02815Sjsg mm = prange->work_item.mm; 23301bb76ff1Sjsg retry: 23315ca02815Sjsg mmap_write_lock(mm); 23325ca02815Sjsg 23331bb76ff1Sjsg /* Checking for the need to drain retry faults must be inside 23341bb76ff1Sjsg * mmap write lock to serialize with munmap notifiers. 23351bb76ff1Sjsg */ 23361bb76ff1Sjsg if (unlikely(atomic_read(&svms->drain_pagefaults))) { 23371bb76ff1Sjsg mmap_write_unlock(mm); 23381bb76ff1Sjsg svm_range_drain_retry_fault(svms); 23391bb76ff1Sjsg goto retry; 23401bb76ff1Sjsg } 23411bb76ff1Sjsg 23421bb76ff1Sjsg /* Remove from deferred_list must be inside mmap write lock, for 23431bb76ff1Sjsg * two race cases: 23441bb76ff1Sjsg * 1. unmap_from_cpu may change work_item.op and add the range 23451bb76ff1Sjsg * to deferred_list again, cause use after free bug. 23461bb76ff1Sjsg * 2. svm_range_list_lock_and_flush_work may hold mmap write 23471bb76ff1Sjsg * lock and continue because deferred_list is empty, but 23481bb76ff1Sjsg * deferred_list work is actually waiting for mmap lock. 23495ca02815Sjsg */ 23505ca02815Sjsg spin_lock(&svms->deferred_list_lock); 23515ca02815Sjsg list_del_init(&prange->deferred_list); 23525ca02815Sjsg spin_unlock(&svms->deferred_list_lock); 23535ca02815Sjsg 23541bb76ff1Sjsg mutex_lock(&svms->lock); 23555ca02815Sjsg mutex_lock(&prange->migrate_mutex); 23565ca02815Sjsg while (!list_empty(&prange->child_list)) { 23575ca02815Sjsg struct svm_range *pchild; 23585ca02815Sjsg 23595ca02815Sjsg pchild = list_first_entry(&prange->child_list, 23605ca02815Sjsg struct svm_range, child_list); 23615ca02815Sjsg pr_debug("child prange 0x%p op %d\n", pchild, 23625ca02815Sjsg pchild->work_item.op); 23635ca02815Sjsg list_del_init(&pchild->child_list); 23641bb76ff1Sjsg svm_range_handle_list_op(svms, pchild, mm); 23655ca02815Sjsg } 23665ca02815Sjsg mutex_unlock(&prange->migrate_mutex); 23675ca02815Sjsg 23681bb76ff1Sjsg svm_range_handle_list_op(svms, prange, mm); 23695ca02815Sjsg mutex_unlock(&svms->lock); 23705ca02815Sjsg mmap_write_unlock(mm); 23715ca02815Sjsg 23726ba23f6eSjsg /* Pairs with mmget in svm_range_add_list_work. If dropping the 23736ba23f6eSjsg * last mm refcount, schedule release work to avoid circular locking 23746ba23f6eSjsg */ 23756ba23f6eSjsg mmput_async(mm); 23761bb76ff1Sjsg 23775ca02815Sjsg spin_lock(&svms->deferred_list_lock); 23785ca02815Sjsg } 23795ca02815Sjsg spin_unlock(&svms->deferred_list_lock); 23805ca02815Sjsg pr_debug("exit svms 0x%p\n", svms); 23815ca02815Sjsg } 23825ca02815Sjsg 23835ca02815Sjsg void 23845ca02815Sjsg svm_range_add_list_work(struct svm_range_list *svms, struct svm_range *prange, 23855ca02815Sjsg struct mm_struct *mm, enum svm_work_list_ops op) 23865ca02815Sjsg { 23875ca02815Sjsg spin_lock(&svms->deferred_list_lock); 23885ca02815Sjsg /* if prange is on the deferred list */ 23895ca02815Sjsg if (!list_empty(&prange->deferred_list)) { 23905ca02815Sjsg pr_debug("update exist prange 0x%p work op %d\n", prange, op); 23915ca02815Sjsg WARN_ONCE(prange->work_item.mm != mm, "unmatch mm\n"); 23925ca02815Sjsg if (op != SVM_OP_NULL && 23935ca02815Sjsg prange->work_item.op != SVM_OP_UNMAP_RANGE) 23945ca02815Sjsg prange->work_item.op = op; 23955ca02815Sjsg } else { 23965ca02815Sjsg prange->work_item.op = op; 23971bb76ff1Sjsg 23981bb76ff1Sjsg /* Pairs with mmput in deferred_list_work */ 23991bb76ff1Sjsg mmget(mm); 24005ca02815Sjsg prange->work_item.mm = mm; 24015ca02815Sjsg list_add_tail(&prange->deferred_list, 24025ca02815Sjsg &prange->svms->deferred_range_list); 24035ca02815Sjsg pr_debug("add prange 0x%p [0x%lx 0x%lx] to work list op %d\n", 24045ca02815Sjsg prange, prange->start, prange->last, op); 24055ca02815Sjsg } 24065ca02815Sjsg spin_unlock(&svms->deferred_list_lock); 24075ca02815Sjsg } 24085ca02815Sjsg 24095ca02815Sjsg void schedule_deferred_list_work(struct svm_range_list *svms) 24105ca02815Sjsg { 24115ca02815Sjsg spin_lock(&svms->deferred_list_lock); 24125ca02815Sjsg if (!list_empty(&svms->deferred_range_list)) 24135ca02815Sjsg schedule_work(&svms->deferred_list_work); 24145ca02815Sjsg spin_unlock(&svms->deferred_list_lock); 24155ca02815Sjsg } 24165ca02815Sjsg 24175ca02815Sjsg static void 24185ca02815Sjsg svm_range_unmap_split(struct mm_struct *mm, struct svm_range *parent, 24195ca02815Sjsg struct svm_range *prange, unsigned long start, 24205ca02815Sjsg unsigned long last) 24215ca02815Sjsg { 24225ca02815Sjsg struct svm_range *head; 24235ca02815Sjsg struct svm_range *tail; 24245ca02815Sjsg 24255ca02815Sjsg if (prange->work_item.op == SVM_OP_UNMAP_RANGE) { 24265ca02815Sjsg pr_debug("prange 0x%p [0x%lx 0x%lx] is already freed\n", prange, 24275ca02815Sjsg prange->start, prange->last); 24285ca02815Sjsg return; 24295ca02815Sjsg } 24305ca02815Sjsg if (start > prange->last || last < prange->start) 24315ca02815Sjsg return; 24325ca02815Sjsg 24335ca02815Sjsg head = tail = prange; 24345ca02815Sjsg if (start > prange->start) 24355ca02815Sjsg svm_range_split(prange, prange->start, start - 1, &tail); 24365ca02815Sjsg if (last < tail->last) 24375ca02815Sjsg svm_range_split(tail, last + 1, tail->last, &head); 24385ca02815Sjsg 24395ca02815Sjsg if (head != prange && tail != prange) { 24405ca02815Sjsg svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE); 24415ca02815Sjsg svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE); 24425ca02815Sjsg } else if (tail != prange) { 24435ca02815Sjsg svm_range_add_child(parent, mm, tail, SVM_OP_UNMAP_RANGE); 24445ca02815Sjsg } else if (head != prange) { 24455ca02815Sjsg svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE); 24465ca02815Sjsg } else if (parent != prange) { 24475ca02815Sjsg prange->work_item.op = SVM_OP_UNMAP_RANGE; 24485ca02815Sjsg } 24495ca02815Sjsg } 24505ca02815Sjsg 24515ca02815Sjsg static void 24525ca02815Sjsg svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange, 24535ca02815Sjsg unsigned long start, unsigned long last) 24545ca02815Sjsg { 24551bb76ff1Sjsg uint32_t trigger = KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU; 24565ca02815Sjsg struct svm_range_list *svms; 24575ca02815Sjsg struct svm_range *pchild; 24585ca02815Sjsg struct kfd_process *p; 24595ca02815Sjsg unsigned long s, l; 24605ca02815Sjsg bool unmap_parent; 24615ca02815Sjsg 24625ca02815Sjsg p = kfd_lookup_process_by_mm(mm); 24635ca02815Sjsg if (!p) 24645ca02815Sjsg return; 24655ca02815Sjsg svms = &p->svms; 24665ca02815Sjsg 24675ca02815Sjsg pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", svms, 24685ca02815Sjsg prange, prange->start, prange->last, start, last); 24695ca02815Sjsg 24701bb76ff1Sjsg /* Make sure pending page faults are drained in the deferred worker 24711bb76ff1Sjsg * before the range is freed to avoid straggler interrupts on 24721bb76ff1Sjsg * unmapped memory causing "phantom faults". 24731bb76ff1Sjsg */ 24741bb76ff1Sjsg atomic_inc(&svms->drain_pagefaults); 24751bb76ff1Sjsg 24765ca02815Sjsg unmap_parent = start <= prange->start && last >= prange->last; 24775ca02815Sjsg 24785ca02815Sjsg list_for_each_entry(pchild, &prange->child_list, child_list) { 24795ca02815Sjsg mutex_lock_nested(&pchild->lock, 1); 24805ca02815Sjsg s = max(start, pchild->start); 24815ca02815Sjsg l = min(last, pchild->last); 24825ca02815Sjsg if (l >= s) 24831bb76ff1Sjsg svm_range_unmap_from_gpus(pchild, s, l, trigger); 24845ca02815Sjsg svm_range_unmap_split(mm, prange, pchild, start, last); 24855ca02815Sjsg mutex_unlock(&pchild->lock); 24865ca02815Sjsg } 24875ca02815Sjsg s = max(start, prange->start); 24885ca02815Sjsg l = min(last, prange->last); 24895ca02815Sjsg if (l >= s) 24901bb76ff1Sjsg svm_range_unmap_from_gpus(prange, s, l, trigger); 24915ca02815Sjsg svm_range_unmap_split(mm, prange, prange, start, last); 24925ca02815Sjsg 24935ca02815Sjsg if (unmap_parent) 24945ca02815Sjsg svm_range_add_list_work(svms, prange, mm, SVM_OP_UNMAP_RANGE); 24955ca02815Sjsg else 24965ca02815Sjsg svm_range_add_list_work(svms, prange, mm, 24975ca02815Sjsg SVM_OP_UPDATE_RANGE_NOTIFIER); 24985ca02815Sjsg schedule_deferred_list_work(svms); 24995ca02815Sjsg 25005ca02815Sjsg kfd_unref_process(p); 25015ca02815Sjsg } 25025ca02815Sjsg 25035ca02815Sjsg /** 25045ca02815Sjsg * svm_range_cpu_invalidate_pagetables - interval notifier callback 25051bb76ff1Sjsg * @mni: mmu_interval_notifier struct 25061bb76ff1Sjsg * @range: mmu_notifier_range struct 25071bb76ff1Sjsg * @cur_seq: value to pass to mmu_interval_set_seq() 25085ca02815Sjsg * 25095ca02815Sjsg * If event is MMU_NOTIFY_UNMAP, this is from CPU unmap range, otherwise, it 25105ca02815Sjsg * is from migration, or CPU page invalidation callback. 25115ca02815Sjsg * 25125ca02815Sjsg * For unmap event, unmap range from GPUs, remove prange from svms in a delayed 25135ca02815Sjsg * work thread, and split prange if only part of prange is unmapped. 25145ca02815Sjsg * 25155ca02815Sjsg * For invalidation event, if GPU retry fault is not enabled, evict the queues, 25165ca02815Sjsg * then schedule svm_range_restore_work to update GPU mapping and resume queues. 25175ca02815Sjsg * If GPU retry fault is enabled, unmap the svm range from GPU, retry fault will 25185ca02815Sjsg * update GPU mapping to recover. 25195ca02815Sjsg * 25205ca02815Sjsg * Context: mmap lock, notifier_invalidate_start lock are held 25215ca02815Sjsg * for invalidate event, prange lock is held if this is from migration 25225ca02815Sjsg */ 25235ca02815Sjsg static bool 25245ca02815Sjsg svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni, 25255ca02815Sjsg const struct mmu_notifier_range *range, 25265ca02815Sjsg unsigned long cur_seq) 25275ca02815Sjsg { 25285ca02815Sjsg struct svm_range *prange; 25295ca02815Sjsg unsigned long start; 25305ca02815Sjsg unsigned long last; 25315ca02815Sjsg 25325ca02815Sjsg if (range->event == MMU_NOTIFY_RELEASE) 25335ca02815Sjsg return true; 253440cf3bc8Sjsg if (!mmget_not_zero(mni->mm)) 253540cf3bc8Sjsg return true; 25365ca02815Sjsg 25375ca02815Sjsg start = mni->interval_tree.start; 25385ca02815Sjsg last = mni->interval_tree.last; 25391bb76ff1Sjsg start = max(start, range->start) >> PAGE_SHIFT; 25401bb76ff1Sjsg last = min(last, range->end - 1) >> PAGE_SHIFT; 25415ca02815Sjsg pr_debug("[0x%lx 0x%lx] range[0x%lx 0x%lx] notifier[0x%lx 0x%lx] %d\n", 25425ca02815Sjsg start, last, range->start >> PAGE_SHIFT, 25435ca02815Sjsg (range->end - 1) >> PAGE_SHIFT, 25445ca02815Sjsg mni->interval_tree.start >> PAGE_SHIFT, 25455ca02815Sjsg mni->interval_tree.last >> PAGE_SHIFT, range->event); 25465ca02815Sjsg 25475ca02815Sjsg prange = container_of(mni, struct svm_range, notifier); 25485ca02815Sjsg 25495ca02815Sjsg svm_range_lock(prange); 25505ca02815Sjsg mmu_interval_set_seq(mni, cur_seq); 25515ca02815Sjsg 25525ca02815Sjsg switch (range->event) { 25535ca02815Sjsg case MMU_NOTIFY_UNMAP: 25545ca02815Sjsg svm_range_unmap_from_cpu(mni->mm, prange, start, last); 25555ca02815Sjsg break; 25565ca02815Sjsg default: 25571bb76ff1Sjsg svm_range_evict(prange, mni->mm, start, last, range->event); 25585ca02815Sjsg break; 25595ca02815Sjsg } 25605ca02815Sjsg 25615ca02815Sjsg svm_range_unlock(prange); 256240cf3bc8Sjsg mmput(mni->mm); 25635ca02815Sjsg 25645ca02815Sjsg return true; 25655ca02815Sjsg } 25665ca02815Sjsg 25675ca02815Sjsg /** 25685ca02815Sjsg * svm_range_from_addr - find svm range from fault address 25695ca02815Sjsg * @svms: svm range list header 25705ca02815Sjsg * @addr: address to search range interval tree, in pages 25715ca02815Sjsg * @parent: parent range if range is on child list 25725ca02815Sjsg * 25735ca02815Sjsg * Context: The caller must hold svms->lock 25745ca02815Sjsg * 25755ca02815Sjsg * Return: the svm_range found or NULL 25765ca02815Sjsg */ 25775ca02815Sjsg struct svm_range * 25785ca02815Sjsg svm_range_from_addr(struct svm_range_list *svms, unsigned long addr, 25795ca02815Sjsg struct svm_range **parent) 25805ca02815Sjsg { 25815ca02815Sjsg struct interval_tree_node *node; 25825ca02815Sjsg struct svm_range *prange; 25835ca02815Sjsg struct svm_range *pchild; 25845ca02815Sjsg 25855ca02815Sjsg node = interval_tree_iter_first(&svms->objects, addr, addr); 25865ca02815Sjsg if (!node) 25875ca02815Sjsg return NULL; 25885ca02815Sjsg 25895ca02815Sjsg prange = container_of(node, struct svm_range, it_node); 25905ca02815Sjsg pr_debug("address 0x%lx prange [0x%lx 0x%lx] node [0x%lx 0x%lx]\n", 25915ca02815Sjsg addr, prange->start, prange->last, node->start, node->last); 25925ca02815Sjsg 25935ca02815Sjsg if (addr >= prange->start && addr <= prange->last) { 25945ca02815Sjsg if (parent) 25955ca02815Sjsg *parent = prange; 25965ca02815Sjsg return prange; 25975ca02815Sjsg } 25985ca02815Sjsg list_for_each_entry(pchild, &prange->child_list, child_list) 25995ca02815Sjsg if (addr >= pchild->start && addr <= pchild->last) { 26005ca02815Sjsg pr_debug("found address 0x%lx pchild [0x%lx 0x%lx]\n", 26015ca02815Sjsg addr, pchild->start, pchild->last); 26025ca02815Sjsg if (parent) 26035ca02815Sjsg *parent = prange; 26045ca02815Sjsg return pchild; 26055ca02815Sjsg } 26065ca02815Sjsg 26075ca02815Sjsg return NULL; 26085ca02815Sjsg } 26095ca02815Sjsg 26105ca02815Sjsg /* svm_range_best_restore_location - decide the best fault restore location 26115ca02815Sjsg * @prange: svm range structure 26125ca02815Sjsg * @adev: the GPU on which vm fault happened 26135ca02815Sjsg * 26145ca02815Sjsg * This is only called when xnack is on, to decide the best location to restore 26155ca02815Sjsg * the range mapping after GPU vm fault. Caller uses the best location to do 26165ca02815Sjsg * migration if actual loc is not best location, then update GPU page table 26175ca02815Sjsg * mapping to the best location. 26185ca02815Sjsg * 26191bb76ff1Sjsg * If the preferred loc is accessible by faulting GPU, use preferred loc. 26205ca02815Sjsg * If vm fault gpu idx is on range ACCESSIBLE bitmap, best_loc is vm fault gpu 26215ca02815Sjsg * If vm fault gpu idx is on range ACCESSIBLE_IN_PLACE bitmap, then 26225ca02815Sjsg * if range actual loc is cpu, best_loc is cpu 26235ca02815Sjsg * if vm fault gpu is on xgmi same hive of range actual loc gpu, best_loc is 26245ca02815Sjsg * range actual loc. 26255ca02815Sjsg * Otherwise, GPU no access, best_loc is -1. 26265ca02815Sjsg * 26275ca02815Sjsg * Return: 26285ca02815Sjsg * -1 means vm fault GPU no access 26295ca02815Sjsg * 0 for CPU or GPU id 26305ca02815Sjsg */ 26315ca02815Sjsg static int32_t 26325ca02815Sjsg svm_range_best_restore_location(struct svm_range *prange, 2633f005ef32Sjsg struct kfd_node *node, 26345ca02815Sjsg int32_t *gpuidx) 26355ca02815Sjsg { 2636f005ef32Sjsg struct kfd_node *bo_node, *preferred_node; 26375ca02815Sjsg struct kfd_process *p; 26385ca02815Sjsg uint32_t gpuid; 26395ca02815Sjsg int r; 26405ca02815Sjsg 26415ca02815Sjsg p = container_of(prange->svms, struct kfd_process, svms); 26425ca02815Sjsg 2643f005ef32Sjsg r = kfd_process_gpuid_from_node(p, node, &gpuid, gpuidx); 26445ca02815Sjsg if (r < 0) { 26455ca02815Sjsg pr_debug("failed to get gpuid from kgd\n"); 26465ca02815Sjsg return -1; 26475ca02815Sjsg } 26485ca02815Sjsg 2649f005ef32Sjsg if (node->adev->gmc.is_app_apu) 2650f005ef32Sjsg return 0; 2651f005ef32Sjsg 26521bb76ff1Sjsg if (prange->preferred_loc == gpuid || 26531bb76ff1Sjsg prange->preferred_loc == KFD_IOCTL_SVM_LOCATION_SYSMEM) { 26545ca02815Sjsg return prange->preferred_loc; 26551bb76ff1Sjsg } else if (prange->preferred_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED) { 2656f005ef32Sjsg preferred_node = svm_range_get_node_by_id(prange, prange->preferred_loc); 2657f005ef32Sjsg if (preferred_node && svm_nodes_in_same_hive(node, preferred_node)) 26581bb76ff1Sjsg return prange->preferred_loc; 26591bb76ff1Sjsg /* fall through */ 26601bb76ff1Sjsg } 26615ca02815Sjsg 26625ca02815Sjsg if (test_bit(*gpuidx, prange->bitmap_access)) 26635ca02815Sjsg return gpuid; 26645ca02815Sjsg 26655ca02815Sjsg if (test_bit(*gpuidx, prange->bitmap_aip)) { 26665ca02815Sjsg if (!prange->actual_loc) 26675ca02815Sjsg return 0; 26685ca02815Sjsg 2669f005ef32Sjsg bo_node = svm_range_get_node_by_id(prange, prange->actual_loc); 2670f005ef32Sjsg if (bo_node && svm_nodes_in_same_hive(node, bo_node)) 26715ca02815Sjsg return prange->actual_loc; 26725ca02815Sjsg else 26735ca02815Sjsg return 0; 26745ca02815Sjsg } 26755ca02815Sjsg 26765ca02815Sjsg return -1; 26775ca02815Sjsg } 26781bb76ff1Sjsg 26795ca02815Sjsg static int 26805ca02815Sjsg svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr, 26811bb76ff1Sjsg unsigned long *start, unsigned long *last, 26821bb76ff1Sjsg bool *is_heap_stack) 26835ca02815Sjsg { 26845ca02815Sjsg struct vm_area_struct *vma; 26855ca02815Sjsg struct interval_tree_node *node; 26865cfb71bbSjsg struct rb_node *rb_node; 26875ca02815Sjsg unsigned long start_limit, end_limit; 26885ca02815Sjsg 2689f005ef32Sjsg vma = vma_lookup(p->mm, addr << PAGE_SHIFT); 2690f005ef32Sjsg if (!vma) { 26915ca02815Sjsg pr_debug("VMA does not exist in address [0x%llx]\n", addr); 26925ca02815Sjsg return -EFAULT; 26935ca02815Sjsg } 26941bb76ff1Sjsg 2695f005ef32Sjsg *is_heap_stack = vma_is_initial_heap(vma) || vma_is_initial_stack(vma); 26961bb76ff1Sjsg 26975ca02815Sjsg start_limit = max(vma->vm_start >> PAGE_SHIFT, 26985ca02815Sjsg (unsigned long)ALIGN_DOWN(addr, 2UL << 8)); 26995ca02815Sjsg end_limit = min(vma->vm_end >> PAGE_SHIFT, 27005ca02815Sjsg (unsigned long)ALIGN(addr + 1, 2UL << 8)); 27015ca02815Sjsg /* First range that starts after the fault address */ 27025ca02815Sjsg node = interval_tree_iter_first(&p->svms.objects, addr + 1, ULONG_MAX); 27035ca02815Sjsg if (node) { 27045ca02815Sjsg end_limit = min(end_limit, node->start); 27055ca02815Sjsg /* Last range that ends before the fault address */ 27065cfb71bbSjsg rb_node = rb_prev(&node->rb); 27075ca02815Sjsg } else { 27085ca02815Sjsg /* Last range must end before addr because 27095ca02815Sjsg * there was no range after addr 27105ca02815Sjsg */ 27115cfb71bbSjsg rb_node = rb_last(&p->svms.objects.rb_root); 27125ca02815Sjsg } 27135cfb71bbSjsg if (rb_node) { 27145cfb71bbSjsg node = container_of(rb_node, struct interval_tree_node, rb); 27155ca02815Sjsg if (node->last >= addr) { 27165ca02815Sjsg WARN(1, "Overlap with prev node and page fault addr\n"); 27175ca02815Sjsg return -EFAULT; 27185ca02815Sjsg } 27195ca02815Sjsg start_limit = max(start_limit, node->last + 1); 27205ca02815Sjsg } 27215ca02815Sjsg 27225ca02815Sjsg *start = start_limit; 27235ca02815Sjsg *last = end_limit - 1; 27245ca02815Sjsg 27251bb76ff1Sjsg pr_debug("vma [0x%lx 0x%lx] range [0x%lx 0x%lx] is_heap_stack %d\n", 27261bb76ff1Sjsg vma->vm_start >> PAGE_SHIFT, vma->vm_end >> PAGE_SHIFT, 27271bb76ff1Sjsg *start, *last, *is_heap_stack); 27285ca02815Sjsg 27295ca02815Sjsg return 0; 27305ca02815Sjsg } 27311bb76ff1Sjsg 27321bb76ff1Sjsg static int 27331bb76ff1Sjsg svm_range_check_vm_userptr(struct kfd_process *p, uint64_t start, uint64_t last, 27341bb76ff1Sjsg uint64_t *bo_s, uint64_t *bo_l) 27351bb76ff1Sjsg { 27361bb76ff1Sjsg struct amdgpu_bo_va_mapping *mapping; 27371bb76ff1Sjsg struct interval_tree_node *node; 27381bb76ff1Sjsg struct amdgpu_bo *bo = NULL; 27391bb76ff1Sjsg unsigned long userptr; 27401bb76ff1Sjsg uint32_t i; 27411bb76ff1Sjsg int r; 27421bb76ff1Sjsg 27431bb76ff1Sjsg for (i = 0; i < p->n_pdds; i++) { 27441bb76ff1Sjsg struct amdgpu_vm *vm; 27451bb76ff1Sjsg 27461bb76ff1Sjsg if (!p->pdds[i]->drm_priv) 27471bb76ff1Sjsg continue; 27481bb76ff1Sjsg 27491bb76ff1Sjsg vm = drm_priv_to_vm(p->pdds[i]->drm_priv); 27501bb76ff1Sjsg r = amdgpu_bo_reserve(vm->root.bo, false); 27511bb76ff1Sjsg if (r) 27521bb76ff1Sjsg return r; 27531bb76ff1Sjsg 27541bb76ff1Sjsg /* Check userptr by searching entire vm->va interval tree */ 27551bb76ff1Sjsg node = interval_tree_iter_first(&vm->va, 0, ~0ULL); 27561bb76ff1Sjsg while (node) { 27571bb76ff1Sjsg mapping = container_of((struct rb_node *)node, 27581bb76ff1Sjsg struct amdgpu_bo_va_mapping, rb); 27591bb76ff1Sjsg bo = mapping->bo_va->base.bo; 27601bb76ff1Sjsg 27611bb76ff1Sjsg if (!amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm, 27621bb76ff1Sjsg start << PAGE_SHIFT, 27631bb76ff1Sjsg last << PAGE_SHIFT, 27641bb76ff1Sjsg &userptr)) { 27651bb76ff1Sjsg node = interval_tree_iter_next(node, 0, ~0ULL); 27661bb76ff1Sjsg continue; 27671bb76ff1Sjsg } 27681bb76ff1Sjsg 27691bb76ff1Sjsg pr_debug("[0x%llx 0x%llx] already userptr mapped\n", 27701bb76ff1Sjsg start, last); 27711bb76ff1Sjsg if (bo_s && bo_l) { 27721bb76ff1Sjsg *bo_s = userptr >> PAGE_SHIFT; 27731bb76ff1Sjsg *bo_l = *bo_s + bo->tbo.ttm->num_pages - 1; 27741bb76ff1Sjsg } 27751bb76ff1Sjsg amdgpu_bo_unreserve(vm->root.bo); 27761bb76ff1Sjsg return -EADDRINUSE; 27771bb76ff1Sjsg } 27781bb76ff1Sjsg amdgpu_bo_unreserve(vm->root.bo); 27791bb76ff1Sjsg } 27801bb76ff1Sjsg return 0; 27811bb76ff1Sjsg } 27821bb76ff1Sjsg 27835ca02815Sjsg static struct 2784f005ef32Sjsg svm_range *svm_range_create_unregistered_range(struct kfd_node *node, 27855ca02815Sjsg struct kfd_process *p, 27865ca02815Sjsg struct mm_struct *mm, 27875ca02815Sjsg int64_t addr) 27885ca02815Sjsg { 27895ca02815Sjsg struct svm_range *prange = NULL; 27905ca02815Sjsg unsigned long start, last; 27915ca02815Sjsg uint32_t gpuid, gpuidx; 27921bb76ff1Sjsg bool is_heap_stack; 27931bb76ff1Sjsg uint64_t bo_s = 0; 27941bb76ff1Sjsg uint64_t bo_l = 0; 27951bb76ff1Sjsg int r; 27965ca02815Sjsg 27971bb76ff1Sjsg if (svm_range_get_range_boundaries(p, addr, &start, &last, 27981bb76ff1Sjsg &is_heap_stack)) 27995ca02815Sjsg return NULL; 28005ca02815Sjsg 28011bb76ff1Sjsg r = svm_range_check_vm(p, start, last, &bo_s, &bo_l); 28021bb76ff1Sjsg if (r != -EADDRINUSE) 28031bb76ff1Sjsg r = svm_range_check_vm_userptr(p, start, last, &bo_s, &bo_l); 28041bb76ff1Sjsg 28051bb76ff1Sjsg if (r == -EADDRINUSE) { 28061bb76ff1Sjsg if (addr >= bo_s && addr <= bo_l) 28071bb76ff1Sjsg return NULL; 28081bb76ff1Sjsg 28091bb76ff1Sjsg /* Create one page svm range if 2MB range overlapping */ 28101bb76ff1Sjsg start = addr; 28111bb76ff1Sjsg last = addr; 28121bb76ff1Sjsg } 28131bb76ff1Sjsg 28141bb76ff1Sjsg prange = svm_range_new(&p->svms, start, last, true); 28155ca02815Sjsg if (!prange) { 28165ca02815Sjsg pr_debug("Failed to create prange in address [0x%llx]\n", addr); 28175ca02815Sjsg return NULL; 28185ca02815Sjsg } 2819f005ef32Sjsg if (kfd_process_gpuid_from_node(p, node, &gpuid, &gpuidx)) { 28205ca02815Sjsg pr_debug("failed to get gpuid from kgd\n"); 28211bb76ff1Sjsg svm_range_free(prange, true); 28225ca02815Sjsg return NULL; 28235ca02815Sjsg } 28245ca02815Sjsg 28251bb76ff1Sjsg if (is_heap_stack) 28261bb76ff1Sjsg prange->preferred_loc = KFD_IOCTL_SVM_LOCATION_SYSMEM; 28271bb76ff1Sjsg 28285ca02815Sjsg svm_range_add_to_svms(prange); 28295ca02815Sjsg svm_range_add_notifier_locked(mm, prange); 28305ca02815Sjsg 28315ca02815Sjsg return prange; 28325ca02815Sjsg } 28335ca02815Sjsg 28345ca02815Sjsg /* svm_range_skip_recover - decide if prange can be recovered 28355ca02815Sjsg * @prange: svm range structure 28365ca02815Sjsg * 28375ca02815Sjsg * GPU vm retry fault handle skip recover the range for cases: 28385ca02815Sjsg * 1. prange is on deferred list to be removed after unmap, it is stale fault, 28395ca02815Sjsg * deferred list work will drain the stale fault before free the prange. 28405ca02815Sjsg * 2. prange is on deferred list to add interval notifier after split, or 28415ca02815Sjsg * 3. prange is child range, it is split from parent prange, recover later 28425ca02815Sjsg * after interval notifier is added. 28435ca02815Sjsg * 28445ca02815Sjsg * Return: true to skip recover, false to recover 28455ca02815Sjsg */ 28465ca02815Sjsg static bool svm_range_skip_recover(struct svm_range *prange) 28475ca02815Sjsg { 28485ca02815Sjsg struct svm_range_list *svms = prange->svms; 28495ca02815Sjsg 28505ca02815Sjsg spin_lock(&svms->deferred_list_lock); 28515ca02815Sjsg if (list_empty(&prange->deferred_list) && 28525ca02815Sjsg list_empty(&prange->child_list)) { 28535ca02815Sjsg spin_unlock(&svms->deferred_list_lock); 28545ca02815Sjsg return false; 28555ca02815Sjsg } 28565ca02815Sjsg spin_unlock(&svms->deferred_list_lock); 28575ca02815Sjsg 28585ca02815Sjsg if (prange->work_item.op == SVM_OP_UNMAP_RANGE) { 28595ca02815Sjsg pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] unmapped\n", 28605ca02815Sjsg svms, prange, prange->start, prange->last); 28615ca02815Sjsg return true; 28625ca02815Sjsg } 28635ca02815Sjsg if (prange->work_item.op == SVM_OP_ADD_RANGE_AND_MAP || 28645ca02815Sjsg prange->work_item.op == SVM_OP_ADD_RANGE) { 28655ca02815Sjsg pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] not added yet\n", 28665ca02815Sjsg svms, prange, prange->start, prange->last); 28675ca02815Sjsg return true; 28685ca02815Sjsg } 28695ca02815Sjsg return false; 28705ca02815Sjsg } 28715ca02815Sjsg 28725ca02815Sjsg static void 2873f005ef32Sjsg svm_range_count_fault(struct kfd_node *node, struct kfd_process *p, 28745ca02815Sjsg int32_t gpuidx) 28755ca02815Sjsg { 28765ca02815Sjsg struct kfd_process_device *pdd; 28775ca02815Sjsg 28785ca02815Sjsg /* fault is on different page of same range 28795ca02815Sjsg * or fault is skipped to recover later 28805ca02815Sjsg * or fault is on invalid virtual address 28815ca02815Sjsg */ 28825ca02815Sjsg if (gpuidx == MAX_GPU_INSTANCE) { 28835ca02815Sjsg uint32_t gpuid; 28845ca02815Sjsg int r; 28855ca02815Sjsg 2886f005ef32Sjsg r = kfd_process_gpuid_from_node(p, node, &gpuid, &gpuidx); 28875ca02815Sjsg if (r < 0) 28885ca02815Sjsg return; 28895ca02815Sjsg } 28905ca02815Sjsg 28915ca02815Sjsg /* fault is recovered 28925ca02815Sjsg * or fault cannot recover because GPU no access on the range 28935ca02815Sjsg */ 28945ca02815Sjsg pdd = kfd_process_device_from_gpuidx(p, gpuidx); 28955ca02815Sjsg if (pdd) 28965ca02815Sjsg WRITE_ONCE(pdd->faults, pdd->faults + 1); 28975ca02815Sjsg } 28985ca02815Sjsg 28995ca02815Sjsg static bool 29001bb76ff1Sjsg svm_fault_allowed(struct vm_area_struct *vma, bool write_fault) 29015ca02815Sjsg { 29025ca02815Sjsg unsigned long requested = VM_READ; 29035ca02815Sjsg 29045ca02815Sjsg if (write_fault) 29055ca02815Sjsg requested |= VM_WRITE; 29065ca02815Sjsg 29075ca02815Sjsg pr_debug("requested 0x%lx, vma permission flags 0x%lx\n", requested, 29085ca02815Sjsg vma->vm_flags); 29095ca02815Sjsg return (vma->vm_flags & requested) == requested; 29105ca02815Sjsg } 29115ca02815Sjsg 29125ca02815Sjsg int 29135ca02815Sjsg svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, 2914f005ef32Sjsg uint32_t vmid, uint32_t node_id, 29155ca02815Sjsg uint64_t addr, bool write_fault) 29165ca02815Sjsg { 29175ca02815Sjsg struct mm_struct *mm = NULL; 29185ca02815Sjsg struct svm_range_list *svms; 29195ca02815Sjsg struct svm_range *prange; 29205ca02815Sjsg struct kfd_process *p; 29211bb76ff1Sjsg ktime_t timestamp = ktime_get_boottime(); 2922f005ef32Sjsg struct kfd_node *node; 29235ca02815Sjsg int32_t best_loc; 29245ca02815Sjsg int32_t gpuidx = MAX_GPU_INSTANCE; 29255ca02815Sjsg bool write_locked = false; 29261bb76ff1Sjsg struct vm_area_struct *vma; 29271bb76ff1Sjsg bool migration = false; 29285ca02815Sjsg int r = 0; 29295ca02815Sjsg 2930f005ef32Sjsg if (!KFD_IS_SVM_API_SUPPORTED(adev)) { 29315ca02815Sjsg pr_debug("device does not support SVM\n"); 29325ca02815Sjsg return -EFAULT; 29335ca02815Sjsg } 29345ca02815Sjsg 29355ca02815Sjsg p = kfd_lookup_process_by_pasid(pasid); 29365ca02815Sjsg if (!p) { 29375ca02815Sjsg pr_debug("kfd process not founded pasid 0x%x\n", pasid); 29381bb76ff1Sjsg return 0; 29395ca02815Sjsg } 29405ca02815Sjsg svms = &p->svms; 29415ca02815Sjsg 29425ca02815Sjsg pr_debug("restoring svms 0x%p fault address 0x%llx\n", svms, addr); 29435ca02815Sjsg 29441bb76ff1Sjsg if (atomic_read(&svms->drain_pagefaults)) { 29451bb76ff1Sjsg pr_debug("draining retry fault, drop fault 0x%llx\n", addr); 29461bb76ff1Sjsg r = 0; 29471bb76ff1Sjsg goto out; 29481bb76ff1Sjsg } 29491bb76ff1Sjsg 29501bb76ff1Sjsg if (!p->xnack_enabled) { 29511bb76ff1Sjsg pr_debug("XNACK not enabled for pasid 0x%x\n", pasid); 29521bb76ff1Sjsg r = -EFAULT; 29531bb76ff1Sjsg goto out; 29541bb76ff1Sjsg } 29551bb76ff1Sjsg 29561bb76ff1Sjsg /* p->lead_thread is available as kfd_process_wq_release flush the work 29571bb76ff1Sjsg * before releasing task ref. 29581bb76ff1Sjsg */ 29595ca02815Sjsg mm = get_task_mm(p->lead_thread); 29605ca02815Sjsg if (!mm) { 29615ca02815Sjsg pr_debug("svms 0x%p failed to get mm\n", svms); 29621bb76ff1Sjsg r = 0; 29635ca02815Sjsg goto out; 29645ca02815Sjsg } 29655ca02815Sjsg 2966f005ef32Sjsg node = kfd_node_by_irq_ids(adev, node_id, vmid); 2967f005ef32Sjsg if (!node) { 2968f005ef32Sjsg pr_debug("kfd node does not exist node_id: %d, vmid: %d\n", node_id, 2969f005ef32Sjsg vmid); 2970f005ef32Sjsg r = -EFAULT; 2971f005ef32Sjsg goto out; 2972f005ef32Sjsg } 29735ca02815Sjsg mmap_read_lock(mm); 29745ca02815Sjsg retry_write_locked: 29755ca02815Sjsg mutex_lock(&svms->lock); 29765ca02815Sjsg prange = svm_range_from_addr(svms, addr, NULL); 29775ca02815Sjsg if (!prange) { 29785ca02815Sjsg pr_debug("failed to find prange svms 0x%p address [0x%llx]\n", 29795ca02815Sjsg svms, addr); 29805ca02815Sjsg if (!write_locked) { 29815ca02815Sjsg /* Need the write lock to create new range with MMU notifier. 29825ca02815Sjsg * Also flush pending deferred work to make sure the interval 29835ca02815Sjsg * tree is up to date before we add a new range 29845ca02815Sjsg */ 29855ca02815Sjsg mutex_unlock(&svms->lock); 29865ca02815Sjsg mmap_read_unlock(mm); 29875ca02815Sjsg mmap_write_lock(mm); 29885ca02815Sjsg write_locked = true; 29895ca02815Sjsg goto retry_write_locked; 29905ca02815Sjsg } 2991f005ef32Sjsg prange = svm_range_create_unregistered_range(node, p, mm, addr); 29925ca02815Sjsg if (!prange) { 29935ca02815Sjsg pr_debug("failed to create unregistered range svms 0x%p address [0x%llx]\n", 29945ca02815Sjsg svms, addr); 29955ca02815Sjsg mmap_write_downgrade(mm); 29965ca02815Sjsg r = -EFAULT; 29975ca02815Sjsg goto out_unlock_svms; 29985ca02815Sjsg } 29995ca02815Sjsg } 30005ca02815Sjsg if (write_locked) 30015ca02815Sjsg mmap_write_downgrade(mm); 30025ca02815Sjsg 30035ca02815Sjsg mutex_lock(&prange->migrate_mutex); 30045ca02815Sjsg 30055ca02815Sjsg if (svm_range_skip_recover(prange)) { 3006f005ef32Sjsg amdgpu_gmc_filter_faults_remove(node->adev, addr, pasid); 30071bb76ff1Sjsg r = 0; 30085ca02815Sjsg goto out_unlock_range; 30095ca02815Sjsg } 30105ca02815Sjsg 30115ca02815Sjsg /* skip duplicate vm fault on different pages of same range */ 30121bb76ff1Sjsg if (ktime_before(timestamp, ktime_add_ns(prange->validate_timestamp, 30131bb76ff1Sjsg AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING))) { 30145ca02815Sjsg pr_debug("svms 0x%p [0x%lx %lx] already restored\n", 30155ca02815Sjsg svms, prange->start, prange->last); 30161bb76ff1Sjsg r = 0; 30175ca02815Sjsg goto out_unlock_range; 30185ca02815Sjsg } 30195ca02815Sjsg 30201bb76ff1Sjsg /* __do_munmap removed VMA, return success as we are handling stale 30211bb76ff1Sjsg * retry fault. 30221bb76ff1Sjsg */ 3023f005ef32Sjsg vma = vma_lookup(mm, addr << PAGE_SHIFT); 3024f005ef32Sjsg if (!vma) { 30251bb76ff1Sjsg pr_debug("address 0x%llx VMA is removed\n", addr); 30261bb76ff1Sjsg r = 0; 30271bb76ff1Sjsg goto out_unlock_range; 30281bb76ff1Sjsg } 30291bb76ff1Sjsg 30301bb76ff1Sjsg if (!svm_fault_allowed(vma, write_fault)) { 30315ca02815Sjsg pr_debug("fault addr 0x%llx no %s permission\n", addr, 30325ca02815Sjsg write_fault ? "write" : "read"); 30335ca02815Sjsg r = -EPERM; 30345ca02815Sjsg goto out_unlock_range; 30355ca02815Sjsg } 30365ca02815Sjsg 3037f005ef32Sjsg best_loc = svm_range_best_restore_location(prange, node, &gpuidx); 30385ca02815Sjsg if (best_loc == -1) { 30395ca02815Sjsg pr_debug("svms %p failed get best restore loc [0x%lx 0x%lx]\n", 30405ca02815Sjsg svms, prange->start, prange->last); 30415ca02815Sjsg r = -EACCES; 30425ca02815Sjsg goto out_unlock_range; 30435ca02815Sjsg } 30445ca02815Sjsg 30455ca02815Sjsg pr_debug("svms %p [0x%lx 0x%lx] best restore 0x%x, actual loc 0x%x\n", 30465ca02815Sjsg svms, prange->start, prange->last, best_loc, 30475ca02815Sjsg prange->actual_loc); 30485ca02815Sjsg 3049f005ef32Sjsg kfd_smi_event_page_fault_start(node, p->lead_thread->pid, addr, 30501bb76ff1Sjsg write_fault, timestamp); 30511bb76ff1Sjsg 30525ca02815Sjsg if (prange->actual_loc != best_loc) { 30531bb76ff1Sjsg migration = true; 30545ca02815Sjsg if (best_loc) { 30551bb76ff1Sjsg r = svm_migrate_to_vram(prange, best_loc, mm, 30561bb76ff1Sjsg KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU); 30575ca02815Sjsg if (r) { 30585ca02815Sjsg pr_debug("svm_migrate_to_vram failed (%d) at %llx, falling back to system memory\n", 30595ca02815Sjsg r, addr); 30605ca02815Sjsg /* Fallback to system memory if migration to 30615ca02815Sjsg * VRAM failed 30625ca02815Sjsg */ 30635ca02815Sjsg if (prange->actual_loc) 30641bb76ff1Sjsg r = svm_migrate_vram_to_ram(prange, mm, 30651bb76ff1Sjsg KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, 30661bb76ff1Sjsg NULL); 30675ca02815Sjsg else 30685ca02815Sjsg r = 0; 30695ca02815Sjsg } 30705ca02815Sjsg } else { 30711bb76ff1Sjsg r = svm_migrate_vram_to_ram(prange, mm, 30721bb76ff1Sjsg KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, 30731bb76ff1Sjsg NULL); 30745ca02815Sjsg } 30755ca02815Sjsg if (r) { 30765ca02815Sjsg pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n", 30775ca02815Sjsg r, svms, prange->start, prange->last); 30785ca02815Sjsg goto out_unlock_range; 30795ca02815Sjsg } 30805ca02815Sjsg } 30815ca02815Sjsg 30821bb76ff1Sjsg r = svm_range_validate_and_map(mm, prange, gpuidx, false, false, false); 30835ca02815Sjsg if (r) 30845ca02815Sjsg pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n", 30855ca02815Sjsg r, svms, prange->start, prange->last); 30865ca02815Sjsg 3087f005ef32Sjsg kfd_smi_event_page_fault_end(node, p->lead_thread->pid, addr, 30881bb76ff1Sjsg migration); 30891bb76ff1Sjsg 30905ca02815Sjsg out_unlock_range: 30915ca02815Sjsg mutex_unlock(&prange->migrate_mutex); 30925ca02815Sjsg out_unlock_svms: 30935ca02815Sjsg mutex_unlock(&svms->lock); 30945ca02815Sjsg mmap_read_unlock(mm); 30955ca02815Sjsg 3096f005ef32Sjsg svm_range_count_fault(node, p, gpuidx); 30975ca02815Sjsg 30985ca02815Sjsg mmput(mm); 30995ca02815Sjsg out: 31005ca02815Sjsg kfd_unref_process(p); 31015ca02815Sjsg 31025ca02815Sjsg if (r == -EAGAIN) { 31035ca02815Sjsg pr_debug("recover vm fault later\n"); 3104f005ef32Sjsg amdgpu_gmc_filter_faults_remove(node->adev, addr, pasid); 31055ca02815Sjsg r = 0; 31065ca02815Sjsg } 31075ca02815Sjsg return r; 31085ca02815Sjsg } 31095ca02815Sjsg 31101bb76ff1Sjsg int 31111bb76ff1Sjsg svm_range_switch_xnack_reserve_mem(struct kfd_process *p, bool xnack_enabled) 31121bb76ff1Sjsg { 31131bb76ff1Sjsg struct svm_range *prange, *pchild; 31141bb76ff1Sjsg uint64_t reserved_size = 0; 31151bb76ff1Sjsg uint64_t size; 31161bb76ff1Sjsg int r = 0; 31171bb76ff1Sjsg 31181bb76ff1Sjsg pr_debug("switching xnack from %d to %d\n", p->xnack_enabled, xnack_enabled); 31191bb76ff1Sjsg 31201bb76ff1Sjsg mutex_lock(&p->svms.lock); 31211bb76ff1Sjsg 31221bb76ff1Sjsg list_for_each_entry(prange, &p->svms.list, list) { 31231bb76ff1Sjsg svm_range_lock(prange); 31241bb76ff1Sjsg list_for_each_entry(pchild, &prange->child_list, child_list) { 31251bb76ff1Sjsg size = (pchild->last - pchild->start + 1) << PAGE_SHIFT; 31261bb76ff1Sjsg if (xnack_enabled) { 31271bb76ff1Sjsg amdgpu_amdkfd_unreserve_mem_limit(NULL, size, 3128f005ef32Sjsg KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0); 31291bb76ff1Sjsg } else { 31301bb76ff1Sjsg r = amdgpu_amdkfd_reserve_mem_limit(NULL, size, 3131f005ef32Sjsg KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0); 31321bb76ff1Sjsg if (r) 31331bb76ff1Sjsg goto out_unlock; 31341bb76ff1Sjsg reserved_size += size; 31351bb76ff1Sjsg } 31361bb76ff1Sjsg } 31371bb76ff1Sjsg 31381bb76ff1Sjsg size = (prange->last - prange->start + 1) << PAGE_SHIFT; 31391bb76ff1Sjsg if (xnack_enabled) { 31401bb76ff1Sjsg amdgpu_amdkfd_unreserve_mem_limit(NULL, size, 3141f005ef32Sjsg KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0); 31421bb76ff1Sjsg } else { 31431bb76ff1Sjsg r = amdgpu_amdkfd_reserve_mem_limit(NULL, size, 3144f005ef32Sjsg KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0); 31451bb76ff1Sjsg if (r) 31461bb76ff1Sjsg goto out_unlock; 31471bb76ff1Sjsg reserved_size += size; 31481bb76ff1Sjsg } 31491bb76ff1Sjsg out_unlock: 31501bb76ff1Sjsg svm_range_unlock(prange); 31511bb76ff1Sjsg if (r) 31521bb76ff1Sjsg break; 31531bb76ff1Sjsg } 31541bb76ff1Sjsg 31551bb76ff1Sjsg if (r) 31561bb76ff1Sjsg amdgpu_amdkfd_unreserve_mem_limit(NULL, reserved_size, 3157f005ef32Sjsg KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0); 31581bb76ff1Sjsg else 31591bb76ff1Sjsg /* Change xnack mode must be inside svms lock, to avoid race with 31601bb76ff1Sjsg * svm_range_deferred_list_work unreserve memory in parallel. 31611bb76ff1Sjsg */ 31621bb76ff1Sjsg p->xnack_enabled = xnack_enabled; 31631bb76ff1Sjsg 31641bb76ff1Sjsg mutex_unlock(&p->svms.lock); 31651bb76ff1Sjsg return r; 31661bb76ff1Sjsg } 31671bb76ff1Sjsg 31685ca02815Sjsg void svm_range_list_fini(struct kfd_process *p) 31695ca02815Sjsg { 31705ca02815Sjsg struct svm_range *prange; 31715ca02815Sjsg struct svm_range *next; 31725ca02815Sjsg 31735ca02815Sjsg pr_debug("pasid 0x%x svms 0x%p\n", p->pasid, &p->svms); 31745ca02815Sjsg 31751bb76ff1Sjsg cancel_delayed_work_sync(&p->svms.restore_work); 31761bb76ff1Sjsg 31775ca02815Sjsg /* Ensure list work is finished before process is destroyed */ 31785ca02815Sjsg flush_work(&p->svms.deferred_list_work); 31795ca02815Sjsg 31801bb76ff1Sjsg /* 31811bb76ff1Sjsg * Ensure no retry fault comes in afterwards, as page fault handler will 31821bb76ff1Sjsg * not find kfd process and take mm lock to recover fault. 31831bb76ff1Sjsg */ 31841bb76ff1Sjsg atomic_inc(&p->svms.drain_pagefaults); 31851bb76ff1Sjsg svm_range_drain_retry_fault(&p->svms); 31861bb76ff1Sjsg 31875ca02815Sjsg list_for_each_entry_safe(prange, next, &p->svms.list, list) { 31885ca02815Sjsg svm_range_unlink(prange); 31895ca02815Sjsg svm_range_remove_notifier(prange); 31901bb76ff1Sjsg svm_range_free(prange, true); 31915ca02815Sjsg } 31925ca02815Sjsg 31935ca02815Sjsg mutex_destroy(&p->svms.lock); 31945ca02815Sjsg 31955ca02815Sjsg pr_debug("pasid 0x%x svms 0x%p done\n", p->pasid, &p->svms); 31965ca02815Sjsg } 31975ca02815Sjsg 31985ca02815Sjsg int svm_range_list_init(struct kfd_process *p) 31995ca02815Sjsg { 32005ca02815Sjsg struct svm_range_list *svms = &p->svms; 32015ca02815Sjsg int i; 32025ca02815Sjsg 32035ca02815Sjsg svms->objects = RB_ROOT_CACHED; 32045ca02815Sjsg mutex_init(&svms->lock); 32055ca02815Sjsg INIT_LIST_HEAD(&svms->list); 32065ca02815Sjsg atomic_set(&svms->evicted_ranges, 0); 32071bb76ff1Sjsg atomic_set(&svms->drain_pagefaults, 0); 32085ca02815Sjsg INIT_DELAYED_WORK(&svms->restore_work, svm_range_restore_work); 32095ca02815Sjsg INIT_WORK(&svms->deferred_list_work, svm_range_deferred_list_work); 32105ca02815Sjsg INIT_LIST_HEAD(&svms->deferred_range_list); 32111bb76ff1Sjsg INIT_LIST_HEAD(&svms->criu_svm_metadata_list); 32125ca02815Sjsg spin_lock_init(&svms->deferred_list_lock); 32135ca02815Sjsg 32145ca02815Sjsg for (i = 0; i < p->n_pdds; i++) 3215f005ef32Sjsg if (KFD_IS_SVM_API_SUPPORTED(p->pdds[i]->dev->adev)) 32165ca02815Sjsg bitmap_set(svms->bitmap_supported, i, 1); 32175ca02815Sjsg 32185ca02815Sjsg return 0; 32195ca02815Sjsg } 32205ca02815Sjsg 32215ca02815Sjsg /** 32221bb76ff1Sjsg * svm_range_check_vm - check if virtual address range mapped already 32231bb76ff1Sjsg * @p: current kfd_process 32241bb76ff1Sjsg * @start: range start address, in pages 32251bb76ff1Sjsg * @last: range last address, in pages 32261bb76ff1Sjsg * @bo_s: mapping start address in pages if address range already mapped 32271bb76ff1Sjsg * @bo_l: mapping last address in pages if address range already mapped 32281bb76ff1Sjsg * 32291bb76ff1Sjsg * The purpose is to avoid virtual address ranges already allocated by 32301bb76ff1Sjsg * kfd_ioctl_alloc_memory_of_gpu ioctl. 32311bb76ff1Sjsg * It looks for each pdd in the kfd_process. 32321bb76ff1Sjsg * 32331bb76ff1Sjsg * Context: Process context 32341bb76ff1Sjsg * 32351bb76ff1Sjsg * Return 0 - OK, if the range is not mapped. 32361bb76ff1Sjsg * Otherwise error code: 32371bb76ff1Sjsg * -EADDRINUSE - if address is mapped already by kfd_ioctl_alloc_memory_of_gpu 32381bb76ff1Sjsg * -ERESTARTSYS - A wait for the buffer to become unreserved was interrupted by 32391bb76ff1Sjsg * a signal. Release all buffer reservations and return to user-space. 32401bb76ff1Sjsg */ 32411bb76ff1Sjsg static int 32421bb76ff1Sjsg svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last, 32431bb76ff1Sjsg uint64_t *bo_s, uint64_t *bo_l) 32441bb76ff1Sjsg { 32451bb76ff1Sjsg struct amdgpu_bo_va_mapping *mapping; 32461bb76ff1Sjsg struct interval_tree_node *node; 32471bb76ff1Sjsg uint32_t i; 32481bb76ff1Sjsg int r; 32491bb76ff1Sjsg 32501bb76ff1Sjsg for (i = 0; i < p->n_pdds; i++) { 32511bb76ff1Sjsg struct amdgpu_vm *vm; 32521bb76ff1Sjsg 32531bb76ff1Sjsg if (!p->pdds[i]->drm_priv) 32541bb76ff1Sjsg continue; 32551bb76ff1Sjsg 32561bb76ff1Sjsg vm = drm_priv_to_vm(p->pdds[i]->drm_priv); 32571bb76ff1Sjsg r = amdgpu_bo_reserve(vm->root.bo, false); 32581bb76ff1Sjsg if (r) 32591bb76ff1Sjsg return r; 32601bb76ff1Sjsg 32611bb76ff1Sjsg node = interval_tree_iter_first(&vm->va, start, last); 32621bb76ff1Sjsg if (node) { 32631bb76ff1Sjsg pr_debug("range [0x%llx 0x%llx] already TTM mapped\n", 32641bb76ff1Sjsg start, last); 32651bb76ff1Sjsg mapping = container_of((struct rb_node *)node, 32661bb76ff1Sjsg struct amdgpu_bo_va_mapping, rb); 32671bb76ff1Sjsg if (bo_s && bo_l) { 32681bb76ff1Sjsg *bo_s = mapping->start; 32691bb76ff1Sjsg *bo_l = mapping->last; 32701bb76ff1Sjsg } 32711bb76ff1Sjsg amdgpu_bo_unreserve(vm->root.bo); 32721bb76ff1Sjsg return -EADDRINUSE; 32731bb76ff1Sjsg } 32741bb76ff1Sjsg amdgpu_bo_unreserve(vm->root.bo); 32751bb76ff1Sjsg } 32761bb76ff1Sjsg 32771bb76ff1Sjsg return 0; 32781bb76ff1Sjsg } 32791bb76ff1Sjsg 32801bb76ff1Sjsg /** 32815ca02815Sjsg * svm_range_is_valid - check if virtual address range is valid 32821bb76ff1Sjsg * @p: current kfd_process 32835ca02815Sjsg * @start: range start address, in pages 32845ca02815Sjsg * @size: range size, in pages 32855ca02815Sjsg * 32865ca02815Sjsg * Valid virtual address range means it belongs to one or more VMAs 32875ca02815Sjsg * 32885ca02815Sjsg * Context: Process context 32895ca02815Sjsg * 32905ca02815Sjsg * Return: 32911bb76ff1Sjsg * 0 - OK, otherwise error code 32925ca02815Sjsg */ 32931bb76ff1Sjsg static int 32941bb76ff1Sjsg svm_range_is_valid(struct kfd_process *p, uint64_t start, uint64_t size) 32955ca02815Sjsg { 32965ca02815Sjsg const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; 32975ca02815Sjsg struct vm_area_struct *vma; 32985ca02815Sjsg unsigned long end; 32991bb76ff1Sjsg unsigned long start_unchg = start; 33005ca02815Sjsg 33015ca02815Sjsg start <<= PAGE_SHIFT; 33025ca02815Sjsg end = start + (size << PAGE_SHIFT); 33035ca02815Sjsg do { 3304f005ef32Sjsg vma = vma_lookup(p->mm, start); 3305f005ef32Sjsg if (!vma || (vma->vm_flags & device_vma)) 33061bb76ff1Sjsg return -EFAULT; 33075ca02815Sjsg start = min(end, vma->vm_end); 33085ca02815Sjsg } while (start < end); 33095ca02815Sjsg 33101bb76ff1Sjsg return svm_range_check_vm(p, start_unchg, (end - 1) >> PAGE_SHIFT, NULL, 33111bb76ff1Sjsg NULL); 33125ca02815Sjsg } 33135ca02815Sjsg 33145ca02815Sjsg /** 33155ca02815Sjsg * svm_range_best_prefetch_location - decide the best prefetch location 33165ca02815Sjsg * @prange: svm range structure 33175ca02815Sjsg * 33185ca02815Sjsg * For xnack off: 33195ca02815Sjsg * If range map to single GPU, the best prefetch location is prefetch_loc, which 33205ca02815Sjsg * can be CPU or GPU. 33215ca02815Sjsg * 33225ca02815Sjsg * If range is ACCESS or ACCESS_IN_PLACE by mGPUs, only if mGPU connection on 33235ca02815Sjsg * XGMI same hive, the best prefetch location is prefetch_loc GPU, othervise 33245ca02815Sjsg * the best prefetch location is always CPU, because GPU can not have coherent 33255ca02815Sjsg * mapping VRAM of other GPUs even with large-BAR PCIe connection. 33265ca02815Sjsg * 33275ca02815Sjsg * For xnack on: 33285ca02815Sjsg * If range is not ACCESS_IN_PLACE by mGPUs, the best prefetch location is 33295ca02815Sjsg * prefetch_loc, other GPU access will generate vm fault and trigger migration. 33305ca02815Sjsg * 33315ca02815Sjsg * If range is ACCESS_IN_PLACE by mGPUs, only if mGPU connection on XGMI same 33325ca02815Sjsg * hive, the best prefetch location is prefetch_loc GPU, otherwise the best 33335ca02815Sjsg * prefetch location is always CPU. 33345ca02815Sjsg * 33355ca02815Sjsg * Context: Process context 33365ca02815Sjsg * 33375ca02815Sjsg * Return: 33385ca02815Sjsg * 0 for CPU or GPU id 33395ca02815Sjsg */ 33405ca02815Sjsg static uint32_t 33415ca02815Sjsg svm_range_best_prefetch_location(struct svm_range *prange) 33425ca02815Sjsg { 33435ca02815Sjsg DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE); 33445ca02815Sjsg uint32_t best_loc = prange->prefetch_loc; 33455ca02815Sjsg struct kfd_process_device *pdd; 3346f005ef32Sjsg struct kfd_node *bo_node; 33475ca02815Sjsg struct kfd_process *p; 33485ca02815Sjsg uint32_t gpuidx; 33495ca02815Sjsg 33505ca02815Sjsg p = container_of(prange->svms, struct kfd_process, svms); 33515ca02815Sjsg 33525ca02815Sjsg if (!best_loc || best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED) 33535ca02815Sjsg goto out; 33545ca02815Sjsg 3355f005ef32Sjsg bo_node = svm_range_get_node_by_id(prange, best_loc); 3356f005ef32Sjsg if (!bo_node) { 3357f005ef32Sjsg WARN_ONCE(1, "failed to get valid kfd node at id%x\n", best_loc); 3358f005ef32Sjsg best_loc = 0; 3359f005ef32Sjsg goto out; 3360f005ef32Sjsg } 3361f005ef32Sjsg 3362f005ef32Sjsg if (bo_node->adev->gmc.is_app_apu) { 33635ca02815Sjsg best_loc = 0; 33645ca02815Sjsg goto out; 33655ca02815Sjsg } 33665ca02815Sjsg 33675ca02815Sjsg if (p->xnack_enabled) 33685ca02815Sjsg bitmap_copy(bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE); 33695ca02815Sjsg else 33705ca02815Sjsg bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip, 33715ca02815Sjsg MAX_GPU_INSTANCE); 33725ca02815Sjsg 33735ca02815Sjsg for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { 33745ca02815Sjsg pdd = kfd_process_device_from_gpuidx(p, gpuidx); 33755ca02815Sjsg if (!pdd) { 33765ca02815Sjsg pr_debug("failed to get device by idx 0x%x\n", gpuidx); 33775ca02815Sjsg continue; 33785ca02815Sjsg } 33795ca02815Sjsg 3380f005ef32Sjsg if (pdd->dev->adev == bo_node->adev) 33815ca02815Sjsg continue; 33825ca02815Sjsg 3383f005ef32Sjsg if (!svm_nodes_in_same_hive(pdd->dev, bo_node)) { 33845ca02815Sjsg best_loc = 0; 33855ca02815Sjsg break; 33865ca02815Sjsg } 33875ca02815Sjsg } 33885ca02815Sjsg 33895ca02815Sjsg out: 33905ca02815Sjsg pr_debug("xnack %d svms 0x%p [0x%lx 0x%lx] best loc 0x%x\n", 33915ca02815Sjsg p->xnack_enabled, &p->svms, prange->start, prange->last, 33925ca02815Sjsg best_loc); 33935ca02815Sjsg 33945ca02815Sjsg return best_loc; 33955ca02815Sjsg } 33965ca02815Sjsg 33975ca02815Sjsg /* svm_range_trigger_migration - start page migration if prefetch loc changed 33985ca02815Sjsg * @mm: current process mm_struct 33995ca02815Sjsg * @prange: svm range structure 34005ca02815Sjsg * @migrated: output, true if migration is triggered 34015ca02815Sjsg * 34025ca02815Sjsg * If range perfetch_loc is GPU, actual loc is cpu 0, then migrate the range 34035ca02815Sjsg * from ram to vram. 34045ca02815Sjsg * If range prefetch_loc is cpu 0, actual loc is GPU, then migrate the range 34055ca02815Sjsg * from vram to ram. 34065ca02815Sjsg * 34075ca02815Sjsg * If GPU vm fault retry is not enabled, migration interact with MMU notifier 34085ca02815Sjsg * and restore work: 34095ca02815Sjsg * 1. migrate_vma_setup invalidate pages, MMU notifier callback svm_range_evict 34105ca02815Sjsg * stops all queues, schedule restore work 34115ca02815Sjsg * 2. svm_range_restore_work wait for migration is done by 34125ca02815Sjsg * a. svm_range_validate_vram takes prange->migrate_mutex 34135ca02815Sjsg * b. svm_range_validate_ram HMM get pages wait for CPU fault handle returns 34145ca02815Sjsg * 3. restore work update mappings of GPU, resume all queues. 34155ca02815Sjsg * 34165ca02815Sjsg * Context: Process context 34175ca02815Sjsg * 34185ca02815Sjsg * Return: 34195ca02815Sjsg * 0 - OK, otherwise - error code of migration 34205ca02815Sjsg */ 34215ca02815Sjsg static int 34225ca02815Sjsg svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange, 34235ca02815Sjsg bool *migrated) 34245ca02815Sjsg { 34255ca02815Sjsg uint32_t best_loc; 34265ca02815Sjsg int r = 0; 34275ca02815Sjsg 34285ca02815Sjsg *migrated = false; 34295ca02815Sjsg best_loc = svm_range_best_prefetch_location(prange); 34305ca02815Sjsg 34315ca02815Sjsg if (best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED || 34325ca02815Sjsg best_loc == prange->actual_loc) 34335ca02815Sjsg return 0; 34345ca02815Sjsg 34355ca02815Sjsg if (!best_loc) { 34361bb76ff1Sjsg r = svm_migrate_vram_to_ram(prange, mm, 34371bb76ff1Sjsg KFD_MIGRATE_TRIGGER_PREFETCH, NULL); 34385ca02815Sjsg *migrated = !r; 34395ca02815Sjsg return r; 34405ca02815Sjsg } 34415ca02815Sjsg 34421bb76ff1Sjsg r = svm_migrate_to_vram(prange, best_loc, mm, KFD_MIGRATE_TRIGGER_PREFETCH); 34435ca02815Sjsg *migrated = !r; 34445ca02815Sjsg 3445b6128eb2Sjsg return 0; 34465ca02815Sjsg } 34475ca02815Sjsg 34485ca02815Sjsg int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence) 34495ca02815Sjsg { 34506982f73fSjsg /* Dereferencing fence->svm_bo is safe here because the fence hasn't 34516982f73fSjsg * signaled yet and we're under the protection of the fence->lock. 34526982f73fSjsg * After the fence is signaled in svm_range_bo_release, we cannot get 34536982f73fSjsg * here any more. 34546982f73fSjsg * 34556982f73fSjsg * Reference is dropped in svm_range_evict_svm_bo_worker. 34566982f73fSjsg */ 34576982f73fSjsg if (svm_bo_ref_unless_zero(fence->svm_bo)) { 34585ca02815Sjsg WRITE_ONCE(fence->svm_bo->evicting, 1); 34595ca02815Sjsg schedule_work(&fence->svm_bo->eviction_work); 34605ca02815Sjsg } 34615ca02815Sjsg 34625ca02815Sjsg return 0; 34635ca02815Sjsg } 34645ca02815Sjsg 34655ca02815Sjsg static void svm_range_evict_svm_bo_worker(struct work_struct *work) 34665ca02815Sjsg { 34675ca02815Sjsg struct svm_range_bo *svm_bo; 34685ca02815Sjsg struct mm_struct *mm; 34691bb76ff1Sjsg int r = 0; 34705ca02815Sjsg 34715ca02815Sjsg svm_bo = container_of(work, struct svm_range_bo, eviction_work); 34725ca02815Sjsg 34731bb76ff1Sjsg if (mmget_not_zero(svm_bo->eviction_fence->mm)) { 34741bb76ff1Sjsg mm = svm_bo->eviction_fence->mm; 34751bb76ff1Sjsg } else { 34761bb76ff1Sjsg svm_range_bo_unref(svm_bo); 34775ca02815Sjsg return; 34781bb76ff1Sjsg } 34795ca02815Sjsg 34805ca02815Sjsg mmap_read_lock(mm); 34815ca02815Sjsg spin_lock(&svm_bo->list_lock); 34821bb76ff1Sjsg while (!list_empty(&svm_bo->range_list) && !r) { 34835ca02815Sjsg struct svm_range *prange = 34845ca02815Sjsg list_first_entry(&svm_bo->range_list, 34855ca02815Sjsg struct svm_range, svm_bo_list); 34861bb76ff1Sjsg int retries = 3; 34871bb76ff1Sjsg 34885ca02815Sjsg list_del_init(&prange->svm_bo_list); 34895ca02815Sjsg spin_unlock(&svm_bo->list_lock); 34905ca02815Sjsg 34915ca02815Sjsg pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, 34925ca02815Sjsg prange->start, prange->last); 34935ca02815Sjsg 34945ca02815Sjsg mutex_lock(&prange->migrate_mutex); 34951bb76ff1Sjsg do { 34961bb76ff1Sjsg r = svm_migrate_vram_to_ram(prange, mm, 34971bb76ff1Sjsg KFD_MIGRATE_TRIGGER_TTM_EVICTION, NULL); 34981bb76ff1Sjsg } while (!r && prange->actual_loc && --retries); 34995ca02815Sjsg 35001bb76ff1Sjsg if (!r && prange->actual_loc) 35011bb76ff1Sjsg pr_info_once("Migration failed during eviction"); 35021bb76ff1Sjsg 35031bb76ff1Sjsg if (!prange->actual_loc) { 35045ca02815Sjsg mutex_lock(&prange->lock); 35055ca02815Sjsg prange->svm_bo = NULL; 35065ca02815Sjsg mutex_unlock(&prange->lock); 35071bb76ff1Sjsg } 35085ca02815Sjsg mutex_unlock(&prange->migrate_mutex); 35095ca02815Sjsg 35105ca02815Sjsg spin_lock(&svm_bo->list_lock); 35115ca02815Sjsg } 35125ca02815Sjsg spin_unlock(&svm_bo->list_lock); 35135ca02815Sjsg mmap_read_unlock(mm); 35141bb76ff1Sjsg mmput(mm); 35155ca02815Sjsg 35165ca02815Sjsg dma_fence_signal(&svm_bo->eviction_fence->base); 35171bb76ff1Sjsg 35185ca02815Sjsg /* This is the last reference to svm_bo, after svm_range_vram_node_free 35195ca02815Sjsg * has been called in svm_migrate_vram_to_ram 35205ca02815Sjsg */ 35211bb76ff1Sjsg WARN_ONCE(!r && kref_read(&svm_bo->kref) != 1, "This was not the last reference\n"); 35225ca02815Sjsg svm_range_bo_unref(svm_bo); 35235ca02815Sjsg } 35245ca02815Sjsg 35255ca02815Sjsg static int 35261bb76ff1Sjsg svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm, 35271bb76ff1Sjsg uint64_t start, uint64_t size, uint32_t nattr, 35281bb76ff1Sjsg struct kfd_ioctl_svm_attribute *attrs) 35295ca02815Sjsg { 35301bb76ff1Sjsg struct amdkfd_process_info *process_info = p->kgd_process_info; 35315ca02815Sjsg struct list_head update_list; 35325ca02815Sjsg struct list_head insert_list; 35335ca02815Sjsg struct list_head remove_list; 35345ca02815Sjsg struct svm_range_list *svms; 35355ca02815Sjsg struct svm_range *prange; 35365ca02815Sjsg struct svm_range *next; 35371bb76ff1Sjsg bool update_mapping = false; 35381bb76ff1Sjsg bool flush_tlb; 3539f005ef32Sjsg int r, ret = 0; 35405ca02815Sjsg 35415ca02815Sjsg pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n", 35425ca02815Sjsg p->pasid, &p->svms, start, start + size - 1, size); 35435ca02815Sjsg 35445ca02815Sjsg r = svm_range_check_attr(p, nattr, attrs); 35455ca02815Sjsg if (r) 35465ca02815Sjsg return r; 35475ca02815Sjsg 35485ca02815Sjsg svms = &p->svms; 35495ca02815Sjsg 35501bb76ff1Sjsg mutex_lock(&process_info->lock); 35511bb76ff1Sjsg 35525ca02815Sjsg svm_range_list_lock_and_flush_work(svms, mm); 35535ca02815Sjsg 35541bb76ff1Sjsg r = svm_range_is_valid(p, start, size); 35551bb76ff1Sjsg if (r) { 35561bb76ff1Sjsg pr_debug("invalid range r=%d\n", r); 35575ca02815Sjsg mmap_write_unlock(mm); 35585ca02815Sjsg goto out; 35595ca02815Sjsg } 35605ca02815Sjsg 35615ca02815Sjsg mutex_lock(&svms->lock); 35625ca02815Sjsg 35635ca02815Sjsg /* Add new range and split existing ranges as needed */ 35645ca02815Sjsg r = svm_range_add(p, start, size, nattr, attrs, &update_list, 35655ca02815Sjsg &insert_list, &remove_list); 35665ca02815Sjsg if (r) { 35675ca02815Sjsg mutex_unlock(&svms->lock); 35685ca02815Sjsg mmap_write_unlock(mm); 35695ca02815Sjsg goto out; 35705ca02815Sjsg } 35715ca02815Sjsg /* Apply changes as a transaction */ 35721bb76ff1Sjsg list_for_each_entry_safe(prange, next, &insert_list, list) { 35735ca02815Sjsg svm_range_add_to_svms(prange); 35745ca02815Sjsg svm_range_add_notifier_locked(mm, prange); 35755ca02815Sjsg } 35765ca02815Sjsg list_for_each_entry(prange, &update_list, update_list) { 35771bb76ff1Sjsg svm_range_apply_attrs(p, prange, nattr, attrs, &update_mapping); 35785ca02815Sjsg /* TODO: unmap ranges from GPU that lost access */ 35795ca02815Sjsg } 35801bb76ff1Sjsg list_for_each_entry_safe(prange, next, &remove_list, update_list) { 35815ca02815Sjsg pr_debug("unlink old 0x%p prange 0x%p [0x%lx 0x%lx]\n", 35825ca02815Sjsg prange->svms, prange, prange->start, 35835ca02815Sjsg prange->last); 35845ca02815Sjsg svm_range_unlink(prange); 35855ca02815Sjsg svm_range_remove_notifier(prange); 35861bb76ff1Sjsg svm_range_free(prange, false); 35875ca02815Sjsg } 35885ca02815Sjsg 35895ca02815Sjsg mmap_write_downgrade(mm); 35905ca02815Sjsg /* Trigger migrations and revalidate and map to GPUs as needed. If 35915ca02815Sjsg * this fails we may be left with partially completed actions. There 35925ca02815Sjsg * is no clean way of rolling back to the previous state in such a 35935ca02815Sjsg * case because the rollback wouldn't be guaranteed to work either. 35945ca02815Sjsg */ 35955ca02815Sjsg list_for_each_entry(prange, &update_list, update_list) { 35965ca02815Sjsg bool migrated; 35975ca02815Sjsg 35985ca02815Sjsg mutex_lock(&prange->migrate_mutex); 35995ca02815Sjsg 36005ca02815Sjsg r = svm_range_trigger_migration(mm, prange, &migrated); 36015ca02815Sjsg if (r) 36025ca02815Sjsg goto out_unlock_range; 36035ca02815Sjsg 36041bb76ff1Sjsg if (migrated && (!p->xnack_enabled || 36051bb76ff1Sjsg (prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) && 36061bb76ff1Sjsg prange->mapped_to_gpu) { 36075ca02815Sjsg pr_debug("restore_work will update mappings of GPUs\n"); 36085ca02815Sjsg mutex_unlock(&prange->migrate_mutex); 36095ca02815Sjsg continue; 36105ca02815Sjsg } 36115ca02815Sjsg 36121bb76ff1Sjsg if (!migrated && !update_mapping) { 36131bb76ff1Sjsg mutex_unlock(&prange->migrate_mutex); 36141bb76ff1Sjsg continue; 36151bb76ff1Sjsg } 36161bb76ff1Sjsg 36171bb76ff1Sjsg flush_tlb = !migrated && update_mapping && prange->mapped_to_gpu; 36181bb76ff1Sjsg 36195ca02815Sjsg r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE, 36201bb76ff1Sjsg true, true, flush_tlb); 36215ca02815Sjsg if (r) 36225ca02815Sjsg pr_debug("failed %d to map svm range\n", r); 36235ca02815Sjsg 36245ca02815Sjsg out_unlock_range: 36255ca02815Sjsg mutex_unlock(&prange->migrate_mutex); 36265ca02815Sjsg if (r) 3627f005ef32Sjsg ret = r; 36285ca02815Sjsg } 36295ca02815Sjsg 3630f005ef32Sjsg dynamic_svm_range_dump(svms); 36315ca02815Sjsg 36325ca02815Sjsg mutex_unlock(&svms->lock); 36335ca02815Sjsg mmap_read_unlock(mm); 36345ca02815Sjsg out: 36351bb76ff1Sjsg mutex_unlock(&process_info->lock); 36361bb76ff1Sjsg 36375ca02815Sjsg pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] done, r=%d\n", p->pasid, 36385ca02815Sjsg &p->svms, start, start + size - 1, r); 36395ca02815Sjsg 3640f005ef32Sjsg return ret ? ret : r; 36415ca02815Sjsg } 36425ca02815Sjsg 36435ca02815Sjsg static int 36441bb76ff1Sjsg svm_range_get_attr(struct kfd_process *p, struct mm_struct *mm, 36451bb76ff1Sjsg uint64_t start, uint64_t size, uint32_t nattr, 36461bb76ff1Sjsg struct kfd_ioctl_svm_attribute *attrs) 36475ca02815Sjsg { 36485ca02815Sjsg DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE); 36495ca02815Sjsg DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE); 36505ca02815Sjsg bool get_preferred_loc = false; 36515ca02815Sjsg bool get_prefetch_loc = false; 36525ca02815Sjsg bool get_granularity = false; 36535ca02815Sjsg bool get_accessible = false; 36545ca02815Sjsg bool get_flags = false; 36555ca02815Sjsg uint64_t last = start + size - 1UL; 36565ca02815Sjsg uint8_t granularity = 0xff; 36575ca02815Sjsg struct interval_tree_node *node; 36585ca02815Sjsg struct svm_range_list *svms; 36595ca02815Sjsg struct svm_range *prange; 36605ca02815Sjsg uint32_t prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 36615ca02815Sjsg uint32_t location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 36625ca02815Sjsg uint32_t flags_and = 0xffffffff; 36635ca02815Sjsg uint32_t flags_or = 0; 36645ca02815Sjsg int gpuidx; 36655ca02815Sjsg uint32_t i; 36661bb76ff1Sjsg int r = 0; 36675ca02815Sjsg 36685ca02815Sjsg pr_debug("svms 0x%p [0x%llx 0x%llx] nattr 0x%x\n", &p->svms, start, 36695ca02815Sjsg start + size - 1, nattr); 36705ca02815Sjsg 36715ca02815Sjsg /* Flush pending deferred work to avoid racing with deferred actions from 36725ca02815Sjsg * previous memory map changes (e.g. munmap). Concurrent memory map changes 36735ca02815Sjsg * can still race with get_attr because we don't hold the mmap lock. But that 36745ca02815Sjsg * would be a race condition in the application anyway, and undefined 36755ca02815Sjsg * behaviour is acceptable in that case. 36765ca02815Sjsg */ 36775ca02815Sjsg flush_work(&p->svms.deferred_list_work); 36785ca02815Sjsg 36795ca02815Sjsg mmap_read_lock(mm); 36801bb76ff1Sjsg r = svm_range_is_valid(p, start, size); 36815ca02815Sjsg mmap_read_unlock(mm); 36821bb76ff1Sjsg if (r) { 36831bb76ff1Sjsg pr_debug("invalid range r=%d\n", r); 36841bb76ff1Sjsg return r; 36855ca02815Sjsg } 36865ca02815Sjsg 36875ca02815Sjsg for (i = 0; i < nattr; i++) { 36885ca02815Sjsg switch (attrs[i].type) { 36895ca02815Sjsg case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 36905ca02815Sjsg get_preferred_loc = true; 36915ca02815Sjsg break; 36925ca02815Sjsg case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 36935ca02815Sjsg get_prefetch_loc = true; 36945ca02815Sjsg break; 36955ca02815Sjsg case KFD_IOCTL_SVM_ATTR_ACCESS: 36965ca02815Sjsg get_accessible = true; 36975ca02815Sjsg break; 36985ca02815Sjsg case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 36995ca02815Sjsg case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 37005ca02815Sjsg get_flags = true; 37015ca02815Sjsg break; 37025ca02815Sjsg case KFD_IOCTL_SVM_ATTR_GRANULARITY: 37035ca02815Sjsg get_granularity = true; 37045ca02815Sjsg break; 37055ca02815Sjsg case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: 37065ca02815Sjsg case KFD_IOCTL_SVM_ATTR_NO_ACCESS: 37075ca02815Sjsg fallthrough; 37085ca02815Sjsg default: 37095ca02815Sjsg pr_debug("get invalid attr type 0x%x\n", attrs[i].type); 37105ca02815Sjsg return -EINVAL; 37115ca02815Sjsg } 37125ca02815Sjsg } 37135ca02815Sjsg 37145ca02815Sjsg svms = &p->svms; 37155ca02815Sjsg 37165ca02815Sjsg mutex_lock(&svms->lock); 37175ca02815Sjsg 37185ca02815Sjsg node = interval_tree_iter_first(&svms->objects, start, last); 37195ca02815Sjsg if (!node) { 37205ca02815Sjsg pr_debug("range attrs not found return default values\n"); 37215ca02815Sjsg svm_range_set_default_attributes(&location, &prefetch_loc, 37225ca02815Sjsg &granularity, &flags_and); 37235ca02815Sjsg flags_or = flags_and; 37245ca02815Sjsg if (p->xnack_enabled) 37255ca02815Sjsg bitmap_copy(bitmap_access, svms->bitmap_supported, 37265ca02815Sjsg MAX_GPU_INSTANCE); 37275ca02815Sjsg else 37285ca02815Sjsg bitmap_zero(bitmap_access, MAX_GPU_INSTANCE); 37295ca02815Sjsg bitmap_zero(bitmap_aip, MAX_GPU_INSTANCE); 37305ca02815Sjsg goto fill_values; 37315ca02815Sjsg } 37325ca02815Sjsg bitmap_copy(bitmap_access, svms->bitmap_supported, MAX_GPU_INSTANCE); 37335ca02815Sjsg bitmap_copy(bitmap_aip, svms->bitmap_supported, MAX_GPU_INSTANCE); 37345ca02815Sjsg 37355ca02815Sjsg while (node) { 37365ca02815Sjsg struct interval_tree_node *next; 37375ca02815Sjsg 37385ca02815Sjsg prange = container_of(node, struct svm_range, it_node); 37395ca02815Sjsg next = interval_tree_iter_next(node, start, last); 37405ca02815Sjsg 37415ca02815Sjsg if (get_preferred_loc) { 37425ca02815Sjsg if (prange->preferred_loc == 37435ca02815Sjsg KFD_IOCTL_SVM_LOCATION_UNDEFINED || 37445ca02815Sjsg (location != KFD_IOCTL_SVM_LOCATION_UNDEFINED && 37455ca02815Sjsg location != prange->preferred_loc)) { 37465ca02815Sjsg location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 37475ca02815Sjsg get_preferred_loc = false; 37485ca02815Sjsg } else { 37495ca02815Sjsg location = prange->preferred_loc; 37505ca02815Sjsg } 37515ca02815Sjsg } 37525ca02815Sjsg if (get_prefetch_loc) { 37535ca02815Sjsg if (prange->prefetch_loc == 37545ca02815Sjsg KFD_IOCTL_SVM_LOCATION_UNDEFINED || 37555ca02815Sjsg (prefetch_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED && 37565ca02815Sjsg prefetch_loc != prange->prefetch_loc)) { 37575ca02815Sjsg prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; 37585ca02815Sjsg get_prefetch_loc = false; 37595ca02815Sjsg } else { 37605ca02815Sjsg prefetch_loc = prange->prefetch_loc; 37615ca02815Sjsg } 37625ca02815Sjsg } 37635ca02815Sjsg if (get_accessible) { 37645ca02815Sjsg bitmap_and(bitmap_access, bitmap_access, 37655ca02815Sjsg prange->bitmap_access, MAX_GPU_INSTANCE); 37665ca02815Sjsg bitmap_and(bitmap_aip, bitmap_aip, 37675ca02815Sjsg prange->bitmap_aip, MAX_GPU_INSTANCE); 37685ca02815Sjsg } 37695ca02815Sjsg if (get_flags) { 37705ca02815Sjsg flags_and &= prange->flags; 37715ca02815Sjsg flags_or |= prange->flags; 37725ca02815Sjsg } 37735ca02815Sjsg 37745ca02815Sjsg if (get_granularity && prange->granularity < granularity) 37755ca02815Sjsg granularity = prange->granularity; 37765ca02815Sjsg 37775ca02815Sjsg node = next; 37785ca02815Sjsg } 37795ca02815Sjsg fill_values: 37805ca02815Sjsg mutex_unlock(&svms->lock); 37815ca02815Sjsg 37825ca02815Sjsg for (i = 0; i < nattr; i++) { 37835ca02815Sjsg switch (attrs[i].type) { 37845ca02815Sjsg case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: 37855ca02815Sjsg attrs[i].value = location; 37865ca02815Sjsg break; 37875ca02815Sjsg case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 37885ca02815Sjsg attrs[i].value = prefetch_loc; 37895ca02815Sjsg break; 37905ca02815Sjsg case KFD_IOCTL_SVM_ATTR_ACCESS: 37915ca02815Sjsg gpuidx = kfd_process_gpuidx_from_gpuid(p, 37925ca02815Sjsg attrs[i].value); 37935ca02815Sjsg if (gpuidx < 0) { 37945ca02815Sjsg pr_debug("invalid gpuid %x\n", attrs[i].value); 37955ca02815Sjsg return -EINVAL; 37965ca02815Sjsg } 37975ca02815Sjsg if (test_bit(gpuidx, bitmap_access)) 37985ca02815Sjsg attrs[i].type = KFD_IOCTL_SVM_ATTR_ACCESS; 37995ca02815Sjsg else if (test_bit(gpuidx, bitmap_aip)) 38005ca02815Sjsg attrs[i].type = 38015ca02815Sjsg KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE; 38025ca02815Sjsg else 38035ca02815Sjsg attrs[i].type = KFD_IOCTL_SVM_ATTR_NO_ACCESS; 38045ca02815Sjsg break; 38055ca02815Sjsg case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 38065ca02815Sjsg attrs[i].value = flags_and; 38075ca02815Sjsg break; 38085ca02815Sjsg case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: 38095ca02815Sjsg attrs[i].value = ~flags_or; 38105ca02815Sjsg break; 38115ca02815Sjsg case KFD_IOCTL_SVM_ATTR_GRANULARITY: 38125ca02815Sjsg attrs[i].value = (uint32_t)granularity; 38135ca02815Sjsg break; 38145ca02815Sjsg } 38155ca02815Sjsg } 38165ca02815Sjsg 38175ca02815Sjsg return 0; 38185ca02815Sjsg } 38195ca02815Sjsg 38201bb76ff1Sjsg int kfd_criu_resume_svm(struct kfd_process *p) 38211bb76ff1Sjsg { 38221bb76ff1Sjsg struct kfd_ioctl_svm_attribute *set_attr_new, *set_attr = NULL; 38231bb76ff1Sjsg int nattr_common = 4, nattr_accessibility = 1; 38241bb76ff1Sjsg struct criu_svm_metadata *criu_svm_md = NULL; 38251bb76ff1Sjsg struct svm_range_list *svms = &p->svms; 38261bb76ff1Sjsg struct criu_svm_metadata *next = NULL; 38271bb76ff1Sjsg uint32_t set_flags = 0xffffffff; 38281bb76ff1Sjsg int i, j, num_attrs, ret = 0; 38291bb76ff1Sjsg uint64_t set_attr_size; 38301bb76ff1Sjsg struct mm_struct *mm; 38311bb76ff1Sjsg 38321bb76ff1Sjsg if (list_empty(&svms->criu_svm_metadata_list)) { 38331bb76ff1Sjsg pr_debug("No SVM data from CRIU restore stage 2\n"); 38341bb76ff1Sjsg return ret; 38351bb76ff1Sjsg } 38361bb76ff1Sjsg 38371bb76ff1Sjsg mm = get_task_mm(p->lead_thread); 38381bb76ff1Sjsg if (!mm) { 38391bb76ff1Sjsg pr_err("failed to get mm for the target process\n"); 38401bb76ff1Sjsg return -ESRCH; 38411bb76ff1Sjsg } 38421bb76ff1Sjsg 38431bb76ff1Sjsg num_attrs = nattr_common + (nattr_accessibility * p->n_pdds); 38441bb76ff1Sjsg 38451bb76ff1Sjsg i = j = 0; 38461bb76ff1Sjsg list_for_each_entry(criu_svm_md, &svms->criu_svm_metadata_list, list) { 38471bb76ff1Sjsg pr_debug("criu_svm_md[%d]\n\tstart: 0x%llx size: 0x%llx (npages)\n", 38481bb76ff1Sjsg i, criu_svm_md->data.start_addr, criu_svm_md->data.size); 38491bb76ff1Sjsg 38501bb76ff1Sjsg for (j = 0; j < num_attrs; j++) { 38511bb76ff1Sjsg pr_debug("\ncriu_svm_md[%d]->attrs[%d].type : 0x%x\ncriu_svm_md[%d]->attrs[%d].value : 0x%x\n", 38521bb76ff1Sjsg i, j, criu_svm_md->data.attrs[j].type, 38531bb76ff1Sjsg i, j, criu_svm_md->data.attrs[j].value); 38541bb76ff1Sjsg switch (criu_svm_md->data.attrs[j].type) { 38551bb76ff1Sjsg /* During Checkpoint operation, the query for 38561bb76ff1Sjsg * KFD_IOCTL_SVM_ATTR_PREFETCH_LOC attribute might 38571bb76ff1Sjsg * return KFD_IOCTL_SVM_LOCATION_UNDEFINED if they were 38581bb76ff1Sjsg * not used by the range which was checkpointed. Care 38591bb76ff1Sjsg * must be taken to not restore with an invalid value 38601bb76ff1Sjsg * otherwise the gpuidx value will be invalid and 38611bb76ff1Sjsg * set_attr would eventually fail so just replace those 38621bb76ff1Sjsg * with another dummy attribute such as 38631bb76ff1Sjsg * KFD_IOCTL_SVM_ATTR_SET_FLAGS. 38641bb76ff1Sjsg */ 38651bb76ff1Sjsg case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: 38661bb76ff1Sjsg if (criu_svm_md->data.attrs[j].value == 38671bb76ff1Sjsg KFD_IOCTL_SVM_LOCATION_UNDEFINED) { 38681bb76ff1Sjsg criu_svm_md->data.attrs[j].type = 38691bb76ff1Sjsg KFD_IOCTL_SVM_ATTR_SET_FLAGS; 38701bb76ff1Sjsg criu_svm_md->data.attrs[j].value = 0; 38711bb76ff1Sjsg } 38721bb76ff1Sjsg break; 38731bb76ff1Sjsg case KFD_IOCTL_SVM_ATTR_SET_FLAGS: 38741bb76ff1Sjsg set_flags = criu_svm_md->data.attrs[j].value; 38751bb76ff1Sjsg break; 38761bb76ff1Sjsg default: 38771bb76ff1Sjsg break; 38781bb76ff1Sjsg } 38791bb76ff1Sjsg } 38801bb76ff1Sjsg 38811bb76ff1Sjsg /* CLR_FLAGS is not available via get_attr during checkpoint but 38821bb76ff1Sjsg * it needs to be inserted before restoring the ranges so 38831bb76ff1Sjsg * allocate extra space for it before calling set_attr 38841bb76ff1Sjsg */ 38851bb76ff1Sjsg set_attr_size = sizeof(struct kfd_ioctl_svm_attribute) * 38861bb76ff1Sjsg (num_attrs + 1); 38871bb76ff1Sjsg set_attr_new = krealloc(set_attr, set_attr_size, 38881bb76ff1Sjsg GFP_KERNEL); 38891bb76ff1Sjsg if (!set_attr_new) { 38901bb76ff1Sjsg ret = -ENOMEM; 38911bb76ff1Sjsg goto exit; 38921bb76ff1Sjsg } 38931bb76ff1Sjsg set_attr = set_attr_new; 38941bb76ff1Sjsg 38951bb76ff1Sjsg memcpy(set_attr, criu_svm_md->data.attrs, num_attrs * 38961bb76ff1Sjsg sizeof(struct kfd_ioctl_svm_attribute)); 38971bb76ff1Sjsg set_attr[num_attrs].type = KFD_IOCTL_SVM_ATTR_CLR_FLAGS; 38981bb76ff1Sjsg set_attr[num_attrs].value = ~set_flags; 38991bb76ff1Sjsg 39001bb76ff1Sjsg ret = svm_range_set_attr(p, mm, criu_svm_md->data.start_addr, 39011bb76ff1Sjsg criu_svm_md->data.size, num_attrs + 1, 39021bb76ff1Sjsg set_attr); 39031bb76ff1Sjsg if (ret) { 39041bb76ff1Sjsg pr_err("CRIU: failed to set range attributes\n"); 39051bb76ff1Sjsg goto exit; 39061bb76ff1Sjsg } 39071bb76ff1Sjsg 39081bb76ff1Sjsg i++; 39091bb76ff1Sjsg } 39101bb76ff1Sjsg exit: 39111bb76ff1Sjsg kfree(set_attr); 39121bb76ff1Sjsg list_for_each_entry_safe(criu_svm_md, next, &svms->criu_svm_metadata_list, list) { 39131bb76ff1Sjsg pr_debug("freeing criu_svm_md[]\n\tstart: 0x%llx\n", 39141bb76ff1Sjsg criu_svm_md->data.start_addr); 39151bb76ff1Sjsg kfree(criu_svm_md); 39161bb76ff1Sjsg } 39171bb76ff1Sjsg 39181bb76ff1Sjsg mmput(mm); 39191bb76ff1Sjsg return ret; 39201bb76ff1Sjsg 39211bb76ff1Sjsg } 39221bb76ff1Sjsg 39231bb76ff1Sjsg int kfd_criu_restore_svm(struct kfd_process *p, 39241bb76ff1Sjsg uint8_t __user *user_priv_ptr, 39251bb76ff1Sjsg uint64_t *priv_data_offset, 39261bb76ff1Sjsg uint64_t max_priv_data_size) 39271bb76ff1Sjsg { 39281bb76ff1Sjsg uint64_t svm_priv_data_size, svm_object_md_size, svm_attrs_size; 39291bb76ff1Sjsg int nattr_common = 4, nattr_accessibility = 1; 39301bb76ff1Sjsg struct criu_svm_metadata *criu_svm_md = NULL; 39311bb76ff1Sjsg struct svm_range_list *svms = &p->svms; 39321bb76ff1Sjsg uint32_t num_devices; 39331bb76ff1Sjsg int ret = 0; 39341bb76ff1Sjsg 39351bb76ff1Sjsg num_devices = p->n_pdds; 39361bb76ff1Sjsg /* Handle one SVM range object at a time, also the number of gpus are 39371bb76ff1Sjsg * assumed to be same on the restore node, checking must be done while 39381bb76ff1Sjsg * evaluating the topology earlier 39391bb76ff1Sjsg */ 39401bb76ff1Sjsg 39411bb76ff1Sjsg svm_attrs_size = sizeof(struct kfd_ioctl_svm_attribute) * 39421bb76ff1Sjsg (nattr_common + nattr_accessibility * num_devices); 39431bb76ff1Sjsg svm_object_md_size = sizeof(struct criu_svm_metadata) + svm_attrs_size; 39441bb76ff1Sjsg 39451bb76ff1Sjsg svm_priv_data_size = sizeof(struct kfd_criu_svm_range_priv_data) + 39461bb76ff1Sjsg svm_attrs_size; 39471bb76ff1Sjsg 39481bb76ff1Sjsg criu_svm_md = kzalloc(svm_object_md_size, GFP_KERNEL); 39491bb76ff1Sjsg if (!criu_svm_md) { 39501bb76ff1Sjsg pr_err("failed to allocate memory to store svm metadata\n"); 39511bb76ff1Sjsg return -ENOMEM; 39521bb76ff1Sjsg } 39531bb76ff1Sjsg if (*priv_data_offset + svm_priv_data_size > max_priv_data_size) { 39541bb76ff1Sjsg ret = -EINVAL; 39551bb76ff1Sjsg goto exit; 39561bb76ff1Sjsg } 39571bb76ff1Sjsg 39581bb76ff1Sjsg ret = copy_from_user(&criu_svm_md->data, user_priv_ptr + *priv_data_offset, 39591bb76ff1Sjsg svm_priv_data_size); 39601bb76ff1Sjsg if (ret) { 39611bb76ff1Sjsg ret = -EFAULT; 39621bb76ff1Sjsg goto exit; 39631bb76ff1Sjsg } 39641bb76ff1Sjsg *priv_data_offset += svm_priv_data_size; 39651bb76ff1Sjsg 39661bb76ff1Sjsg list_add_tail(&criu_svm_md->list, &svms->criu_svm_metadata_list); 39671bb76ff1Sjsg 39681bb76ff1Sjsg return 0; 39691bb76ff1Sjsg 39701bb76ff1Sjsg 39711bb76ff1Sjsg exit: 39721bb76ff1Sjsg kfree(criu_svm_md); 39731bb76ff1Sjsg return ret; 39741bb76ff1Sjsg } 39751bb76ff1Sjsg 39761bb76ff1Sjsg int svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges, 39771bb76ff1Sjsg uint64_t *svm_priv_data_size) 39781bb76ff1Sjsg { 39791bb76ff1Sjsg uint64_t total_size, accessibility_size, common_attr_size; 39801bb76ff1Sjsg int nattr_common = 4, nattr_accessibility = 1; 39811bb76ff1Sjsg int num_devices = p->n_pdds; 39821bb76ff1Sjsg struct svm_range_list *svms; 39831bb76ff1Sjsg struct svm_range *prange; 39841bb76ff1Sjsg uint32_t count = 0; 39851bb76ff1Sjsg 39861bb76ff1Sjsg *svm_priv_data_size = 0; 39871bb76ff1Sjsg 39881bb76ff1Sjsg svms = &p->svms; 39891bb76ff1Sjsg if (!svms) 39901bb76ff1Sjsg return -EINVAL; 39911bb76ff1Sjsg 39921bb76ff1Sjsg mutex_lock(&svms->lock); 39931bb76ff1Sjsg list_for_each_entry(prange, &svms->list, list) { 39941bb76ff1Sjsg pr_debug("prange: 0x%p start: 0x%lx\t npages: 0x%llx\t end: 0x%llx\n", 39951bb76ff1Sjsg prange, prange->start, prange->npages, 39961bb76ff1Sjsg prange->start + prange->npages - 1); 39971bb76ff1Sjsg count++; 39981bb76ff1Sjsg } 39991bb76ff1Sjsg mutex_unlock(&svms->lock); 40001bb76ff1Sjsg 40011bb76ff1Sjsg *num_svm_ranges = count; 40021bb76ff1Sjsg /* Only the accessbility attributes need to be queried for all the gpus 40031bb76ff1Sjsg * individually, remaining ones are spanned across the entire process 40041bb76ff1Sjsg * regardless of the various gpu nodes. Of the remaining attributes, 40051bb76ff1Sjsg * KFD_IOCTL_SVM_ATTR_CLR_FLAGS need not be saved. 40061bb76ff1Sjsg * 40071bb76ff1Sjsg * KFD_IOCTL_SVM_ATTR_PREFERRED_LOC 40081bb76ff1Sjsg * KFD_IOCTL_SVM_ATTR_PREFETCH_LOC 40091bb76ff1Sjsg * KFD_IOCTL_SVM_ATTR_SET_FLAGS 40101bb76ff1Sjsg * KFD_IOCTL_SVM_ATTR_GRANULARITY 40111bb76ff1Sjsg * 40121bb76ff1Sjsg * ** ACCESSBILITY ATTRIBUTES ** 40131bb76ff1Sjsg * (Considered as one, type is altered during query, value is gpuid) 40141bb76ff1Sjsg * KFD_IOCTL_SVM_ATTR_ACCESS 40151bb76ff1Sjsg * KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE 40161bb76ff1Sjsg * KFD_IOCTL_SVM_ATTR_NO_ACCESS 40171bb76ff1Sjsg */ 40181bb76ff1Sjsg if (*num_svm_ranges > 0) { 40191bb76ff1Sjsg common_attr_size = sizeof(struct kfd_ioctl_svm_attribute) * 40201bb76ff1Sjsg nattr_common; 40211bb76ff1Sjsg accessibility_size = sizeof(struct kfd_ioctl_svm_attribute) * 40221bb76ff1Sjsg nattr_accessibility * num_devices; 40231bb76ff1Sjsg 40241bb76ff1Sjsg total_size = sizeof(struct kfd_criu_svm_range_priv_data) + 40251bb76ff1Sjsg common_attr_size + accessibility_size; 40261bb76ff1Sjsg 40271bb76ff1Sjsg *svm_priv_data_size = *num_svm_ranges * total_size; 40281bb76ff1Sjsg } 40291bb76ff1Sjsg 40301bb76ff1Sjsg pr_debug("num_svm_ranges %u total_priv_size %llu\n", *num_svm_ranges, 40311bb76ff1Sjsg *svm_priv_data_size); 40321bb76ff1Sjsg return 0; 40331bb76ff1Sjsg } 40341bb76ff1Sjsg 40351bb76ff1Sjsg int kfd_criu_checkpoint_svm(struct kfd_process *p, 40361bb76ff1Sjsg uint8_t __user *user_priv_data, 40371bb76ff1Sjsg uint64_t *priv_data_offset) 40381bb76ff1Sjsg { 40391bb76ff1Sjsg struct kfd_criu_svm_range_priv_data *svm_priv = NULL; 40401bb76ff1Sjsg struct kfd_ioctl_svm_attribute *query_attr = NULL; 40411bb76ff1Sjsg uint64_t svm_priv_data_size, query_attr_size = 0; 40421bb76ff1Sjsg int index, nattr_common = 4, ret = 0; 40431bb76ff1Sjsg struct svm_range_list *svms; 40441bb76ff1Sjsg int num_devices = p->n_pdds; 40451bb76ff1Sjsg struct svm_range *prange; 40461bb76ff1Sjsg struct mm_struct *mm; 40471bb76ff1Sjsg 40481bb76ff1Sjsg svms = &p->svms; 40491bb76ff1Sjsg if (!svms) 40501bb76ff1Sjsg return -EINVAL; 40511bb76ff1Sjsg 40521bb76ff1Sjsg mm = get_task_mm(p->lead_thread); 40531bb76ff1Sjsg if (!mm) { 40541bb76ff1Sjsg pr_err("failed to get mm for the target process\n"); 40551bb76ff1Sjsg return -ESRCH; 40561bb76ff1Sjsg } 40571bb76ff1Sjsg 40581bb76ff1Sjsg query_attr_size = sizeof(struct kfd_ioctl_svm_attribute) * 40591bb76ff1Sjsg (nattr_common + num_devices); 40601bb76ff1Sjsg 40611bb76ff1Sjsg query_attr = kzalloc(query_attr_size, GFP_KERNEL); 40621bb76ff1Sjsg if (!query_attr) { 40631bb76ff1Sjsg ret = -ENOMEM; 40641bb76ff1Sjsg goto exit; 40651bb76ff1Sjsg } 40661bb76ff1Sjsg 40671bb76ff1Sjsg query_attr[0].type = KFD_IOCTL_SVM_ATTR_PREFERRED_LOC; 40681bb76ff1Sjsg query_attr[1].type = KFD_IOCTL_SVM_ATTR_PREFETCH_LOC; 40691bb76ff1Sjsg query_attr[2].type = KFD_IOCTL_SVM_ATTR_SET_FLAGS; 40701bb76ff1Sjsg query_attr[3].type = KFD_IOCTL_SVM_ATTR_GRANULARITY; 40711bb76ff1Sjsg 40721bb76ff1Sjsg for (index = 0; index < num_devices; index++) { 40731bb76ff1Sjsg struct kfd_process_device *pdd = p->pdds[index]; 40741bb76ff1Sjsg 40751bb76ff1Sjsg query_attr[index + nattr_common].type = 40761bb76ff1Sjsg KFD_IOCTL_SVM_ATTR_ACCESS; 40771bb76ff1Sjsg query_attr[index + nattr_common].value = pdd->user_gpu_id; 40781bb76ff1Sjsg } 40791bb76ff1Sjsg 40801bb76ff1Sjsg svm_priv_data_size = sizeof(*svm_priv) + query_attr_size; 40811bb76ff1Sjsg 40821bb76ff1Sjsg svm_priv = kzalloc(svm_priv_data_size, GFP_KERNEL); 40831bb76ff1Sjsg if (!svm_priv) { 40841bb76ff1Sjsg ret = -ENOMEM; 40851bb76ff1Sjsg goto exit_query; 40861bb76ff1Sjsg } 40871bb76ff1Sjsg 40881bb76ff1Sjsg index = 0; 40891bb76ff1Sjsg list_for_each_entry(prange, &svms->list, list) { 40901bb76ff1Sjsg 40911bb76ff1Sjsg svm_priv->object_type = KFD_CRIU_OBJECT_TYPE_SVM_RANGE; 40921bb76ff1Sjsg svm_priv->start_addr = prange->start; 40931bb76ff1Sjsg svm_priv->size = prange->npages; 40941bb76ff1Sjsg memcpy(&svm_priv->attrs, query_attr, query_attr_size); 40951bb76ff1Sjsg pr_debug("CRIU: prange: 0x%p start: 0x%lx\t npages: 0x%llx end: 0x%llx\t size: 0x%llx\n", 40961bb76ff1Sjsg prange, prange->start, prange->npages, 40971bb76ff1Sjsg prange->start + prange->npages - 1, 40981bb76ff1Sjsg prange->npages * PAGE_SIZE); 40991bb76ff1Sjsg 41001bb76ff1Sjsg ret = svm_range_get_attr(p, mm, svm_priv->start_addr, 41011bb76ff1Sjsg svm_priv->size, 41021bb76ff1Sjsg (nattr_common + num_devices), 41031bb76ff1Sjsg svm_priv->attrs); 41041bb76ff1Sjsg if (ret) { 41051bb76ff1Sjsg pr_err("CRIU: failed to obtain range attributes\n"); 41061bb76ff1Sjsg goto exit_priv; 41071bb76ff1Sjsg } 41081bb76ff1Sjsg 41091bb76ff1Sjsg if (copy_to_user(user_priv_data + *priv_data_offset, svm_priv, 41101bb76ff1Sjsg svm_priv_data_size)) { 41111bb76ff1Sjsg pr_err("Failed to copy svm priv to user\n"); 41121bb76ff1Sjsg ret = -EFAULT; 41131bb76ff1Sjsg goto exit_priv; 41141bb76ff1Sjsg } 41151bb76ff1Sjsg 41161bb76ff1Sjsg *priv_data_offset += svm_priv_data_size; 41171bb76ff1Sjsg 41181bb76ff1Sjsg } 41191bb76ff1Sjsg 41201bb76ff1Sjsg 41211bb76ff1Sjsg exit_priv: 41221bb76ff1Sjsg kfree(svm_priv); 41231bb76ff1Sjsg exit_query: 41241bb76ff1Sjsg kfree(query_attr); 41251bb76ff1Sjsg exit: 41261bb76ff1Sjsg mmput(mm); 41271bb76ff1Sjsg return ret; 41281bb76ff1Sjsg } 41291bb76ff1Sjsg 41305ca02815Sjsg int 41315ca02815Sjsg svm_ioctl(struct kfd_process *p, enum kfd_ioctl_svm_op op, uint64_t start, 41325ca02815Sjsg uint64_t size, uint32_t nattrs, struct kfd_ioctl_svm_attribute *attrs) 41335ca02815Sjsg { 41341bb76ff1Sjsg struct mm_struct *mm = current->mm; 41355ca02815Sjsg int r; 41365ca02815Sjsg 41375ca02815Sjsg start >>= PAGE_SHIFT; 41385ca02815Sjsg size >>= PAGE_SHIFT; 41395ca02815Sjsg 41405ca02815Sjsg switch (op) { 41415ca02815Sjsg case KFD_IOCTL_SVM_OP_SET_ATTR: 41421bb76ff1Sjsg r = svm_range_set_attr(p, mm, start, size, nattrs, attrs); 41435ca02815Sjsg break; 41445ca02815Sjsg case KFD_IOCTL_SVM_OP_GET_ATTR: 41451bb76ff1Sjsg r = svm_range_get_attr(p, mm, start, size, nattrs, attrs); 41465ca02815Sjsg break; 41475ca02815Sjsg default: 41485ca02815Sjsg r = EINVAL; 41495ca02815Sjsg break; 41505ca02815Sjsg } 41515ca02815Sjsg 41525ca02815Sjsg return r; 41535ca02815Sjsg } 4154