From 2eb9dd497a698dc384c0dd3e0311d541eb2e13dd Mon Sep 17 00:00:00 2001 From: Joshua Ashton Date: Thu, 2 Nov 2023 04:21:55 +0000 Subject: drm/amd/display: Set color_mgmt_changed to true on unsuspend Otherwise we can end up with a frame on unsuspend where color management is not applied when userspace has not committed themselves. Fixes re-applying color management on Steam Deck/Gamescope on S3 resume. Signed-off-by: Joshua Ashton Reviewed-by: Harry Wentland Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 6d2f60c61decc3..f3f94d109726d3 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -3029,6 +3029,7 @@ static int dm_resume(void *handle) dc_stream_release(dm_new_crtc_state->stream); dm_new_crtc_state->stream = NULL; } + dm_new_crtc_state->base.color_mgmt_changed = true; } for_each_new_plane_in_state(dm->cached_state, plane, new_plane_state, i) { -- cgit 1.2.3-korg From 37865e02e6ccecdda240f33b4332105a5c734984 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Wed, 17 Apr 2024 21:13:59 -0400 Subject: drm/amdkfd: Fix eviction fence handling Handle case that dma_fence_get_rcu_safe returns NULL. If restore work is already scheduled, only update its timer. The same work item cannot be queued twice, so undo the extra queue eviction. Fixes: 9a1c1339abf9 ("drm/amdkfd: Run restore_workers on freezable WQs") Signed-off-by: Felix Kuehling Reviewed-by: Philip Yang Tested-by: Gang BA Reviewed-by: Gang BA Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index b79986412cd839..aafdf064651fae 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -1922,6 +1922,8 @@ static int signal_eviction_fence(struct kfd_process *p) rcu_read_lock(); ef = dma_fence_get_rcu_safe(&p->ef); rcu_read_unlock(); + if (!ef) + return -EINVAL; ret = dma_fence_signal(ef); dma_fence_put(ef); @@ -1949,10 +1951,9 @@ static void evict_process_worker(struct work_struct *work) * they are responsible stopping the queues and scheduling * the restore work. */ - if (!signal_eviction_fence(p)) - queue_delayed_work(kfd_restore_wq, &p->restore_work, - msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)); - else + if (signal_eviction_fence(p) || + mod_delayed_work(kfd_restore_wq, &p->restore_work, + msecs_to_jiffies(PROCESS_RESTORE_TIME_MS))) kfd_process_restore_queues(p); pr_debug("Finished evicting pasid 0x%x\n", p->pasid); -- cgit 1.2.3-korg From 25e9227c6afd200bed6774c866980b8e36d033af Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Thu, 18 Apr 2024 11:32:34 -0400 Subject: drm/amdgpu: Fix leak when GPU memory allocation fails Free the sync object if the memory allocation fails for any reason. Signed-off-by: Mukul Joshi Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index df58a6a1a67ec5..7c23ba19af33a1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1854,6 +1854,7 @@ err_node_allow: err_bo_create: amdgpu_amdkfd_unreserve_mem_limit(adev, aligned_size, flags, xcp_id); err_reserve_limit: + amdgpu_sync_free(&(*mem)->sync); mutex_destroy(&(*mem)->lock); if (gobj) drm_gem_object_put(gobj); -- cgit 1.2.3-korg From 9c783a11214553a54f0915a7260a3ce624d36bf2 Mon Sep 17 00:00:00 2001 From: Lang Yu Date: Sun, 7 Apr 2024 12:36:00 +0800 Subject: drm/amdkfd: make sure VM is ready for updating operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When page table BOs were evicted but not validated before updating page tables, VM is still in evicting state, amdgpu_vm_update_range returns -EBUSY and restore_process_worker runs into a dead loop. v2: Split the BO validation and page table update into two separate loops in amdgpu_amdkfd_restore_process_bos. (Felix) 1.Validate BOs 2.Validate VM (and DMABuf attachments) 3.Update page tables for the BOs validated above Fixes: 50661eb1a2c8 ("drm/amdgpu: Auto-validate DMABuf imports in compute VMs") Signed-off-by: Lang Yu Acked-by: Christian König Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 34 ++++++++++++++---------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 7c23ba19af33a1..2131de36e3dac0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -2901,13 +2901,12 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu * amdgpu_sync_create(&sync_obj); - /* Validate BOs and map them to GPUVM (update VM page tables). */ + /* Validate BOs managed by KFD */ list_for_each_entry(mem, &process_info->kfd_bo_list, validate_list) { struct amdgpu_bo *bo = mem->bo; uint32_t domain = mem->domain; - struct kfd_mem_attachment *attachment; struct dma_resv_iter cursor; struct dma_fence *fence; @@ -2932,6 +2931,25 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu * goto validate_map_fail; } } + } + + if (failed_size) + pr_debug("0x%lx/0x%lx in system\n", failed_size, total_size); + + /* Validate PDs, PTs and evicted DMABuf imports last. Otherwise BO + * validations above would invalidate DMABuf imports again. + */ + ret = process_validate_vms(process_info, &exec.ticket); + if (ret) { + pr_debug("Validating VMs failed, ret: %d\n", ret); + goto validate_map_fail; + } + + /* Update mappings managed by KFD. */ + list_for_each_entry(mem, &process_info->kfd_bo_list, + validate_list) { + struct kfd_mem_attachment *attachment; + list_for_each_entry(attachment, &mem->attachments, list) { if (!attachment->is_mapped) continue; @@ -2948,18 +2966,6 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu * } } - if (failed_size) - pr_debug("0x%lx/0x%lx in system\n", failed_size, total_size); - - /* Validate PDs, PTs and evicted DMABuf imports last. Otherwise BO - * validations above would invalidate DMABuf imports again. - */ - ret = process_validate_vms(process_info, &exec.ticket); - if (ret) { - pr_debug("Validating VMs failed, ret: %d\n", ret); - goto validate_map_fail; - } - /* Update mappings not managed by KFD */ list_for_each_entry(peer_vm, &process_info->vm_list_head, vm_list_node) { -- cgit 1.2.3-korg From 0e95ed6452cb079cf9587c774a475a7d83c7e040 Mon Sep 17 00:00:00 2001 From: Ma Jun Date: Tue, 16 Apr 2024 17:30:12 +0800 Subject: drm/amdgpu/pm: Remove gpu_od if it's an empty directory gpu_od should be removed if it's an empty directory Signed-off-by: Ma Jun Reported-by: Yang Wang Reviewed-by: Yang Wang Suggested-by: Lijo Lazar Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/pm/amdgpu_pm.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c index f09b9d49297e81..bbd0169010c2d5 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c @@ -4261,6 +4261,13 @@ static int amdgpu_od_set_init(struct amdgpu_device *adev) } } + /* + * If gpu_od is the only member in the list, that means gpu_od is an + * empty directory, so remove it. + */ + if (list_is_singular(&adev->pm.od_kobj_list)) + goto err_out; + return 0; err_out: -- cgit 1.2.3-korg From aebd3eb9d3ae017e6260043f6bcace2f5ef60694 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Wed, 10 Apr 2024 19:30:46 +0530 Subject: drm/amdgpu: Assign correct bits for SDMA HDP flush HDP Flush request bit can be kept unique per AID, and doesn't need to be unique SOC-wide. Assign only bits 10-13 for SDMA v4.4.2. Signed-off-by: Lijo Lazar Acked-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c index 82eab49be82bb9..e708468ac54dd5 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c @@ -368,7 +368,8 @@ static void sdma_v4_4_2_ring_emit_hdp_flush(struct amdgpu_ring *ring) u32 ref_and_mask = 0; const struct nbio_hdp_flush_reg *nbio_hf_reg = adev->nbio.hdp_flush_reg; - ref_and_mask = nbio_hf_reg->ref_and_mask_sdma0 << ring->me; + ref_and_mask = nbio_hf_reg->ref_and_mask_sdma0 + << (ring->me % adev->sdma.num_inst_per_aid); sdma_v4_4_2_wait_reg_mem(ring, 0, 1, adev->nbio.funcs->get_hdp_flush_done_offset(adev), -- cgit 1.2.3-korg From d59198d2d0c5cb2a360819b000b0f173c472c9ef Mon Sep 17 00:00:00 2001 From: Peyton Lee Date: Fri, 19 Apr 2024 14:07:39 +0800 Subject: drm/amdgpu/vpe: fix vpe dpm setup failed The vpe dpm settings should be done before firmware is loaded. Otherwise, the frequency cannot be successfully raised. Signed-off-by: Peyton Lee Reviewed-by: Lang Yu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c | 2 +- drivers/gpu/drm/amd/amdgpu/vpe_v6_1.c | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c index 6695481f870f8a..c23d97d34b7ec5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c @@ -205,7 +205,7 @@ disable_dpm: dpm_ctl &= 0xfffffffe; /* Disable DPM */ WREG32(vpe_get_reg_offset(vpe, 0, vpe->regs.dpm_enable), dpm_ctl); dev_dbg(adev->dev, "%s: disable vpe dpm\n", __func__); - return 0; + return -EINVAL; } int amdgpu_vpe_psp_update_sram(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/vpe_v6_1.c b/drivers/gpu/drm/amd/amdgpu/vpe_v6_1.c index 769eb8f7bb3c57..09315dd5a1ec95 100644 --- a/drivers/gpu/drm/amd/amdgpu/vpe_v6_1.c +++ b/drivers/gpu/drm/amd/amdgpu/vpe_v6_1.c @@ -144,6 +144,12 @@ static int vpe_v6_1_load_microcode(struct amdgpu_vpe *vpe) WREG32(vpe_get_reg_offset(vpe, j, regVPEC_CNTL), ret); } + /* setup collaborate mode */ + vpe_v6_1_set_collaborate_mode(vpe, true); + /* setup DPM */ + if (amdgpu_vpe_configure_dpm(vpe)) + dev_warn(adev->dev, "VPE failed to enable DPM\n"); + /* * For VPE 6.1.1, still only need to add master's offset, and psp will apply it to slave as well. * Here use instance 0 as master. @@ -159,11 +165,7 @@ static int vpe_v6_1_load_microcode(struct amdgpu_vpe *vpe) adev->vpe.cmdbuf_cpu_addr[0] = f32_offset; adev->vpe.cmdbuf_cpu_addr[1] = f32_cntl; - amdgpu_vpe_psp_update_sram(adev); - vpe_v6_1_set_collaborate_mode(vpe, true); - amdgpu_vpe_configure_dpm(vpe); - - return 0; + return amdgpu_vpe_psp_update_sram(adev); } vpe_hdr = (const struct vpe_firmware_header_v1_0 *)adev->vpe.fw->data; @@ -196,8 +198,6 @@ static int vpe_v6_1_load_microcode(struct amdgpu_vpe *vpe) } vpe_v6_1_halt(vpe, false); - vpe_v6_1_set_collaborate_mode(vpe, true); - amdgpu_vpe_configure_dpm(vpe); return 0; } -- cgit 1.2.3-korg From b0b13d532105e0e682d95214933bb8483a063184 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Thu, 18 Apr 2024 13:56:42 -0400 Subject: drm/amdgpu: Update BO eviction priorities MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make SVM BOs more likely to get evicted than other BOs. These BOs opportunistically use available VRAM, but can fall back relatively seamlessly to system memory. It also avoids SVM migrations evicting other, more important BOs as they will evict other SVM allocations first. Signed-off-by: Felix Kuehling Acked-by: Mukul Joshi Tested-by: Mukul Joshi Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 2099159a693fa0..ce733e3cb35d05 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -605,6 +605,8 @@ int amdgpu_bo_create(struct amdgpu_device *adev, else amdgpu_bo_placement_from_domain(bo, bp->domain); if (bp->type == ttm_bo_type_kernel) + bo->tbo.priority = 2; + else if (!(bp->flags & AMDGPU_GEM_CREATE_DISCARDABLE)) bo->tbo.priority = 1; if (!bp->destroy) -- cgit 1.2.3-korg From e26305f369ed0e087a043c2cdc76f3d9a6efb3bd Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Fri, 19 Apr 2024 13:25:58 -0400 Subject: drm/amdkfd: Fix rescheduling of restore worker Handle the case that the restore worker was already scheduled by another eviction while the restore was in progress. Fixes: 9a1c1339abf9 ("drm/amdkfd: Run restore_workers on freezable WQs") Signed-off-by: Felix Kuehling Reviewed-by: Philip Yang Tested-by: Yunxiang Li Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index aafdf064651fae..58c1fe5421934d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -2012,9 +2012,9 @@ static void restore_process_worker(struct work_struct *work) if (ret) { pr_debug("Failed to restore BOs of pasid 0x%x, retry after %d ms\n", p->pasid, PROCESS_BACK_OFF_TIME_MS); - ret = queue_delayed_work(kfd_restore_wq, &p->restore_work, - msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS)); - WARN(!ret, "reschedule restore work failed\n"); + if (mod_delayed_work(kfd_restore_wq, &p->restore_work, + msecs_to_jiffies(PROCESS_RESTORE_TIME_MS))) + kfd_process_restore_queues(p); } } -- cgit 1.2.3-korg From 661d71ee5a010bdc0663e0db701931aff920e8e1 Mon Sep 17 00:00:00 2001 From: Lang Yu Date: Fri, 19 Apr 2024 15:40:08 +0800 Subject: drm/amdgpu/umsch: don't execute umsch test when GPU is in reset/suspend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit umsch test needs full GPU functionality(e.g., VM update, TLB flush, possibly buffer moving under memory pressure) which may be not ready under these states. Just skip it to avoid potential issues. Signed-off-by: Lang Yu Reviewed-by: Christian König Reviewed-by: Veerabadhran Gopalakrishnan Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c index 0df97c3e3a700d..f7c73533e336fa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c @@ -774,6 +774,9 @@ static int umsch_mm_late_init(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; + if (amdgpu_in_reset(adev) || adev->in_s0ix || adev->in_suspend) + return 0; + return umsch_mm_test(adev); } -- cgit 1.2.3-korg From 30d1cda8ce31ab49051ff7159280c542a738b23d Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Fri, 12 Apr 2024 13:11:14 +0530 Subject: drm/amd/pm: Restore config space after reset During mode-2 reset, pci config space registers are affected at device side. However, certain platforms have switches which assign virtual BAR addresses and returns the same even after device is reset. This affects pci_restore_state() as it doesn't issue another config write, if the value read is same as the saved value. Add a workaround to write saved config space values from driver side. Presently, these switches are in platforms with SMU v13.0.6 SOCs, hence restrict the workaround only to those. Signed-off-by: Lijo Lazar Reviewed-by: Asad Kamal Signed-off-by: Alex Deucher --- .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 25 ++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 3957af057d54ff..c977ebe88001df 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -2294,6 +2294,17 @@ static ssize_t smu_v13_0_6_get_gpu_metrics(struct smu_context *smu, void **table return sizeof(*gpu_metrics); } +static void smu_v13_0_6_restore_pci_config(struct smu_context *smu) +{ + struct amdgpu_device *adev = smu->adev; + int i; + + for (i = 0; i < 16; i++) + pci_write_config_dword(adev->pdev, i * 4, + adev->pdev->saved_config_space[i]); + pci_restore_msi_state(adev->pdev); +} + static int smu_v13_0_6_mode2_reset(struct smu_context *smu) { int ret = 0, index; @@ -2315,6 +2326,20 @@ static int smu_v13_0_6_mode2_reset(struct smu_context *smu) /* Restore the config space saved during init */ amdgpu_device_load_pci_state(adev->pdev); + /* Certain platforms have switches which assign virtual BAR values to + * devices. OS uses the virtual BAR values and device behind the switch + * is assgined another BAR value. When device's config space registers + * are queried, switch returns the virtual BAR values. When mode-2 reset + * is performed, switch is unaware of it, and will continue to return + * the same virtual values to the OS.This affects + * pci_restore_config_space() API as it doesn't write the value saved if + * the current value read from config space is the same as what is + * saved. As a workaround, make sure the config space is restored + * always. + */ + if (!(adev->flags & AMD_IS_APU)) + smu_v13_0_6_restore_pci_config(smu); + dev_dbg(smu->adev->dev, "wait for reset ack\n"); do { ret = smu_cmn_wait_for_response(smu); -- cgit 1.2.3-korg From 1e214f7faaf5d842754cd5cfcd76308bfedab3b5 Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Thu, 18 Apr 2024 15:13:58 -0400 Subject: drm/amdkfd: Add VRAM accounting for SVM migration Do VRAM accounting when doing migrations to vram to make sure there is enough available VRAM and migrating to VRAM doesn't evict other possible non-unified memory BOs. If migrating to VRAM fails, driver can fall back to using system memory seamlessly. Signed-off-by: Mukul Joshi Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 16 +++++++++++++++- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 2 +- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index bdc01ca9609a7e..5c8d81bfce7ab1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -509,10 +509,19 @@ svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc, start = start_mgr << PAGE_SHIFT; end = (last_mgr + 1) << PAGE_SHIFT; + r = amdgpu_amdkfd_reserve_mem_limit(node->adev, + prange->npages * PAGE_SIZE, + KFD_IOC_ALLOC_MEM_FLAGS_VRAM, + node->xcp ? node->xcp->id : 0); + if (r) { + dev_dbg(node->adev->dev, "failed to reserve VRAM, r: %ld\n", r); + return -ENOSPC; + } + r = svm_range_vram_node_new(node, prange, true); if (r) { dev_dbg(node->adev->dev, "fail %ld to alloc vram\n", r); - return r; + goto out; } ttm_res_offset = (start_mgr - prange->start + prange->offset) << PAGE_SHIFT; @@ -545,6 +554,11 @@ svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc, svm_range_vram_node_free(prange); } +out: + amdgpu_amdkfd_unreserve_mem_limit(node->adev, + prange->npages * PAGE_SIZE, + KFD_IOC_ALLOC_MEM_FLAGS_VRAM, + node->xcp ? node->xcp->id : 0); return r < 0 ? r : 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index f0f7f48af4137a..386875e6eb96ba 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -3426,7 +3426,7 @@ svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange, mm, KFD_MIGRATE_TRIGGER_PREFETCH); *migrated = !r; - return r; + return 0; } int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence) -- cgit 1.2.3-korg From fe93b0927bc58cb1d64230f45744e527d9d8482c Mon Sep 17 00:00:00 2001 From: Prike Liang Date: Mon, 25 Mar 2024 15:33:34 +0800 Subject: drm/amdgpu: Fix the ring buffer size for queue VM flush MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Here are the corrections needed for the queue ring buffer size calculation for the following cases: - Remove the KIQ VM flush ring usage. - Add the invalidate TLBs packet for gfx10 and gfx11 queue. - There's no VM flush and PFP sync, so remove the gfx9 real ring and compute ring buffer usage. Signed-off-by: Prike Liang Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 3 +-- drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 3 +-- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 2 -- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index f90905ef32c76d..701146d649c353 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -9186,7 +9186,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = { 7 + /* PIPELINE_SYNC */ SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + - 2 + /* VM_FLUSH */ + 4 + /* VM_FLUSH */ 8 + /* FENCE for VM_FLUSH */ 20 + /* GDS switch */ 4 + /* double SWITCH_BUFFER, @@ -9276,7 +9276,6 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_kiq = { 7 + /* gfx_v10_0_ring_emit_pipeline_sync */ SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + - 2 + /* gfx_v10_0_ring_emit_vm_flush */ 8 + 8 + 8, /* gfx_v10_0_ring_emit_fence_kiq x3 for user fence, vm fence */ .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */ .emit_ib = gfx_v10_0_ring_emit_ib_compute, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index f7325b02a191f7..f00e05aba46a4e 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -6192,7 +6192,7 @@ static const struct amdgpu_ring_funcs gfx_v11_0_ring_funcs_gfx = { 7 + /* PIPELINE_SYNC */ SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + - 2 + /* VM_FLUSH */ + 4 + /* VM_FLUSH */ 8 + /* FENCE for VM_FLUSH */ 20 + /* GDS switch */ 5 + /* COND_EXEC */ @@ -6278,7 +6278,6 @@ static const struct amdgpu_ring_funcs gfx_v11_0_ring_funcs_kiq = { 7 + /* gfx_v11_0_ring_emit_pipeline_sync */ SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + - 2 + /* gfx_v11_0_ring_emit_vm_flush */ 8 + 8 + 8, /* gfx_v11_0_ring_emit_fence_kiq x3 for user fence, vm fence */ .emit_ib_size = 7, /* gfx_v11_0_ring_emit_ib_compute */ .emit_ib = gfx_v11_0_ring_emit_ib_compute, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 6f97a6d0e6d052..99dbd2341120db 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -6981,7 +6981,6 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = { 7 + /* gfx_v9_0_ring_emit_pipeline_sync */ SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + - 2 + /* gfx_v9_0_ring_emit_vm_flush */ 8 + 8 + 8 + /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */ 7 + /* gfx_v9_0_emit_mem_sync */ 5 + /* gfx_v9_0_emit_wave_limit for updating mmSPI_WCL_PIPE_PERCENT_GFX register */ @@ -7019,7 +7018,6 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = { 7 + /* gfx_v9_0_ring_emit_pipeline_sync */ SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + - 2 + /* gfx_v9_0_ring_emit_vm_flush */ 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence_kiq x3 for user fence, vm fence */ .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */ .emit_fence = gfx_v9_0_ring_emit_fence_kiq, -- cgit 1.2.3-korg From 9792b7cc18aaa0c2acae6af5d0acf249bcb1ab0d Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Sun, 14 Apr 2024 21:20:56 -0400 Subject: drm/amdgpu/sdma5.2: use legacy HDP flush for SDMA2/3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This avoids a potential conflict with firmwares with the newer HDP flush mechanism. Reviewed-by: Christian König Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c index 42f4bd250def62..da01b524b9f2a9 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c @@ -280,17 +280,21 @@ static void sdma_v5_2_ring_emit_hdp_flush(struct amdgpu_ring *ring) u32 ref_and_mask = 0; const struct nbio_hdp_flush_reg *nbio_hf_reg = adev->nbio.hdp_flush_reg; - ref_and_mask = nbio_hf_reg->ref_and_mask_sdma0 << ring->me; - - amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_POLL_REGMEM) | - SDMA_PKT_POLL_REGMEM_HEADER_HDP_FLUSH(1) | - SDMA_PKT_POLL_REGMEM_HEADER_FUNC(3)); /* == */ - amdgpu_ring_write(ring, (adev->nbio.funcs->get_hdp_flush_done_offset(adev)) << 2); - amdgpu_ring_write(ring, (adev->nbio.funcs->get_hdp_flush_req_offset(adev)) << 2); - amdgpu_ring_write(ring, ref_and_mask); /* reference */ - amdgpu_ring_write(ring, ref_and_mask); /* mask */ - amdgpu_ring_write(ring, SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff) | - SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(10)); /* retry count, poll interval */ + if (ring->me > 1) { + amdgpu_asic_flush_hdp(adev, ring); + } else { + ref_and_mask = nbio_hf_reg->ref_and_mask_sdma0 << ring->me; + + amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_POLL_REGMEM) | + SDMA_PKT_POLL_REGMEM_HEADER_HDP_FLUSH(1) | + SDMA_PKT_POLL_REGMEM_HEADER_FUNC(3)); /* == */ + amdgpu_ring_write(ring, (adev->nbio.funcs->get_hdp_flush_done_offset(adev)) << 2); + amdgpu_ring_write(ring, (adev->nbio.funcs->get_hdp_flush_req_offset(adev)) << 2); + amdgpu_ring_write(ring, ref_and_mask); /* reference */ + amdgpu_ring_write(ring, ref_and_mask); /* mask */ + amdgpu_ring_write(ring, SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff) | + SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(10)); /* retry count, poll interval */ + } } /** -- cgit 1.2.3-korg From 948255282074d9367e01908b3f5dcf8c10fc9c3d Mon Sep 17 00:00:00 2001 From: Jack Xiao Date: Mon, 22 Apr 2024 16:22:54 +0800 Subject: drm/amdgpu/mes: fix use-after-free issue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Delete fence fallback timer to fix the ramdom use-after-free issue. v2: move to amdgpu_mes.c Signed-off-by: Jack Xiao Acked-by: Lijo Lazar Acked-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index a00cf4756ad0e2..1569bef030eac1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -1132,6 +1132,7 @@ void amdgpu_mes_remove_ring(struct amdgpu_device *adev, return; amdgpu_mes_remove_hw_queue(adev, ring->hw_queue_id); + del_timer_sync(&ring->fence_drv.fallback_timer); amdgpu_ring_fini(ring); kfree(ring); } -- cgit 1.2.3-korg