diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c index d60efb8f4ba3..dc622af8695c 100644 --- a/drivers/gpu/drm/i915/display/intel_display.c +++ b/drivers/gpu/drm/i915/display/intel_display.c @@ -2311,7 +2311,7 @@ intel_pin_and_fence_fb_obj(struct drm_framebuffer *fb, void intel_unpin_fb_vma(struct i915_vma *vma, unsigned long flags) { - i915_gem_object_lock(vma->obj); + i915_gem_object_lock(vma->obj, NULL); if (flags & PLANE_HAS_FENCE) i915_vma_unpin_fence(vma); i915_gem_object_unpin_from_display_plane(vma); @@ -3451,7 +3451,7 @@ initial_plane_vma(struct drm_i915_private *i915, if (IS_ERR(vma)) goto err_obj; - if (i915_ggtt_pin(vma, 0, PIN_MAPPABLE | PIN_OFFSET_FIXED | base)) + if (i915_ggtt_pin(vma, NULL, 0, PIN_MAPPABLE | PIN_OFFSET_FIXED | base)) goto err_obj; if (i915_gem_object_is_tiled(obj) && @@ -17194,7 +17194,7 @@ static int intel_framebuffer_init(struct intel_framebuffer *intel_fb, if (!intel_fb->frontbuffer) return -ENOMEM; - i915_gem_object_lock(obj); + i915_gem_object_lock(obj, NULL); tiling = i915_gem_object_get_tiling(obj); stride = i915_gem_object_get_stride(obj); i915_gem_object_unlock(obj); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_client_blt.c b/drivers/gpu/drm/i915/gem/i915_gem_client_blt.c index 278664f831e7..272cf3ea68d5 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_client_blt.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_client_blt.c @@ -32,12 +32,13 @@ static void vma_clear_pages(struct i915_vma *vma) vma->pages = NULL; } -static int vma_bind(struct i915_address_space *vm, - struct i915_vma *vma, - enum i915_cache_level cache_level, - u32 flags) +static void vma_bind(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + struct i915_vma *vma, + enum i915_cache_level cache_level, + u32 flags) { - return vm->vma_ops.bind_vma(vm, vma, cache_level, flags); + vm->vma_ops.bind_vma(vm, stash, vma, cache_level, flags); } static void vma_unbind(struct i915_address_space *vm, struct i915_vma *vma) @@ -157,6 +158,7 @@ static void clear_pages_worker(struct work_struct *work) struct clear_pages_work *w = container_of(work, typeof(*w), work); struct drm_i915_gem_object *obj = w->sleeve->vma->obj; struct i915_vma *vma = w->sleeve->vma; + struct i915_gem_ww_ctx ww; struct i915_request *rq; struct i915_vma *batch; int err = w->dma.error; @@ -172,17 +174,20 @@ static void clear_pages_worker(struct work_struct *work) obj->read_domains = I915_GEM_GPU_DOMAINS; obj->write_domain = 0; - err = i915_vma_pin(vma, 0, 0, PIN_USER); - if (unlikely(err)) + i915_gem_ww_ctx_init(&ww, false); + intel_engine_pm_get(w->ce->engine); +retry: + err = intel_context_pin_ww(w->ce, &ww); + if (err) goto out_signal; - batch = intel_emit_vma_fill_blt(w->ce, vma, w->value); + batch = intel_emit_vma_fill_blt(w->ce, vma, &ww, w->value); if (IS_ERR(batch)) { err = PTR_ERR(batch); - goto out_unpin; + goto out_ctx; } - rq = intel_context_create_request(w->ce); + rq = i915_request_create(w->ce); if (IS_ERR(rq)) { err = PTR_ERR(rq); goto out_batch; @@ -224,9 +229,19 @@ static void clear_pages_worker(struct work_struct *work) i915_request_add(rq); out_batch: intel_emit_vma_release(w->ce, batch); -out_unpin: - i915_vma_unpin(vma); +out_ctx: + intel_context_unpin(w->ce); out_signal: + if (err == -EDEADLK) { + err = i915_gem_ww_ctx_backoff(&ww); + if (!err) + goto retry; + } + i915_gem_ww_ctx_fini(&ww); + + i915_vma_unpin(w->sleeve->vma); + intel_engine_pm_put(w->ce->engine); + if (unlikely(err)) { dma_fence_set_error(&w->dma, err); dma_fence_signal(&w->dma); @@ -234,6 +249,44 @@ static void clear_pages_worker(struct work_struct *work) } } +static int pin_wait_clear_pages_work(struct clear_pages_work *w, + struct intel_context *ce) +{ + struct i915_vma *vma = w->sleeve->vma; + struct i915_gem_ww_ctx ww; + int err; + + i915_gem_ww_ctx_init(&ww, false); +retry: + err = i915_gem_object_lock(vma->obj, &ww); + if (err) + goto out; + + err = i915_vma_pin_ww(vma, &ww, 0, 0, PIN_USER); + if (unlikely(err)) + goto out; + + err = i915_sw_fence_await_reservation(&w->wait, + vma->obj->base.resv, NULL, + true, 0, I915_FENCE_GFP); + if (err) + goto err_unpin_vma; + + dma_resv_add_excl_fence(vma->obj->base.resv, &w->dma); + +err_unpin_vma: + if (err) + i915_vma_unpin(vma); +out: + if (err == -EDEADLK) { + err = i915_gem_ww_ctx_backoff(&ww); + if (!err) + goto retry; + } + i915_gem_ww_ctx_fini(&ww); + return err; +} + static int __i915_sw_fence_call clear_pages_work_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state) @@ -287,17 +340,9 @@ int i915_gem_schedule_fill_pages_blt(struct drm_i915_gem_object *obj, dma_fence_init(&work->dma, &clear_pages_work_ops, &fence_lock, 0, 0); i915_sw_fence_init(&work->wait, clear_pages_work_notify); - i915_gem_object_lock(obj); - err = i915_sw_fence_await_reservation(&work->wait, - obj->base.resv, NULL, true, 0, - I915_FENCE_GFP); - if (err < 0) { + err = pin_wait_clear_pages_work(work, ce); + if (err < 0) dma_fence_set_error(&work->dma, err); - } else { - dma_resv_add_excl_fence(obj->base.resv, &work->dma); - err = 0; - } - i915_gem_object_unlock(obj); dma_fence_get(&work->dma); i915_sw_fence_commit(&work->wait); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index d0bdb6d447ed..cf5ecbde9e06 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -439,29 +439,36 @@ static bool __cancel_engine(struct intel_engine_cs *engine) return __reset_engine(engine); } -static struct intel_engine_cs *__active_engine(struct i915_request *rq) +static bool +__active_engine(struct i915_request *rq, struct intel_engine_cs **active) { struct intel_engine_cs *engine, *locked; + bool ret = false; /* * Serialise with __i915_request_submit() so that it sees * is-banned?, or we know the request is already inflight. + * + * Note that rq->engine is unstable, and so we double + * check that we have acquired the lock on the final engine. */ locked = READ_ONCE(rq->engine); spin_lock_irq(&locked->active.lock); while (unlikely(locked != (engine = READ_ONCE(rq->engine)))) { spin_unlock(&locked->active.lock); - spin_lock(&engine->active.lock); locked = engine; + spin_lock(&locked->active.lock); } - engine = NULL; - if (i915_request_is_active(rq) && rq->fence.error != -EIO) - engine = rq->engine; + if (!i915_request_completed(rq)) { + if (i915_request_is_active(rq) && rq->fence.error != -EIO) + *active = locked; + ret = true; + } spin_unlock_irq(&locked->active.lock); - return engine; + return ret; } static struct intel_engine_cs *active_engine(struct intel_context *ce) @@ -472,17 +479,16 @@ static struct intel_engine_cs *active_engine(struct intel_context *ce) if (!ce->timeline) return NULL; - mutex_lock(&ce->timeline->mutex); - list_for_each_entry_reverse(rq, &ce->timeline->requests, link) { - if (i915_request_completed(rq)) - break; + rcu_read_lock(); + list_for_each_entry_rcu(rq, &ce->timeline->requests, link) { + if (i915_request_is_active(rq) && i915_request_completed(rq)) + continue; /* Check with the backend if the request is inflight */ - engine = __active_engine(rq); - if (engine) + if (__active_engine(rq, &engine)) break; } - mutex_unlock(&ce->timeline->mutex); + rcu_read_unlock(); return engine; } @@ -713,6 +719,7 @@ __create_context(struct drm_i915_private *i915) ctx->i915 = i915; ctx->sched.priority = I915_USER_PRIORITY(I915_PRIORITY_NORMAL); mutex_init(&ctx->mutex); + INIT_LIST_HEAD(&ctx->link); spin_lock_init(&ctx->stale.lock); INIT_LIST_HEAD(&ctx->stale.engines); @@ -740,10 +747,6 @@ __create_context(struct drm_i915_private *i915) for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp); i++) ctx->hang_timestamp[i] = jiffies - CONTEXT_FAST_HANG_JIFFIES; - spin_lock(&i915->gem.contexts.lock); - list_add_tail(&ctx->link, &i915->gem.contexts.list); - spin_unlock(&i915->gem.contexts.lock); - return ctx; err_free: @@ -889,7 +892,7 @@ i915_gem_create_context(struct drm_i915_private *i915, unsigned int flags) if (flags & I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE) { struct intel_timeline *timeline; - timeline = intel_timeline_create(&i915->gt, NULL); + timeline = intel_timeline_create(&i915->gt); if (IS_ERR(timeline)) { context_close(ctx); return ERR_CAST(timeline); @@ -931,6 +934,7 @@ static int gem_context_register(struct i915_gem_context *ctx, struct drm_i915_file_private *fpriv, u32 *id) { + struct drm_i915_private *i915 = ctx->i915; struct i915_address_space *vm; int ret; @@ -949,8 +953,16 @@ static int gem_context_register(struct i915_gem_context *ctx, /* And finally expose ourselves to userspace via the idr */ ret = xa_alloc(&fpriv->context_xa, id, ctx, xa_limit_32b, GFP_KERNEL); if (ret) - put_pid(fetch_and_zero(&ctx->pid)); + goto err_pid; + spin_lock(&i915->gem.contexts.lock); + list_add_tail(&ctx->link, &i915->gem.contexts.list); + spin_unlock(&i915->gem.contexts.lock); + + return 0; + +err_pid: + put_pid(fetch_and_zero(&ctx->pid)); return ret; } @@ -1094,6 +1106,7 @@ I915_SELFTEST_DECLARE(static intel_engine_mask_t context_barrier_inject_fault); static int context_barrier_task(struct i915_gem_context *ctx, intel_engine_mask_t engines, bool (*skip)(struct intel_context *ce, void *data), + int (*pin)(struct intel_context *ce, struct i915_gem_ww_ctx *ww, void *data), int (*emit)(struct i915_request *rq, void *data), void (*task)(void *data), void *data) @@ -1101,6 +1114,7 @@ static int context_barrier_task(struct i915_gem_context *ctx, struct context_barrier_task *cb; struct i915_gem_engines_iter it; struct i915_gem_engines *e; + struct i915_gem_ww_ctx ww; struct intel_context *ce; int err = 0; @@ -1138,10 +1152,21 @@ static int context_barrier_task(struct i915_gem_context *ctx, if (skip && skip(ce, data)) continue; - rq = intel_context_create_request(ce); + i915_gem_ww_ctx_init(&ww, true); +retry: + err = intel_context_pin_ww(ce, &ww); + if (err) + goto err; + + if (pin) + err = pin(ce, &ww, data); + if (err) + goto err_unpin; + + rq = i915_request_create(ce); if (IS_ERR(rq)) { err = PTR_ERR(rq); - break; + goto err_unpin; } err = 0; @@ -1151,6 +1176,16 @@ static int context_barrier_task(struct i915_gem_context *ctx, err = i915_active_add_request(&cb->base, rq); i915_request_add(rq); +err_unpin: + intel_context_unpin(ce); +err: + if (err == -EDEADLK) { + err = i915_gem_ww_ctx_backoff(&ww); + if (!err) + goto retry; + } + i915_gem_ww_ctx_fini(&ww); + if (err) break; } @@ -1206,6 +1241,17 @@ static void set_ppgtt_barrier(void *data) i915_vm_close(old); } +static int pin_ppgtt_update(struct intel_context *ce, struct i915_gem_ww_ctx *ww, void *data) +{ + struct i915_address_space *vm = ce->vm; + + if (!HAS_LOGICAL_RING_CONTEXTS(vm->i915)) + /* ppGTT is not part of the legacy context image */ + return gen6_ppgtt_pin(i915_vm_to_ppgtt(vm), ww); + + return 0; +} + static int emit_ppgtt_update(struct i915_request *rq, void *data) { struct i915_address_space *vm = rq->context->vm; @@ -1262,20 +1308,10 @@ static int emit_ppgtt_update(struct i915_request *rq, void *data) static bool skip_ppgtt_update(struct intel_context *ce, void *data) { - if (!test_bit(CONTEXT_ALLOC_BIT, &ce->flags)) - return true; - if (HAS_LOGICAL_RING_CONTEXTS(ce->engine->i915)) - return false; - - if (!atomic_read(&ce->pin_count)) - return true; - - /* ppGTT is not part of the legacy context image */ - if (gen6_ppgtt_pin(i915_vm_to_ppgtt(ce->vm))) - return true; - - return false; + return !ce->state; + else + return !atomic_read(&ce->pin_count); } static int set_ppgtt(struct drm_i915_file_private *file_priv, @@ -1326,6 +1362,7 @@ static int set_ppgtt(struct drm_i915_file_private *file_priv, */ err = context_barrier_task(ctx, ALL_ENGINES, skip_ppgtt_update, + pin_ppgtt_update, emit_ppgtt_update, set_ppgtt_barrier, old); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c index 2679380159fc..27fddc22a7c6 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c @@ -128,7 +128,7 @@ static int i915_gem_begin_cpu_access(struct dma_buf *dma_buf, enum dma_data_dire if (err) return err; - err = i915_gem_object_lock_interruptible(obj); + err = i915_gem_object_lock_interruptible(obj, NULL); if (err) goto out; @@ -149,7 +149,7 @@ static int i915_gem_end_cpu_access(struct dma_buf *dma_buf, enum dma_data_direct if (err) return err; - err = i915_gem_object_lock_interruptible(obj); + err = i915_gem_object_lock_interruptible(obj, NULL); if (err) goto out; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_domain.c b/drivers/gpu/drm/i915/gem/i915_gem_domain.c index 7f76fc68f498..7c90a63c273d 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_domain.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_domain.c @@ -32,11 +32,17 @@ void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj) if (!i915_gem_object_is_framebuffer(obj)) return; - i915_gem_object_lock(obj); + i915_gem_object_lock(obj, NULL); __i915_gem_object_flush_for_display(obj); i915_gem_object_unlock(obj); } +void i915_gem_object_flush_if_display_locked(struct drm_i915_gem_object *obj) +{ + if (i915_gem_object_is_framebuffer(obj)) + __i915_gem_object_flush_for_display(obj); +} + /** * Moves a single object to the WC read, and possibly write domain. * @obj: object to act on @@ -197,18 +203,12 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj, if (ret) return ret; - ret = i915_gem_object_lock_interruptible(obj); - if (ret) - return ret; - /* Always invalidate stale cachelines */ if (obj->cache_level != cache_level) { i915_gem_object_set_cache_coherency(obj, cache_level); obj->cache_dirty = true; } - i915_gem_object_unlock(obj); - /* The cache-level will be applied when each vma is rebound. */ return i915_gem_object_unbind(obj, I915_GEM_OBJECT_UNBIND_ACTIVE | @@ -293,7 +293,12 @@ int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data, goto out; } + ret = i915_gem_object_lock_interruptible(obj, NULL); + if (ret) + goto out; + ret = i915_gem_object_set_cache_level(obj, level); + i915_gem_object_unlock(obj); out: i915_gem_object_put(obj); @@ -313,6 +318,7 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj, unsigned int flags) { struct drm_i915_private *i915 = to_i915(obj->base.dev); + struct i915_gem_ww_ctx ww; struct i915_vma *vma; int ret; @@ -320,6 +326,11 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj, if (HAS_LMEM(i915) && !i915_gem_object_is_lmem(obj)) return ERR_PTR(-EINVAL); + i915_gem_ww_ctx_init(&ww, true); +retry: + ret = i915_gem_object_lock(obj, &ww); + if (ret) + goto err; /* * The display engine is not coherent with the LLC cache on gen6. As * a result, we make sure that the pinning that is about to occur is @@ -334,7 +345,7 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj, HAS_WT(i915) ? I915_CACHE_WT : I915_CACHE_NONE); if (ret) - return ERR_PTR(ret); + goto err; /* * As the user may map the buffer once pinned in the display plane @@ -347,18 +358,31 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj, vma = ERR_PTR(-ENOSPC); if ((flags & PIN_MAPPABLE) == 0 && (!view || view->type == I915_GGTT_VIEW_NORMAL)) - vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment, - flags | - PIN_MAPPABLE | - PIN_NONBLOCK); - if (IS_ERR(vma)) - vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment, flags); - if (IS_ERR(vma)) - return vma; + vma = i915_gem_object_ggtt_pin_ww(obj, &ww, view, 0, alignment, + flags | PIN_MAPPABLE | + PIN_NONBLOCK); + if (IS_ERR(vma) && vma != ERR_PTR(-EDEADLK)) + vma = i915_gem_object_ggtt_pin_ww(obj, &ww, view, 0, + alignment, flags); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto err; + } vma->display_alignment = max_t(u64, vma->display_alignment, alignment); - i915_gem_object_flush_if_display(obj); + i915_gem_object_flush_if_display_locked(obj); + +err: + if (ret == -EDEADLK) { + ret = i915_gem_ww_ctx_backoff(&ww); + if (!ret) + goto retry; + } + i915_gem_ww_ctx_fini(&ww); + + if (ret) + return ERR_PTR(ret); return vma; } @@ -536,7 +560,7 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data, if (err) goto out; - err = i915_gem_object_lock_interruptible(obj); + err = i915_gem_object_lock_interruptible(obj, NULL); if (err) goto out_unpin; @@ -576,19 +600,17 @@ int i915_gem_object_prepare_read(struct drm_i915_gem_object *obj, if (!i915_gem_object_has_struct_page(obj)) return -ENODEV; - ret = i915_gem_object_lock_interruptible(obj); - if (ret) - return ret; + assert_object_held(obj); ret = i915_gem_object_wait(obj, I915_WAIT_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); if (ret) - goto err_unlock; + return ret; ret = i915_gem_object_pin_pages(obj); if (ret) - goto err_unlock; + return ret; if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ || !static_cpu_has(X86_FEATURE_CLFLUSH)) { @@ -616,8 +638,6 @@ int i915_gem_object_prepare_read(struct drm_i915_gem_object *obj, err_unpin: i915_gem_object_unpin_pages(obj); -err_unlock: - i915_gem_object_unlock(obj); return ret; } @@ -630,20 +650,18 @@ int i915_gem_object_prepare_write(struct drm_i915_gem_object *obj, if (!i915_gem_object_has_struct_page(obj)) return -ENODEV; - ret = i915_gem_object_lock_interruptible(obj); - if (ret) - return ret; + assert_object_held(obj); ret = i915_gem_object_wait(obj, I915_WAIT_INTERRUPTIBLE | I915_WAIT_ALL, MAX_SCHEDULE_TIMEOUT); if (ret) - goto err_unlock; + return ret; ret = i915_gem_object_pin_pages(obj); if (ret) - goto err_unlock; + return ret; if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE || !static_cpu_has(X86_FEATURE_CLFLUSH)) { @@ -680,7 +698,5 @@ int i915_gem_object_prepare_write(struct drm_i915_gem_object *obj, err_unpin: i915_gem_object_unpin_pages(obj); -err_unlock: - i915_gem_object_unlock(obj); return ret; } diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index 322642fb765f..804339255df1 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -41,11 +41,6 @@ struct eb_vma { u32 handle; }; -struct eb_vma_array { - struct kref kref; - struct eb_vma vma[]; -}; - enum { FORCE_CPU_RELOC = 1, FORCE_GTT_RELOC, @@ -58,9 +53,11 @@ enum { #define __EXEC_OBJECT_NEEDS_MAP BIT(29) #define __EXEC_OBJECT_NEEDS_BIAS BIT(28) #define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 28) /* all of the above */ +#define __EXEC_OBJECT_RESERVED (__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE) #define __EXEC_HAS_RELOC BIT(31) -#define __EXEC_INTERNAL_FLAGS (~0u << 31) +#define __EXEC_ENGINE_PINNED BIT(30) +#define __EXEC_INTERNAL_FLAGS (~0u << 30) #define UPDATE PIN_OFFSET_FIXED #define BATCH_OFFSET_BIAS (256*1024) @@ -261,6 +258,8 @@ struct i915_execbuffer { /** list of vma that have execobj.relocation_count */ struct list_head relocs; + struct i915_gem_ww_ctx ww; + /** * Track the most recently used object for relocations, as we * frequently have to perform multiple relocations within the same @@ -276,19 +275,22 @@ struct i915_execbuffer { bool has_fence : 1; bool needs_unfenced : 1; - struct i915_vma *target; struct i915_request *rq; - struct i915_vma *rq_vma; u32 *rq_cmd; unsigned int rq_size; + struct intel_gt_buffer_pool_node *pool; } reloc_cache; + struct intel_gt_buffer_pool_node *reloc_pool; /** relocation pool for -EDEADLK handling */ + struct intel_context *reloc_context; + u64 invalid_flags; /** Set of execobj.flags that are invalid */ u32 context_flags; /** Set of execobj.flags to insert from the ctx */ u32 batch_start_offset; /** Location within object of batch */ u32 batch_len; /** Length of batch within object */ u32 batch_flags; /** Flags composed for emit_bb_start() */ + struct intel_gt_buffer_pool_node *batch_pool; /** pool node for batch buffer */ /** * Indicate either the size of the hastable used to resolve @@ -297,12 +299,16 @@ struct i915_execbuffer { */ int lut_size; struct hlist_head *buckets; /** ht for relocation handles */ - struct eb_vma_array *array; struct eb_fence *fences; unsigned long num_fences; }; +static int eb_parse(struct i915_execbuffer *eb); +static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb, + bool throttle); +static void eb_unpin_engine(struct i915_execbuffer *eb); + static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb) { return intel_engine_requires_cmd_parser(eb->engine) || @@ -310,62 +316,8 @@ static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb) eb->args->batch_len); } -static struct eb_vma_array *eb_vma_array_create(unsigned int count) -{ - struct eb_vma_array *arr; - - arr = kvmalloc(struct_size(arr, vma, count), GFP_KERNEL | __GFP_NOWARN); - if (!arr) - return NULL; - - kref_init(&arr->kref); - arr->vma[0].vma = NULL; - - return arr; -} - -static inline void eb_unreserve_vma(struct eb_vma *ev) -{ - struct i915_vma *vma = ev->vma; - - if (unlikely(ev->flags & __EXEC_OBJECT_HAS_FENCE)) - __i915_vma_unpin_fence(vma); - - if (ev->flags & __EXEC_OBJECT_HAS_PIN) - __i915_vma_unpin(vma); - - ev->flags &= ~(__EXEC_OBJECT_HAS_PIN | - __EXEC_OBJECT_HAS_FENCE); -} - -static void eb_vma_array_destroy(struct kref *kref) -{ - struct eb_vma_array *arr = container_of(kref, typeof(*arr), kref); - struct eb_vma *ev = arr->vma; - - while (ev->vma) { - eb_unreserve_vma(ev); - i915_vma_put(ev->vma); - ev++; - } - - kvfree(arr); -} - -static void eb_vma_array_put(struct eb_vma_array *arr) -{ - kref_put(&arr->kref, eb_vma_array_destroy); -} - static int eb_create(struct i915_execbuffer *eb) { - /* Allocate an extra slot for use by the command parser + sentinel */ - eb->array = eb_vma_array_create(eb->buffer_count + 2); - if (!eb->array) - return -ENOMEM; - - eb->vma = eb->array->vma; - if (!(eb->args->flags & I915_EXEC_HANDLE_LUT)) { unsigned int size = 1 + ilog2(eb->buffer_count); @@ -399,10 +351,8 @@ static int eb_create(struct i915_execbuffer *eb) break; } while (--size); - if (unlikely(!size)) { - eb_vma_array_put(eb->array); + if (unlikely(!size)) return -ENOMEM; - } eb->lut_size = size; } else { @@ -486,16 +436,17 @@ eb_pin_vma(struct i915_execbuffer *eb, pin_flags |= PIN_GLOBAL; /* Attempt to reuse the current location if available */ - if (unlikely(i915_vma_pin(vma, 0, 0, pin_flags))) { + /* TODO: Add -EDEADLK handling here */ + if (unlikely(i915_vma_pin_ww(vma, &eb->ww, 0, 0, pin_flags))) { if (entry->flags & EXEC_OBJECT_PINNED) return false; /* Failing that pick any _free_ space if suitable */ - if (unlikely(i915_vma_pin(vma, - entry->pad_to_size, - entry->alignment, - eb_pin_flags(entry, ev->flags) | - PIN_USER | PIN_NOEVICT))) + if (unlikely(i915_vma_pin_ww(vma, &eb->ww, + entry->pad_to_size, + entry->alignment, + eb_pin_flags(entry, ev->flags) | + PIN_USER | PIN_NOEVICT))) return false; } @@ -513,6 +464,19 @@ eb_pin_vma(struct i915_execbuffer *eb, return !eb_vma_misplaced(entry, vma, ev->flags); } +static inline void +eb_unreserve_vma(struct eb_vma *ev) +{ + if (!(ev->flags & __EXEC_OBJECT_HAS_PIN)) + return; + + if (unlikely(ev->flags & __EXEC_OBJECT_HAS_FENCE)) + __i915_vma_unpin_fence(ev->vma); + + __i915_vma_unpin(ev->vma); + ev->flags &= ~__EXEC_OBJECT_RESERVED; +} + static int eb_validate_vma(struct i915_execbuffer *eb, struct drm_i915_gem_exec_object2 *entry, @@ -604,16 +568,6 @@ eb_add_vma(struct i915_execbuffer *eb, eb->batch = ev; } - - if (eb_pin_vma(eb, entry, ev)) { - if (entry->offset != vma->node.start) { - entry->offset = vma->node.start | UPDATE; - eb->args->flags |= __EXEC_HAS_RELOC; - } - } else { - eb_unreserve_vma(ev); - list_add_tail(&ev->bind_link, &eb->unbound); - } } static inline int use_cpu_reloc(const struct reloc_cache *cache, @@ -633,7 +587,7 @@ static inline int use_cpu_reloc(const struct reloc_cache *cache, obj->cache_level != I915_CACHE_NONE); } -static int eb_reserve_vma(const struct i915_execbuffer *eb, +static int eb_reserve_vma(struct i915_execbuffer *eb, struct eb_vma *ev, u64 pin_flags) { @@ -648,7 +602,7 @@ static int eb_reserve_vma(const struct i915_execbuffer *eb, return err; } - err = i915_vma_pin(vma, + err = i915_vma_pin_ww(vma, &eb->ww, entry->pad_to_size, entry->alignment, eb_pin_flags(entry, ev->flags) | pin_flags); if (err) @@ -698,10 +652,6 @@ static int eb_reserve(struct i915_execbuffer *eb) * This avoid unnecessary unbinding of later objects in order to make * room for the earlier objects *unless* we need to defragment. */ - - if (mutex_lock_interruptible(&eb->i915->drm.struct_mutex)) - return -EINTR; - pass = 0; do { list_for_each_entry(ev, &eb->unbound, bind_link) { @@ -709,8 +659,8 @@ static int eb_reserve(struct i915_execbuffer *eb) if (err) break; } - if (!(err == -ENOSPC || err == -EAGAIN)) - break; + if (err != -ENOSPC) + return err; /* Resort *all* the objects into priority order */ INIT_LIST_HEAD(&eb->unbound); @@ -740,13 +690,6 @@ static int eb_reserve(struct i915_execbuffer *eb) } list_splice_tail(&last, &eb->unbound); - if (err == -EAGAIN) { - mutex_unlock(&eb->i915->drm.struct_mutex); - flush_workqueue(eb->i915->mm.userptr_wq); - mutex_lock(&eb->i915->drm.struct_mutex); - continue; - } - switch (pass++) { case 0: break; @@ -757,20 +700,15 @@ static int eb_reserve(struct i915_execbuffer *eb) err = i915_gem_evict_vm(eb->context->vm); mutex_unlock(&eb->context->vm->mutex); if (err) - goto unlock; + return err; break; default: - err = -ENOSPC; - goto unlock; + return -ENOSPC; } pin_flags = PIN_USER; } while (1); - -unlock: - mutex_unlock(&eb->i915->drm.struct_mutex); - return err; } static unsigned int eb_batch_index(const struct i915_execbuffer *eb) @@ -893,12 +831,12 @@ static struct i915_vma *eb_lookup_vma(struct i915_execbuffer *eb, u32 handle) static int eb_lookup_vmas(struct i915_execbuffer *eb) { + struct drm_i915_private *i915 = eb->i915; unsigned int batch = eb_batch_index(eb); unsigned int i; int err = 0; INIT_LIST_HEAD(&eb->relocs); - INIT_LIST_HEAD(&eb->unbound); for (i = 0; i < eb->buffer_count; i++) { struct i915_vma *vma; @@ -906,22 +844,83 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb) vma = eb_lookup_vma(eb, eb->exec[i].handle); if (IS_ERR(vma)) { err = PTR_ERR(vma); - break; + goto err; } err = eb_validate_vma(eb, &eb->exec[i], vma); if (unlikely(err)) { i915_vma_put(vma); - break; + goto err; } eb_add_vma(eb, i, batch, vma); } + if (unlikely(eb->batch->flags & EXEC_OBJECT_WRITE)) { + drm_dbg(&i915->drm, + "Attempting to use self-modifying batch buffer\n"); + return -EINVAL; + } + + if (range_overflows_t(u64, + eb->batch_start_offset, eb->batch_len, + eb->batch->vma->size)) { + drm_dbg(&i915->drm, "Attempting to use out-of-bounds batch\n"); + return -EINVAL; + } + + if (eb->batch_len == 0) + eb->batch_len = eb->batch->vma->size - eb->batch_start_offset; + + return 0; + +err: eb->vma[i].vma = NULL; return err; } +static int eb_validate_vmas(struct i915_execbuffer *eb) +{ + unsigned int i; + int err; + + INIT_LIST_HEAD(&eb->unbound); + + for (i = 0; i < eb->buffer_count; i++) { + struct drm_i915_gem_exec_object2 *entry = &eb->exec[i]; + struct eb_vma *ev = &eb->vma[i]; + struct i915_vma *vma = ev->vma; + + err = i915_gem_object_lock(vma->obj, &eb->ww); + if (err) + return err; + + if (eb_pin_vma(eb, entry, ev)) { + if (entry->offset != vma->node.start) { + entry->offset = vma->node.start | UPDATE; + eb->args->flags |= __EXEC_HAS_RELOC; + } + } else { + eb_unreserve_vma(ev); + + list_add_tail(&ev->bind_link, &eb->unbound); + if (drm_mm_node_allocated(&vma->node)) { + err = i915_vma_unbind(vma); + if (err) + return err; + } + } + + GEM_BUG_ON(drm_mm_node_allocated(&vma->node) && + eb_vma_misplaced(&eb->exec[i], vma, ev->flags)); + } + + if (!list_empty(&eb->unbound)) + return eb_reserve(eb); + + return 0; +} + static struct eb_vma * eb_get_vma(const struct i915_execbuffer *eb, unsigned long handle) { @@ -942,13 +941,31 @@ eb_get_vma(const struct i915_execbuffer *eb, unsigned long handle) } } +static void eb_release_vmas(struct i915_execbuffer *eb, bool final) +{ + const unsigned int count = eb->buffer_count; + unsigned int i; + + for (i = 0; i < count; i++) { + struct eb_vma *ev = &eb->vma[i]; + struct i915_vma *vma = ev->vma; + + if (!vma) + break; + + eb_unreserve_vma(ev); + + if (final) + i915_vma_put(vma); + } + + eb_unpin_engine(eb); +} + static void eb_destroy(const struct i915_execbuffer *eb) { GEM_BUG_ON(eb->reloc_cache.rq); - if (eb->array) - eb_vma_array_put(eb->array); - if (eb->lut_size > 0) kfree(eb->buckets); } @@ -960,6 +977,14 @@ relocation_target(const struct drm_i915_gem_relocation_entry *reloc, return gen8_canonical_addr((int)reloc->delta + target->node.start); } +static void reloc_cache_clear(struct reloc_cache *cache) +{ + cache->rq = NULL; + cache->rq_cmd = NULL; + cache->pool = NULL; + cache->rq_size = 0; +} + static void reloc_cache_init(struct reloc_cache *cache, struct drm_i915_private *i915) { @@ -972,8 +997,7 @@ static void reloc_cache_init(struct reloc_cache *cache, cache->has_fence = cache->gen < 4; cache->needs_unfenced = INTEL_INFO(i915)->unfenced_needs_alignment; cache->node.flags = 0; - cache->rq = NULL; - cache->target = NULL; + reloc_cache_clear(cache); } static inline void *unmask_page(unsigned long p) @@ -995,132 +1019,60 @@ static inline struct i915_ggtt *cache_to_ggtt(struct reloc_cache *cache) return &i915->ggtt; } -#define RELOC_TAIL 4 - -static int reloc_gpu_chain(struct reloc_cache *cache) +static void reloc_cache_put_pool(struct i915_execbuffer *eb, struct reloc_cache *cache) { - struct intel_gt_buffer_pool_node *pool; - struct i915_request *rq = cache->rq; - struct i915_vma *batch; - u32 *cmd; - int err; + if (!cache->pool) + return; - pool = intel_gt_get_buffer_pool(rq->engine->gt, PAGE_SIZE); - if (IS_ERR(pool)) - return PTR_ERR(pool); - - batch = i915_vma_instance(pool->obj, rq->context->vm, NULL); - if (IS_ERR(batch)) { - err = PTR_ERR(batch); - goto out_pool; - } - - err = i915_vma_pin(batch, 0, 0, PIN_USER | PIN_NONBLOCK); - if (err) - goto out_pool; - - GEM_BUG_ON(cache->rq_size + RELOC_TAIL > PAGE_SIZE / sizeof(u32)); - cmd = cache->rq_cmd + cache->rq_size; - *cmd++ = MI_ARB_CHECK; - if (cache->gen >= 8) - *cmd++ = MI_BATCH_BUFFER_START_GEN8; - else if (cache->gen >= 6) - *cmd++ = MI_BATCH_BUFFER_START; - else - *cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT; - *cmd++ = lower_32_bits(batch->node.start); - *cmd++ = upper_32_bits(batch->node.start); /* Always 0 for gen<8 */ - i915_gem_object_flush_map(cache->rq_vma->obj); - i915_gem_object_unpin_map(cache->rq_vma->obj); - cache->rq_vma = NULL; - - err = intel_gt_buffer_pool_mark_active(pool, rq); - if (err == 0) { - i915_vma_lock(batch); - err = i915_request_await_object(rq, batch->obj, false); - if (err == 0) - err = i915_vma_move_to_active(batch, rq, 0); - i915_vma_unlock(batch); - } - i915_vma_unpin(batch); - if (err) - goto out_pool; - - cmd = i915_gem_object_pin_map(batch->obj, - cache->has_llc ? - I915_MAP_FORCE_WB : - I915_MAP_FORCE_WC); - if (IS_ERR(cmd)) { - err = PTR_ERR(cmd); - goto out_pool; - } - - /* Return with batch mapping (cmd) still pinned */ - cache->rq_cmd = cmd; - cache->rq_size = 0; - cache->rq_vma = batch; - -out_pool: - intel_gt_buffer_pool_put(pool); - return err; + /* + * This is a bit nasty, normally we keep objects locked until the end + * of execbuffer, but we already submit this, and have to unlock before + * dropping the reference. Fortunately we can only hold 1 pool node at + * a time, so this should be harmless. + */ + i915_gem_ww_unlock_single(cache->pool->obj); + intel_gt_buffer_pool_put(cache->pool); + cache->pool = NULL; } -static unsigned int reloc_bb_flags(const struct reloc_cache *cache) +static void reloc_gpu_flush(struct i915_execbuffer *eb, struct reloc_cache *cache) { - return cache->gen > 5 ? 0 : I915_DISPATCH_SECURE; + struct drm_i915_gem_object *obj = cache->rq->batch->obj; + + GEM_BUG_ON(cache->rq_size >= obj->base.size / sizeof(u32)); + cache->rq_cmd[cache->rq_size] = MI_BATCH_BUFFER_END; + + __i915_gem_object_flush_map(obj, 0, sizeof(u32) * (cache->rq_size + 1)); + i915_gem_object_unpin_map(obj); + + intel_gt_chipset_flush(cache->rq->engine->gt); + + i915_request_add(cache->rq); + reloc_cache_put_pool(eb, cache); + reloc_cache_clear(cache); + + eb->reloc_pool = NULL; } -static int reloc_gpu_flush(struct reloc_cache *cache) -{ - struct i915_request *rq; - int err; - - rq = fetch_and_zero(&cache->rq); - if (!rq) - return 0; - - if (cache->rq_vma) { - struct drm_i915_gem_object *obj = cache->rq_vma->obj; - - GEM_BUG_ON(cache->rq_size >= obj->base.size / sizeof(u32)); - cache->rq_cmd[cache->rq_size++] = MI_BATCH_BUFFER_END; - - __i915_gem_object_flush_map(obj, - 0, sizeof(u32) * cache->rq_size); - i915_gem_object_unpin_map(obj); - } - - err = 0; - if (rq->engine->emit_init_breadcrumb) - err = rq->engine->emit_init_breadcrumb(rq); - if (!err) - err = rq->engine->emit_bb_start(rq, - rq->batch->node.start, - PAGE_SIZE, - reloc_bb_flags(cache)); - if (err) - i915_request_set_error_once(rq, err); - - intel_gt_chipset_flush(rq->engine->gt); - i915_request_add(rq); - - return err; -} - -static void reloc_cache_reset(struct reloc_cache *cache) +static void reloc_cache_reset(struct reloc_cache *cache, struct i915_execbuffer *eb) { void *vaddr; + if (cache->rq) + reloc_gpu_flush(eb, cache); + if (!cache->vaddr) return; vaddr = unmask_page(cache->vaddr); if (cache->vaddr & KMAP) { + struct drm_i915_gem_object *obj = + (struct drm_i915_gem_object *)cache->node.mm; if (cache->vaddr & CLFLUSH_AFTER) mb(); kunmap_atomic(vaddr); - i915_gem_object_finish_access((struct drm_i915_gem_object *)cache->node.mm); + i915_gem_object_finish_access(obj); } else { struct i915_ggtt *ggtt = cache_to_ggtt(cache); @@ -1145,9 +1097,10 @@ static void reloc_cache_reset(struct reloc_cache *cache) static void *reloc_kmap(struct drm_i915_gem_object *obj, struct reloc_cache *cache, - unsigned long page) + unsigned long pageno) { void *vaddr; + struct page *page; if (cache->vaddr) { kunmap_atomic(unmask_page(cache->vaddr)); @@ -1168,17 +1121,22 @@ static void *reloc_kmap(struct drm_i915_gem_object *obj, mb(); } - vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj, page)); + page = i915_gem_object_get_page(obj, pageno); + if (!obj->mm.dirty) + set_page_dirty(page); + + vaddr = kmap_atomic(page); cache->vaddr = unmask_flags(cache->vaddr) | (unsigned long)vaddr; - cache->page = page; + cache->page = pageno; return vaddr; } static void *reloc_iomap(struct drm_i915_gem_object *obj, - struct reloc_cache *cache, + struct i915_execbuffer *eb, unsigned long page) { + struct reloc_cache *cache = &eb->reloc_cache; struct i915_ggtt *ggtt = cache_to_ggtt(cache); unsigned long offset; void *vaddr; @@ -1196,16 +1154,17 @@ static void *reloc_iomap(struct drm_i915_gem_object *obj, if (use_cpu_reloc(cache, obj)) return NULL; - i915_gem_object_lock(obj); err = i915_gem_object_set_to_gtt_domain(obj, true); - i915_gem_object_unlock(obj); if (err) return ERR_PTR(err); - vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, - PIN_MAPPABLE | - PIN_NONBLOCK /* NOWARN */ | - PIN_NOEVICT); + vma = i915_gem_object_ggtt_pin_ww(obj, &eb->ww, NULL, 0, 0, + PIN_MAPPABLE | + PIN_NONBLOCK /* NOWARN */ | + PIN_NOEVICT); + if (vma == ERR_PTR(-EDEADLK)) + return vma; + if (IS_ERR(vma)) { memset(&cache->node, 0, sizeof(cache->node)); mutex_lock(&ggtt->vm.mutex); @@ -1241,9 +1200,10 @@ static void *reloc_iomap(struct drm_i915_gem_object *obj, } static void *reloc_vaddr(struct drm_i915_gem_object *obj, - struct reloc_cache *cache, + struct i915_execbuffer *eb, unsigned long page) { + struct reloc_cache *cache = &eb->reloc_cache; void *vaddr; if (cache->page == page) { @@ -1251,7 +1211,7 @@ static void *reloc_vaddr(struct drm_i915_gem_object *obj, } else { vaddr = NULL; if ((cache->vaddr & KMAP) == 0) - vaddr = reloc_iomap(obj, cache, page); + vaddr = reloc_iomap(obj, eb, page); if (!vaddr) vaddr = reloc_kmap(obj, cache, page); } @@ -1287,7 +1247,7 @@ static int reloc_move_to_gpu(struct i915_request *rq, struct i915_vma *vma) struct drm_i915_gem_object *obj = vma->obj; int err; - i915_vma_lock(vma); + assert_vma_held(vma); if (obj->cache_dirty & ~obj->cache_coherent) i915_gem_clflush_object(obj, 0); @@ -1297,25 +1257,31 @@ static int reloc_move_to_gpu(struct i915_request *rq, struct i915_vma *vma) if (err == 0) err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE); - i915_vma_unlock(vma); - return err; } static int __reloc_gpu_alloc(struct i915_execbuffer *eb, struct intel_engine_cs *engine, + struct i915_vma *vma, unsigned int len) { struct reloc_cache *cache = &eb->reloc_cache; - struct intel_gt_buffer_pool_node *pool; + struct intel_gt_buffer_pool_node *pool = eb->reloc_pool; struct i915_request *rq; struct i915_vma *batch; u32 *cmd; int err; - pool = intel_gt_get_buffer_pool(engine->gt, PAGE_SIZE); - if (IS_ERR(pool)) - return PTR_ERR(pool); + if (!pool) { + pool = intel_gt_get_buffer_pool(engine->gt, PAGE_SIZE); + if (IS_ERR(pool)) + return PTR_ERR(pool); + } + eb->reloc_pool = NULL; + + err = i915_gem_object_lock(pool->obj, &eb->ww); + if (err) + goto err_pool; cmd = i915_gem_object_pin_map(pool->obj, cache->has_llc ? @@ -1323,35 +1289,42 @@ static int __reloc_gpu_alloc(struct i915_execbuffer *eb, I915_MAP_FORCE_WC); if (IS_ERR(cmd)) { err = PTR_ERR(cmd); - goto out_pool; + goto err_pool; } - batch = i915_vma_instance(pool->obj, eb->context->vm, NULL); + batch = i915_vma_instance(pool->obj, vma->vm, NULL); if (IS_ERR(batch)) { err = PTR_ERR(batch); goto err_unmap; } - err = i915_vma_pin(batch, 0, 0, PIN_USER | PIN_NONBLOCK); + err = i915_vma_pin_ww(batch, &eb->ww, 0, 0, PIN_USER | PIN_NONBLOCK); if (err) goto err_unmap; if (engine == eb->context->engine) { rq = i915_request_create(eb->context); } else { - struct intel_context *ce; + struct intel_context *ce = eb->reloc_context; - ce = intel_context_create(engine); - if (IS_ERR(ce)) { - err = PTR_ERR(ce); - goto err_unpin; + if (!ce) { + ce = intel_context_create(engine); + if (IS_ERR(ce)) { + err = PTR_ERR(ce); + goto err_unpin; + } + + i915_vm_put(ce->vm); + ce->vm = i915_vm_get(eb->context->vm); + eb->reloc_context = ce; } - i915_vm_put(ce->vm); - ce->vm = i915_vm_get(eb->context->vm); + err = intel_context_pin_ww(ce, &eb->ww); + if (err) + goto err_unpin; - rq = intel_context_create_request(ce); - intel_context_put(ce); + rq = i915_request_create(ce); + intel_context_unpin(ce); } if (IS_ERR(rq)) { err = PTR_ERR(rq); @@ -1362,11 +1335,20 @@ static int __reloc_gpu_alloc(struct i915_execbuffer *eb, if (err) goto err_request; - i915_vma_lock(batch); + err = reloc_move_to_gpu(rq, vma); + if (err) + goto err_request; + + err = eb->engine->emit_bb_start(rq, + batch->node.start, PAGE_SIZE, + cache->gen > 5 ? 0 : I915_DISPATCH_SECURE); + if (err) + goto skip_request; + + assert_vma_held(batch); err = i915_request_await_object(rq, batch->obj, false); if (err == 0) err = i915_vma_move_to_active(batch, rq, 0); - i915_vma_unlock(batch); if (err) goto skip_request; @@ -1376,10 +1358,10 @@ static int __reloc_gpu_alloc(struct i915_execbuffer *eb, cache->rq = rq; cache->rq_cmd = cmd; cache->rq_size = 0; - cache->rq_vma = batch; + cache->pool = pool; /* Return with batch mapping (cmd) still pinned */ - goto out_pool; + return 0; skip_request: i915_request_set_error_once(rq, err); @@ -1389,8 +1371,8 @@ static int __reloc_gpu_alloc(struct i915_execbuffer *eb, i915_vma_unpin(batch); err_unmap: i915_gem_object_unpin_map(pool->obj); -out_pool: - intel_gt_buffer_pool_put(pool); +err_pool: + eb->reloc_pool = pool; return err; } @@ -1405,9 +1387,12 @@ static u32 *reloc_gpu(struct i915_execbuffer *eb, { struct reloc_cache *cache = &eb->reloc_cache; u32 *cmd; - int err; + + if (cache->rq_size > PAGE_SIZE/sizeof(u32) - (len + 1)) + reloc_gpu_flush(eb, cache); if (unlikely(!cache->rq)) { + int err; struct intel_engine_cs *engine = eb->engine; if (!reloc_can_use_engine(engine)) { @@ -1416,31 +1401,11 @@ static u32 *reloc_gpu(struct i915_execbuffer *eb, return ERR_PTR(-ENODEV); } - err = __reloc_gpu_alloc(eb, engine, len); + err = __reloc_gpu_alloc(eb, engine, vma, len); if (unlikely(err)) return ERR_PTR(err); } - if (vma != cache->target) { - err = reloc_move_to_gpu(cache->rq, vma); - if (unlikely(err)) { - i915_request_set_error_once(cache->rq, err); - return ERR_PTR(err); - } - - cache->target = vma; - } - - if (unlikely(cache->rq_size + len > - PAGE_SIZE / sizeof(u32) - RELOC_TAIL)) { - err = reloc_gpu_chain(cache); - if (unlikely(err)) { - i915_request_set_error_once(cache->rq, err); - return ERR_PTR(err); - } - } - - GEM_BUG_ON(cache->rq_size + len >= PAGE_SIZE / sizeof(u32)); cmd = cache->rq_cmd + cache->rq_size; cache->rq_size += len; @@ -1490,7 +1455,9 @@ static bool __reloc_entry_gpu(struct i915_execbuffer *eb, len = 3; batch = reloc_gpu(eb, vma, len); - if (IS_ERR(batch)) + if (batch == ERR_PTR(-EDEADLK)) + return (s64)-EDEADLK; + else if (IS_ERR(batch)) return false; addr = gen8_canonical_addr(vma->node.start + offset); @@ -1543,7 +1510,7 @@ static bool __reloc_entry_gpu(struct i915_execbuffer *eb, return true; } -static bool reloc_entry_gpu(struct i915_execbuffer *eb, +static int reloc_entry_gpu(struct i915_execbuffer *eb, struct i915_vma *vma, u64 offset, u64 target_addr) @@ -1565,14 +1532,17 @@ relocate_entry(struct i915_vma *vma, { u64 target_addr = relocation_target(reloc, target); u64 offset = reloc->offset; + int reloc_gpu = reloc_entry_gpu(eb, vma, offset, target_addr); - if (!reloc_entry_gpu(eb, vma, offset, target_addr)) { + if (reloc_gpu < 0) + return reloc_gpu; + + if (!reloc_gpu) { bool wide = eb->reloc_cache.use_64bit_reloc; void *vaddr; repeat: - vaddr = reloc_vaddr(vma->obj, - &eb->reloc_cache, + vaddr = reloc_vaddr(vma->obj, eb, offset >> PAGE_SHIFT); if (IS_ERR(vaddr)) return PTR_ERR(vaddr); @@ -1723,7 +1693,9 @@ static int eb_relocate_vma(struct i915_execbuffer *eb, struct eb_vma *ev) * we would try to acquire the struct mutex again. Obviously * this is bad and so lockdep complains vehemently. */ - copied = __copy_from_user(r, urelocs, count * sizeof(r[0])); + pagefault_disable(); + copied = __copy_from_user_inatomic(r, urelocs, count * sizeof(r[0])); + pagefault_enable(); if (unlikely(copied)) { remain = -EFAULT; goto out; @@ -1767,28 +1739,354 @@ static int eb_relocate_vma(struct i915_execbuffer *eb, struct eb_vma *ev) urelocs += ARRAY_SIZE(stack); } while (remain); out: - reloc_cache_reset(&eb->reloc_cache); + reloc_cache_reset(&eb->reloc_cache, eb); return remain; } -static int eb_relocate(struct i915_execbuffer *eb) +static int +eb_relocate_vma_slow(struct i915_execbuffer *eb, struct eb_vma *ev) { + const struct drm_i915_gem_exec_object2 *entry = ev->exec; + struct drm_i915_gem_relocation_entry *relocs = + u64_to_ptr(typeof(*relocs), entry->relocs_ptr); + unsigned int i; int err; - err = eb_lookup_vmas(eb); - if (err) - return err; + for (i = 0; i < entry->relocation_count; i++) { + u64 offset = eb_relocate_entry(eb, ev, &relocs[i]); - if (!list_empty(&eb->unbound)) { - err = eb_reserve(eb); + if ((s64)offset < 0) { + err = (int)offset; + goto err; + } + } + err = 0; +err: + reloc_cache_reset(&eb->reloc_cache, eb); + return err; +} + +static int check_relocations(const struct drm_i915_gem_exec_object2 *entry) +{ + const char __user *addr, *end; + unsigned long size; + char __maybe_unused c; + + size = entry->relocation_count; + if (size == 0) + return 0; + + if (size > N_RELOC(ULONG_MAX)) + return -EINVAL; + + addr = u64_to_user_ptr(entry->relocs_ptr); + size *= sizeof(struct drm_i915_gem_relocation_entry); + if (!access_ok(addr, size)) + return -EFAULT; + + end = addr + size; + for (; addr < end; addr += PAGE_SIZE) { + int err = __get_user(c, addr); + if (err) + return err; + } + return __get_user(c, end - 1); +} + +static int eb_copy_relocations(const struct i915_execbuffer *eb) +{ + struct drm_i915_gem_relocation_entry *relocs; + const unsigned int count = eb->buffer_count; + unsigned int i; + int err; + + for (i = 0; i < count; i++) { + const unsigned int nreloc = eb->exec[i].relocation_count; + struct drm_i915_gem_relocation_entry __user *urelocs; + unsigned long size; + unsigned long copied; + + if (nreloc == 0) + continue; + + err = check_relocations(&eb->exec[i]); + if (err) + goto err; + + urelocs = u64_to_user_ptr(eb->exec[i].relocs_ptr); + size = nreloc * sizeof(*relocs); + + relocs = kvmalloc_array(size, 1, GFP_KERNEL); + if (!relocs) { + err = -ENOMEM; + goto err; + } + + /* copy_from_user is limited to < 4GiB */ + copied = 0; + do { + unsigned int len = + min_t(u64, BIT_ULL(31), size - copied); + + if (__copy_from_user((char *)relocs + copied, + (char __user *)urelocs + copied, + len)) + goto end; + + copied += len; + } while (copied < size); + + /* + * As we do not update the known relocation offsets after + * relocating (due to the complexities in lock handling), + * we need to mark them as invalid now so that we force the + * relocation processing next time. Just in case the target + * object is evicted and then rebound into its old + * presumed_offset before the next execbuffer - if that + * happened we would make the mistake of assuming that the + * relocations were valid. + */ + if (!user_access_begin(urelocs, size)) + goto end; + + for (copied = 0; copied < nreloc; copied++) + unsafe_put_user(-1, + &urelocs[copied].presumed_offset, + end_user); + user_access_end(); + + eb->exec[i].relocs_ptr = (uintptr_t)relocs; + } + + return 0; + +end_user: + user_access_end(); +end: + kvfree(relocs); + err = -EFAULT; +err: + while (i--) { + relocs = u64_to_ptr(typeof(*relocs), eb->exec[i].relocs_ptr); + if (eb->exec[i].relocation_count) + kvfree(relocs); + } + return err; +} + +static int eb_prefault_relocations(const struct i915_execbuffer *eb) +{ + const unsigned int count = eb->buffer_count; + unsigned int i; + + for (i = 0; i < count; i++) { + int err; + + err = check_relocations(&eb->exec[i]); if (err) return err; } + return 0; +} + +static noinline int eb_relocate_parse_slow(struct i915_execbuffer *eb, + struct i915_request *rq) +{ + bool have_copy = false; + struct eb_vma *ev; + int err = 0; + +repeat: + if (signal_pending(current)) { + err = -ERESTARTSYS; + goto out; + } + + /* We may process another execbuffer during the unlock... */ + eb_release_vmas(eb, false); + i915_gem_ww_ctx_fini(&eb->ww); + + if (rq) { + /* nonblocking is always false */ + if (i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE, + MAX_SCHEDULE_TIMEOUT) < 0) { + i915_request_put(rq); + rq = NULL; + + err = -EINTR; + goto err_relock; + } + + i915_request_put(rq); + rq = NULL; + } + + /* + * We take 3 passes through the slowpatch. + * + * 1 - we try to just prefault all the user relocation entries and + * then attempt to reuse the atomic pagefault disabled fast path again. + * + * 2 - we copy the user entries to a local buffer here outside of the + * local and allow ourselves to wait upon any rendering before + * relocations + * + * 3 - we already have a local copy of the relocation entries, but + * were interrupted (EAGAIN) whilst waiting for the objects, try again. + */ + if (!err) { + err = eb_prefault_relocations(eb); + } else if (!have_copy) { + err = eb_copy_relocations(eb); + have_copy = err == 0; + } else { + cond_resched(); + err = 0; + } + + if (!err) + flush_workqueue(eb->i915->mm.userptr_wq); + +err_relock: + i915_gem_ww_ctx_init(&eb->ww, true); + if (err) + goto out; + + /* reacquire the objects */ +repeat_validate: + rq = eb_pin_engine(eb, false); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + rq = NULL; + goto err; + } + + /* We didn't throttle, should be NULL */ + GEM_WARN_ON(rq); + + err = eb_validate_vmas(eb); + if (err) + goto err; + + GEM_BUG_ON(!eb->batch); + + list_for_each_entry(ev, &eb->relocs, reloc_link) { + if (!have_copy) { + pagefault_disable(); + err = eb_relocate_vma(eb, ev); + pagefault_enable(); + if (err) + break; + } else { + err = eb_relocate_vma_slow(eb, ev); + if (err) + break; + } + } + + if (err == -EDEADLK) + goto err; + + if (err && !have_copy) + goto repeat; + + if (err) + goto err; + + /* as last step, parse the command buffer */ + err = eb_parse(eb); + if (err) + goto err; + + /* + * Leave the user relocations as are, this is the painfully slow path, + * and we want to avoid the complication of dropping the lock whilst + * having buffers reserved in the aperture and so causing spurious + * ENOSPC for random operations. + */ + +err: + if (err == -EDEADLK) { + eb_release_vmas(eb, false); + err = i915_gem_ww_ctx_backoff(&eb->ww); + if (!err) + goto repeat_validate; + } + + if (err == -EAGAIN) + goto repeat; + +out: + if (have_copy) { + const unsigned int count = eb->buffer_count; + unsigned int i; + + for (i = 0; i < count; i++) { + const struct drm_i915_gem_exec_object2 *entry = + &eb->exec[i]; + struct drm_i915_gem_relocation_entry *relocs; + + if (!entry->relocation_count) + continue; + + relocs = u64_to_ptr(typeof(*relocs), entry->relocs_ptr); + kvfree(relocs); + } + } + + if (rq) + i915_request_put(rq); + + return err; +} + +static int eb_relocate_parse(struct i915_execbuffer *eb) +{ + int err; + struct i915_request *rq = NULL; + bool throttle = true; + +retry: + rq = eb_pin_engine(eb, throttle); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + rq = NULL; + if (err != -EDEADLK) + return err; + + goto err; + } + + if (rq) { + bool nonblock = eb->file->filp->f_flags & O_NONBLOCK; + + /* Need to drop all locks now for throttling, take slowpath */ + err = i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE, 0); + if (err == -ETIME) { + if (nonblock) { + err = -EWOULDBLOCK; + i915_request_put(rq); + goto err; + } + goto slow; + } + i915_request_put(rq); + rq = NULL; + } + + /* only throttle once, even if we didn't need to throttle */ + throttle = false; + + err = eb_validate_vmas(eb); + if (err == -EAGAIN) + goto slow; + else if (err) + goto err; + /* The objects are in their final locations, apply the relocations. */ if (eb->args->flags & __EXEC_HAS_RELOC) { struct eb_vma *ev; - int flush; list_for_each_entry(ev, &eb->relocs, reloc_link) { err = eb_relocate_vma(eb, ev); @@ -1796,46 +2094,46 @@ static int eb_relocate(struct i915_execbuffer *eb) break; } - flush = reloc_gpu_flush(&eb->reloc_cache); - if (!err) - err = flush; + if (err == -EDEADLK) + goto err; + else if (err) + goto slow; } + if (!err) + err = eb_parse(eb); + +err: + if (err == -EDEADLK) { + eb_release_vmas(eb, false); + err = i915_gem_ww_ctx_backoff(&eb->ww); + if (!err) + goto retry; + } + + return err; + +slow: + err = eb_relocate_parse_slow(eb, rq); + if (err) + /* + * If the user expects the execobject.offset and + * reloc.presumed_offset to be an exact match, + * as for using NO_RELOC, then we cannot update + * the execobject.offset until we have completed + * relocation. + */ + eb->args->flags &= ~__EXEC_HAS_RELOC; + return err; } static int eb_move_to_gpu(struct i915_execbuffer *eb) { const unsigned int count = eb->buffer_count; - struct ww_acquire_ctx acquire; - unsigned int i; + unsigned int i = count; int err = 0; - ww_acquire_init(&acquire, &reservation_ww_class); - - for (i = 0; i < count; i++) { - struct eb_vma *ev = &eb->vma[i]; - struct i915_vma *vma = ev->vma; - - err = ww_mutex_lock_interruptible(&vma->resv->lock, &acquire); - if (err == -EDEADLK) { - GEM_BUG_ON(i == 0); - do { - int j = i - 1; - - ww_mutex_unlock(&eb->vma[j].vma->resv->lock); - - swap(eb->vma[i], eb->vma[j]); - } while (--i); - - err = ww_mutex_lock_slow_interruptible(&vma->resv->lock, - &acquire); - } - if (err) - break; - } - ww_acquire_done(&acquire); - while (i--) { struct eb_vma *ev = &eb->vma[i]; struct i915_vma *vma = ev->vma; @@ -1879,13 +2177,7 @@ static int eb_move_to_gpu(struct i915_execbuffer *eb) if (err == 0) err = i915_vma_move_to_active(vma, eb->request, flags); - - i915_vma_unlock(vma); - eb_unreserve_vma(ev); } - ww_acquire_fini(&acquire); - - eb_vma_array_put(fetch_and_zero(&eb->array)); if (unlikely(err)) goto err_skip; @@ -1950,7 +2242,8 @@ static int i915_reset_gen7_sol_offsets(struct i915_request *rq) } static struct i915_vma * -shadow_batch_pin(struct drm_i915_gem_object *obj, +shadow_batch_pin(struct i915_execbuffer *eb, + struct drm_i915_gem_object *obj, struct i915_address_space *vm, unsigned int flags) { @@ -1961,7 +2254,7 @@ shadow_batch_pin(struct drm_i915_gem_object *obj, if (IS_ERR(vma)) return vma; - err = i915_vma_pin(vma, 0, 0, flags); + err = i915_vma_pin_ww(vma, &eb->ww, 0, 0, flags); if (err) return ERR_PTR(err); @@ -2013,7 +2306,7 @@ __parser_mark_active(struct i915_vma *vma, { struct intel_gt_buffer_pool_node *node = vma->private; - return i915_active_ref(&node->active, tl, fence); + return i915_active_ref(&node->active, tl->fence_context, fence); } static int @@ -2077,36 +2370,26 @@ static int eb_parse_pipeline(struct i915_execbuffer *eb, if (err) goto err_commit; - err = dma_resv_lock_interruptible(pw->batch->resv, NULL); - if (err) - goto err_commit; - err = dma_resv_reserve_shared(pw->batch->resv, 1); if (err) - goto err_commit_unlock; + goto err_commit; /* Wait for all writes (and relocs) into the batch to complete */ err = i915_sw_fence_await_reservation(&pw->base.chain, pw->batch->resv, NULL, false, 0, I915_FENCE_GFP); if (err < 0) - goto err_commit_unlock; + goto err_commit; /* Keep the batch alive and unwritten as we parse */ dma_resv_add_shared_fence(pw->batch->resv, &pw->base.dma); - dma_resv_unlock(pw->batch->resv); - /* Force execution to wait for completion of the parser */ - dma_resv_lock(shadow->resv, NULL); dma_resv_add_excl_fence(shadow->resv, &pw->base.dma); - dma_resv_unlock(shadow->resv); dma_fence_work_commit_imm(&pw->base); return 0; -err_commit_unlock: - dma_resv_unlock(pw->batch->resv); err_commit: i915_sw_fence_set_error_once(&pw->base.chain, err); dma_fence_work_commit_imm(&pw->base); @@ -2121,16 +2404,33 @@ static int eb_parse_pipeline(struct i915_execbuffer *eb, return err; } +static struct i915_vma *eb_dispatch_secure(struct i915_execbuffer *eb, struct i915_vma *vma) +{ + /* + * snb/ivb/vlv conflate the "batch in ppgtt" bit with the "non-secure + * batch" bit. Hence we need to pin secure batches into the global gtt. + * hsw should have this fixed, but bdw mucks it up again. */ + if (eb->batch_flags & I915_DISPATCH_SECURE) + return i915_gem_object_ggtt_pin_ww(vma->obj, &eb->ww, NULL, 0, 0, 0); + + return NULL; +} + static int eb_parse(struct i915_execbuffer *eb) { struct drm_i915_private *i915 = eb->i915; - struct intel_gt_buffer_pool_node *pool; - struct i915_vma *shadow, *trampoline; + struct intel_gt_buffer_pool_node *pool = eb->batch_pool; + struct i915_vma *shadow, *trampoline, *batch; unsigned int len; int err; - if (!eb_use_cmdparser(eb)) - return 0; + if (!eb_use_cmdparser(eb)) { + batch = eb_dispatch_secure(eb, eb->batch->vma); + if (IS_ERR(batch)) + return PTR_ERR(batch); + + goto secure_batch; + } len = eb->batch_len; if (!CMDPARSER_USES_GGTT(eb->i915)) { @@ -2147,11 +2447,18 @@ static int eb_parse(struct i915_execbuffer *eb) len += I915_CMD_PARSER_TRAMPOLINE_SIZE; } - pool = intel_gt_get_buffer_pool(eb->engine->gt, len); - if (IS_ERR(pool)) - return PTR_ERR(pool); + if (!pool) { + pool = intel_gt_get_buffer_pool(eb->engine->gt, len); + if (IS_ERR(pool)) + return PTR_ERR(pool); + eb->batch_pool = pool; + } - shadow = shadow_batch_pin(pool->obj, eb->context->vm, PIN_USER); + err = i915_gem_object_lock(pool->obj, &eb->ww); + if (err) + goto err; + + shadow = shadow_batch_pin(eb, pool->obj, eb->context->vm, PIN_USER); if (IS_ERR(shadow)) { err = PTR_ERR(shadow); goto err; @@ -2163,7 +2470,7 @@ static int eb_parse(struct i915_execbuffer *eb) if (CMDPARSER_USES_GGTT(eb->i915)) { trampoline = shadow; - shadow = shadow_batch_pin(pool->obj, + shadow = shadow_batch_pin(eb, pool->obj, &eb->engine->gt->ggtt->vm, PIN_GLOBAL); if (IS_ERR(shadow)) { @@ -2176,42 +2483,43 @@ static int eb_parse(struct i915_execbuffer *eb) eb->batch_flags |= I915_DISPATCH_SECURE; } + batch = eb_dispatch_secure(eb, shadow); + if (IS_ERR(batch)) { + err = PTR_ERR(batch); + goto err_trampoline; + } + err = eb_parse_pipeline(eb, shadow, trampoline); if (err) - goto err_trampoline; + goto err_unpin_batch; - eb->vma[eb->buffer_count].vma = i915_vma_get(shadow); - eb->vma[eb->buffer_count].flags = __EXEC_OBJECT_HAS_PIN; eb->batch = &eb->vma[eb->buffer_count++]; - eb->vma[eb->buffer_count].vma = NULL; + eb->batch->vma = i915_vma_get(shadow); + eb->batch->flags = __EXEC_OBJECT_HAS_PIN; eb->trampoline = trampoline; eb->batch_start_offset = 0; +secure_batch: + if (batch) { + eb->batch = &eb->vma[eb->buffer_count++]; + eb->batch->flags = __EXEC_OBJECT_HAS_PIN; + eb->batch->vma = i915_vma_get(batch); + } return 0; +err_unpin_batch: + if (batch) + i915_vma_unpin(batch); err_trampoline: if (trampoline) i915_vma_unpin(trampoline); err_shadow: i915_vma_unpin(shadow); err: - intel_gt_buffer_pool_put(pool); return err; } -static void -add_to_client(struct i915_request *rq, struct drm_file *file) -{ - struct drm_i915_file_private *file_priv = file->driver_priv; - - rq->file_priv = file_priv; - - spin_lock(&file_priv->mm.lock); - list_add_tail(&rq->client_link, &file_priv->mm.request_list); - spin_unlock(&file_priv->mm.lock); -} - static int eb_submit(struct i915_execbuffer *eb, struct i915_vma *batch) { int err; @@ -2293,7 +2601,7 @@ static const enum intel_engine_id user_ring_map[] = { [I915_EXEC_VEBOX] = VECS0 }; -static struct i915_request *eb_throttle(struct intel_context *ce) +static struct i915_request *eb_throttle(struct i915_execbuffer *eb, struct intel_context *ce) { struct intel_ring *ring = ce->ring; struct intel_timeline *tl = ce->timeline; @@ -2327,31 +2635,26 @@ static struct i915_request *eb_throttle(struct intel_context *ce) return i915_request_get(rq); } -static int __eb_pin_engine(struct i915_execbuffer *eb, struct intel_context *ce) +static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb, bool throttle) { + struct intel_context *ce = eb->context; struct intel_timeline *tl; - struct i915_request *rq; + struct i915_request *rq = NULL; int err; - /* - * ABI: Before userspace accesses the GPU (e.g. execbuffer), report - * EIO if the GPU is already wedged. - */ - err = intel_gt_terminally_wedged(ce->engine->gt); - if (err) - return err; + GEM_BUG_ON(eb->args->flags & __EXEC_ENGINE_PINNED); if (unlikely(intel_context_is_banned(ce))) - return -EIO; + return ERR_PTR(-EIO); /* * Pinning the contexts may generate requests in order to acquire * GGTT space, so do this first before we reserve a seqno for * ourselves. */ - err = intel_context_pin(ce); + err = intel_context_pin_ww(ce, &eb->ww); if (err) - return err; + return ERR_PTR(err); /* * Take a local wakeref for preparing to dispatch the execbuf as @@ -2363,45 +2666,17 @@ static int __eb_pin_engine(struct i915_execbuffer *eb, struct intel_context *ce) */ tl = intel_context_timeline_lock(ce); if (IS_ERR(tl)) { - err = PTR_ERR(tl); - goto err_unpin; + intel_context_unpin(ce); + return ERR_CAST(tl); } intel_context_enter(ce); - rq = eb_throttle(ce); - + if (throttle) + rq = eb_throttle(eb, ce); intel_context_timeline_unlock(tl); - if (rq) { - bool nonblock = eb->file->filp->f_flags & O_NONBLOCK; - long timeout; - - timeout = MAX_SCHEDULE_TIMEOUT; - if (nonblock) - timeout = 0; - - timeout = i915_request_wait(rq, - I915_WAIT_INTERRUPTIBLE, - timeout); - i915_request_put(rq); - - if (timeout < 0) { - err = nonblock ? -EWOULDBLOCK : timeout; - goto err_exit; - } - } - - eb->engine = ce->engine; - eb->context = ce; - return 0; - -err_exit: - mutex_lock(&tl->mutex); - intel_context_exit(ce); - intel_context_timeline_unlock(tl); -err_unpin: - intel_context_unpin(ce); - return err; + eb->args->flags |= __EXEC_ENGINE_PINNED; + return rq; } static void eb_unpin_engine(struct i915_execbuffer *eb) @@ -2409,6 +2684,11 @@ static void eb_unpin_engine(struct i915_execbuffer *eb) struct intel_context *ce = eb->context; struct intel_timeline *tl = ce->timeline; + if (!(eb->args->flags & __EXEC_ENGINE_PINNED)) + return; + + eb->args->flags &= ~__EXEC_ENGINE_PINNED; + mutex_lock(&tl->mutex); intel_context_exit(ce); mutex_unlock(&tl->mutex); @@ -2417,11 +2697,10 @@ static void eb_unpin_engine(struct i915_execbuffer *eb) } static unsigned int -eb_select_legacy_ring(struct i915_execbuffer *eb, - struct drm_file *file, - struct drm_i915_gem_execbuffer2 *args) +eb_select_legacy_ring(struct i915_execbuffer *eb) { struct drm_i915_private *i915 = eb->i915; + struct drm_i915_gem_execbuffer2 *args = eb->args; unsigned int user_ring_id = args->flags & I915_EXEC_RING_MASK; if (user_ring_id != I915_EXEC_BSD && @@ -2436,7 +2715,7 @@ eb_select_legacy_ring(struct i915_execbuffer *eb, unsigned int bsd_idx = args->flags & I915_EXEC_BSD_MASK; if (bsd_idx == I915_EXEC_BSD_DEFAULT) { - bsd_idx = gen8_dispatch_bsd_engine(i915, file); + bsd_idx = gen8_dispatch_bsd_engine(i915, eb->file); } else if (bsd_idx >= I915_EXEC_BSD_RING1 && bsd_idx <= I915_EXEC_BSD_RING2) { bsd_idx >>= I915_EXEC_BSD_SHIFT; @@ -2461,27 +2740,58 @@ eb_select_legacy_ring(struct i915_execbuffer *eb, } static int -eb_pin_engine(struct i915_execbuffer *eb, - struct drm_file *file, - struct drm_i915_gem_execbuffer2 *args) +eb_select_engine(struct i915_execbuffer *eb) { struct intel_context *ce; unsigned int idx; int err; if (i915_gem_context_user_engines(eb->gem_context)) - idx = args->flags & I915_EXEC_RING_MASK; + idx = eb->args->flags & I915_EXEC_RING_MASK; else - idx = eb_select_legacy_ring(eb, file, args); + idx = eb_select_legacy_ring(eb); ce = i915_gem_context_get_engine(eb->gem_context, idx); if (IS_ERR(ce)) return PTR_ERR(ce); - err = __eb_pin_engine(eb, ce); - intel_context_put(ce); + intel_gt_pm_get(ce->engine->gt); + if (!test_bit(CONTEXT_ALLOC_BIT, &ce->flags)) { + err = intel_context_alloc_state(ce); + if (err) + goto err; + } + + /* + * ABI: Before userspace accesses the GPU (e.g. execbuffer), report + * EIO if the GPU is already wedged. + */ + err = intel_gt_terminally_wedged(ce->engine->gt); + if (err) + goto err; + + eb->context = ce; + eb->engine = ce->engine; + + /* + * Make sure engine pool stays alive even if we call intel_context_put + * during ww handling. The pool is destroyed when last pm reference + * is dropped, which breaks our -EDEADLK handling. + */ return err; + +err: + intel_gt_pm_put(ce->engine->gt); + intel_context_put(ce); + return err; +} + +static void +eb_put_engine(struct i915_execbuffer *eb) +{ + intel_gt_pm_put(eb->engine->gt); + intel_context_put(eb->context); } static void @@ -2573,6 +2883,7 @@ add_timeline_fence_array(struct i915_execbuffer *eb, if (err && !(user_fence.flags & I915_EXEC_FENCE_SIGNAL)) { DRM_DEBUG("Syncobj handle missing requested point %llu\n", point); + dma_fence_put(fence); drm_syncobj_put(syncobj); return err; } @@ -2860,6 +3171,10 @@ i915_gem_do_execbuffer(struct drm_device *dev, args->flags |= __EXEC_HAS_RELOC; eb.exec = exec; + eb.vma = (struct eb_vma *)(exec + args->buffer_count + 1); + eb.vma[0].vma = NULL; + eb.reloc_pool = eb.batch_pool = NULL; + eb.reloc_context = NULL; eb.invalid_flags = __EXEC_OBJECT_UNKNOWN_FLAGS; reloc_cache_init(&eb.reloc_cache, eb.i915); @@ -2928,11 +3243,19 @@ i915_gem_do_execbuffer(struct drm_device *dev, if (unlikely(err)) goto err_destroy; - err = eb_pin_engine(&eb, file, args); + err = eb_select_engine(&eb); if (unlikely(err)) goto err_context; - err = eb_relocate(&eb); + err = eb_lookup_vmas(&eb); + if (err) { + eb_release_vmas(&eb, true); + goto err_engine; + } + + i915_gem_ww_ctx_init(&eb.ww, true); + + err = eb_relocate_parse(&eb); if (err) { /* * If the user expects the execobject.offset and @@ -2945,54 +3268,9 @@ i915_gem_do_execbuffer(struct drm_device *dev, goto err_vma; } - if (unlikely(eb.batch->flags & EXEC_OBJECT_WRITE)) { - drm_dbg(&i915->drm, - "Attempting to use self-modifying batch buffer\n"); - err = -EINVAL; - goto err_vma; - } + ww_acquire_done(&eb.ww.ctx); - if (range_overflows_t(u64, - eb.batch_start_offset, eb.batch_len, - eb.batch->vma->size)) { - drm_dbg(&i915->drm, "Attempting to use out-of-bounds batch\n"); - err = -EINVAL; - goto err_vma; - } - - if (eb.batch_len == 0) - eb.batch_len = eb.batch->vma->size - eb.batch_start_offset; - - err = eb_parse(&eb); - if (err) - goto err_vma; - - /* - * snb/ivb/vlv conflate the "batch in ppgtt" bit with the "non-secure - * batch" bit. Hence we need to pin secure batches into the global gtt. - * hsw should have this fixed, but bdw mucks it up again. */ batch = eb.batch->vma; - if (eb.batch_flags & I915_DISPATCH_SECURE) { - struct i915_vma *vma; - - /* - * So on first glance it looks freaky that we pin the batch here - * outside of the reservation loop. But: - * - The batch is already pinned into the relevant ppgtt, so we - * already have the backing storage fully allocated. - * - No other BO uses the global gtt (well contexts, but meh), - * so we don't really have issues with multiple objects not - * fitting due to fragmentation. - * So this is actually safe. - */ - vma = i915_gem_object_ggtt_pin(batch->obj, NULL, 0, 0, 0); - if (IS_ERR(vma)) { - err = PTR_ERR(vma); - goto err_parse; - } - - batch = vma; - } /* All GPU relocation batches must be submitted prior to the user rq */ GEM_BUG_ON(eb.reloc_cache.rq); @@ -3001,7 +3279,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, eb.request = i915_request_create(eb.context); if (IS_ERR(eb.request)) { err = PTR_ERR(eb.request); - goto err_batch_unpin; + goto err_vma; } if (in_fence) { @@ -3038,13 +3316,12 @@ i915_gem_do_execbuffer(struct drm_device *dev, * to explicitly hold another reference here. */ eb.request->batch = batch; - if (batch->private) - intel_gt_buffer_pool_mark_active(batch->private, eb.request); + if (eb.batch_pool) + intel_gt_buffer_pool_mark_active(eb.batch_pool, eb.request); trace_i915_request_queue(eb.request, eb.batch_flags); err = eb_submit(&eb, batch); err_request: - add_to_client(eb.request, file); i915_request_get(eb.request); eb_request_add(&eb); @@ -3063,16 +3340,21 @@ i915_gem_do_execbuffer(struct drm_device *dev, } i915_request_put(eb.request); -err_batch_unpin: - if (eb.batch_flags & I915_DISPATCH_SECURE) - i915_vma_unpin(batch); -err_parse: - if (batch->private) - intel_gt_buffer_pool_put(batch->private); err_vma: + eb_release_vmas(&eb, true); if (eb.trampoline) i915_vma_unpin(eb.trampoline); - eb_unpin_engine(&eb); + WARN_ON(err == -EDEADLK); + i915_gem_ww_ctx_fini(&eb.ww); + + if (eb.batch_pool) + intel_gt_buffer_pool_put(eb.batch_pool); + if (eb.reloc_pool) + intel_gt_buffer_pool_put(eb.reloc_pool); + if (eb.reloc_context) + intel_context_put(eb.reloc_context); +err_engine: + eb_put_engine(&eb); err_context: i915_gem_context_put(eb.gem_context); err_destroy: @@ -3089,7 +3371,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, static size_t eb_element_size(void) { - return sizeof(struct drm_i915_gem_exec_object2); + return sizeof(struct drm_i915_gem_exec_object2) + sizeof(struct eb_vma); } static bool check_buffer_count(size_t count) @@ -3145,7 +3427,9 @@ i915_gem_execbuffer_ioctl(struct drm_device *dev, void *data, /* Copy in the exec list from userland */ exec_list = kvmalloc_array(count, sizeof(*exec_list), __GFP_NOWARN | GFP_KERNEL); - exec2_list = kvmalloc_array(count, eb_element_size(), + + /* Allocate extra slots for use by the command parser */ + exec2_list = kvmalloc_array(count + 2, eb_element_size(), __GFP_NOWARN | GFP_KERNEL); if (exec_list == NULL || exec2_list == NULL) { drm_dbg(&i915->drm, @@ -3222,7 +3506,8 @@ i915_gem_execbuffer2_ioctl(struct drm_device *dev, void *data, if (err) return err; - exec2_list = kvmalloc_array(count, eb_element_size(), + /* Allocate extra slots for use by the command parser */ + exec2_list = kvmalloc_array(count + 2, eb_element_size(), __GFP_NOWARN | GFP_KERNEL); if (exec2_list == NULL) { drm_dbg(&i915->drm, "Failed to allocate exec list for %zd buffers\n", diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c index 753f82d87a31..3d69e51f3e4d 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c @@ -283,37 +283,46 @@ static vm_fault_t vm_fault_gtt(struct vm_fault *vmf) struct intel_runtime_pm *rpm = &i915->runtime_pm; struct i915_ggtt *ggtt = &i915->ggtt; bool write = area->vm_flags & VM_WRITE; + struct i915_gem_ww_ctx ww; intel_wakeref_t wakeref; struct i915_vma *vma; pgoff_t page_offset; int srcu; int ret; - /* Sanity check that we allow writing into this object */ - if (i915_gem_object_is_readonly(obj) && write) - return VM_FAULT_SIGBUS; - /* We don't use vmf->pgoff since that has the fake offset */ page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT; trace_i915_gem_object_fault(obj, page_offset, true, write); - ret = i915_gem_object_pin_pages(obj); - if (ret) - goto err; - wakeref = intel_runtime_pm_get(rpm); - ret = intel_gt_reset_trylock(ggtt->vm.gt, &srcu); + i915_gem_ww_ctx_init(&ww, true); +retry: + ret = i915_gem_object_lock(obj, &ww); if (ret) goto err_rpm; + /* Sanity check that we allow writing into this object */ + if (i915_gem_object_is_readonly(obj) && write) { + ret = -EFAULT; + goto err_rpm; + } + + ret = i915_gem_object_pin_pages(obj); + if (ret) + goto err_rpm; + + ret = intel_gt_reset_trylock(ggtt->vm.gt, &srcu); + if (ret) + goto err_pages; + /* Now pin it into the GTT as needed */ - vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, - PIN_MAPPABLE | - PIN_NONBLOCK /* NOWARN */ | - PIN_NOEVICT); - if (IS_ERR(vma)) { + vma = i915_gem_object_ggtt_pin_ww(obj, &ww, NULL, 0, 0, + PIN_MAPPABLE | + PIN_NONBLOCK /* NOWARN */ | + PIN_NOEVICT); + if (IS_ERR(vma) && vma != ERR_PTR(-EDEADLK)) { /* Use a partial view if it is bigger than available space */ struct i915_ggtt_view view = compute_partial_view(obj, page_offset, MIN_CHUNK_PAGES); @@ -328,11 +337,11 @@ static vm_fault_t vm_fault_gtt(struct vm_fault *vmf) * all hope that the hardware is able to track future writes. */ - vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags); - if (IS_ERR(vma)) { + vma = i915_gem_object_ggtt_pin_ww(obj, &ww, &view, 0, 0, flags); + if (IS_ERR(vma) && vma != ERR_PTR(-EDEADLK)) { flags = PIN_MAPPABLE; view.type = I915_GGTT_VIEW_PARTIAL; - vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags); + vma = i915_gem_object_ggtt_pin_ww(obj, &ww, &view, 0, 0, flags); } /* The entire mappable GGTT is pinned? Unexpected! */ @@ -389,10 +398,16 @@ static vm_fault_t vm_fault_gtt(struct vm_fault *vmf) __i915_vma_unpin(vma); err_reset: intel_gt_reset_unlock(ggtt->vm.gt, srcu); -err_rpm: - intel_runtime_pm_put(rpm, wakeref); +err_pages: i915_gem_object_unpin_pages(obj); -err: +err_rpm: + if (ret == -EDEADLK) { + ret = i915_gem_ww_ctx_backoff(&ww); + if (!ret) + goto retry; + } + i915_gem_ww_ctx_fini(&ww); + intel_runtime_pm_put(rpm, wakeref); return i915_error_to_vmf_fault(ret); } diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.h b/drivers/gpu/drm/i915/gem/i915_gem_object.h index 9cf4ad78ece6..d46db8d8f38e 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object.h @@ -110,9 +110,39 @@ i915_gem_object_put(struct drm_i915_gem_object *obj) #define assert_object_held(obj) dma_resv_assert_held((obj)->base.resv) -static inline void i915_gem_object_lock(struct drm_i915_gem_object *obj) +static inline int __i915_gem_object_lock(struct drm_i915_gem_object *obj, + struct i915_gem_ww_ctx *ww, + bool intr) { - dma_resv_lock(obj->base.resv, NULL); + int ret; + + if (intr) + ret = dma_resv_lock_interruptible(obj->base.resv, ww ? &ww->ctx : NULL); + else + ret = dma_resv_lock(obj->base.resv, ww ? &ww->ctx : NULL); + + if (!ret && ww) + list_add_tail(&obj->obj_link, &ww->obj_list); + if (ret == -EALREADY) + ret = 0; + + if (ret == -EDEADLK) + ww->contended = obj; + + return ret; +} + +static inline int i915_gem_object_lock(struct drm_i915_gem_object *obj, + struct i915_gem_ww_ctx *ww) +{ + return __i915_gem_object_lock(obj, ww, ww && ww->intr); +} + +static inline int i915_gem_object_lock_interruptible(struct drm_i915_gem_object *obj, + struct i915_gem_ww_ctx *ww) +{ + WARN_ON(ww && !ww->intr); + return __i915_gem_object_lock(obj, ww, true); } static inline bool i915_gem_object_trylock(struct drm_i915_gem_object *obj) @@ -120,12 +150,6 @@ static inline bool i915_gem_object_trylock(struct drm_i915_gem_object *obj) return dma_resv_trylock(obj->base.resv); } -static inline int -i915_gem_object_lock_interruptible(struct drm_i915_gem_object *obj) -{ - return dma_resv_lock_interruptible(obj->base.resv, NULL); -} - static inline void i915_gem_object_unlock(struct drm_i915_gem_object *obj) { dma_resv_unlock(obj->base.resv); @@ -412,7 +436,6 @@ static inline void i915_gem_object_finish_access(struct drm_i915_gem_object *obj) { i915_gem_object_unpin_pages(obj); - i915_gem_object_unlock(obj); } static inline struct intel_engine_cs * @@ -435,6 +458,7 @@ i915_gem_object_last_write_engine(struct drm_i915_gem_object *obj) void i915_gem_object_set_cache_coherency(struct drm_i915_gem_object *obj, unsigned int cache_level); void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj); +void i915_gem_object_flush_if_display_locked(struct drm_i915_gem_object *obj); int __must_check i915_gem_object_set_to_wc_domain(struct drm_i915_gem_object *obj, bool write); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_blt.c b/drivers/gpu/drm/i915/gem/i915_gem_object_blt.c index bfdb32d46877..d93eb36160c9 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object_blt.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_blt.c @@ -14,6 +14,7 @@ struct i915_vma *intel_emit_vma_fill_blt(struct intel_context *ce, struct i915_vma *vma, + struct i915_gem_ww_ctx *ww, u32 value) { struct drm_i915_private *i915 = ce->vm->i915; @@ -39,10 +40,24 @@ struct i915_vma *intel_emit_vma_fill_blt(struct intel_context *ce, goto out_pm; } + err = i915_gem_object_lock(pool->obj, ww); + if (err) + goto out_put; + + batch = i915_vma_instance(pool->obj, ce->vm, NULL); + if (IS_ERR(batch)) { + err = PTR_ERR(batch); + goto out_put; + } + + err = i915_vma_pin_ww(batch, ww, 0, 0, PIN_USER); + if (unlikely(err)) + goto out_put; + cmd = i915_gem_object_pin_map(pool->obj, I915_MAP_WC); if (IS_ERR(cmd)) { err = PTR_ERR(cmd); - goto out_put; + goto out_unpin; } rem = vma->size; @@ -84,19 +99,11 @@ struct i915_vma *intel_emit_vma_fill_blt(struct intel_context *ce, intel_gt_chipset_flush(ce->vm->gt); - batch = i915_vma_instance(pool->obj, ce->vm, NULL); - if (IS_ERR(batch)) { - err = PTR_ERR(batch); - goto out_put; - } - - err = i915_vma_pin(batch, 0, 0, PIN_USER); - if (unlikely(err)) - goto out_put; - batch->private = pool; return batch; +out_unpin: + i915_vma_unpin(batch); out_put: intel_gt_buffer_pool_put(pool); out_pm: @@ -108,11 +115,9 @@ int intel_emit_vma_mark_active(struct i915_vma *vma, struct i915_request *rq) { int err; - i915_vma_lock(vma); err = i915_request_await_object(rq, vma->obj, false); if (err == 0) err = i915_vma_move_to_active(vma, rq, 0); - i915_vma_unlock(vma); if (unlikely(err)) return err; @@ -141,6 +146,7 @@ int i915_gem_object_fill_blt(struct drm_i915_gem_object *obj, struct intel_context *ce, u32 value) { + struct i915_gem_ww_ctx ww; struct i915_request *rq; struct i915_vma *batch; struct i915_vma *vma; @@ -150,17 +156,28 @@ int i915_gem_object_fill_blt(struct drm_i915_gem_object *obj, if (IS_ERR(vma)) return PTR_ERR(vma); - err = i915_vma_pin(vma, 0, 0, PIN_USER); - if (unlikely(err)) - return err; + i915_gem_ww_ctx_init(&ww, true); + intel_engine_pm_get(ce->engine); +retry: + err = i915_gem_object_lock(obj, &ww); + if (err) + goto out; - batch = intel_emit_vma_fill_blt(ce, vma, value); + err = intel_context_pin_ww(ce, &ww); + if (err) + goto out; + + err = i915_vma_pin_ww(vma, &ww, 0, 0, PIN_USER); + if (err) + goto out_ctx; + + batch = intel_emit_vma_fill_blt(ce, vma, &ww, value); if (IS_ERR(batch)) { err = PTR_ERR(batch); - goto out_unpin; + goto out_vma; } - rq = intel_context_create_request(ce); + rq = i915_request_create(ce); if (IS_ERR(rq)) { err = PTR_ERR(rq); goto out_batch; @@ -170,11 +187,9 @@ int i915_gem_object_fill_blt(struct drm_i915_gem_object *obj, if (unlikely(err)) goto out_request; - i915_vma_lock(vma); err = move_obj_to_gpu(vma->obj, rq, true); if (err == 0) err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE); - i915_vma_unlock(vma); if (unlikely(err)) goto out_request; @@ -193,8 +208,18 @@ int i915_gem_object_fill_blt(struct drm_i915_gem_object *obj, i915_request_add(rq); out_batch: intel_emit_vma_release(ce, batch); -out_unpin: +out_vma: i915_vma_unpin(vma); +out_ctx: + intel_context_unpin(ce); +out: + if (err == -EDEADLK) { + err = i915_gem_ww_ctx_backoff(&ww); + if (!err) + goto retry; + } + i915_gem_ww_ctx_fini(&ww); + intel_engine_pm_put(ce->engine); return err; } @@ -210,6 +235,7 @@ static bool wa_1209644611_applies(struct drm_i915_private *i915, u32 size) } struct i915_vma *intel_emit_vma_copy_blt(struct intel_context *ce, + struct i915_gem_ww_ctx *ww, struct i915_vma *src, struct i915_vma *dst) { @@ -236,10 +262,24 @@ struct i915_vma *intel_emit_vma_copy_blt(struct intel_context *ce, goto out_pm; } + err = i915_gem_object_lock(pool->obj, ww); + if (err) + goto out_put; + + batch = i915_vma_instance(pool->obj, ce->vm, NULL); + if (IS_ERR(batch)) { + err = PTR_ERR(batch); + goto out_put; + } + + err = i915_vma_pin_ww(batch, ww, 0, 0, PIN_USER); + if (unlikely(err)) + goto out_put; + cmd = i915_gem_object_pin_map(pool->obj, I915_MAP_WC); if (IS_ERR(cmd)) { err = PTR_ERR(cmd); - goto out_put; + goto out_unpin; } rem = src->size; @@ -296,20 +336,11 @@ struct i915_vma *intel_emit_vma_copy_blt(struct intel_context *ce, i915_gem_object_unpin_map(pool->obj); intel_gt_chipset_flush(ce->vm->gt); - - batch = i915_vma_instance(pool->obj, ce->vm, NULL); - if (IS_ERR(batch)) { - err = PTR_ERR(batch); - goto out_put; - } - - err = i915_vma_pin(batch, 0, 0, PIN_USER); - if (unlikely(err)) - goto out_put; - batch->private = pool; return batch; +out_unpin: + i915_vma_unpin(batch); out_put: intel_gt_buffer_pool_put(pool); out_pm: @@ -321,10 +352,9 @@ int i915_gem_object_copy_blt(struct drm_i915_gem_object *src, struct drm_i915_gem_object *dst, struct intel_context *ce) { - struct drm_gem_object *objs[] = { &src->base, &dst->base }; struct i915_address_space *vm = ce->vm; struct i915_vma *vma[2], *batch; - struct ww_acquire_ctx acquire; + struct i915_gem_ww_ctx ww; struct i915_request *rq; int err, i; @@ -332,25 +362,36 @@ int i915_gem_object_copy_blt(struct drm_i915_gem_object *src, if (IS_ERR(vma[0])) return PTR_ERR(vma[0]); - err = i915_vma_pin(vma[0], 0, 0, PIN_USER); - if (unlikely(err)) - return err; - vma[1] = i915_vma_instance(dst, vm, NULL); if (IS_ERR(vma[1])) - goto out_unpin_src; + return PTR_ERR(vma); - err = i915_vma_pin(vma[1], 0, 0, PIN_USER); + i915_gem_ww_ctx_init(&ww, true); + intel_engine_pm_get(ce->engine); +retry: + err = i915_gem_object_lock(src, &ww); + if (!err) + err = i915_gem_object_lock(dst, &ww); + if (!err) + err = intel_context_pin_ww(ce, &ww); + if (err) + goto out; + + err = i915_vma_pin_ww(vma[0], &ww, 0, 0, PIN_USER); + if (err) + goto out_ctx; + + err = i915_vma_pin_ww(vma[1], &ww, 0, 0, PIN_USER); if (unlikely(err)) goto out_unpin_src; - batch = intel_emit_vma_copy_blt(ce, vma[0], vma[1]); + batch = intel_emit_vma_copy_blt(ce, &ww, vma[0], vma[1]); if (IS_ERR(batch)) { err = PTR_ERR(batch); goto out_unpin_dst; } - rq = intel_context_create_request(ce); + rq = i915_request_create(ce); if (IS_ERR(rq)) { err = PTR_ERR(rq); goto out_batch; @@ -360,14 +401,10 @@ int i915_gem_object_copy_blt(struct drm_i915_gem_object *src, if (unlikely(err)) goto out_request; - err = drm_gem_lock_reservations(objs, ARRAY_SIZE(objs), &acquire); - if (unlikely(err)) - goto out_request; - for (i = 0; i < ARRAY_SIZE(vma); i++) { err = move_obj_to_gpu(vma[i]->obj, rq, i); if (unlikely(err)) - goto out_unlock; + goto out_request; } for (i = 0; i < ARRAY_SIZE(vma); i++) { @@ -375,20 +412,19 @@ int i915_gem_object_copy_blt(struct drm_i915_gem_object *src, err = i915_vma_move_to_active(vma[i], rq, flags); if (unlikely(err)) - goto out_unlock; + goto out_request; } if (rq->engine->emit_init_breadcrumb) { err = rq->engine->emit_init_breadcrumb(rq); if (unlikely(err)) - goto out_unlock; + goto out_request; } err = rq->engine->emit_bb_start(rq, batch->node.start, batch->node.size, 0); -out_unlock: - drm_gem_unlock_reservations(objs, ARRAY_SIZE(objs), &acquire); + out_request: if (unlikely(err)) i915_request_set_error_once(rq, err); @@ -400,6 +436,16 @@ int i915_gem_object_copy_blt(struct drm_i915_gem_object *src, i915_vma_unpin(vma[1]); out_unpin_src: i915_vma_unpin(vma[0]); +out_ctx: + intel_context_unpin(ce); +out: + if (err == -EDEADLK) { + err = i915_gem_ww_ctx_backoff(&ww); + if (!err) + goto retry; + } + i915_gem_ww_ctx_fini(&ww); + intel_engine_pm_put(ce->engine); return err; } diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_blt.h b/drivers/gpu/drm/i915/gem/i915_gem_object_blt.h index 8bcd336a90dc..2409fdcccf0e 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object_blt.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_blt.h @@ -13,12 +13,15 @@ #include "i915_vma.h" struct drm_i915_gem_object; +struct i915_gem_ww_ctx; struct i915_vma *intel_emit_vma_fill_blt(struct intel_context *ce, struct i915_vma *vma, + struct i915_gem_ww_ctx *ww, u32 value); struct i915_vma *intel_emit_vma_copy_blt(struct intel_context *ce, + struct i915_gem_ww_ctx *ww, struct i915_vma *src, struct i915_vma *dst); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h index 5335f799b548..b5c15557cc87 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h @@ -123,6 +123,15 @@ struct drm_i915_gem_object { struct list_head lut_list; spinlock_t lut_lock; /* guards lut_list */ + /** + * @obj_link: Link into @i915_gem_ww_ctx.obj_list + * + * When we lock this object through i915_gem_object_lock() with a + * context, we add it to the list to ensure we can unlock everything + * when i915_gem_ww_ctx_backoff() or i915_gem_ww_ctx_fini() are called. + */ + struct list_head obj_link; + /** Stolen memory for this object, instead of being backed by shmem. */ struct drm_mm_node *stolen; union { @@ -282,6 +291,7 @@ struct drm_i915_gem_object { } userptr; unsigned long scratch; + u64 encode; void *gvt_info; }; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pm.c b/drivers/gpu/drm/i915/gem/i915_gem_pm.c index 3d215164dd5a..40d3e40500fa 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_pm.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_pm.c @@ -84,7 +84,7 @@ void i915_gem_suspend_late(struct drm_i915_private *i915) spin_unlock_irqrestore(&i915->mm.obj_lock, flags); - i915_gem_object_lock(obj); + i915_gem_object_lock(obj, NULL); drm_WARN_ON(&i915->drm, i915_gem_object_set_to_gtt_domain(obj, false)); i915_gem_object_unlock(obj); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_throttle.c b/drivers/gpu/drm/i915/gem/i915_gem_throttle.c index 540ef0551789..1929d6cf4150 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_throttle.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_throttle.c @@ -9,6 +9,7 @@ #include #include "i915_drv.h" +#include "i915_gem_context.h" #include "i915_gem_ioctls.h" #include "i915_gem_object.h" @@ -35,9 +36,10 @@ int i915_gem_throttle_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { + const unsigned long recent_enough = jiffies - DRM_I915_THROTTLE_JIFFIES; struct drm_i915_file_private *file_priv = file->driver_priv; - unsigned long recent_enough = jiffies - DRM_I915_THROTTLE_JIFFIES; - struct i915_request *request, *target = NULL; + struct i915_gem_context *ctx; + unsigned long idx; long ret; /* ABI: return -EIO if already wedged */ @@ -45,27 +47,54 @@ i915_gem_throttle_ioctl(struct drm_device *dev, void *data, if (ret) return ret; - spin_lock(&file_priv->mm.lock); - list_for_each_entry(request, &file_priv->mm.request_list, client_link) { - if (time_after_eq(request->emitted_jiffies, recent_enough)) - break; + rcu_read_lock(); + xa_for_each(&file_priv->context_xa, idx, ctx) { + struct i915_gem_engines_iter it; + struct intel_context *ce; - if (target && xchg(&target->file_priv, NULL)) - list_del(&target->client_link); + if (!kref_get_unless_zero(&ctx->ref)) + continue; + rcu_read_unlock(); - target = request; + for_each_gem_engine(ce, + i915_gem_context_lock_engines(ctx), + it) { + struct i915_request *rq, *target = NULL; + + if (!ce->timeline) + continue; + + mutex_lock(&ce->timeline->mutex); + list_for_each_entry_reverse(rq, + &ce->timeline->requests, + link) { + if (i915_request_completed(rq)) + break; + + if (time_after(rq->emitted_jiffies, + recent_enough)) + continue; + + target = i915_request_get(rq); + break; + } + mutex_unlock(&ce->timeline->mutex); + if (!target) + continue; + + ret = i915_request_wait(target, + I915_WAIT_INTERRUPTIBLE, + MAX_SCHEDULE_TIMEOUT); + i915_request_put(target); + if (ret < 0) + break; + } + i915_gem_context_unlock_engines(ctx); + i915_gem_context_put(ctx); + + rcu_read_lock(); } - if (target) - i915_request_get(target); - spin_unlock(&file_priv->mm.lock); - - if (!target) - return 0; - - ret = i915_request_wait(target, - I915_WAIT_INTERRUPTIBLE, - MAX_SCHEDULE_TIMEOUT); - i915_request_put(target); + rcu_read_unlock(); return ret < 0 ? ret : 0; } diff --git a/drivers/gpu/drm/i915/gem/i915_gem_tiling.c b/drivers/gpu/drm/i915/gem/i915_gem_tiling.c index ff72ee2fd9cd..ffcaee74a249 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_tiling.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_tiling.c @@ -249,7 +249,7 @@ i915_gem_object_set_tiling(struct drm_i915_gem_object *obj, * whilst executing a fenced command for an untiled object. */ - i915_gem_object_lock(obj); + i915_gem_object_lock(obj, NULL); if (i915_gem_object_is_framebuffer(obj)) { i915_gem_object_unlock(obj); return -EBUSY; diff --git a/drivers/gpu/drm/i915/gem/selftests/huge_pages.c b/drivers/gpu/drm/i915/gem/selftests/huge_pages.c index 8291ede6902c..5daf4a2be422 100644 --- a/drivers/gpu/drm/i915/gem/selftests/huge_pages.c +++ b/drivers/gpu/drm/i915/gem/selftests/huge_pages.c @@ -393,7 +393,7 @@ static int igt_mock_exhaust_device_supported_pages(void *arg) */ for (i = 1; i < BIT(ARRAY_SIZE(page_sizes)); i++) { - unsigned int combination = 0; + unsigned int combination = SZ_4K; /* Required for ppGTT */ for (j = 0; j < ARRAY_SIZE(page_sizes); j++) { if (i & BIT(j)) @@ -947,7 +947,7 @@ static int gpu_write(struct intel_context *ce, { int err; - i915_gem_object_lock(vma->obj); + i915_gem_object_lock(vma->obj, NULL); err = i915_gem_object_set_to_gtt_domain(vma->obj, true); i915_gem_object_unlock(vma->obj); if (err) @@ -964,9 +964,10 @@ __cpu_check_shmem(struct drm_i915_gem_object *obj, u32 dword, u32 val) unsigned long n; int err; + i915_gem_object_lock(obj, NULL); err = i915_gem_object_prepare_read(obj, &needs_flush); if (err) - return err; + goto err_unlock; for (n = 0; n < obj->base.size >> PAGE_SHIFT; ++n) { u32 *ptr = kmap_atomic(i915_gem_object_get_page(obj, n)); @@ -986,6 +987,8 @@ __cpu_check_shmem(struct drm_i915_gem_object *obj, u32 dword, u32 val) } i915_gem_object_finish_access(obj); +err_unlock: + i915_gem_object_unlock(obj); return err; } diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_client_blt.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_client_blt.c index 299c29e9ad86..4e36d4897ea6 100644 --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_client_blt.c +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_client_blt.c @@ -75,7 +75,7 @@ static int __igt_client_fill(struct intel_engine_cs *engine) if (err) goto err_unpin; - i915_gem_object_lock(obj); + i915_gem_object_lock(obj, NULL); err = i915_gem_object_set_to_cpu_domain(obj, false); i915_gem_object_unlock(obj); if (err) diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_coherency.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_coherency.c index 87d7d8aa080f..7049a6bbc03d 100644 --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_coherency.c +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_coherency.c @@ -27,9 +27,10 @@ static int cpu_set(struct context *ctx, unsigned long offset, u32 v) u32 *cpu; int err; + i915_gem_object_lock(ctx->obj, NULL); err = i915_gem_object_prepare_write(ctx->obj, &needs_clflush); if (err) - return err; + goto out; page = i915_gem_object_get_page(ctx->obj, offset >> PAGE_SHIFT); map = kmap_atomic(page); @@ -46,7 +47,9 @@ static int cpu_set(struct context *ctx, unsigned long offset, u32 v) kunmap_atomic(map); i915_gem_object_finish_access(ctx->obj); - return 0; +out: + i915_gem_object_unlock(ctx->obj); + return err; } static int cpu_get(struct context *ctx, unsigned long offset, u32 *v) @@ -57,9 +60,10 @@ static int cpu_get(struct context *ctx, unsigned long offset, u32 *v) u32 *cpu; int err; + i915_gem_object_lock(ctx->obj, NULL); err = i915_gem_object_prepare_read(ctx->obj, &needs_clflush); if (err) - return err; + goto out; page = i915_gem_object_get_page(ctx->obj, offset >> PAGE_SHIFT); map = kmap_atomic(page); @@ -73,7 +77,9 @@ static int cpu_get(struct context *ctx, unsigned long offset, u32 *v) kunmap_atomic(map); i915_gem_object_finish_access(ctx->obj); - return 0; +out: + i915_gem_object_unlock(ctx->obj); + return err; } static int gtt_set(struct context *ctx, unsigned long offset, u32 v) @@ -82,7 +88,7 @@ static int gtt_set(struct context *ctx, unsigned long offset, u32 v) u32 __iomem *map; int err = 0; - i915_gem_object_lock(ctx->obj); + i915_gem_object_lock(ctx->obj, NULL); err = i915_gem_object_set_to_gtt_domain(ctx->obj, true); i915_gem_object_unlock(ctx->obj); if (err) @@ -115,7 +121,7 @@ static int gtt_get(struct context *ctx, unsigned long offset, u32 *v) u32 __iomem *map; int err = 0; - i915_gem_object_lock(ctx->obj); + i915_gem_object_lock(ctx->obj, NULL); err = i915_gem_object_set_to_gtt_domain(ctx->obj, false); i915_gem_object_unlock(ctx->obj); if (err) @@ -147,7 +153,7 @@ static int wc_set(struct context *ctx, unsigned long offset, u32 v) u32 *map; int err; - i915_gem_object_lock(ctx->obj); + i915_gem_object_lock(ctx->obj, NULL); err = i915_gem_object_set_to_wc_domain(ctx->obj, true); i915_gem_object_unlock(ctx->obj); if (err) @@ -170,7 +176,7 @@ static int wc_get(struct context *ctx, unsigned long offset, u32 *v) u32 *map; int err; - i915_gem_object_lock(ctx->obj); + i915_gem_object_lock(ctx->obj, NULL); err = i915_gem_object_set_to_wc_domain(ctx->obj, false); i915_gem_object_unlock(ctx->obj); if (err) @@ -193,27 +199,27 @@ static int gpu_set(struct context *ctx, unsigned long offset, u32 v) u32 *cs; int err; - i915_gem_object_lock(ctx->obj); + i915_gem_object_lock(ctx->obj, NULL); err = i915_gem_object_set_to_gtt_domain(ctx->obj, true); - i915_gem_object_unlock(ctx->obj); if (err) - return err; + goto out_unlock; vma = i915_gem_object_ggtt_pin(ctx->obj, NULL, 0, 0, 0); - if (IS_ERR(vma)) - return PTR_ERR(vma); + if (IS_ERR(vma)) { + err = PTR_ERR(vma); + goto out_unlock; + } rq = intel_engine_create_kernel_request(ctx->engine); if (IS_ERR(rq)) { - i915_vma_unpin(vma); - return PTR_ERR(rq); + err = PTR_ERR(rq); + goto out_unpin; } cs = intel_ring_begin(rq, 4); if (IS_ERR(cs)) { - i915_request_add(rq); - i915_vma_unpin(vma); - return PTR_ERR(cs); + err = PTR_ERR(cs); + goto out_rq; } if (INTEL_GEN(ctx->engine->i915) >= 8) { @@ -234,14 +240,16 @@ static int gpu_set(struct context *ctx, unsigned long offset, u32 v) } intel_ring_advance(rq, cs); - i915_vma_lock(vma); err = i915_request_await_object(rq, vma->obj, true); if (err == 0) err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE); - i915_vma_unlock(vma); - i915_vma_unpin(vma); +out_rq: i915_request_add(rq); +out_unpin: + i915_vma_unpin(vma); +out_unlock: + i915_gem_object_unlock(ctx->obj); return err; } diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c index 7ffc3c751432..99becb86abd3 100644 --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c @@ -461,9 +461,10 @@ static int cpu_fill(struct drm_i915_gem_object *obj, u32 value) unsigned int n, m, need_flush; int err; + i915_gem_object_lock(obj, NULL); err = i915_gem_object_prepare_write(obj, &need_flush); if (err) - return err; + goto out; for (n = 0; n < real_page_count(obj); n++) { u32 *map; @@ -479,7 +480,9 @@ static int cpu_fill(struct drm_i915_gem_object *obj, u32 value) i915_gem_object_finish_access(obj); obj->read_domains = I915_GEM_DOMAIN_GTT | I915_GEM_DOMAIN_CPU; obj->write_domain = 0; - return 0; +out: + i915_gem_object_unlock(obj); + return err; } static noinline int cpu_check(struct drm_i915_gem_object *obj, @@ -488,9 +491,10 @@ static noinline int cpu_check(struct drm_i915_gem_object *obj, unsigned int n, m, needs_flush; int err; + i915_gem_object_lock(obj, NULL); err = i915_gem_object_prepare_read(obj, &needs_flush); if (err) - return err; + goto out_unlock; for (n = 0; n < real_page_count(obj); n++) { u32 *map; @@ -527,6 +531,8 @@ static noinline int cpu_check(struct drm_i915_gem_object *obj, } i915_gem_object_finish_access(obj); +out_unlock: + i915_gem_object_unlock(obj); return err; } @@ -887,24 +893,15 @@ static int igt_shared_ctx_exec(void *arg) return err; } -static struct i915_vma *rpcs_query_batch(struct i915_vma *vma) +static int rpcs_query_batch(struct drm_i915_gem_object *rpcs, struct i915_vma *vma) { - struct drm_i915_gem_object *obj; u32 *cmd; - int err; - if (INTEL_GEN(vma->vm->i915) < 8) - return ERR_PTR(-EINVAL); + GEM_BUG_ON(INTEL_GEN(vma->vm->i915) < 8); - obj = i915_gem_object_create_internal(vma->vm->i915, PAGE_SIZE); - if (IS_ERR(obj)) - return ERR_CAST(obj); - - cmd = i915_gem_object_pin_map(obj, I915_MAP_WB); - if (IS_ERR(cmd)) { - err = PTR_ERR(cmd); - goto err; - } + cmd = i915_gem_object_pin_map(rpcs, I915_MAP_WB); + if (IS_ERR(cmd)) + return PTR_ERR(cmd); *cmd++ = MI_STORE_REGISTER_MEM_GEN8; *cmd++ = i915_mmio_reg_offset(GEN8_R_PWR_CLK_STATE); @@ -912,26 +909,12 @@ static struct i915_vma *rpcs_query_batch(struct i915_vma *vma) *cmd++ = upper_32_bits(vma->node.start); *cmd = MI_BATCH_BUFFER_END; - __i915_gem_object_flush_map(obj, 0, 64); - i915_gem_object_unpin_map(obj); + __i915_gem_object_flush_map(rpcs, 0, 64); + i915_gem_object_unpin_map(rpcs); intel_gt_chipset_flush(vma->vm->gt); - vma = i915_vma_instance(obj, vma->vm, NULL); - if (IS_ERR(vma)) { - err = PTR_ERR(vma); - goto err; - } - - err = i915_vma_pin(vma, 0, 0, PIN_USER); - if (err) - goto err; - - return vma; - -err: - i915_gem_object_put(obj); - return ERR_PTR(err); + return 0; } static int @@ -939,52 +922,68 @@ emit_rpcs_query(struct drm_i915_gem_object *obj, struct intel_context *ce, struct i915_request **rq_out) { + struct drm_i915_private *i915 = to_i915(obj->base.dev); struct i915_request *rq; + struct i915_gem_ww_ctx ww; struct i915_vma *batch; struct i915_vma *vma; + struct drm_i915_gem_object *rpcs; int err; GEM_BUG_ON(!intel_engine_can_store_dword(ce->engine)); + if (INTEL_GEN(i915) < 8) + return -EINVAL; + vma = i915_vma_instance(obj, ce->vm, NULL); if (IS_ERR(vma)) return PTR_ERR(vma); - i915_gem_object_lock(obj); - err = i915_gem_object_set_to_gtt_domain(obj, false); - i915_gem_object_unlock(obj); - if (err) - return err; + rpcs = i915_gem_object_create_internal(i915, PAGE_SIZE); + if (IS_ERR(rpcs)) + return PTR_ERR(rpcs); - err = i915_vma_pin(vma, 0, 0, PIN_USER); - if (err) - return err; - - batch = rpcs_query_batch(vma); + batch = i915_vma_instance(rpcs, ce->vm, NULL); if (IS_ERR(batch)) { err = PTR_ERR(batch); - goto err_vma; + goto err_put; } + i915_gem_ww_ctx_init(&ww, false); +retry: + err = i915_gem_object_lock(obj, &ww); + if (!err) + err = i915_gem_object_lock(rpcs, &ww); + if (!err) + err = i915_gem_object_set_to_gtt_domain(obj, false); + if (!err) + err = i915_vma_pin_ww(vma, &ww, 0, 0, PIN_USER); + if (err) + goto err_put; + + err = i915_vma_pin_ww(batch, &ww, 0, 0, PIN_USER); + if (err) + goto err_vma; + + err = rpcs_query_batch(rpcs, vma); + if (err) + goto err_batch; + rq = i915_request_create(ce); if (IS_ERR(rq)) { err = PTR_ERR(rq); goto err_batch; } - i915_vma_lock(batch); err = i915_request_await_object(rq, batch->obj, false); if (err == 0) err = i915_vma_move_to_active(batch, rq, 0); - i915_vma_unlock(batch); if (err) goto skip_request; - i915_vma_lock(vma); err = i915_request_await_object(rq, vma->obj, true); if (err == 0) err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE); - i915_vma_unlock(vma); if (err) goto skip_request; @@ -1000,23 +999,24 @@ emit_rpcs_query(struct drm_i915_gem_object *obj, if (err) goto skip_request; - i915_vma_unpin_and_release(&batch, 0); - i915_vma_unpin(vma); - *rq_out = i915_request_get(rq); - i915_request_add(rq); - - return 0; - skip_request: - i915_request_set_error_once(rq, err); + if (err) + i915_request_set_error_once(rq, err); i915_request_add(rq); err_batch: - i915_vma_unpin_and_release(&batch, 0); + i915_vma_unpin(batch); err_vma: i915_vma_unpin(vma); - +err_put: + if (err == -EDEADLK) { + err = i915_gem_ww_ctx_backoff(&ww); + if (!err) + goto retry; + } + i915_gem_ww_ctx_fini(&ww); + i915_gem_object_put(rpcs); return err; } @@ -1709,7 +1709,7 @@ static int read_from_scratch(struct i915_gem_context *ctx, i915_request_add(rq); - i915_gem_object_lock(obj); + i915_gem_object_lock(obj, NULL); err = i915_gem_object_set_to_cpu_domain(obj, false); i915_gem_object_unlock(obj); if (err) @@ -1748,7 +1748,7 @@ static int check_scratch_page(struct i915_gem_context *ctx, u32 *out) if (!vm) return -ENODEV; - page = vm->scratch[0].base.page; + page = __px_page(vm->scratch[0]); if (!page) { pr_err("No scratch page!\n"); return -EINVAL; @@ -1914,8 +1914,8 @@ static int mock_context_barrier(void *arg) return -ENOMEM; counter = 0; - err = context_barrier_task(ctx, 0, - NULL, NULL, mock_barrier_task, &counter); + err = context_barrier_task(ctx, 0, NULL, NULL, NULL, + mock_barrier_task, &counter); if (err) { pr_err("Failed at line %d, err=%d\n", __LINE__, err); goto out; @@ -1927,11 +1927,8 @@ static int mock_context_barrier(void *arg) } counter = 0; - err = context_barrier_task(ctx, ALL_ENGINES, - skip_unused_engines, - NULL, - mock_barrier_task, - &counter); + err = context_barrier_task(ctx, ALL_ENGINES, skip_unused_engines, + NULL, NULL, mock_barrier_task, &counter); if (err) { pr_err("Failed at line %d, err=%d\n", __LINE__, err); goto out; @@ -1951,8 +1948,8 @@ static int mock_context_barrier(void *arg) counter = 0; context_barrier_inject_fault = BIT(RCS0); - err = context_barrier_task(ctx, ALL_ENGINES, - NULL, NULL, mock_barrier_task, &counter); + err = context_barrier_task(ctx, ALL_ENGINES, NULL, NULL, NULL, + mock_barrier_task, &counter); context_barrier_inject_fault = 0; if (err == -ENXIO) err = 0; @@ -1966,11 +1963,8 @@ static int mock_context_barrier(void *arg) goto out; counter = 0; - err = context_barrier_task(ctx, ALL_ENGINES, - skip_unused_engines, - NULL, - mock_barrier_task, - &counter); + err = context_barrier_task(ctx, ALL_ENGINES, skip_unused_engines, + NULL, NULL, mock_barrier_task, &counter); if (err) { pr_err("Failed at line %d, err=%d\n", __LINE__, err); goto out; diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c index a49016f8ee0d..e1d50a5a1477 100644 --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c @@ -32,46 +32,39 @@ static int __igt_gpu_reloc(struct i915_execbuffer *eb, if (IS_ERR(vma)) return PTR_ERR(vma); - err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_HIGH); + err = i915_gem_object_lock(obj, &eb->ww); + if (err) + return err; + + err = i915_vma_pin_ww(vma, &eb->ww, 0, 0, PIN_USER | PIN_HIGH); if (err) return err; /* 8-Byte aligned */ - if (!__reloc_entry_gpu(eb, vma, - offsets[0] * sizeof(u32), - 0)) { - err = -EIO; - goto unpin_vma; - } + err = __reloc_entry_gpu(eb, vma, offsets[0] * sizeof(u32), 0); + if (err <= 0) + goto reloc_err; /* !8-Byte aligned */ - if (!__reloc_entry_gpu(eb, vma, - offsets[1] * sizeof(u32), - 1)) { - err = -EIO; - goto unpin_vma; - } + err = __reloc_entry_gpu(eb, vma, offsets[1] * sizeof(u32), 1); + if (err <= 0) + goto reloc_err; /* Skip to the end of the cmd page */ - i = PAGE_SIZE / sizeof(u32) - RELOC_TAIL - 1; + i = PAGE_SIZE / sizeof(u32) - 1; i -= eb->reloc_cache.rq_size; memset32(eb->reloc_cache.rq_cmd + eb->reloc_cache.rq_size, MI_NOOP, i); eb->reloc_cache.rq_size += i; - /* Force batch chaining */ - if (!__reloc_entry_gpu(eb, vma, - offsets[2] * sizeof(u32), - 2)) { - err = -EIO; - goto unpin_vma; - } + /* Force next batch */ + err = __reloc_entry_gpu(eb, vma, offsets[2] * sizeof(u32), 2); + if (err <= 0) + goto reloc_err; GEM_BUG_ON(!eb->reloc_cache.rq); rq = i915_request_get(eb->reloc_cache.rq); - err = reloc_gpu_flush(&eb->reloc_cache); - if (err) - goto put_rq; + reloc_gpu_flush(eb, &eb->reloc_cache); GEM_BUG_ON(eb->reloc_cache.rq); err = i915_gem_object_wait(obj, I915_WAIT_INTERRUPTIBLE, HZ / 2); @@ -103,6 +96,11 @@ static int __igt_gpu_reloc(struct i915_execbuffer *eb, unpin_vma: i915_vma_unpin(vma); return err; + +reloc_err: + if (!err) + err = -EIO; + goto unpin_vma; } static int igt_gpu_reloc(void *arg) @@ -124,6 +122,8 @@ static int igt_gpu_reloc(void *arg) goto err_scratch; } + intel_gt_pm_get(&eb.i915->gt); + for_each_uabi_engine(eb.engine, eb.i915) { reloc_cache_init(&eb.reloc_cache, eb.i915); memset(map, POISON_INUSE, 4096); @@ -134,15 +134,29 @@ static int igt_gpu_reloc(void *arg) err = PTR_ERR(eb.context); goto err_pm; } + eb.reloc_pool = NULL; + eb.reloc_context = NULL; - err = intel_context_pin(eb.context); - if (err) - goto err_put; + i915_gem_ww_ctx_init(&eb.ww, false); +retry: + err = intel_context_pin_ww(eb.context, &eb.ww); + if (!err) { + err = __igt_gpu_reloc(&eb, scratch); - err = __igt_gpu_reloc(&eb, scratch); + intel_context_unpin(eb.context); + } + if (err == -EDEADLK) { + err = i915_gem_ww_ctx_backoff(&eb.ww); + if (!err) + goto retry; + } + i915_gem_ww_ctx_fini(&eb.ww); + + if (eb.reloc_pool) + intel_gt_buffer_pool_put(eb.reloc_pool); + if (eb.reloc_context) + intel_context_put(eb.reloc_context); - intel_context_unpin(eb.context); -err_put: intel_context_put(eb.context); err_pm: intel_engine_pm_put(eb.engine); @@ -153,6 +167,7 @@ static int igt_gpu_reloc(void *arg) if (igt_flush_test(eb.i915)) err = -EIO; + intel_gt_pm_put(&eb.i915->gt); err_scratch: i915_gem_object_put(scratch); return err; diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c index 9c7402ce5bf9..d27d87a678c8 100644 --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c @@ -103,7 +103,7 @@ static int check_partial_mapping(struct drm_i915_gem_object *obj, GEM_BUG_ON(i915_gem_object_get_tiling(obj) != tile->tiling); GEM_BUG_ON(i915_gem_object_get_stride(obj) != tile->stride); - i915_gem_object_lock(obj); + i915_gem_object_lock(obj, NULL); err = i915_gem_object_set_to_gtt_domain(obj, true); i915_gem_object_unlock(obj); if (err) { @@ -188,7 +188,7 @@ static int check_partial_mappings(struct drm_i915_gem_object *obj, GEM_BUG_ON(i915_gem_object_get_tiling(obj) != tile->tiling); GEM_BUG_ON(i915_gem_object_get_stride(obj) != tile->stride); - i915_gem_object_lock(obj); + i915_gem_object_lock(obj, NULL); err = i915_gem_object_set_to_gtt_domain(obj, true); i915_gem_object_unlock(obj); if (err) { @@ -528,31 +528,42 @@ static int make_obj_busy(struct drm_i915_gem_object *obj) for_each_uabi_engine(engine, i915) { struct i915_request *rq; struct i915_vma *vma; + struct i915_gem_ww_ctx ww; int err; vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); if (IS_ERR(vma)) return PTR_ERR(vma); - err = i915_vma_pin(vma, 0, 0, PIN_USER); + i915_gem_ww_ctx_init(&ww, false); +retry: + err = i915_gem_object_lock(obj, &ww); + if (!err) + err = i915_vma_pin_ww(vma, &ww, 0, 0, PIN_USER); if (err) - return err; + goto err; rq = intel_engine_create_kernel_request(engine); if (IS_ERR(rq)) { - i915_vma_unpin(vma); - return PTR_ERR(rq); + err = PTR_ERR(rq); + goto err_unpin; } - i915_vma_lock(vma); err = i915_request_await_object(rq, vma->obj, true); if (err == 0) err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE); - i915_vma_unlock(vma); i915_request_add(rq); +err_unpin: i915_vma_unpin(vma); +err: + if (err == -EDEADLK) { + err = i915_gem_ww_ctx_backoff(&ww); + if (!err) + goto retry; + } + i915_gem_ww_ctx_fini(&ww); if (err) return err; } @@ -1123,6 +1134,7 @@ static int __igt_mmap_gpu(struct drm_i915_private *i915, for_each_uabi_engine(engine, i915) { struct i915_request *rq; struct i915_vma *vma; + struct i915_gem_ww_ctx ww; vma = i915_vma_instance(obj, engine->kernel_context->vm, NULL); if (IS_ERR(vma)) { @@ -1130,9 +1142,13 @@ static int __igt_mmap_gpu(struct drm_i915_private *i915, goto out_unmap; } - err = i915_vma_pin(vma, 0, 0, PIN_USER); + i915_gem_ww_ctx_init(&ww, false); +retry: + err = i915_gem_object_lock(obj, &ww); + if (!err) + err = i915_vma_pin_ww(vma, &ww, 0, 0, PIN_USER); if (err) - goto out_unmap; + goto out_ww; rq = i915_request_create(engine->kernel_context); if (IS_ERR(rq)) { @@ -1140,11 +1156,9 @@ static int __igt_mmap_gpu(struct drm_i915_private *i915, goto out_unpin; } - i915_vma_lock(vma); err = i915_request_await_object(rq, vma->obj, false); if (err == 0) err = i915_vma_move_to_active(vma, rq, 0); - i915_vma_unlock(vma); err = engine->emit_bb_start(rq, vma->node.start, 0, 0); i915_request_get(rq); @@ -1166,6 +1180,13 @@ static int __igt_mmap_gpu(struct drm_i915_private *i915, out_unpin: i915_vma_unpin(vma); +out_ww: + if (err == -EDEADLK) { + err = i915_gem_ww_ctx_backoff(&ww); + if (!err) + goto retry; + } + i915_gem_ww_ctx_fini(&ww); if (err) goto out_unmap; } diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_phys.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_phys.c index 34932871b3a5..a94243dc4c5c 100644 --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_phys.c +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_phys.c @@ -44,7 +44,7 @@ static int mock_phys_object(void *arg) } /* Make the object dirty so that put_pages must do copy back the data */ - i915_gem_object_lock(obj); + i915_gem_object_lock(obj, NULL); err = i915_gem_object_set_to_gtt_domain(obj, true); i915_gem_object_unlock(obj); if (err) { diff --git a/drivers/gpu/drm/i915/gt/gen6_ppgtt.c b/drivers/gpu/drm/i915/gt/gen6_ppgtt.c index cdc0b9c54305..fd0d24d28763 100644 --- a/drivers/gpu/drm/i915/gt/gen6_ppgtt.c +++ b/drivers/gpu/drm/i915/gt/gen6_ppgtt.c @@ -16,8 +16,10 @@ static inline void gen6_write_pde(const struct gen6_ppgtt *ppgtt, const unsigned int pde, const struct i915_page_table *pt) { + dma_addr_t addr = pt ? px_dma(pt) : px_dma(ppgtt->base.vm.scratch[1]); + /* Caller needs to make sure the write completes if necessary */ - iowrite32(GEN6_PDE_ADDR_ENCODE(px_dma(pt)) | GEN6_PDE_VALID, + iowrite32(GEN6_PDE_ADDR_ENCODE(addr) | GEN6_PDE_VALID, ppgtt->pd_addr + pde); } @@ -79,7 +81,7 @@ static void gen6_ppgtt_clear_range(struct i915_address_space *vm, { struct gen6_ppgtt * const ppgtt = to_gen6_ppgtt(i915_vm_to_ppgtt(vm)); const unsigned int first_entry = start / I915_GTT_PAGE_SIZE; - const gen6_pte_t scratch_pte = vm->scratch[0].encode; + const gen6_pte_t scratch_pte = vm->scratch[0]->encode; unsigned int pde = first_entry / GEN6_PTES; unsigned int pte = first_entry % GEN6_PTES; unsigned int num_entries = length / I915_GTT_PAGE_SIZE; @@ -90,8 +92,6 @@ static void gen6_ppgtt_clear_range(struct i915_address_space *vm, const unsigned int count = min(num_entries, GEN6_PTES - pte); gen6_pte_t *vaddr; - GEM_BUG_ON(px_base(pt) == px_base(&vm->scratch[1])); - num_entries -= count; GEM_BUG_ON(count > atomic_read(&pt->used)); @@ -127,7 +127,7 @@ static void gen6_ppgtt_insert_entries(struct i915_address_space *vm, struct sgt_dma iter = sgt_dma(vma); gen6_pte_t *vaddr; - GEM_BUG_ON(pd->entry[act_pt] == &vm->scratch[1]); + GEM_BUG_ON(!pd->entry[act_pt]); vaddr = kmap_atomic_px(i915_pt_entry(pd, act_pt)); do { @@ -177,39 +177,36 @@ static void gen6_flush_pd(struct gen6_ppgtt *ppgtt, u64 start, u64 end) mutex_unlock(&ppgtt->flush); } -static int gen6_alloc_va_range(struct i915_address_space *vm, - u64 start, u64 length) +static void gen6_alloc_va_range(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + u64 start, u64 length) { struct gen6_ppgtt *ppgtt = to_gen6_ppgtt(i915_vm_to_ppgtt(vm)); struct i915_page_directory * const pd = ppgtt->base.pd; - struct i915_page_table *pt, *alloc = NULL; + struct i915_page_table *pt; bool flush = false; u64 from = start; unsigned int pde; - int ret = 0; spin_lock(&pd->lock); gen6_for_each_pde(pt, pd, start, length, pde) { const unsigned int count = gen6_pte_count(start, length); - if (px_base(pt) == px_base(&vm->scratch[1])) { + if (!pt) { spin_unlock(&pd->lock); - pt = fetch_and_zero(&alloc); - if (!pt) - pt = alloc_pt(vm); - if (IS_ERR(pt)) { - ret = PTR_ERR(pt); - goto unwind_out; - } + pt = stash->pt[0]; + __i915_gem_object_pin_pages(pt->base); + i915_gem_object_make_unshrinkable(pt->base); - fill32_px(pt, vm->scratch[0].encode); + fill32_px(pt, vm->scratch[0]->encode); spin_lock(&pd->lock); - if (pd->entry[pde] == &vm->scratch[1]) { + if (!pd->entry[pde]) { + stash->pt[0] = pt->stash; + atomic_set(&pt->used, 0); pd->entry[pde] = pt; } else { - alloc = pt; pt = pd->entry[pde]; } @@ -226,38 +223,32 @@ static int gen6_alloc_va_range(struct i915_address_space *vm, with_intel_runtime_pm(&vm->i915->runtime_pm, wakeref) gen6_flush_pd(ppgtt, from, start); } - - goto out; - -unwind_out: - gen6_ppgtt_clear_range(vm, from, start - from); -out: - if (alloc) - free_px(vm, alloc); - return ret; } static int gen6_ppgtt_init_scratch(struct gen6_ppgtt *ppgtt) { struct i915_address_space * const vm = &ppgtt->base.vm; - struct i915_page_directory * const pd = ppgtt->base.pd; int ret; - ret = setup_scratch_page(vm, __GFP_HIGHMEM); + ret = setup_scratch_page(vm); if (ret) return ret; - vm->scratch[0].encode = - vm->pte_encode(px_dma(&vm->scratch[0]), + vm->scratch[0]->encode = + vm->pte_encode(px_dma(vm->scratch[0]), I915_CACHE_NONE, PTE_READ_ONLY); - if (unlikely(setup_page_dma(vm, px_base(&vm->scratch[1])))) { - cleanup_scratch_page(vm); - return -ENOMEM; + vm->scratch[1] = vm->alloc_pt_dma(vm, I915_GTT_PAGE_SIZE_4K); + if (IS_ERR(vm->scratch[1])) + return PTR_ERR(vm->scratch[1]); + + ret = pin_pt_dma(vm, vm->scratch[1]); + if (ret) { + i915_gem_object_put(vm->scratch[1]); + return ret; } - fill32_px(&vm->scratch[1], vm->scratch[0].encode); - memset_p(pd->entry, &vm->scratch[1], I915_PDES); + fill32_px(vm->scratch[1], vm->scratch[0]->encode); return 0; } @@ -265,14 +256,12 @@ static int gen6_ppgtt_init_scratch(struct gen6_ppgtt *ppgtt) static void gen6_ppgtt_free_pd(struct gen6_ppgtt *ppgtt) { struct i915_page_directory * const pd = ppgtt->base.pd; - struct i915_page_dma * const scratch = - px_base(&ppgtt->base.vm.scratch[1]); struct i915_page_table *pt; u32 pde; gen6_for_all_pdes(pt, pd, pde) - if (px_base(pt) != scratch) - free_px(&ppgtt->base.vm, pt); + if (pt) + free_pt(&ppgtt->base.vm, pt); } static void gen6_ppgtt_cleanup(struct i915_address_space *vm) @@ -286,7 +275,8 @@ static void gen6_ppgtt_cleanup(struct i915_address_space *vm) mutex_destroy(&ppgtt->flush); mutex_destroy(&ppgtt->pin_mutex); - kfree(ppgtt->base.pd); + + free_pd(&ppgtt->base.vm, ppgtt->base.pd); } static int pd_vma_set_pages(struct i915_vma *vma) @@ -302,28 +292,26 @@ static void pd_vma_clear_pages(struct i915_vma *vma) vma->pages = NULL; } -static int pd_vma_bind(struct i915_address_space *vm, - struct i915_vma *vma, - enum i915_cache_level cache_level, - u32 unused) +static void pd_vma_bind(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + struct i915_vma *vma, + enum i915_cache_level cache_level, + u32 unused) { struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm); struct gen6_ppgtt *ppgtt = vma->private; u32 ggtt_offset = i915_ggtt_offset(vma) / I915_GTT_PAGE_SIZE; - px_base(ppgtt->base.pd)->ggtt_offset = ggtt_offset * sizeof(gen6_pte_t); + ppgtt->pp_dir = ggtt_offset * sizeof(gen6_pte_t) << 10; ppgtt->pd_addr = (gen6_pte_t __iomem *)ggtt->gsm + ggtt_offset; gen6_flush_pd(ppgtt, 0, ppgtt->base.vm.total); - return 0; } static void pd_vma_unbind(struct i915_address_space *vm, struct i915_vma *vma) { struct gen6_ppgtt *ppgtt = vma->private; struct i915_page_directory * const pd = ppgtt->base.pd; - struct i915_page_dma * const scratch = - px_base(&ppgtt->base.vm.scratch[1]); struct i915_page_table *pt; unsigned int pde; @@ -332,11 +320,11 @@ static void pd_vma_unbind(struct i915_address_space *vm, struct i915_vma *vma) /* Free all no longer used page tables */ gen6_for_all_pdes(pt, ppgtt->base.pd, pde) { - if (px_base(pt) == scratch || atomic_read(&pt->used)) + if (!pt || atomic_read(&pt->used)) continue; - free_px(&ppgtt->base.vm, pt); - pd->entry[pde] = scratch; + free_pt(&ppgtt->base.vm, pt); + pd->entry[pde] = NULL; } ppgtt->scan_for_unused_pt = false; @@ -380,7 +368,7 @@ static struct i915_vma *pd_vma_create(struct gen6_ppgtt *ppgtt, int size) return vma; } -int gen6_ppgtt_pin(struct i915_ppgtt *base) +int gen6_ppgtt_pin(struct i915_ppgtt *base, struct i915_gem_ww_ctx *ww) { struct gen6_ppgtt *ppgtt = to_gen6_ppgtt(base); int err; @@ -406,7 +394,7 @@ int gen6_ppgtt_pin(struct i915_ppgtt *base) */ err = 0; if (!atomic_read(&ppgtt->pin_count)) - err = i915_ggtt_pin(ppgtt->vma, GEN6_PD_ALIGN, PIN_HIGH); + err = i915_ggtt_pin(ppgtt->vma, ww, GEN6_PD_ALIGN, PIN_HIGH); if (!err) atomic_inc(&ppgtt->pin_count); mutex_unlock(&ppgtt->pin_mutex); @@ -448,6 +436,7 @@ struct i915_ppgtt *gen6_ppgtt_create(struct intel_gt *gt) mutex_init(&ppgtt->pin_mutex); ppgtt_init(&ppgtt->base, gt); + ppgtt->base.vm.pd_shift = ilog2(SZ_4K * SZ_4K / sizeof(gen6_pte_t)); ppgtt->base.vm.top = 1; ppgtt->base.vm.bind_async_flags = I915_VMA_LOCAL_BIND; @@ -456,9 +445,10 @@ struct i915_ppgtt *gen6_ppgtt_create(struct intel_gt *gt) ppgtt->base.vm.insert_entries = gen6_ppgtt_insert_entries; ppgtt->base.vm.cleanup = gen6_ppgtt_cleanup; + ppgtt->base.vm.alloc_pt_dma = alloc_pt_dma; ppgtt->base.vm.pte_encode = ggtt->vm.pte_encode; - ppgtt->base.pd = __alloc_pd(sizeof(*ppgtt->base.pd)); + ppgtt->base.pd = __alloc_pd(I915_PDES); if (!ppgtt->base.pd) { err = -ENOMEM; goto err_free; @@ -479,7 +469,7 @@ struct i915_ppgtt *gen6_ppgtt_create(struct intel_gt *gt) err_scratch: free_scratch(&ppgtt->base.vm); err_pd: - kfree(ppgtt->base.pd); + free_pd(&ppgtt->base.vm, ppgtt->base.pd); err_free: mutex_destroy(&ppgtt->pin_mutex); kfree(ppgtt); diff --git a/drivers/gpu/drm/i915/gt/gen6_ppgtt.h b/drivers/gpu/drm/i915/gt/gen6_ppgtt.h index 72e481806c96..3357228f3304 100644 --- a/drivers/gpu/drm/i915/gt/gen6_ppgtt.h +++ b/drivers/gpu/drm/i915/gt/gen6_ppgtt.h @@ -8,12 +8,15 @@ #include "intel_gtt.h" +struct i915_gem_ww_ctx; + struct gen6_ppgtt { struct i915_ppgtt base; struct mutex flush; struct i915_vma *vma; gen6_pte_t __iomem *pd_addr; + u32 pp_dir; atomic_t pin_count; struct mutex pin_mutex; @@ -66,7 +69,7 @@ static inline struct gen6_ppgtt *to_gen6_ppgtt(struct i915_ppgtt *base) (pt = i915_pt_entry(pd, iter), true); \ ++iter) -int gen6_ppgtt_pin(struct i915_ppgtt *base); +int gen6_ppgtt_pin(struct i915_ppgtt *base, struct i915_gem_ww_ctx *ww); void gen6_ppgtt_unpin(struct i915_ppgtt *base); void gen6_ppgtt_unpin_all(struct i915_ppgtt *base); void gen6_ppgtt_enable(struct intel_gt *gt); diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c index 699125928272..eb64f474a78c 100644 --- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c +++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c @@ -181,7 +181,7 @@ static void __gen8_ppgtt_cleanup(struct i915_address_space *vm, } while (pde++, --count); } - free_px(vm, pd); + free_px(vm, &pd->pt, lvl); } static void gen8_ppgtt_cleanup(struct i915_address_space *vm) @@ -199,7 +199,7 @@ static u64 __gen8_ppgtt_clear(struct i915_address_space * const vm, struct i915_page_directory * const pd, u64 start, const u64 end, int lvl) { - const struct i915_page_scratch * const scratch = &vm->scratch[lvl]; + const struct drm_i915_gem_object * const scratch = vm->scratch[lvl]; unsigned int idx, len; GEM_BUG_ON(end > vm->total >> GEN8_PTE_SHIFT); @@ -239,7 +239,7 @@ static u64 __gen8_ppgtt_clear(struct i915_address_space * const vm, vaddr = kmap_atomic_px(pt); memset64(vaddr + gen8_pd_index(start, 0), - vm->scratch[0].encode, + vm->scratch[0]->encode, count); kunmap_atomic(vaddr); @@ -248,7 +248,7 @@ static u64 __gen8_ppgtt_clear(struct i915_address_space * const vm, } if (release_pd_entry(pd, idx, pt, scratch)) - free_px(vm, pt); + free_px(vm, pt, lvl); } while (idx++, --len); return start; @@ -269,14 +269,12 @@ static void gen8_ppgtt_clear(struct i915_address_space *vm, start, start + length, vm->top); } -static int __gen8_ppgtt_alloc(struct i915_address_space * const vm, - struct i915_page_directory * const pd, - u64 * const start, const u64 end, int lvl) +static void __gen8_ppgtt_alloc(struct i915_address_space * const vm, + struct i915_vm_pt_stash *stash, + struct i915_page_directory * const pd, + u64 * const start, const u64 end, int lvl) { - const struct i915_page_scratch * const scratch = &vm->scratch[lvl]; - struct i915_page_table *alloc = NULL; unsigned int idx, len; - int ret = 0; GEM_BUG_ON(end > vm->total >> GEN8_PTE_SHIFT); @@ -297,49 +295,31 @@ static int __gen8_ppgtt_alloc(struct i915_address_space * const vm, DBG("%s(%p):{ lvl:%d, idx:%d } allocating new tree\n", __func__, vm, lvl + 1, idx); - pt = fetch_and_zero(&alloc); - if (lvl) { - if (!pt) { - pt = &alloc_pd(vm)->pt; - if (IS_ERR(pt)) { - ret = PTR_ERR(pt); - goto out; - } - } + pt = stash->pt[!!lvl]; + __i915_gem_object_pin_pages(pt->base); + i915_gem_object_make_unshrinkable(pt->base); - fill_px(pt, vm->scratch[lvl].encode); - } else { - if (!pt) { - pt = alloc_pt(vm); - if (IS_ERR(pt)) { - ret = PTR_ERR(pt); - goto out; - } - } - - if (intel_vgpu_active(vm->i915) || - gen8_pt_count(*start, end) < I915_PDES) - fill_px(pt, vm->scratch[lvl].encode); - } + if (lvl || + gen8_pt_count(*start, end) < I915_PDES || + intel_vgpu_active(vm->i915)) + fill_px(pt, vm->scratch[lvl]->encode); spin_lock(&pd->lock); - if (likely(!pd->entry[idx])) + if (likely(!pd->entry[idx])) { + stash->pt[!!lvl] = pt->stash; + atomic_set(&pt->used, 0); set_pd_entry(pd, idx, pt); - else - alloc = pt, pt = pd->entry[idx]; + } else { + pt = pd->entry[idx]; + } } if (lvl) { atomic_inc(&pt->used); spin_unlock(&pd->lock); - ret = __gen8_ppgtt_alloc(vm, as_pd(pt), - start, end, lvl); - if (unlikely(ret)) { - if (release_pd_entry(pd, idx, pt, scratch)) - free_px(vm, pt); - goto out; - } + __gen8_ppgtt_alloc(vm, stash, + as_pd(pt), start, end, lvl); spin_lock(&pd->lock); atomic_dec(&pt->used); @@ -359,18 +339,12 @@ static int __gen8_ppgtt_alloc(struct i915_address_space * const vm, } } while (idx++, --len); spin_unlock(&pd->lock); -out: - if (alloc) - free_px(vm, alloc); - return ret; } -static int gen8_ppgtt_alloc(struct i915_address_space *vm, - u64 start, u64 length) +static void gen8_ppgtt_alloc(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + u64 start, u64 length) { - u64 from; - int err; - GEM_BUG_ON(!IS_ALIGNED(start, BIT_ULL(GEN8_PTE_SHIFT))); GEM_BUG_ON(!IS_ALIGNED(length, BIT_ULL(GEN8_PTE_SHIFT))); GEM_BUG_ON(range_overflows(start, length, vm->total)); @@ -378,25 +352,9 @@ static int gen8_ppgtt_alloc(struct i915_address_space *vm, start >>= GEN8_PTE_SHIFT; length >>= GEN8_PTE_SHIFT; GEM_BUG_ON(length == 0); - from = start; - err = __gen8_ppgtt_alloc(vm, i915_vm_to_ppgtt(vm)->pd, - &start, start + length, vm->top); - if (unlikely(err && from != start)) - __gen8_ppgtt_clear(vm, i915_vm_to_ppgtt(vm)->pd, - from, start, vm->top); - - return err; -} - -static __always_inline void -write_pte(gen8_pte_t *pte, const gen8_pte_t val) -{ - /* Magic delays? Or can we refine these to flush all in one pass? */ - *pte = val; - wmb(); /* cpu to cache */ - clflush(pte); /* cache to memory */ - wmb(); /* visible to all */ + __gen8_ppgtt_alloc(vm, stash, i915_vm_to_ppgtt(vm)->pd, + &start, start + length, vm->top); } static __always_inline u64 @@ -415,8 +373,7 @@ gen8_ppgtt_insert_pte(struct i915_ppgtt *ppgtt, vaddr = kmap_atomic_px(i915_pt_entry(pd, gen8_pd_index(idx, 1))); do { GEM_BUG_ON(iter->sg->length < I915_GTT_PAGE_SIZE); - write_pte(&vaddr[gen8_pd_index(idx, 0)], - pte_encode | iter->dma); + vaddr[gen8_pd_index(idx, 0)] = pte_encode | iter->dma; iter->dma += I915_GTT_PAGE_SIZE; if (iter->dma >= iter->max) { @@ -439,10 +396,12 @@ gen8_ppgtt_insert_pte(struct i915_ppgtt *ppgtt, pd = pdp->entry[gen8_pd_index(idx, 2)]; } + clflush_cache_range(vaddr, PAGE_SIZE); kunmap_atomic(vaddr); vaddr = kmap_atomic_px(i915_pt_entry(pd, gen8_pd_index(idx, 1))); } } while (1); + clflush_cache_range(vaddr, PAGE_SIZE); kunmap_atomic(vaddr); return idx; @@ -498,7 +457,7 @@ static void gen8_ppgtt_insert_huge(struct i915_vma *vma, do { GEM_BUG_ON(iter->sg->length < page_size); - write_pte(&vaddr[index++], encode | iter->dma); + vaddr[index++] = encode | iter->dma; start += page_size; iter->dma += page_size; @@ -523,6 +482,7 @@ static void gen8_ppgtt_insert_huge(struct i915_vma *vma, } } while (rem >= page_size && index < I915_PDES); + clflush_cache_range(vaddr, PAGE_SIZE); kunmap_atomic(vaddr); /* @@ -554,7 +514,7 @@ static void gen8_ppgtt_insert_huge(struct i915_vma *vma, if (I915_SELFTEST_ONLY(vma->vm->scrub_64K)) { u16 i; - encode = vma->vm->scratch[0].encode; + encode = vma->vm->scratch[0]->encode; vaddr = kmap_atomic_px(i915_pt_entry(pd, maybe_64K)); for (i = 1; i < index; i += 16) @@ -608,27 +568,37 @@ static int gen8_init_scratch(struct i915_address_space *vm) GEM_BUG_ON(!clone->has_read_only); vm->scratch_order = clone->scratch_order; - memcpy(vm->scratch, clone->scratch, sizeof(vm->scratch)); - px_dma(&vm->scratch[0]) = 0; /* no xfer of ownership */ + for (i = 0; i <= vm->top; i++) + vm->scratch[i] = i915_gem_object_get(clone->scratch[i]); + return 0; } - ret = setup_scratch_page(vm, __GFP_HIGHMEM); + ret = setup_scratch_page(vm); if (ret) return ret; - vm->scratch[0].encode = - gen8_pte_encode(px_dma(&vm->scratch[0]), + vm->scratch[0]->encode = + gen8_pte_encode(px_dma(vm->scratch[0]), I915_CACHE_LLC, vm->has_read_only); for (i = 1; i <= vm->top; i++) { - if (unlikely(setup_page_dma(vm, px_base(&vm->scratch[i])))) + struct drm_i915_gem_object *obj; + + obj = vm->alloc_pt_dma(vm, I915_GTT_PAGE_SIZE_4K); + if (IS_ERR(obj)) goto free_scratch; - fill_px(&vm->scratch[i], vm->scratch[i - 1].encode); - vm->scratch[i].encode = - gen8_pde_encode(px_dma(&vm->scratch[i]), - I915_CACHE_LLC); + ret = pin_pt_dma(vm, obj); + if (ret) { + i915_gem_object_put(obj); + goto free_scratch; + } + + fill_px(obj, vm->scratch[i - 1]->encode); + obj->encode = gen8_pde_encode(px_dma(obj), I915_CACHE_LLC); + + vm->scratch[i] = obj; } return 0; @@ -649,12 +619,20 @@ static int gen8_preallocate_top_level_pdp(struct i915_ppgtt *ppgtt) for (idx = 0; idx < GEN8_3LVL_PDPES; idx++) { struct i915_page_directory *pde; + int err; pde = alloc_pd(vm); if (IS_ERR(pde)) return PTR_ERR(pde); - fill_px(pde, vm->scratch[1].encode); + err = pin_pt_dma(vm, pde->pt.base); + if (err) { + i915_gem_object_put(pde->pt.base); + free_pd(vm, pde); + return err; + } + + fill_px(pde, vm->scratch[1]->encode); set_pd_entry(pd, idx, pde); atomic_inc(px_used(pde)); /* keep pinned */ } @@ -668,21 +646,32 @@ gen8_alloc_top_pd(struct i915_address_space *vm) { const unsigned int count = gen8_pd_top_count(vm); struct i915_page_directory *pd; + int err; - GEM_BUG_ON(count > ARRAY_SIZE(pd->entry)); + GEM_BUG_ON(count > I915_PDES); - pd = __alloc_pd(offsetof(typeof(*pd), entry[count])); + pd = __alloc_pd(count); if (unlikely(!pd)) return ERR_PTR(-ENOMEM); - if (unlikely(setup_page_dma(vm, px_base(pd)))) { - kfree(pd); - return ERR_PTR(-ENOMEM); + pd->pt.base = vm->alloc_pt_dma(vm, I915_GTT_PAGE_SIZE_4K); + if (IS_ERR(pd->pt.base)) { + err = PTR_ERR(pd->pt.base); + pd->pt.base = NULL; + goto err_pd; } - fill_page_dma(px_base(pd), vm->scratch[vm->top].encode, count); + err = pin_pt_dma(vm, pd->pt.base); + if (err) + goto err_pd; + + fill_page_dma(px_base(pd), vm->scratch[vm->top]->encode, count); atomic_inc(px_used(pd)); /* mark as pinned */ return pd; + +err_pd: + free_pd(vm, pd); + return ERR_PTR(err); } /* @@ -703,6 +692,7 @@ struct i915_ppgtt *gen8_ppgtt_create(struct intel_gt *gt) ppgtt_init(ppgtt, gt); ppgtt->vm.top = i915_vm_is_4lvl(&ppgtt->vm) ? 3 : 2; + ppgtt->vm.pd_shift = ilog2(SZ_4K * SZ_4K / sizeof(gen8_pte_t)); /* * From bdw, there is hw support for read-only pages in the PPGTT. @@ -714,12 +704,7 @@ struct i915_ppgtt *gen8_ppgtt_create(struct intel_gt *gt) */ ppgtt->vm.has_read_only = !IS_GEN_RANGE(gt->i915, 11, 12); - /* - * There are only few exceptions for gen >=6. chv and bxt. - * And we are not sure about the latter so play safe for now. - */ - if (IS_CHERRYVIEW(gt->i915) || IS_BROXTON(gt->i915)) - ppgtt->vm.pt_kmap_wc = true; + ppgtt->vm.alloc_pt_dma = alloc_pt_dma; err = gen8_init_scratch(&ppgtt->vm); if (err) diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c index 91786310c114..d8b206e53660 100644 --- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c +++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c @@ -28,6 +28,8 @@ #include "i915_drv.h" #include "i915_trace.h" +#include "intel_breadcrumbs.h" +#include "intel_context.h" #include "intel_gt_pm.h" #include "intel_gt_requests.h" @@ -53,33 +55,65 @@ static void irq_disable(struct intel_engine_cs *engine) spin_unlock(&engine->gt->irq_lock); } +static void __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b) +{ + lockdep_assert_held(&b->irq_lock); + + if (!b->irq_engine || b->irq_armed) + return; + + if (!intel_gt_pm_get_if_awake(b->irq_engine->gt)) + return; + + /* + * The breadcrumb irq will be disarmed on the interrupt after the + * waiters are signaled. This gives us a single interrupt window in + * which we can add a new waiter and avoid the cost of re-enabling + * the irq. + */ + WRITE_ONCE(b->irq_armed, true); + + /* + * Since we are waiting on a request, the GPU should be busy + * and should have its own rpm reference. This is tracked + * by i915->gt.awake, we can forgo holding our own wakref + * for the interrupt as before i915->gt.awake is released (when + * the driver is idle) we disarm the breadcrumbs. + */ + + if (!b->irq_enabled++) + irq_enable(b->irq_engine); +} + static void __intel_breadcrumbs_disarm_irq(struct intel_breadcrumbs *b) { - struct intel_engine_cs *engine = - container_of(b, struct intel_engine_cs, breadcrumbs); - lockdep_assert_held(&b->irq_lock); + if (!b->irq_engine || !b->irq_armed) + return; + GEM_BUG_ON(!b->irq_enabled); if (!--b->irq_enabled) - irq_disable(engine); + irq_disable(b->irq_engine); WRITE_ONCE(b->irq_armed, false); - intel_gt_pm_put_async(engine->gt); + intel_gt_pm_put_async(b->irq_engine->gt); } -void intel_engine_disarm_breadcrumbs(struct intel_engine_cs *engine) +static void add_signaling_context(struct intel_breadcrumbs *b, + struct intel_context *ce) { - struct intel_breadcrumbs *b = &engine->breadcrumbs; - unsigned long flags; + intel_context_get(ce); + list_add_tail(&ce->signal_link, &b->signalers); + if (list_is_first(&ce->signal_link, &b->signalers)) + __intel_breadcrumbs_arm_irq(b); +} - if (!READ_ONCE(b->irq_armed)) - return; - - spin_lock_irqsave(&b->irq_lock, flags); - if (b->irq_armed) - __intel_breadcrumbs_disarm_irq(b); - spin_unlock_irqrestore(&b->irq_lock, flags); +static void remove_signaling_context(struct intel_breadcrumbs *b, + struct intel_context *ce) +{ + list_del(&ce->signal_link); + intel_context_put(ce); } static inline bool __request_completed(const struct i915_request *rq) @@ -90,6 +124,9 @@ static inline bool __request_completed(const struct i915_request *rq) __maybe_unused static bool check_signal_order(struct intel_context *ce, struct i915_request *rq) { + if (rq->context != ce) + return false; + if (!list_is_last(&rq->signal_link, &ce->signals) && i915_seqno_passed(rq->fence.seqno, list_next_entry(rq, signal_link)->fence.seqno)) @@ -133,25 +170,21 @@ __dma_fence_signal__notify(struct dma_fence *fence, static void add_retire(struct intel_breadcrumbs *b, struct intel_timeline *tl) { - struct intel_engine_cs *engine = - container_of(b, struct intel_engine_cs, breadcrumbs); - - if (unlikely(intel_engine_is_virtual(engine))) - engine = intel_virtual_engine_get_sibling(engine, 0); - - intel_engine_add_retire(engine, tl); + if (b->irq_engine) + intel_engine_add_retire(b->irq_engine, tl); } -static void __signal_request(struct i915_request *rq, struct list_head *signals) +static bool __signal_request(struct i915_request *rq, struct list_head *signals) { - GEM_BUG_ON(!test_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags)); clear_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags); - if (!__dma_fence_signal(&rq->fence)) - return; + if (!__dma_fence_signal(&rq->fence)) { + i915_request_put(rq); + return false; + } - i915_request_get(rq); list_add_tail(&rq->signal_link, signals); + return true; } static void signal_irq_work(struct irq_work *work) @@ -164,7 +197,7 @@ static void signal_irq_work(struct irq_work *work) spin_lock(&b->irq_lock); - if (b->irq_armed && list_empty(&b->signalers)) + if (list_empty(&b->signalers)) __intel_breadcrumbs_disarm_irq(b); list_splice_init(&b->signaled_requests, &signal); @@ -197,8 +230,8 @@ static void signal_irq_work(struct irq_work *work) /* Advance the list to the first incomplete request */ __list_del_many(&ce->signals, pos); if (&ce->signals == pos) { /* now empty */ - list_del_init(&ce->signal_link); add_retire(b, ce->timeline); + remove_signaling_context(b, ce); } } } @@ -220,116 +253,89 @@ static void signal_irq_work(struct irq_work *work) } } -static bool __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b) +struct intel_breadcrumbs * +intel_breadcrumbs_create(struct intel_engine_cs *irq_engine) { - struct intel_engine_cs *engine = - container_of(b, struct intel_engine_cs, breadcrumbs); + struct intel_breadcrumbs *b; - lockdep_assert_held(&b->irq_lock); - if (b->irq_armed) - return true; - - if (!intel_gt_pm_get_if_awake(engine->gt)) - return false; - - /* - * The breadcrumb irq will be disarmed on the interrupt after the - * waiters are signaled. This gives us a single interrupt window in - * which we can add a new waiter and avoid the cost of re-enabling - * the irq. - */ - WRITE_ONCE(b->irq_armed, true); - - /* - * Since we are waiting on a request, the GPU should be busy - * and should have its own rpm reference. This is tracked - * by i915->gt.awake, we can forgo holding our own wakref - * for the interrupt as before i915->gt.awake is released (when - * the driver is idle) we disarm the breadcrumbs. - */ - - if (!b->irq_enabled++) - irq_enable(engine); - - return true; -} - -void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine) -{ - struct intel_breadcrumbs *b = &engine->breadcrumbs; + b = kzalloc(sizeof(*b), GFP_KERNEL); + if (!b) + return NULL; spin_lock_init(&b->irq_lock); INIT_LIST_HEAD(&b->signalers); INIT_LIST_HEAD(&b->signaled_requests); init_irq_work(&b->irq_work, signal_irq_work); + + b->irq_engine = irq_engine; + + return b; } -void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine) +void intel_breadcrumbs_reset(struct intel_breadcrumbs *b) { - struct intel_breadcrumbs *b = &engine->breadcrumbs; unsigned long flags; + if (!b->irq_engine) + return; + spin_lock_irqsave(&b->irq_lock, flags); if (b->irq_enabled) - irq_enable(engine); + irq_enable(b->irq_engine); else - irq_disable(engine); + irq_disable(b->irq_engine); spin_unlock_irqrestore(&b->irq_lock, flags); } -void intel_engine_transfer_stale_breadcrumbs(struct intel_engine_cs *engine, - struct intel_context *ce) +void intel_breadcrumbs_park(struct intel_breadcrumbs *b) { - struct intel_breadcrumbs *b = &engine->breadcrumbs; unsigned long flags; + if (!READ_ONCE(b->irq_armed)) + return; + spin_lock_irqsave(&b->irq_lock, flags); - if (!list_empty(&ce->signals)) { - struct i915_request *rq, *next; - - /* Queue for executing the signal callbacks in the irq_work */ - list_for_each_entry_safe(rq, next, &ce->signals, signal_link) { - GEM_BUG_ON(rq->engine != engine); - GEM_BUG_ON(!__request_completed(rq)); - - __signal_request(rq, &b->signaled_requests); - } - - INIT_LIST_HEAD(&ce->signals); - list_del_init(&ce->signal_link); - - irq_work_queue(&b->irq_work); - } + __intel_breadcrumbs_disarm_irq(b); spin_unlock_irqrestore(&b->irq_lock, flags); + + if (!list_empty(&b->signalers)) + irq_work_queue(&b->irq_work); } -void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine) +void intel_breadcrumbs_free(struct intel_breadcrumbs *b) { + kfree(b); } -bool i915_request_enable_breadcrumb(struct i915_request *rq) +static void insert_breadcrumb(struct i915_request *rq, + struct intel_breadcrumbs *b) { - lockdep_assert_held(&rq->lock); + struct intel_context *ce = rq->context; + struct list_head *pos; - if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags)) - return true; + if (test_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags)) + return; - if (test_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags)) { - struct intel_breadcrumbs *b = &rq->engine->breadcrumbs; - struct intel_context *ce = rq->context; - struct list_head *pos; + i915_request_get(rq); - spin_lock(&b->irq_lock); - - if (test_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags)) - goto unlock; - - if (!__intel_breadcrumbs_arm_irq(b)) - goto unlock; + /* + * If the request is already completed, we can transfer it + * straight onto a signaled list, and queue the irq worker for + * its signal completion. + */ + if (__request_completed(rq)) { + if (__signal_request(rq, &b->signaled_requests)) + irq_work_queue(&b->irq_work); + return; + } + if (list_empty(&ce->signals)) { + add_signaling_context(b, ce); + pos = &ce->signals; + } else { /* * We keep the seqno in retirement order, so we can break * inside intel_engine_signal_breadcrumbs as soon as we've @@ -351,24 +357,75 @@ bool i915_request_enable_breadcrumb(struct i915_request *rq) if (i915_seqno_passed(rq->fence.seqno, it->fence.seqno)) break; } - list_add(&rq->signal_link, pos); - if (pos == &ce->signals) /* catch transitions from empty list */ - list_move_tail(&ce->signal_link, &b->signalers); - GEM_BUG_ON(!check_signal_order(ce, rq)); + } + list_add(&rq->signal_link, pos); + GEM_BUG_ON(!check_signal_order(ce, rq)); + set_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags); - set_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags); -unlock: + /* Check after attaching to irq, interrupt may have already fired. */ + if (__request_completed(rq)) + irq_work_queue(&b->irq_work); +} + +bool i915_request_enable_breadcrumb(struct i915_request *rq) +{ + struct intel_breadcrumbs *b; + + /* Serialises with i915_request_retire() using rq->lock */ + if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags)) + return true; + + /* + * Peek at i915_request_submit()/i915_request_unsubmit() status. + * + * If the request is not yet active (and not signaled), we will + * attach the breadcrumb later. + */ + if (!test_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags)) + return true; + + /* + * rq->engine is locked by rq->engine->active.lock. That however + * is not known until after rq->engine has been dereferenced and + * the lock acquired. Hence we acquire the lock and then validate + * that rq->engine still matches the lock we hold for it. + * + * Here, we are using the breadcrumb lock as a proxy for the + * rq->engine->active.lock, and we know that since the breadcrumb + * will be serialised within i915_request_submit/i915_request_unsubmit, + * the engine cannot change while active as long as we hold the + * breadcrumb lock on that engine. + * + * From the dma_fence_enable_signaling() path, we are outside of the + * request submit/unsubmit path, and so we must be more careful to + * acquire the right lock. + */ + b = READ_ONCE(rq->engine)->breadcrumbs; + spin_lock(&b->irq_lock); + while (unlikely(b != READ_ONCE(rq->engine)->breadcrumbs)) { spin_unlock(&b->irq_lock); + b = READ_ONCE(rq->engine)->breadcrumbs; + spin_lock(&b->irq_lock); } - return !__request_completed(rq); + /* + * Now that we are finally serialised with request submit/unsubmit, + * [with b->irq_lock] and with i915_request_retire() [via checking + * SIGNALED with rq->lock] confirm the request is indeed active. If + * it is no longer active, the breadcrumb will be attached upon + * i915_request_submit(). + */ + if (test_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags)) + insert_breadcrumb(rq, b); + + spin_unlock(&b->irq_lock); + + return true; } void i915_request_cancel_breadcrumb(struct i915_request *rq) { - struct intel_breadcrumbs *b = &rq->engine->breadcrumbs; - - lockdep_assert_held(&rq->lock); + struct intel_breadcrumbs *b = rq->engine->breadcrumbs; /* * We must wait for b->irq_lock so that we know the interrupt handler @@ -382,23 +439,19 @@ void i915_request_cancel_breadcrumb(struct i915_request *rq) list_del(&rq->signal_link); if (list_empty(&ce->signals)) - list_del_init(&ce->signal_link); + remove_signaling_context(b, ce); clear_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags); + i915_request_put(rq); } spin_unlock(&b->irq_lock); } -void intel_engine_print_breadcrumbs(struct intel_engine_cs *engine, - struct drm_printer *p) +static void print_signals(struct intel_breadcrumbs *b, struct drm_printer *p) { - struct intel_breadcrumbs *b = &engine->breadcrumbs; struct intel_context *ce; struct i915_request *rq; - if (list_empty(&b->signalers)) - return; - drm_printf(p, "Signals:\n"); spin_lock_irq(&b->irq_lock); @@ -414,3 +467,17 @@ void intel_engine_print_breadcrumbs(struct intel_engine_cs *engine, } spin_unlock_irq(&b->irq_lock); } + +void intel_engine_print_breadcrumbs(struct intel_engine_cs *engine, + struct drm_printer *p) +{ + struct intel_breadcrumbs *b; + + b = engine->breadcrumbs; + if (!b) + return; + + drm_printf(p, "IRQ: %s\n", enableddisabled(b->irq_armed)); + if (!list_empty(&b->signalers)) + print_signals(b, p); +} diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.h b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.h new file mode 100644 index 000000000000..ed3d1deabfbd --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2019 Intel Corporation + */ + +#ifndef __INTEL_BREADCRUMBS__ +#define __INTEL_BREADCRUMBS__ + +#include + +#include "intel_engine_types.h" + +struct drm_printer; +struct i915_request; +struct intel_breadcrumbs; + +struct intel_breadcrumbs * +intel_breadcrumbs_create(struct intel_engine_cs *irq_engine); +void intel_breadcrumbs_free(struct intel_breadcrumbs *b); + +void intel_breadcrumbs_reset(struct intel_breadcrumbs *b); +void intel_breadcrumbs_park(struct intel_breadcrumbs *b); + +static inline void +intel_engine_signal_breadcrumbs(struct intel_engine_cs *engine) +{ + irq_work_queue(&engine->breadcrumbs->irq_work); +} + +void intel_engine_print_breadcrumbs(struct intel_engine_cs *engine, + struct drm_printer *p); + +bool i915_request_enable_breadcrumb(struct i915_request *request); +void i915_request_cancel_breadcrumb(struct i915_request *request); + +#endif /* __INTEL_BREADCRUMBS__ */ diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs_types.h b/drivers/gpu/drm/i915/gt/intel_breadcrumbs_types.h new file mode 100644 index 000000000000..8e53b9942695 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs_types.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2019 Intel Corporation + */ + +#ifndef __INTEL_BREADCRUMBS_TYPES__ +#define __INTEL_BREADCRUMBS_TYPES__ + +#include +#include +#include +#include + +/* + * Rather than have every client wait upon all user interrupts, + * with the herd waking after every interrupt and each doing the + * heavyweight seqno dance, we delegate the task (of being the + * bottom-half of the user interrupt) to the first client. After + * every interrupt, we wake up one client, who does the heavyweight + * coherent seqno read and either goes back to sleep (if incomplete), + * or wakes up all the completed clients in parallel, before then + * transferring the bottom-half status to the next client in the queue. + * + * Compared to walking the entire list of waiters in a single dedicated + * bottom-half, we reduce the latency of the first waiter by avoiding + * a context switch, but incur additional coherent seqno reads when + * following the chain of request breadcrumbs. Since it is most likely + * that we have a single client waiting on each seqno, then reducing + * the overhead of waking that client is much preferred. + */ +struct intel_breadcrumbs { + spinlock_t irq_lock; /* protects the lists used in hardirq context */ + + /* Not all breadcrumbs are attached to physical HW */ + struct intel_engine_cs *irq_engine; + + struct list_head signalers; + struct list_head signaled_requests; + + struct irq_work irq_work; /* for use from inside irq_lock */ + + unsigned int irq_enabled; + + bool irq_armed; +}; + +#endif /* __INTEL_BREADCRUMBS_TYPES__ */ diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c index 52db2bde44a3..d301dda1b261 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.c +++ b/drivers/gpu/drm/i915/gt/intel_context.c @@ -93,85 +93,12 @@ static void intel_context_active_release(struct intel_context *ce) i915_active_release(&ce->active); } -int __intel_context_do_pin(struct intel_context *ce) -{ - int err; - - if (unlikely(!test_bit(CONTEXT_ALLOC_BIT, &ce->flags))) { - err = intel_context_alloc_state(ce); - if (err) - return err; - } - - err = i915_active_acquire(&ce->active); - if (err) - return err; - - if (mutex_lock_interruptible(&ce->pin_mutex)) { - err = -EINTR; - goto out_release; - } - - if (unlikely(intel_context_is_closed(ce))) { - err = -ENOENT; - goto out_unlock; - } - - if (likely(!atomic_add_unless(&ce->pin_count, 1, 0))) { - err = intel_context_active_acquire(ce); - if (unlikely(err)) - goto out_unlock; - - err = ce->ops->pin(ce); - if (unlikely(err)) - goto err_active; - - CE_TRACE(ce, "pin ring:{start:%08x, head:%04x, tail:%04x}\n", - i915_ggtt_offset(ce->ring->vma), - ce->ring->head, ce->ring->tail); - - smp_mb__before_atomic(); /* flush pin before it is visible */ - atomic_inc(&ce->pin_count); - } - - GEM_BUG_ON(!intel_context_is_pinned(ce)); /* no overflow! */ - GEM_BUG_ON(i915_active_is_idle(&ce->active)); - goto out_unlock; - -err_active: - intel_context_active_release(ce); -out_unlock: - mutex_unlock(&ce->pin_mutex); -out_release: - i915_active_release(&ce->active); - return err; -} - -void intel_context_unpin(struct intel_context *ce) -{ - if (!atomic_dec_and_test(&ce->pin_count)) - return; - - CE_TRACE(ce, "unpin\n"); - ce->ops->unpin(ce); - - /* - * Once released, we may asynchronously drop the active reference. - * As that may be the only reference keeping the context alive, - * take an extra now so that it is not freed before we finish - * dereferencing it. - */ - intel_context_get(ce); - intel_context_active_release(ce); - intel_context_put(ce); -} - -static int __context_pin_state(struct i915_vma *vma) +static int __context_pin_state(struct i915_vma *vma, struct i915_gem_ww_ctx *ww) { unsigned int bias = i915_ggtt_pin_bias(vma) | PIN_OFFSET_BIAS; int err; - err = i915_ggtt_pin(vma, 0, bias | PIN_HIGH); + err = i915_ggtt_pin(vma, ww, 0, bias | PIN_HIGH); if (err) return err; @@ -200,11 +127,12 @@ static void __context_unpin_state(struct i915_vma *vma) __i915_vma_unpin(vma); } -static int __ring_active(struct intel_ring *ring) +static int __ring_active(struct intel_ring *ring, + struct i915_gem_ww_ctx *ww) { int err; - err = intel_ring_pin(ring); + err = intel_ring_pin(ring, ww); if (err) return err; @@ -225,6 +153,173 @@ static void __ring_retire(struct intel_ring *ring) intel_ring_unpin(ring); } +static int intel_context_pre_pin(struct intel_context *ce, + struct i915_gem_ww_ctx *ww) +{ + int err; + + CE_TRACE(ce, "active\n"); + + err = __ring_active(ce->ring, ww); + if (err) + return err; + + err = intel_timeline_pin(ce->timeline, ww); + if (err) + goto err_ring; + + if (!ce->state) + return 0; + + err = __context_pin_state(ce->state, ww); + if (err) + goto err_timeline; + + + return 0; + +err_timeline: + intel_timeline_unpin(ce->timeline); +err_ring: + __ring_retire(ce->ring); + return err; +} + +static void intel_context_post_unpin(struct intel_context *ce) +{ + if (ce->state) + __context_unpin_state(ce->state); + + intel_timeline_unpin(ce->timeline); + __ring_retire(ce->ring); +} + +int __intel_context_do_pin_ww(struct intel_context *ce, + struct i915_gem_ww_ctx *ww) +{ + bool handoff = false; + void *vaddr; + int err = 0; + + if (unlikely(!test_bit(CONTEXT_ALLOC_BIT, &ce->flags))) { + err = intel_context_alloc_state(ce); + if (err) + return err; + } + + /* + * We always pin the context/ring/timeline here, to ensure a pin + * refcount for __intel_context_active(), which prevent a lock + * inversion of ce->pin_mutex vs dma_resv_lock(). + */ + + err = i915_gem_object_lock(ce->timeline->hwsp_ggtt->obj, ww); + if (!err && ce->ring->vma->obj) + err = i915_gem_object_lock(ce->ring->vma->obj, ww); + if (!err && ce->state) + err = i915_gem_object_lock(ce->state->obj, ww); + if (!err) + err = intel_context_pre_pin(ce, ww); + if (err) + return err; + + err = i915_active_acquire(&ce->active); + if (err) + goto err_ctx_unpin; + + err = ce->ops->pre_pin(ce, ww, &vaddr); + if (err) + goto err_release; + + err = mutex_lock_interruptible(&ce->pin_mutex); + if (err) + goto err_post_unpin; + + if (unlikely(intel_context_is_closed(ce))) { + err = -ENOENT; + goto err_unlock; + } + + if (likely(!atomic_add_unless(&ce->pin_count, 1, 0))) { + err = intel_context_active_acquire(ce); + if (unlikely(err)) + goto err_unlock; + + err = ce->ops->pin(ce, vaddr); + if (err) { + intel_context_active_release(ce); + goto err_unlock; + } + + CE_TRACE(ce, "pin ring:{start:%08x, head:%04x, tail:%04x}\n", + i915_ggtt_offset(ce->ring->vma), + ce->ring->head, ce->ring->tail); + + handoff = true; + smp_mb__before_atomic(); /* flush pin before it is visible */ + atomic_inc(&ce->pin_count); + } + + GEM_BUG_ON(!intel_context_is_pinned(ce)); /* no overflow! */ + +err_unlock: + mutex_unlock(&ce->pin_mutex); +err_post_unpin: + if (!handoff) + ce->ops->post_unpin(ce); +err_release: + i915_active_release(&ce->active); +err_ctx_unpin: + intel_context_post_unpin(ce); + + /* + * Unlock the hwsp_ggtt object since it's shared. + * In principle we can unlock all the global state locked above + * since it's pinned and doesn't need fencing, and will + * thus remain resident until it is explicitly unpinned. + */ + i915_gem_ww_unlock_single(ce->timeline->hwsp_ggtt->obj); + + return err; +} + +int __intel_context_do_pin(struct intel_context *ce) +{ + struct i915_gem_ww_ctx ww; + int err; + + i915_gem_ww_ctx_init(&ww, true); +retry: + err = __intel_context_do_pin_ww(ce, &ww); + if (err == -EDEADLK) { + err = i915_gem_ww_ctx_backoff(&ww); + if (!err) + goto retry; + } + i915_gem_ww_ctx_fini(&ww); + return err; +} + +void intel_context_unpin(struct intel_context *ce) +{ + if (!atomic_dec_and_test(&ce->pin_count)) + return; + + CE_TRACE(ce, "unpin\n"); + ce->ops->unpin(ce); + ce->ops->post_unpin(ce); + + /* + * Once released, we may asynchronously drop the active reference. + * As that may be the only reference keeping the context alive, + * take an extra now so that it is not freed before we finish + * dereferencing it. + */ + intel_context_get(ce); + intel_context_active_release(ce); + intel_context_put(ce); +} + __i915_active_call static void __intel_context_retire(struct i915_active *active) { @@ -235,48 +330,29 @@ static void __intel_context_retire(struct i915_active *active) intel_context_get_avg_runtime_ns(ce)); set_bit(CONTEXT_VALID_BIT, &ce->flags); - if (ce->state) - __context_unpin_state(ce->state); - - intel_timeline_unpin(ce->timeline); - __ring_retire(ce->ring); - + intel_context_post_unpin(ce); intel_context_put(ce); } static int __intel_context_active(struct i915_active *active) { struct intel_context *ce = container_of(active, typeof(*ce), active); - int err; - - CE_TRACE(ce, "active\n"); intel_context_get(ce); - err = __ring_active(ce->ring); - if (err) - goto err_put; + /* everything should already be activated by intel_context_pre_pin() */ + GEM_WARN_ON(!i915_active_acquire_if_busy(&ce->ring->vma->active)); + __intel_ring_pin(ce->ring); - err = intel_timeline_pin(ce->timeline); - if (err) - goto err_ring; + __intel_timeline_pin(ce->timeline); - if (!ce->state) - return 0; - - err = __context_pin_state(ce->state); - if (err) - goto err_timeline; + if (ce->state) { + GEM_WARN_ON(!i915_active_acquire_if_busy(&ce->state->active)); + __i915_vma_pin(ce->state); + i915_vma_make_unshrinkable(ce->state); + } return 0; - -err_timeline: - intel_timeline_unpin(ce->timeline); -err_ring: - __ring_retire(ce->ring); -err_put: - intel_context_put(ce); - return err; } void @@ -382,15 +458,37 @@ int intel_context_prepare_remote_request(struct intel_context *ce, struct i915_request *intel_context_create_request(struct intel_context *ce) { + struct i915_gem_ww_ctx ww; struct i915_request *rq; int err; - err = intel_context_pin(ce); - if (unlikely(err)) - return ERR_PTR(err); + i915_gem_ww_ctx_init(&ww, true); +retry: + err = intel_context_pin_ww(ce, &ww); + if (!err) { + rq = i915_request_create(ce); + intel_context_unpin(ce); + } else if (err == -EDEADLK) { + err = i915_gem_ww_ctx_backoff(&ww); + if (!err) + goto retry; + } else { + rq = ERR_PTR(err); + } - rq = i915_request_create(ce); - intel_context_unpin(ce); + i915_gem_ww_ctx_fini(&ww); + + if (IS_ERR(rq)) + return rq; + + /* + * timeline->mutex should be the inner lock, but is used as outer lock. + * Hack around this to shut up lockdep in selftests.. + */ + lockdep_unpin_lock(&ce->timeline->mutex, rq->cookie); + mutex_release(&ce->timeline->mutex.dep_map, _RET_IP_); + mutex_acquire(&ce->timeline->mutex.dep_map, SINGLE_DEPTH_NESTING, 0, _RET_IP_); + rq->cookie = lockdep_pin_lock(&ce->timeline->mutex); return rq; } diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h index 07be021882cc..fda2eba81e22 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.h +++ b/drivers/gpu/drm/i915/gt/intel_context.h @@ -25,6 +25,8 @@ ##__VA_ARGS__); \ } while (0) +struct i915_gem_ww_ctx; + void intel_context_init(struct intel_context *ce, struct intel_engine_cs *engine); void intel_context_fini(struct intel_context *ce); @@ -81,6 +83,8 @@ static inline void intel_context_unlock_pinned(struct intel_context *ce) } int __intel_context_do_pin(struct intel_context *ce); +int __intel_context_do_pin_ww(struct intel_context *ce, + struct i915_gem_ww_ctx *ww); static inline bool intel_context_pin_if_active(struct intel_context *ce) { @@ -95,6 +99,15 @@ static inline int intel_context_pin(struct intel_context *ce) return __intel_context_do_pin(ce); } +static inline int intel_context_pin_ww(struct intel_context *ce, + struct i915_gem_ww_ctx *ww) +{ + if (likely(intel_context_pin_if_active(ce))) + return 0; + + return __intel_context_do_pin_ww(ce, ww); +} + static inline void __intel_context_pin(struct intel_context *ce) { GEM_BUG_ON(!intel_context_is_pinned(ce)); diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h index 4954b0df4864..552cb57a2e8c 100644 --- a/drivers/gpu/drm/i915/gt/intel_context_types.h +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h @@ -23,6 +23,7 @@ DECLARE_EWMA(runtime, 3, 8); struct i915_gem_context; +struct i915_gem_ww_ctx; struct i915_vma; struct intel_context; struct intel_ring; @@ -30,8 +31,10 @@ struct intel_ring; struct intel_context_ops { int (*alloc)(struct intel_context *ce); - int (*pin)(struct intel_context *ce); + int (*pre_pin)(struct intel_context *ce, struct i915_gem_ww_ctx *ww, void **vaddr); + int (*pin)(struct intel_context *ce, void *vaddr); void (*unpin)(struct intel_context *ce); + void (*post_unpin)(struct intel_context *ce); void (*enter)(struct intel_context *ce); void (*exit)(struct intel_context *ce); diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h index a9249a23903a..08e2c000dcc3 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine.h +++ b/drivers/gpu/drm/i915/gt/intel_engine.h @@ -223,26 +223,6 @@ void intel_engine_get_instdone(const struct intel_engine_cs *engine, void intel_engine_init_execlists(struct intel_engine_cs *engine); -void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine); -void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine); - -void intel_engine_disarm_breadcrumbs(struct intel_engine_cs *engine); - -static inline void -intel_engine_signal_breadcrumbs(struct intel_engine_cs *engine) -{ - irq_work_queue(&engine->breadcrumbs.irq_work); -} - -void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine); -void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine); - -void intel_engine_transfer_stale_breadcrumbs(struct intel_engine_cs *engine, - struct intel_context *ce); - -void intel_engine_print_breadcrumbs(struct intel_engine_cs *engine, - struct drm_printer *p); - static inline u32 *__gen8_emit_pipe_control(u32 *batch, u32 flags0, u32 flags1, u32 offset) { memset(batch, 0, 6 * sizeof(u32)); diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index 26087dd79782..5bfb5f7ed02c 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -28,6 +28,7 @@ #include "i915_drv.h" +#include "intel_breadcrumbs.h" #include "intel_context.h" #include "intel_engine.h" #include "intel_engine_pm.h" @@ -634,7 +635,7 @@ static int pin_ggtt_status_page(struct intel_engine_cs *engine, else flags = PIN_HIGH; - return i915_ggtt_pin(vma, 0, flags); + return i915_ggtt_pin(vma, NULL, 0, flags); } static int init_status_page(struct intel_engine_cs *engine) @@ -700,8 +701,13 @@ static int engine_setup_common(struct intel_engine_cs *engine) if (err) return err; + engine->breadcrumbs = intel_breadcrumbs_create(engine); + if (!engine->breadcrumbs) { + err = -ENOMEM; + goto err_status; + } + intel_engine_init_active(engine, ENGINE_PHYSICAL); - intel_engine_init_breadcrumbs(engine); intel_engine_init_execlists(engine); intel_engine_init_cmd_parser(engine); intel_engine_init__pm(engine); @@ -716,6 +722,10 @@ static int engine_setup_common(struct intel_engine_cs *engine) intel_engine_init_ctx_wa(engine); return 0; + +err_status: + cleanup_status_page(engine); + return err; } struct measure_breadcrumb { @@ -785,9 +795,11 @@ intel_engine_init_active(struct intel_engine_cs *engine, unsigned int subclass) } static struct intel_context * -create_kernel_context(struct intel_engine_cs *engine) +create_pinned_context(struct intel_engine_cs *engine, + unsigned int hwsp, + struct lock_class_key *key, + const char *name) { - static struct lock_class_key kernel; struct intel_context *ce; int err; @@ -796,6 +808,7 @@ create_kernel_context(struct intel_engine_cs *engine) return ce; __set_bit(CONTEXT_BARRIER_BIT, &ce->flags); + ce->timeline = page_pack_bits(NULL, hwsp); err = intel_context_pin(ce); /* perma-pin so it is always available */ if (err) { @@ -809,11 +822,20 @@ create_kernel_context(struct intel_engine_cs *engine) * should we need to inject GPU operations during their request * construction. */ - lockdep_set_class(&ce->timeline->mutex, &kernel); + lockdep_set_class_and_name(&ce->timeline->mutex, key, name); return ce; } +static struct intel_context * +create_kernel_context(struct intel_engine_cs *engine) +{ + static struct lock_class_key kernel; + + return create_pinned_context(engine, I915_GEM_HWS_SEQNO_ADDR, + &kernel, "kernel_context"); +} + /** * intel_engines_init_common - initialize cengine state which might require hw access * @engine: Engine to initialize. @@ -902,9 +924,9 @@ void intel_engine_cleanup_common(struct intel_engine_cs *engine) tasklet_kill(&engine->execlists.tasklet); /* flush the callback */ cleanup_status_page(engine); + intel_breadcrumbs_free(engine->breadcrumbs); intel_engine_fini_retire(engine); - intel_engine_fini_breadcrumbs(engine); intel_engine_cleanup_cmd_parser(engine); if (engine->default_state) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c index 8ec3eecf3e39..f7b2e07e2229 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c @@ -6,6 +6,7 @@ #include "i915_drv.h" +#include "intel_breadcrumbs.h" #include "intel_context.h" #include "intel_engine.h" #include "intel_engine_heartbeat.h" @@ -247,7 +248,7 @@ static int __engine_park(struct intel_wakeref *wf) call_idle_barriers(engine); /* cleanup after wedging */ intel_engine_park_heartbeat(engine); - intel_engine_disarm_breadcrumbs(engine); + intel_breadcrumbs_park(engine->breadcrumbs); /* Must be reset upon idling, or we may miss the busy wakeup. */ GEM_BUG_ON(engine->execlists.queue_priority_hint != INT_MIN); diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h index 8de92fd7d392..c400aaa2287b 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h @@ -22,6 +22,7 @@ #include "i915_pmu.h" #include "i915_priolist_types.h" #include "i915_selftest.h" +#include "intel_breadcrumbs_types.h" #include "intel_sseu.h" #include "intel_timeline_types.h" #include "intel_uncore.h" @@ -373,34 +374,8 @@ struct intel_engine_cs { */ struct ewma__engine_latency latency; - /* Rather than have every client wait upon all user interrupts, - * with the herd waking after every interrupt and each doing the - * heavyweight seqno dance, we delegate the task (of being the - * bottom-half of the user interrupt) to the first client. After - * every interrupt, we wake up one client, who does the heavyweight - * coherent seqno read and either goes back to sleep (if incomplete), - * or wakes up all the completed clients in parallel, before then - * transferring the bottom-half status to the next client in the queue. - * - * Compared to walking the entire list of waiters in a single dedicated - * bottom-half, we reduce the latency of the first waiter by avoiding - * a context switch, but incur additional coherent seqno reads when - * following the chain of request breadcrumbs. Since it is most likely - * that we have a single client waiting on each seqno, then reducing - * the overhead of waking that client is much preferred. - */ - struct intel_breadcrumbs { - spinlock_t irq_lock; - struct list_head signalers; - - struct list_head signaled_requests; - - struct irq_work irq_work; /* for use from inside irq_lock */ - - unsigned int irq_enabled; - - bool irq_armed; - } breadcrumbs; + /* Keep track of all the seqno used, a trail of breadcrumbs */ + struct intel_breadcrumbs *breadcrumbs; struct intel_engine_pmu { /** diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c b/drivers/gpu/drm/i915/gt/intel_ggtt.c index 99e28d9021e8..81c05f551b9c 100644 --- a/drivers/gpu/drm/i915/gt/intel_ggtt.c +++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c @@ -78,8 +78,6 @@ int i915_ggtt_init_hw(struct drm_i915_private *i915) { int ret; - stash_init(&i915->mm.wc_stash); - /* * Note that we use page colouring to enforce a guard page at the * end of the address space. This is required as the CS may prefetch @@ -232,7 +230,7 @@ static void gen8_ggtt_insert_entries(struct i915_address_space *vm, /* Fill the allocated but "unused" space beyond the end of the buffer */ while (gte < end) - gen8_set_pte(gte++, vm->scratch[0].encode); + gen8_set_pte(gte++, vm->scratch[0]->encode); /* * We want to flush the TLBs only after we're certain all the PTE @@ -283,7 +281,7 @@ static void gen6_ggtt_insert_entries(struct i915_address_space *vm, /* Fill the allocated but "unused" space beyond the end of the buffer */ while (gte < end) - iowrite32(vm->scratch[0].encode, gte++); + iowrite32(vm->scratch[0]->encode, gte++); /* * We want to flush the TLBs only after we're certain all the PTE @@ -303,7 +301,7 @@ static void gen8_ggtt_clear_range(struct i915_address_space *vm, struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm); unsigned int first_entry = start / I915_GTT_PAGE_SIZE; unsigned int num_entries = length / I915_GTT_PAGE_SIZE; - const gen8_pte_t scratch_pte = vm->scratch[0].encode; + const gen8_pte_t scratch_pte = vm->scratch[0]->encode; gen8_pte_t __iomem *gtt_base = (gen8_pte_t __iomem *)ggtt->gsm + first_entry; const int max_entries = ggtt_total_entries(ggtt) - first_entry; @@ -401,7 +399,7 @@ static void gen6_ggtt_clear_range(struct i915_address_space *vm, first_entry, num_entries, max_entries)) num_entries = max_entries; - scratch_pte = vm->scratch[0].encode; + scratch_pte = vm->scratch[0]->encode; for (i = 0; i < num_entries; i++) iowrite32(scratch_pte, >t_base[i]); } @@ -436,16 +434,17 @@ static void i915_ggtt_clear_range(struct i915_address_space *vm, intel_gtt_clear_range(start >> PAGE_SHIFT, length >> PAGE_SHIFT); } -static int ggtt_bind_vma(struct i915_address_space *vm, - struct i915_vma *vma, - enum i915_cache_level cache_level, - u32 flags) +static void ggtt_bind_vma(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + struct i915_vma *vma, + enum i915_cache_level cache_level, + u32 flags) { struct drm_i915_gem_object *obj = vma->obj; u32 pte_flags; if (i915_vma_is_bound(vma, ~flags & I915_VMA_BIND_MASK)) - return 0; + return; /* Applicable to VLV (gen8+ do not support RO in the GGTT) */ pte_flags = 0; @@ -454,8 +453,6 @@ static int ggtt_bind_vma(struct i915_address_space *vm, vm->insert_entries(vm, vma, cache_level, pte_flags); vma->page_sizes.gtt = I915_GTT_PAGE_SIZE; - - return 0; } static void ggtt_unbind_vma(struct i915_address_space *vm, struct i915_vma *vma) @@ -568,31 +565,25 @@ static int init_ggtt(struct i915_ggtt *ggtt) return ret; } -static int aliasing_gtt_bind_vma(struct i915_address_space *vm, - struct i915_vma *vma, - enum i915_cache_level cache_level, - u32 flags) +static void aliasing_gtt_bind_vma(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + struct i915_vma *vma, + enum i915_cache_level cache_level, + u32 flags) { u32 pte_flags; - int ret; /* Currently applicable only to VLV */ pte_flags = 0; if (i915_gem_object_is_readonly(vma->obj)) pte_flags |= PTE_READ_ONLY; - if (flags & I915_VMA_LOCAL_BIND) { - struct i915_ppgtt *alias = i915_vm_to_ggtt(vm)->alias; - - ret = ppgtt_bind_vma(&alias->vm, vma, cache_level, flags); - if (ret) - return ret; - } + if (flags & I915_VMA_LOCAL_BIND) + ppgtt_bind_vma(&i915_vm_to_ggtt(vm)->alias->vm, + stash, vma, cache_level, flags); if (flags & I915_VMA_GLOBAL_BIND) vm->insert_entries(vm, vma, cache_level, pte_flags); - - return 0; } static void aliasing_gtt_unbind_vma(struct i915_address_space *vm, @@ -607,6 +598,7 @@ static void aliasing_gtt_unbind_vma(struct i915_address_space *vm, static int init_aliasing_ppgtt(struct i915_ggtt *ggtt) { + struct i915_vm_pt_stash stash = {}; struct i915_ppgtt *ppgtt; int err; @@ -619,15 +611,21 @@ static int init_aliasing_ppgtt(struct i915_ggtt *ggtt) goto err_ppgtt; } + err = i915_vm_alloc_pt_stash(&ppgtt->vm, &stash, ggtt->vm.total); + if (err) + goto err_ppgtt; + + err = i915_vm_pin_pt_stash(&ppgtt->vm, &stash); + if (err) + goto err_stash; + /* * Note we only pre-allocate as far as the end of the global * GTT. On 48b / 4-level page-tables, the difference is very, * very significant! We have to preallocate as GVT/vgpu does * not like the page directory disappearing. */ - err = ppgtt->vm.allocate_va_range(&ppgtt->vm, 0, ggtt->vm.total); - if (err) - goto err_ppgtt; + ppgtt->vm.allocate_va_range(&ppgtt->vm, &stash, 0, ggtt->vm.total); ggtt->alias = ppgtt; ggtt->vm.bind_async_flags |= ppgtt->vm.bind_async_flags; @@ -638,8 +636,11 @@ static int init_aliasing_ppgtt(struct i915_ggtt *ggtt) GEM_BUG_ON(ggtt->vm.vma_ops.unbind_vma != ggtt_unbind_vma); ggtt->vm.vma_ops.unbind_vma = aliasing_gtt_unbind_vma; + i915_vm_free_pt_stash(&ppgtt->vm, &stash); return 0; +err_stash: + i915_vm_free_pt_stash(&ppgtt->vm, &stash); err_ppgtt: i915_vm_put(&ppgtt->vm); return err; @@ -715,18 +716,11 @@ static void ggtt_cleanup_hw(struct i915_ggtt *ggtt) void i915_ggtt_driver_release(struct drm_i915_private *i915) { struct i915_ggtt *ggtt = &i915->ggtt; - struct pagevec *pvec; fini_aliasing_ppgtt(ggtt); intel_ggtt_fini_fences(ggtt); ggtt_cleanup_hw(ggtt); - - pvec = &i915->mm.wc_stash.pvec; - if (pvec->nr) { - set_pages_array_wb(pvec->pages, pvec->nr); - __pagevec_release(pvec); - } } static unsigned int gen6_get_total_gtt_size(u16 snb_gmch_ctl) @@ -789,7 +783,7 @@ static int ggtt_probe_common(struct i915_ggtt *ggtt, u64 size) return -ENOMEM; } - ret = setup_scratch_page(&ggtt->vm, GFP_DMA32); + ret = setup_scratch_page(&ggtt->vm); if (ret) { drm_err(&i915->drm, "Scratch setup failed\n"); /* iounmap will also get called at remove, but meh */ @@ -797,8 +791,8 @@ static int ggtt_probe_common(struct i915_ggtt *ggtt, u64 size) return ret; } - ggtt->vm.scratch[0].encode = - ggtt->vm.pte_encode(px_dma(&ggtt->vm.scratch[0]), + ggtt->vm.scratch[0]->encode = + ggtt->vm.pte_encode(px_dma(ggtt->vm.scratch[0]), I915_CACHE_NONE, 0); return 0; @@ -824,7 +818,7 @@ static void gen6_gmch_remove(struct i915_address_space *vm) struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm); iounmap(ggtt->gsm); - cleanup_scratch_page(vm); + free_scratch(vm); } static struct resource pci_resource(struct pci_dev *pdev, int bar) @@ -852,6 +846,8 @@ static int gen8_gmch_probe(struct i915_ggtt *ggtt) else size = gen8_get_total_gtt_size(snb_gmch_ctl); + ggtt->vm.alloc_pt_dma = alloc_pt_dma; + ggtt->vm.total = (size / sizeof(gen8_pte_t)) * I915_GTT_PAGE_SIZE; ggtt->vm.cleanup = gen6_gmch_remove; ggtt->vm.insert_page = gen8_ggtt_insert_page; @@ -1000,6 +996,8 @@ static int gen6_gmch_probe(struct i915_ggtt *ggtt) size = gen6_get_total_gtt_size(snb_gmch_ctl); ggtt->vm.total = (size / sizeof(gen6_pte_t)) * I915_GTT_PAGE_SIZE; + ggtt->vm.alloc_pt_dma = alloc_pt_dma; + ggtt->vm.clear_range = nop_clear_range; if (!HAS_FULL_PPGTT(i915) || intel_scanout_needs_vtd_wa(i915)) ggtt->vm.clear_range = gen6_ggtt_clear_range; @@ -1050,6 +1048,8 @@ static int i915_gmch_probe(struct i915_ggtt *ggtt) ggtt->gmadr = (struct resource)DEFINE_RES_MEM(gmadr_base, ggtt->mappable_end); + ggtt->vm.alloc_pt_dma = alloc_pt_dma; + ggtt->do_idle_maps = needs_idle_maps(i915); ggtt->vm.insert_page = i915_ggtt_insert_page; ggtt->vm.insert_entries = i915_ggtt_insert_entries; @@ -1165,11 +1165,6 @@ void i915_ggtt_disable_guc(struct i915_ggtt *ggtt) ggtt->invalidate(ggtt); } -static unsigned int clear_bind(struct i915_vma *vma) -{ - return atomic_fetch_and(~I915_VMA_BIND_MASK, &vma->flags); -} - void i915_ggtt_resume(struct i915_ggtt *ggtt) { struct i915_vma *vma; @@ -1187,11 +1182,13 @@ void i915_ggtt_resume(struct i915_ggtt *ggtt) /* clflush objects bound into the GGTT and rebind them. */ list_for_each_entry(vma, &ggtt->vm.bound_list, vm_link) { struct drm_i915_gem_object *obj = vma->obj; - unsigned int was_bound = clear_bind(vma); + unsigned int was_bound = + atomic_read(&vma->flags) & I915_VMA_BIND_MASK; - WARN_ON(i915_vma_bind(vma, - obj ? obj->cache_level : 0, - was_bound, NULL)); + GEM_BUG_ON(!was_bound); + vma->ops->bind_vma(&ggtt->vm, NULL, vma, + obj ? obj->cache_level : 0, + was_bound); if (obj) { /* only used during resume => exclusive access */ flush |= fetch_and_zero(&obj->write_domain); obj->read_domains |= I915_GEM_DOMAIN_GTT; diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c index e0755f1a904b..39b428c5049c 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.c +++ b/drivers/gpu/drm/i915/gt/intel_gt.c @@ -356,7 +356,7 @@ static int intel_gt_init_scratch(struct intel_gt *gt, unsigned int size) goto err_unref; } - ret = i915_ggtt_pin(vma, 0, PIN_HIGH); + ret = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH); if (ret) goto err_unref; @@ -406,21 +406,20 @@ static int __engines_record_defaults(struct intel_gt *gt) /* We must be able to switch to something! */ GEM_BUG_ON(!engine->kernel_context); - err = intel_renderstate_init(&so, engine); - if (err) - goto out; - ce = intel_context_create(engine); if (IS_ERR(ce)) { err = PTR_ERR(ce); goto out; } - rq = intel_context_create_request(ce); + err = intel_renderstate_init(&so, ce); + if (err) + goto err; + + rq = i915_request_create(ce); if (IS_ERR(rq)) { err = PTR_ERR(rq); - intel_context_put(ce); - goto out; + goto err_fini; } err = intel_engine_emit_ctx_wa(rq); @@ -434,9 +433,13 @@ static int __engines_record_defaults(struct intel_gt *gt) err_rq: requests[id] = i915_request_get(rq); i915_request_add(rq); - intel_renderstate_fini(&so); - if (err) +err_fini: + intel_renderstate_fini(&so, ce); +err: + if (err) { + intel_context_put(ce); goto out; + } } /* Flush the default context image to memory, and enable powersaving. */ diff --git a/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool.c b/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool.c index 418ae184cecf..4b7671ac5dca 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool.c +++ b/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool.c @@ -35,39 +35,65 @@ static void node_free(struct intel_gt_buffer_pool_node *node) { i915_gem_object_put(node->obj); i915_active_fini(&node->active); - kfree(node); + kfree_rcu(node, rcu); +} + +static bool pool_free_older_than(struct intel_gt_buffer_pool *pool, long keep) +{ + struct intel_gt_buffer_pool_node *node, *stale = NULL; + bool active = false; + int n; + + /* Free buffers that have not been used in the past second */ + for (n = 0; n < ARRAY_SIZE(pool->cache_list); n++) { + struct list_head *list = &pool->cache_list[n]; + + if (list_empty(list)) + continue; + + if (spin_trylock_irq(&pool->lock)) { + struct list_head *pos; + + /* Most recent at head; oldest at tail */ + list_for_each_prev(pos, list) { + unsigned long age; + + node = list_entry(pos, typeof(*node), link); + + age = READ_ONCE(node->age); + if (!age || jiffies - age < keep) + break; + + /* Check we are the first to claim this node */ + if (!xchg(&node->age, 0)) + break; + + node->free = stale; + stale = node; + } + if (!list_is_last(pos, list)) + __list_del_many(pos, list); + + spin_unlock_irq(&pool->lock); + } + + active |= !list_empty(list); + } + + while ((node = stale)) { + stale = stale->free; + node_free(node); + } + + return active; } static void pool_free_work(struct work_struct *wrk) { struct intel_gt_buffer_pool *pool = container_of(wrk, typeof(*pool), work.work); - struct intel_gt_buffer_pool_node *node, *next; - unsigned long old = jiffies - HZ; - bool active = false; - LIST_HEAD(stale); - int n; - /* Free buffers that have not been used in the past second */ - spin_lock_irq(&pool->lock); - for (n = 0; n < ARRAY_SIZE(pool->cache_list); n++) { - struct list_head *list = &pool->cache_list[n]; - - /* Most recent at head; oldest at tail */ - list_for_each_entry_safe_reverse(node, next, list, link) { - if (time_before(node->age, old)) - break; - - list_move(&node->link, &stale); - } - active |= !list_empty(list); - } - spin_unlock_irq(&pool->lock); - - list_for_each_entry_safe(node, next, &stale, link) - node_free(node); - - if (active) + if (pool_free_older_than(pool, HZ)) schedule_delayed_work(&pool->work, round_jiffies_up_relative(HZ)); } @@ -109,8 +135,8 @@ static void pool_retire(struct i915_active *ref) i915_gem_object_make_purgeable(node->obj); spin_lock_irqsave(&pool->lock, flags); - node->age = jiffies; - list_add(&node->link, list); + list_add_rcu(&node->link, list); + WRITE_ONCE(node->age, jiffies ?: 1); /* 0 reserved for active nodes */ spin_unlock_irqrestore(&pool->lock, flags); schedule_delayed_work(&pool->work, @@ -151,20 +177,30 @@ intel_gt_get_buffer_pool(struct intel_gt *gt, size_t size) struct intel_gt_buffer_pool *pool = >->buffer_pool; struct intel_gt_buffer_pool_node *node; struct list_head *list; - unsigned long flags; int ret; size = PAGE_ALIGN(size); list = bucket_for_size(pool, size); - spin_lock_irqsave(&pool->lock, flags); - list_for_each_entry(node, list, link) { + rcu_read_lock(); + list_for_each_entry_rcu(node, list, link) { + unsigned long age; + if (node->obj->base.size < size) continue; - list_del(&node->link); - break; + + age = READ_ONCE(node->age); + if (!age) + continue; + + if (cmpxchg(&node->age, age, 0) == age) { + spin_lock_irq(&pool->lock); + list_del_rcu(&node->link); + spin_unlock_irq(&pool->lock); + break; + } } - spin_unlock_irqrestore(&pool->lock, flags); + rcu_read_unlock(); if (&node->link == list) { node = node_create(pool, size); @@ -192,28 +228,13 @@ void intel_gt_init_buffer_pool(struct intel_gt *gt) INIT_DELAYED_WORK(&pool->work, pool_free_work); } -static void pool_free_imm(struct intel_gt_buffer_pool *pool) -{ - int n; - - spin_lock_irq(&pool->lock); - for (n = 0; n < ARRAY_SIZE(pool->cache_list); n++) { - struct intel_gt_buffer_pool_node *node, *next; - struct list_head *list = &pool->cache_list[n]; - - list_for_each_entry_safe(node, next, list, link) - node_free(node); - INIT_LIST_HEAD(list); - } - spin_unlock_irq(&pool->lock); -} - void intel_gt_flush_buffer_pool(struct intel_gt *gt) { struct intel_gt_buffer_pool *pool = >->buffer_pool; do { - pool_free_imm(pool); + while (pool_free_older_than(pool, 0)) + ; } while (cancel_delayed_work_sync(&pool->work)); } diff --git a/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool_types.h b/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool_types.h index e28bdda771ed..bcf1658c9633 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool_types.h +++ b/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool_types.h @@ -25,7 +25,11 @@ struct intel_gt_buffer_pool_node { struct i915_active active; struct drm_i915_gem_object *obj; struct list_head link; - struct intel_gt_buffer_pool *pool; + union { + struct intel_gt_buffer_pool *pool; + struct intel_gt_buffer_pool_node *free; + struct rcu_head rcu; + }; unsigned long age; }; diff --git a/drivers/gpu/drm/i915/gt/intel_gt_irq.c b/drivers/gpu/drm/i915/gt/intel_gt_irq.c index b05da68e52f4..257063a57101 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_irq.c +++ b/drivers/gpu/drm/i915/gt/intel_gt_irq.c @@ -8,6 +8,7 @@ #include "i915_drv.h" #include "i915_irq.h" +#include "intel_breadcrumbs.h" #include "intel_gt.h" #include "intel_gt_irq.h" #include "intel_uncore.h" diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c b/drivers/gpu/drm/i915/gt/intel_gtt.c index 2a72cce63fd9..3f1114b58b01 100644 --- a/drivers/gpu/drm/i915/gt/intel_gtt.c +++ b/drivers/gpu/drm/i915/gt/intel_gtt.c @@ -11,160 +11,24 @@ #include "intel_gt.h" #include "intel_gtt.h" -void stash_init(struct pagestash *stash) +struct drm_i915_gem_object *alloc_pt_dma(struct i915_address_space *vm, int sz) { - pagevec_init(&stash->pvec); - spin_lock_init(&stash->lock); -} - -static struct page *stash_pop_page(struct pagestash *stash) -{ - struct page *page = NULL; - - spin_lock(&stash->lock); - if (likely(stash->pvec.nr)) - page = stash->pvec.pages[--stash->pvec.nr]; - spin_unlock(&stash->lock); - - return page; -} - -static void stash_push_pagevec(struct pagestash *stash, struct pagevec *pvec) -{ - unsigned int nr; - - spin_lock_nested(&stash->lock, SINGLE_DEPTH_NESTING); - - nr = min_t(typeof(nr), pvec->nr, pagevec_space(&stash->pvec)); - memcpy(stash->pvec.pages + stash->pvec.nr, - pvec->pages + pvec->nr - nr, - sizeof(pvec->pages[0]) * nr); - stash->pvec.nr += nr; - - spin_unlock(&stash->lock); - - pvec->nr -= nr; -} - -static struct page *vm_alloc_page(struct i915_address_space *vm, gfp_t gfp) -{ - struct pagevec stack; - struct page *page; - if (I915_SELFTEST_ONLY(should_fail(&vm->fault_attr, 1))) i915_gem_shrink_all(vm->i915); - page = stash_pop_page(&vm->free_pages); - if (page) - return page; - - if (!vm->pt_kmap_wc) - return alloc_page(gfp); - - /* Look in our global stash of WC pages... */ - page = stash_pop_page(&vm->i915->mm.wc_stash); - if (page) - return page; - - /* - * Otherwise batch allocate pages to amortize cost of set_pages_wc. - * - * We have to be careful as page allocation may trigger the shrinker - * (via direct reclaim) which will fill up the WC stash underneath us. - * So we add our WB pages into a temporary pvec on the stack and merge - * them into the WC stash after all the allocations are complete. - */ - pagevec_init(&stack); - do { - struct page *page; - - page = alloc_page(gfp); - if (unlikely(!page)) - break; - - stack.pages[stack.nr++] = page; - } while (pagevec_space(&stack)); - - if (stack.nr && !set_pages_array_wc(stack.pages, stack.nr)) { - page = stack.pages[--stack.nr]; - - /* Merge spare WC pages to the global stash */ - if (stack.nr) - stash_push_pagevec(&vm->i915->mm.wc_stash, &stack); - - /* Push any surplus WC pages onto the local VM stash */ - if (stack.nr) - stash_push_pagevec(&vm->free_pages, &stack); - } - - /* Return unwanted leftovers */ - if (unlikely(stack.nr)) { - WARN_ON_ONCE(set_pages_array_wb(stack.pages, stack.nr)); - __pagevec_release(&stack); - } - - return page; + return i915_gem_object_create_internal(vm->i915, sz); } -static void vm_free_pages_release(struct i915_address_space *vm, - bool immediate) +int pin_pt_dma(struct i915_address_space *vm, struct drm_i915_gem_object *obj) { - struct pagevec *pvec = &vm->free_pages.pvec; - struct pagevec stack; + int err; - lockdep_assert_held(&vm->free_pages.lock); - GEM_BUG_ON(!pagevec_count(pvec)); + err = i915_gem_object_pin_pages(obj); + if (err) + return err; - if (vm->pt_kmap_wc) { - /* - * When we use WC, first fill up the global stash and then - * only if full immediately free the overflow. - */ - stash_push_pagevec(&vm->i915->mm.wc_stash, pvec); - - /* - * As we have made some room in the VM's free_pages, - * we can wait for it to fill again. Unless we are - * inside i915_address_space_fini() and must - * immediately release the pages! - */ - if (pvec->nr <= (immediate ? 0 : PAGEVEC_SIZE - 1)) - return; - - /* - * We have to drop the lock to allow ourselves to sleep, - * so take a copy of the pvec and clear the stash for - * others to use it as we sleep. - */ - stack = *pvec; - pagevec_reinit(pvec); - spin_unlock(&vm->free_pages.lock); - - pvec = &stack; - set_pages_array_wb(pvec->pages, pvec->nr); - - spin_lock(&vm->free_pages.lock); - } - - __pagevec_release(pvec); -} - -static void vm_free_page(struct i915_address_space *vm, struct page *page) -{ - /* - * On !llc, we need to change the pages back to WB. We only do so - * in bulk, so we rarely need to change the page attributes here, - * but doing so requires a stop_machine() from deep inside arch/x86/mm. - * To make detection of the possible sleep more likely, use an - * unconditional might_sleep() for everybody. - */ - might_sleep(); - spin_lock(&vm->free_pages.lock); - while (!pagevec_space(&vm->free_pages.pvec)) - vm_free_pages_release(vm, false); - GEM_BUG_ON(pagevec_count(&vm->free_pages.pvec) >= PAGEVEC_SIZE); - pagevec_add(&vm->free_pages.pvec, page); - spin_unlock(&vm->free_pages.lock); + i915_gem_object_make_unshrinkable(obj); + return 0; } void __i915_vm_close(struct i915_address_space *vm) @@ -194,14 +58,7 @@ void __i915_vm_close(struct i915_address_space *vm) void i915_address_space_fini(struct i915_address_space *vm) { - spin_lock(&vm->free_pages.lock); - if (pagevec_count(&vm->free_pages.pvec)) - vm_free_pages_release(vm, true); - GEM_BUG_ON(pagevec_count(&vm->free_pages.pvec)); - spin_unlock(&vm->free_pages.lock); - drm_mm_takedown(&vm->mm); - mutex_destroy(&vm->mutex); } @@ -246,8 +103,6 @@ void i915_address_space_init(struct i915_address_space *vm, int subclass) drm_mm_init(&vm->mm, 0, vm->total); vm->mm.head_node.color = I915_COLOR_UNEVICTABLE; - stash_init(&vm->free_pages); - INIT_LIST_HEAD(&vm->bound_list); } @@ -264,64 +119,50 @@ void clear_pages(struct i915_vma *vma) memset(&vma->page_sizes, 0, sizeof(vma->page_sizes)); } -static int __setup_page_dma(struct i915_address_space *vm, - struct i915_page_dma *p, - gfp_t gfp) +dma_addr_t __px_dma(struct drm_i915_gem_object *p) { - p->page = vm_alloc_page(vm, gfp | I915_GFP_ALLOW_FAIL); - if (unlikely(!p->page)) - return -ENOMEM; - - p->daddr = dma_map_page_attrs(vm->dma, - p->page, 0, PAGE_SIZE, - PCI_DMA_BIDIRECTIONAL, - DMA_ATTR_SKIP_CPU_SYNC | - DMA_ATTR_NO_WARN); - if (unlikely(dma_mapping_error(vm->dma, p->daddr))) { - vm_free_page(vm, p->page); - return -ENOMEM; - } - - return 0; + GEM_BUG_ON(!i915_gem_object_has_pages(p)); + return sg_dma_address(p->mm.pages->sgl); } -int setup_page_dma(struct i915_address_space *vm, struct i915_page_dma *p) +struct page *__px_page(struct drm_i915_gem_object *p) { - return __setup_page_dma(vm, p, __GFP_HIGHMEM); -} - -void cleanup_page_dma(struct i915_address_space *vm, struct i915_page_dma *p) -{ - dma_unmap_page(vm->dma, p->daddr, PAGE_SIZE, PCI_DMA_BIDIRECTIONAL); - vm_free_page(vm, p->page); + GEM_BUG_ON(!i915_gem_object_has_pages(p)); + return sg_page(p->mm.pages->sgl); } void -fill_page_dma(const struct i915_page_dma *p, const u64 val, unsigned int count) +fill_page_dma(struct drm_i915_gem_object *p, const u64 val, unsigned int count) { - kunmap_atomic(memset64(kmap_atomic(p->page), val, count)); + struct page *page = __px_page(p); + void *vaddr; + + vaddr = kmap(page); + memset64(vaddr, val, count); + clflush_cache_range(vaddr, PAGE_SIZE); + kunmap(page); } -static void poison_scratch_page(struct page *page, unsigned long size) +static void poison_scratch_page(struct drm_i915_gem_object *scratch) { - if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) - return; + struct sgt_iter sgt; + struct page *page; + u8 val; - GEM_BUG_ON(!IS_ALIGNED(size, PAGE_SIZE)); + val = 0; + if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) + val = POISON_FREE; - do { + for_each_sgt_page(page, sgt, scratch->mm.pages) { void *vaddr; vaddr = kmap(page); - memset(vaddr, POISON_FREE, PAGE_SIZE); + memset(vaddr, val, PAGE_SIZE); kunmap(page); - - page = pfn_to_page(page_to_pfn(page) + 1); - size -= PAGE_SIZE; - } while (size); + } } -int setup_scratch_page(struct i915_address_space *vm, gfp_t gfp) +int setup_scratch_page(struct i915_address_space *vm) { unsigned long size; @@ -338,21 +179,27 @@ int setup_scratch_page(struct i915_address_space *vm, gfp_t gfp) */ size = I915_GTT_PAGE_SIZE_4K; if (i915_vm_is_4lvl(vm) && - HAS_PAGE_SIZES(vm->i915, I915_GTT_PAGE_SIZE_64K)) { + HAS_PAGE_SIZES(vm->i915, I915_GTT_PAGE_SIZE_64K)) size = I915_GTT_PAGE_SIZE_64K; - gfp |= __GFP_NOWARN; - } - gfp |= __GFP_ZERO | __GFP_RETRY_MAYFAIL; do { - unsigned int order = get_order(size); - struct page *page; - dma_addr_t addr; + struct drm_i915_gem_object *obj; - page = alloc_pages(gfp, order); - if (unlikely(!page)) + obj = vm->alloc_pt_dma(vm, size); + if (IS_ERR(obj)) goto skip; + if (pin_pt_dma(vm, obj)) + goto skip_obj; + + /* We need a single contiguous page for our scratch */ + if (obj->mm.page_sizes.sg < size) + goto skip_obj; + + /* And it needs to be correspondingly aligned */ + if (__px_dma(obj) & (size - 1)) + goto skip_obj; + /* * Use a non-zero scratch page for debugging. * @@ -362,61 +209,28 @@ int setup_scratch_page(struct i915_address_space *vm, gfp_t gfp) * should it ever be accidentally used, the effect should be * fairly benign. */ - poison_scratch_page(page, size); + poison_scratch_page(obj); - addr = dma_map_page_attrs(vm->dma, - page, 0, size, - PCI_DMA_BIDIRECTIONAL, - DMA_ATTR_SKIP_CPU_SYNC | - DMA_ATTR_NO_WARN); - if (unlikely(dma_mapping_error(vm->dma, addr))) - goto free_page; - - if (unlikely(!IS_ALIGNED(addr, size))) - goto unmap_page; - - vm->scratch[0].base.page = page; - vm->scratch[0].base.daddr = addr; - vm->scratch_order = order; + vm->scratch[0] = obj; + vm->scratch_order = get_order(size); return 0; -unmap_page: - dma_unmap_page(vm->dma, addr, size, PCI_DMA_BIDIRECTIONAL); -free_page: - __free_pages(page, order); +skip_obj: + i915_gem_object_put(obj); skip: if (size == I915_GTT_PAGE_SIZE_4K) return -ENOMEM; size = I915_GTT_PAGE_SIZE_4K; - gfp &= ~__GFP_NOWARN; } while (1); } -void cleanup_scratch_page(struct i915_address_space *vm) -{ - struct i915_page_dma *p = px_base(&vm->scratch[0]); - unsigned int order = vm->scratch_order; - - dma_unmap_page(vm->dma, p->daddr, BIT(order) << PAGE_SHIFT, - PCI_DMA_BIDIRECTIONAL); - __free_pages(p->page, order); -} - void free_scratch(struct i915_address_space *vm) { int i; - if (!px_dma(&vm->scratch[0])) /* set to 0 on clones */ - return; - - for (i = 1; i <= vm->top; i++) { - if (!px_dma(&vm->scratch[i])) - break; - cleanup_page_dma(vm, px_base(&vm->scratch[i])); - } - - cleanup_scratch_page(vm); + for (i = 0; i <= vm->top; i++) + i915_gem_object_put(vm->scratch[i]); } void gtt_write_workarounds(struct intel_gt *gt) diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.h b/drivers/gpu/drm/i915/gt/intel_gtt.h index f2b75078e05f..c13c650ced22 100644 --- a/drivers/gpu/drm/i915/gt/intel_gtt.h +++ b/drivers/gpu/drm/i915/gt/intel_gtt.h @@ -134,38 +134,29 @@ typedef u64 gen8_pte_t; #define GEN8_PDE_IPS_64K BIT(11) #define GEN8_PDE_PS_2M BIT(7) +enum i915_cache_level; + +struct drm_i915_file_private; +struct drm_i915_gem_object; struct i915_fence_reg; +struct i915_vma; +struct intel_gt; #define for_each_sgt_daddr(__dp, __iter, __sgt) \ __for_each_sgt_daddr(__dp, __iter, __sgt, I915_GTT_PAGE_SIZE) -struct i915_page_dma { - struct page *page; - union { - dma_addr_t daddr; - - /* - * For gen6/gen7 only. This is the offset in the GGTT - * where the page directory entries for PPGTT begin - */ - u32 ggtt_offset; - }; -}; - -struct i915_page_scratch { - struct i915_page_dma base; - u64 encode; -}; - struct i915_page_table { - struct i915_page_dma base; - atomic_t used; + struct drm_i915_gem_object *base; + union { + atomic_t used; + struct i915_page_table *stash; + }; }; struct i915_page_directory { struct i915_page_table pt; spinlock_t lock; - void *entry[512]; + void **entry; }; #define __px_choose_expr(x, type, expr, other) \ @@ -176,12 +167,14 @@ struct i915_page_directory { other) #define px_base(px) \ - __px_choose_expr(px, struct i915_page_dma *, __x, \ - __px_choose_expr(px, struct i915_page_scratch *, &__x->base, \ - __px_choose_expr(px, struct i915_page_table *, &__x->base, \ - __px_choose_expr(px, struct i915_page_directory *, &__x->pt.base, \ - (void)0)))) -#define px_dma(px) (px_base(px)->daddr) + __px_choose_expr(px, struct drm_i915_gem_object *, __x, \ + __px_choose_expr(px, struct i915_page_table *, __x->base, \ + __px_choose_expr(px, struct i915_page_directory *, __x->pt.base, \ + (void)0))) + +struct page *__px_page(struct drm_i915_gem_object *p); +dma_addr_t __px_dma(struct drm_i915_gem_object *p); +#define px_dma(px) (__px_dma(px_base(px))) #define px_pt(px) \ __px_choose_expr(px, struct i915_page_table *, __x, \ @@ -189,19 +182,18 @@ struct i915_page_directory { (void)0)) #define px_used(px) (&px_pt(px)->used) -enum i915_cache_level; - -struct drm_i915_file_private; -struct drm_i915_gem_object; -struct i915_vma; -struct intel_gt; +struct i915_vm_pt_stash { + /* preallocated chains of page tables/directories */ + struct i915_page_table *pt[2]; +}; struct i915_vma_ops { /* Map an object into an address space with the given cache flags. */ - int (*bind_vma)(struct i915_address_space *vm, - struct i915_vma *vma, - enum i915_cache_level cache_level, - u32 flags); + void (*bind_vma)(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + struct i915_vma *vma, + enum i915_cache_level cache_level, + u32 flags); /* * Unmap an object from an address space. This usually consists of * setting the valid PTE entries to a reserved scratch page. @@ -213,13 +205,6 @@ struct i915_vma_ops { void (*clear_pages)(struct i915_vma *vma); }; -struct pagestash { - spinlock_t lock; - struct pagevec pvec; -}; - -void stash_init(struct pagestash *stash); - struct i915_address_space { struct kref ref; struct rcu_work rcu; @@ -256,33 +241,33 @@ struct i915_address_space { #define VM_CLASS_GGTT 0 #define VM_CLASS_PPGTT 1 - struct i915_page_scratch scratch[4]; - unsigned int scratch_order; - unsigned int top; - + struct drm_i915_gem_object *scratch[4]; /** * List of vma currently bound. */ struct list_head bound_list; - struct pagestash free_pages; - /* Global GTT */ bool is_ggtt:1; - /* Some systems require uncached updates of the page directories */ - bool pt_kmap_wc:1; - /* Some systems support read-only mappings for GGTT and/or PPGTT */ bool has_read_only:1; + u8 top; + u8 pd_shift; + u8 scratch_order; + + struct drm_i915_gem_object * + (*alloc_pt_dma)(struct i915_address_space *vm, int sz); + u64 (*pte_encode)(dma_addr_t addr, enum i915_cache_level level, u32 flags); /* Create a valid PTE */ #define PTE_READ_ONLY BIT(0) - int (*allocate_va_range)(struct i915_address_space *vm, - u64 start, u64 length); + void (*allocate_va_range)(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + u64 start, u64 length); void (*clear_range)(struct i915_address_space *vm, u64 start, u64 length); void (*insert_page)(struct i915_address_space *vm, @@ -490,9 +475,9 @@ i915_pd_entry(const struct i915_page_directory * const pdp, static inline dma_addr_t i915_page_dir_dma_addr(const struct i915_ppgtt *ppgtt, const unsigned int n) { - struct i915_page_dma *pt = ppgtt->pd->entry[n]; + struct i915_page_table *pt = ppgtt->pd->entry[n]; - return px_dma(pt ?: px_base(&ppgtt->vm.scratch[ppgtt->vm.top])); + return __px_dma(pt ? px_base(pt) : ppgtt->vm.scratch[ppgtt->vm.top]); } void ppgtt_init(struct i915_ppgtt *ppgtt, struct intel_gt *gt); @@ -517,13 +502,10 @@ struct i915_ppgtt *i915_ppgtt_create(struct intel_gt *gt); void i915_ggtt_suspend(struct i915_ggtt *gtt); void i915_ggtt_resume(struct i915_ggtt *ggtt); -int setup_page_dma(struct i915_address_space *vm, struct i915_page_dma *p); -void cleanup_page_dma(struct i915_address_space *vm, struct i915_page_dma *p); - -#define kmap_atomic_px(px) kmap_atomic(px_base(px)->page) +#define kmap_atomic_px(px) kmap_atomic(__px_page(px_base(px))) void -fill_page_dma(const struct i915_page_dma *p, const u64 val, unsigned int count); +fill_page_dma(struct drm_i915_gem_object *p, const u64 val, unsigned int count); #define fill_px(px, v) fill_page_dma(px_base(px), (v), PAGE_SIZE / sizeof(u64)) #define fill32_px(px, v) do { \ @@ -531,47 +513,51 @@ fill_page_dma(const struct i915_page_dma *p, const u64 val, unsigned int count); fill_px((px), v__ << 32 | v__); \ } while (0) -int setup_scratch_page(struct i915_address_space *vm, gfp_t gfp); -void cleanup_scratch_page(struct i915_address_space *vm); +int setup_scratch_page(struct i915_address_space *vm); void free_scratch(struct i915_address_space *vm); +struct drm_i915_gem_object *alloc_pt_dma(struct i915_address_space *vm, int sz); struct i915_page_table *alloc_pt(struct i915_address_space *vm); struct i915_page_directory *alloc_pd(struct i915_address_space *vm); -struct i915_page_directory *__alloc_pd(size_t sz); +struct i915_page_directory *__alloc_pd(int npde); -void free_pd(struct i915_address_space *vm, struct i915_page_dma *pd); +int pin_pt_dma(struct i915_address_space *vm, struct drm_i915_gem_object *obj); -#define free_px(vm, px) free_pd(vm, px_base(px)) +void free_px(struct i915_address_space *vm, + struct i915_page_table *pt, int lvl); +#define free_pt(vm, px) free_px(vm, px, 0) +#define free_pd(vm, px) free_px(vm, px_pt(px), 1) void __set_pd_entry(struct i915_page_directory * const pd, const unsigned short idx, - struct i915_page_dma * const to, + struct i915_page_table *pt, u64 (*encode)(const dma_addr_t, const enum i915_cache_level)); #define set_pd_entry(pd, idx, to) \ - __set_pd_entry((pd), (idx), px_base(to), gen8_pde_encode) + __set_pd_entry((pd), (idx), px_pt(to), gen8_pde_encode) void clear_pd_entry(struct i915_page_directory * const pd, const unsigned short idx, - const struct i915_page_scratch * const scratch); + const struct drm_i915_gem_object * const scratch); bool release_pd_entry(struct i915_page_directory * const pd, const unsigned short idx, struct i915_page_table * const pt, - const struct i915_page_scratch * const scratch); + const struct drm_i915_gem_object * const scratch); void gen6_ggtt_invalidate(struct i915_ggtt *ggtt); int ggtt_set_pages(struct i915_vma *vma); int ppgtt_set_pages(struct i915_vma *vma); void clear_pages(struct i915_vma *vma); -int ppgtt_bind_vma(struct i915_address_space *vm, - struct i915_vma *vma, - enum i915_cache_level cache_level, - u32 flags); +void ppgtt_bind_vma(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + struct i915_vma *vma, + enum i915_cache_level cache_level, + u32 flags); void ppgtt_unbind_vma(struct i915_address_space *vm, struct i915_vma *vma); @@ -579,6 +565,14 @@ void gtt_write_workarounds(struct intel_gt *gt); void setup_private_pat(struct intel_uncore *uncore); +int i915_vm_alloc_pt_stash(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + u64 size); +int i915_vm_pin_pt_stash(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash); +void i915_vm_free_pt_stash(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash); + static inline struct sgt_dma { struct scatterlist *sg; dma_addr_t dma, max; diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index d03da2f64a49..0412a44f25f2 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -137,6 +137,7 @@ #include "i915_perf.h" #include "i915_trace.h" #include "i915_vgpu.h" +#include "intel_breadcrumbs.h" #include "intel_context.h" #include "intel_engine_pm.h" #include "intel_gt.h" @@ -1148,20 +1149,6 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine) } else { struct intel_engine_cs *owner = rq->context->engine; - /* - * Decouple the virtual breadcrumb before moving it - * back to the virtual engine -- we don't want the - * request to complete in the background and try - * and cancel the breadcrumb on the virtual engine - * (instead of the old engine where it is linked)! - */ - if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, - &rq->fence.flags)) { - spin_lock_nested(&rq->lock, - SINGLE_DEPTH_NESTING); - i915_request_cancel_breadcrumb(rq); - spin_unlock(&rq->lock); - } WRITE_ONCE(rq->engine, owner); owner->submit_request(rq); active = NULL; @@ -1819,16 +1806,31 @@ static bool virtual_matches(const struct virtual_engine *ve, return true; } -static void virtual_xfer_breadcrumbs(struct virtual_engine *ve) +static void virtual_xfer_context(struct virtual_engine *ve, + struct intel_engine_cs *engine) { + unsigned int n; + + if (likely(engine == ve->siblings[0])) + return; + + GEM_BUG_ON(READ_ONCE(ve->context.inflight)); + if (!intel_engine_has_relative_mmio(engine)) + virtual_update_register_offsets(ve->context.lrc_reg_state, + engine); + /* - * All the outstanding signals on ve->siblings[0] must have - * been completed, just pending the interrupt handler. As those - * signals still refer to the old sibling (via rq->engine), we must - * transfer those to the old irq_worker to keep our locking - * consistent. + * Move the bound engine to the top of the list for + * future execution. We then kick this tasklet first + * before checking others, so that we preferentially + * reuse this set of bound registers. */ - intel_engine_transfer_stale_breadcrumbs(ve->siblings[0], &ve->context); + for (n = 1; n < ve->num_siblings; n++) { + if (ve->siblings[n] == engine) { + swap(ve->siblings[n], ve->siblings[0]); + break; + } + } } #define for_each_waiter(p__, rq__) \ @@ -2060,6 +2062,14 @@ static inline void clear_ports(struct i915_request **ports, int count) memset_p((void **)ports, NULL, count); } +static inline void +copy_ports(struct i915_request **dst, struct i915_request **src, int count) +{ + /* A memcpy_p() would be very useful here! */ + while (count--) + WRITE_ONCE(*dst++, *src++); /* avoid write tearing */ +} + static void execlists_dequeue(struct intel_engine_cs *engine) { struct intel_engine_execlists * const execlists = &engine->execlists; @@ -2271,38 +2281,23 @@ static void execlists_dequeue(struct intel_engine_cs *engine) GEM_BUG_ON(!(rq->execution_mask & engine->mask)); WRITE_ONCE(rq->engine, engine); - if (engine != ve->siblings[0]) { - u32 *regs = ve->context.lrc_reg_state; - unsigned int n; - - GEM_BUG_ON(READ_ONCE(ve->context.inflight)); - - if (!intel_engine_has_relative_mmio(engine)) - virtual_update_register_offsets(regs, - engine); - - if (!list_empty(&ve->context.signals)) - virtual_xfer_breadcrumbs(ve); - - /* - * Move the bound engine to the top of the list - * for future execution. We then kick this - * tasklet first before checking others, so that - * we preferentially reuse this set of bound - * registers. - */ - for (n = 1; n < ve->num_siblings; n++) { - if (ve->siblings[n] == engine) { - swap(ve->siblings[n], - ve->siblings[0]); - break; - } - } - - GEM_BUG_ON(ve->siblings[0] != engine); - } - if (__i915_request_submit(rq)) { + /* + * Only after we confirm that we will submit + * this request (i.e. it has not already + * completed), do we want to update the context. + * + * This serves two purposes. It avoids + * unnecessary work if we are resubmitting an + * already completed request after timeslicing. + * But more importantly, it prevents us altering + * ve->siblings[] on an idle context, where + * we may be using ve->siblings[] in + * virtual_context_enter / virtual_context_exit. + */ + virtual_xfer_context(ve, engine); + GEM_BUG_ON(ve->siblings[0] != engine); + submit = true; last = rq; } @@ -2648,10 +2643,9 @@ static void process_csb(struct intel_engine_cs *engine) /* switch pending to inflight */ GEM_BUG_ON(!assert_pending_valid(execlists, "promote")); - memcpy(execlists->inflight, - execlists->pending, - execlists_num_ports(execlists) * - sizeof(*execlists->pending)); + copy_ports(execlists->inflight, + execlists->pending, + execlists_num_ports(execlists)); smp_wmb(); /* complete the seqlock */ WRITE_ONCE(execlists->active, execlists->inflight); @@ -3309,7 +3303,10 @@ static void execlists_context_unpin(struct intel_context *ce) { check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET, ce->engine); +} +static void execlists_context_post_unpin(struct intel_context *ce) +{ i915_gem_object_unpin_map(ce->state->obj); } @@ -3471,20 +3468,24 @@ __execlists_update_reg_state(const struct intel_context *ce, } static int -__execlists_context_pin(struct intel_context *ce, - struct intel_engine_cs *engine) +execlists_context_pre_pin(struct intel_context *ce, + struct i915_gem_ww_ctx *ww, void **vaddr) { - void *vaddr; - GEM_BUG_ON(!ce->state); GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); - vaddr = i915_gem_object_pin_map(ce->state->obj, - i915_coherent_map_type(engine->i915) | + *vaddr = i915_gem_object_pin_map(ce->state->obj, + i915_coherent_map_type(ce->engine->i915) | I915_MAP_OVERRIDE); - if (IS_ERR(vaddr)) - return PTR_ERR(vaddr); + return PTR_ERR_OR_ZERO(*vaddr); +} + +static int +__execlists_context_pin(struct intel_context *ce, + struct intel_engine_cs *engine, + void *vaddr) +{ ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE; ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET; __execlists_update_reg_state(ce, engine, ce->ring->tail); @@ -3492,9 +3493,9 @@ __execlists_context_pin(struct intel_context *ce, return 0; } -static int execlists_context_pin(struct intel_context *ce) +static int execlists_context_pin(struct intel_context *ce, void *vaddr) { - return __execlists_context_pin(ce, ce->engine); + return __execlists_context_pin(ce, ce->engine, vaddr); } static int execlists_context_alloc(struct intel_context *ce) @@ -3520,8 +3521,10 @@ static void execlists_context_reset(struct intel_context *ce) static const struct intel_context_ops execlists_context_ops = { .alloc = execlists_context_alloc, + .pre_pin = execlists_context_pre_pin, .pin = execlists_context_pin, .unpin = execlists_context_unpin, + .post_unpin = execlists_context_post_unpin, .enter = intel_context_enter_engine, .exit = intel_context_exit_engine, @@ -3885,7 +3888,7 @@ static int lrc_setup_wa_ctx(struct intel_engine_cs *engine) goto err; } - err = i915_ggtt_pin(vma, 0, PIN_HIGH); + err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH); if (err) goto err; @@ -4126,7 +4129,7 @@ static int execlists_resume(struct intel_engine_cs *engine) { intel_mocs_init_engine(engine); - intel_engine_reset_breadcrumbs(engine); + intel_breadcrumbs_reset(engine->breadcrumbs); if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) { struct drm_printer p = drm_debug_printer(__func__); @@ -4757,14 +4760,21 @@ static int gen12_emit_flush(struct i915_request *request, u32 mode) intel_engine_mask_t aux_inv = 0; u32 cmd, *cs; + cmd = 4; + if (mode & EMIT_INVALIDATE) + cmd += 2; if (mode & EMIT_INVALIDATE) aux_inv = request->engine->mask & ~BIT(BCS0); + if (aux_inv) + cmd += 2 * hweight8(aux_inv) + 2; - cs = intel_ring_begin(request, - 4 + (aux_inv ? 2 * hweight8(aux_inv) + 2 : 0)); + cs = intel_ring_begin(request, cmd); if (IS_ERR(cs)) return PTR_ERR(cs); + if (mode & EMIT_INVALIDATE) + *cs++ = preparser_disable(true); + cmd = MI_FLUSH_DW + 1; /* We always require a command barrier so that subsequent @@ -4797,6 +4807,10 @@ static int gen12_emit_flush(struct i915_request *request, u32 mode) } *cs++ = MI_NOOP; } + + if (mode & EMIT_INVALIDATE) + *cs++ = preparser_disable(false); + intel_ring_advance(request, cs); return 0; @@ -5295,6 +5309,14 @@ populate_lr_context(struct intel_context *ce, return 0; } +static struct intel_timeline *pinned_timeline(struct intel_context *ce) +{ + struct intel_timeline *tl = fetch_and_zero(&ce->timeline); + + return intel_timeline_create_from_engine(ce->engine, + page_unmask_bits(tl)); +} + static int __execlists_context_alloc(struct intel_context *ce, struct intel_engine_cs *engine) { @@ -5325,19 +5347,17 @@ static int __execlists_context_alloc(struct intel_context *ce, goto error_deref_obj; } - if (!ce->timeline) { + if (!page_mask_bits(ce->timeline)) { struct intel_timeline *tl; - struct i915_vma *hwsp; /* * Use the static global HWSP for the kernel context, and * a dynamically allocated cacheline for everyone else. */ - hwsp = NULL; - if (unlikely(intel_context_is_barrier(ce))) - hwsp = engine->status_page.vma; - - tl = intel_timeline_create(engine->gt, hwsp); + if (unlikely(ce->timeline)) + tl = pinned_timeline(ce); + else + tl = intel_timeline_create(engine->gt); if (IS_ERR(tl)) { ret = PTR_ERR(tl); goto error_deref_obj; @@ -5443,12 +5463,12 @@ static int virtual_context_alloc(struct intel_context *ce) return __execlists_context_alloc(ce, ve->siblings[0]); } -static int virtual_context_pin(struct intel_context *ce) +static int virtual_context_pin(struct intel_context *ce, void *vaddr) { struct virtual_engine *ve = container_of(ce, typeof(*ve), context); /* Note: we must use a real engine class for setting up reg state */ - return __execlists_context_pin(ce, ve->siblings[0]); + return __execlists_context_pin(ce, ve->siblings[0], vaddr); } static void virtual_context_enter(struct intel_context *ce) @@ -5476,8 +5496,10 @@ static void virtual_context_exit(struct intel_context *ce) static const struct intel_context_ops virtual_context_ops = { .alloc = virtual_context_alloc, + .pre_pin = execlists_context_pre_pin, .pin = virtual_context_pin, .unpin = execlists_context_unpin, + .post_unpin = execlists_context_post_unpin, .enter = virtual_context_enter, .exit = virtual_context_exit, @@ -5711,9 +5733,7 @@ intel_execlists_create_virtual(struct intel_engine_cs **siblings, snprintf(ve->base.name, sizeof(ve->base.name), "virtual"); intel_engine_init_active(&ve->base, ENGINE_VIRTUAL); - intel_engine_init_breadcrumbs(&ve->base); intel_engine_init_execlists(&ve->base); - ve->base.breadcrumbs.irq_armed = true; /* fake HW, used for irq_work */ ve->base.cops = &virtual_context_ops; ve->base.request_alloc = execlists_request_alloc; @@ -5730,6 +5750,12 @@ intel_execlists_create_virtual(struct intel_engine_cs **siblings, intel_context_init(&ve->context, &ve->base); + ve->base.breadcrumbs = intel_breadcrumbs_create(NULL); + if (!ve->base.breadcrumbs) { + err = -ENOMEM; + goto err_put; + } + for (n = 0; n < count; n++) { struct intel_engine_cs *sibling = siblings[n]; diff --git a/drivers/gpu/drm/i915/gt/intel_ppgtt.c b/drivers/gpu/drm/i915/gt/intel_ppgtt.c index f0862e924d11..46d9aceda64c 100644 --- a/drivers/gpu/drm/i915/gt/intel_ppgtt.c +++ b/drivers/gpu/drm/i915/gt/intel_ppgtt.c @@ -18,7 +18,8 @@ struct i915_page_table *alloc_pt(struct i915_address_space *vm) if (unlikely(!pt)) return ERR_PTR(-ENOMEM); - if (unlikely(setup_page_dma(vm, &pt->base))) { + pt->base = vm->alloc_pt_dma(vm, I915_GTT_PAGE_SIZE_4K); + if (IS_ERR(pt->base)) { kfree(pt); return ERR_PTR(-ENOMEM); } @@ -27,14 +28,20 @@ struct i915_page_table *alloc_pt(struct i915_address_space *vm) return pt; } -struct i915_page_directory *__alloc_pd(size_t sz) +struct i915_page_directory *__alloc_pd(int count) { struct i915_page_directory *pd; - pd = kzalloc(sz, I915_GFP_ALLOW_FAIL); + pd = kzalloc(sizeof(*pd), I915_GFP_ALLOW_FAIL); if (unlikely(!pd)) return NULL; + pd->entry = kcalloc(count, sizeof(*pd->entry), I915_GFP_ALLOW_FAIL); + if (unlikely(!pd->entry)) { + kfree(pd); + return NULL; + } + spin_lock_init(&pd->lock); return pd; } @@ -43,11 +50,13 @@ struct i915_page_directory *alloc_pd(struct i915_address_space *vm) { struct i915_page_directory *pd; - pd = __alloc_pd(sizeof(*pd)); + pd = __alloc_pd(I915_PDES); if (unlikely(!pd)) return ERR_PTR(-ENOMEM); - if (unlikely(setup_page_dma(vm, px_base(pd)))) { + pd->pt.base = vm->alloc_pt_dma(vm, I915_GTT_PAGE_SIZE_4K); + if (IS_ERR(pd->pt.base)) { + kfree(pd->entry); kfree(pd); return ERR_PTR(-ENOMEM); } @@ -55,41 +64,52 @@ struct i915_page_directory *alloc_pd(struct i915_address_space *vm) return pd; } -void free_pd(struct i915_address_space *vm, struct i915_page_dma *pd) +void free_px(struct i915_address_space *vm, struct i915_page_table *pt, int lvl) { - cleanup_page_dma(vm, pd); - kfree(pd); + BUILD_BUG_ON(offsetof(struct i915_page_directory, pt)); + + if (lvl) { + struct i915_page_directory *pd = + container_of(pt, typeof(*pd), pt); + kfree(pd->entry); + } + + if (pt->base) + i915_gem_object_put(pt->base); + + kfree(pt); } static inline void -write_dma_entry(struct i915_page_dma * const pdma, +write_dma_entry(struct drm_i915_gem_object * const pdma, const unsigned short idx, const u64 encoded_entry) { - u64 * const vaddr = kmap_atomic(pdma->page); + u64 * const vaddr = kmap_atomic(__px_page(pdma)); vaddr[idx] = encoded_entry; + clflush_cache_range(&vaddr[idx], sizeof(u64)); kunmap_atomic(vaddr); } void __set_pd_entry(struct i915_page_directory * const pd, const unsigned short idx, - struct i915_page_dma * const to, + struct i915_page_table * const to, u64 (*encode)(const dma_addr_t, const enum i915_cache_level)) { /* Each thread pre-pins the pd, and we may have a thread per pde. */ - GEM_BUG_ON(atomic_read(px_used(pd)) > NALLOC * ARRAY_SIZE(pd->entry)); + GEM_BUG_ON(atomic_read(px_used(pd)) > NALLOC * I915_PDES); atomic_inc(px_used(pd)); pd->entry[idx] = to; - write_dma_entry(px_base(pd), idx, encode(to->daddr, I915_CACHE_LLC)); + write_dma_entry(px_base(pd), idx, encode(px_dma(to), I915_CACHE_LLC)); } void clear_pd_entry(struct i915_page_directory * const pd, const unsigned short idx, - const struct i915_page_scratch * const scratch) + const struct drm_i915_gem_object * const scratch) { GEM_BUG_ON(atomic_read(px_used(pd)) == 0); @@ -102,7 +122,7 @@ bool release_pd_entry(struct i915_page_directory * const pd, const unsigned short idx, struct i915_page_table * const pt, - const struct i915_page_scratch * const scratch) + const struct drm_i915_gem_object * const scratch) { bool free = false; @@ -155,19 +175,16 @@ struct i915_ppgtt *i915_ppgtt_create(struct intel_gt *gt) return ppgtt; } -int ppgtt_bind_vma(struct i915_address_space *vm, - struct i915_vma *vma, - enum i915_cache_level cache_level, - u32 flags) +void ppgtt_bind_vma(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + struct i915_vma *vma, + enum i915_cache_level cache_level, + u32 flags) { u32 pte_flags; - int err; if (!test_bit(I915_VMA_ALLOC_BIT, __i915_vma_flags(vma))) { - err = vm->allocate_va_range(vm, vma->node.start, vma->size); - if (err) - return err; - + vm->allocate_va_range(vm, stash, vma->node.start, vma->size); set_bit(I915_VMA_ALLOC_BIT, __i915_vma_flags(vma)); } @@ -178,8 +195,6 @@ int ppgtt_bind_vma(struct i915_address_space *vm, vm->insert_entries(vm, vma, cache_level, pte_flags); wmb(); - - return 0; } void ppgtt_unbind_vma(struct i915_address_space *vm, struct i915_vma *vma) @@ -188,12 +203,93 @@ void ppgtt_unbind_vma(struct i915_address_space *vm, struct i915_vma *vma) vm->clear_range(vm, vma->node.start, vma->size); } +static unsigned long pd_count(u64 size, int shift) +{ + /* Beware later misalignment */ + return (size + 2 * (BIT_ULL(shift) - 1)) >> shift; +} + +int i915_vm_alloc_pt_stash(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + u64 size) +{ + unsigned long count; + int shift, n; + + shift = vm->pd_shift; + if (!shift) + return 0; + + count = pd_count(size, shift); + while (count--) { + struct i915_page_table *pt; + + pt = alloc_pt(vm); + if (IS_ERR(pt)) { + i915_vm_free_pt_stash(vm, stash); + return PTR_ERR(pt); + } + + pt->stash = stash->pt[0]; + stash->pt[0] = pt; + } + + for (n = 1; n < vm->top; n++) { + shift += ilog2(I915_PDES); /* Each PD holds 512 entries */ + count = pd_count(size, shift); + while (count--) { + struct i915_page_directory *pd; + + pd = alloc_pd(vm); + if (IS_ERR(pd)) { + i915_vm_free_pt_stash(vm, stash); + return PTR_ERR(pd); + } + + pd->pt.stash = stash->pt[1]; + stash->pt[1] = &pd->pt; + } + } + + return 0; +} + +int i915_vm_pin_pt_stash(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash) +{ + struct i915_page_table *pt; + int n, err; + + for (n = 0; n < ARRAY_SIZE(stash->pt); n++) { + for (pt = stash->pt[n]; pt; pt = pt->stash) { + err = pin_pt_dma(vm, pt->base); + if (err) + return err; + } + } + + return 0; +} + +void i915_vm_free_pt_stash(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash) +{ + struct i915_page_table *pt; + int n; + + for (n = 0; n < ARRAY_SIZE(stash->pt); n++) { + while ((pt = stash->pt[n])) { + stash->pt[n] = pt->stash; + free_px(vm, pt, n); + } + } +} + int ppgtt_set_pages(struct i915_vma *vma) { GEM_BUG_ON(vma->pages); vma->pages = vma->obj->mm.pages; - vma->page_sizes = vma->obj->mm.page_sizes; return 0; diff --git a/drivers/gpu/drm/i915/gt/intel_renderstate.c b/drivers/gpu/drm/i915/gt/intel_renderstate.c index 1bfad589c63b..ea2a77c7b469 100644 --- a/drivers/gpu/drm/i915/gt/intel_renderstate.c +++ b/drivers/gpu/drm/i915/gt/intel_renderstate.c @@ -27,6 +27,7 @@ #include "i915_drv.h" #include "intel_renderstate.h" +#include "gt/intel_context.h" #include "intel_ring.h" static const struct intel_renderstate_rodata * @@ -157,33 +158,47 @@ static int render_state_setup(struct intel_renderstate *so, #undef OUT_BATCH int intel_renderstate_init(struct intel_renderstate *so, - struct intel_engine_cs *engine) + struct intel_context *ce) { - struct drm_i915_gem_object *obj; + struct intel_engine_cs *engine = ce->engine; + struct drm_i915_gem_object *obj = NULL; int err; memset(so, 0, sizeof(*so)); so->rodata = render_state_get_rodata(engine); - if (!so->rodata) + if (so->rodata) { + if (so->rodata->batch_items * 4 > PAGE_SIZE) + return -EINVAL; + + obj = i915_gem_object_create_internal(engine->i915, PAGE_SIZE); + if (IS_ERR(obj)) + return PTR_ERR(obj); + + so->vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); + if (IS_ERR(so->vma)) { + err = PTR_ERR(so->vma); + goto err_obj; + } + } + + i915_gem_ww_ctx_init(&so->ww, true); +retry: + err = intel_context_pin_ww(ce, &so->ww); + if (err) + goto err_fini; + + /* return early if there's nothing to setup */ + if (!err && !so->rodata) return 0; - if (so->rodata->batch_items * 4 > PAGE_SIZE) - return -EINVAL; - - obj = i915_gem_object_create_internal(engine->i915, PAGE_SIZE); - if (IS_ERR(obj)) - return PTR_ERR(obj); - - so->vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); - if (IS_ERR(so->vma)) { - err = PTR_ERR(so->vma); - goto err_obj; - } + err = i915_gem_object_lock(so->vma->obj, &so->ww); + if (err) + goto err_context; err = i915_vma_pin(so->vma, 0, 0, PIN_GLOBAL | PIN_HIGH); if (err) - goto err_obj; + goto err_context; err = render_state_setup(so, engine->i915); if (err) @@ -193,8 +208,18 @@ int intel_renderstate_init(struct intel_renderstate *so, err_unpin: i915_vma_unpin(so->vma); +err_context: + intel_context_unpin(ce); +err_fini: + if (err == -EDEADLK) { + err = i915_gem_ww_ctx_backoff(&so->ww); + if (!err) + goto retry; + } + i915_gem_ww_ctx_fini(&so->ww); err_obj: - i915_gem_object_put(obj); + if (obj) + i915_gem_object_put(obj); so->vma = NULL; return err; } @@ -208,11 +233,9 @@ int intel_renderstate_emit(struct intel_renderstate *so, if (!so->vma) return 0; - i915_vma_lock(so->vma); err = i915_request_await_object(rq, so->vma->obj, false); if (err == 0) err = i915_vma_move_to_active(so->vma, rq, 0); - i915_vma_unlock(so->vma); if (err) return err; @@ -233,7 +256,17 @@ int intel_renderstate_emit(struct intel_renderstate *so, return 0; } -void intel_renderstate_fini(struct intel_renderstate *so) +void intel_renderstate_fini(struct intel_renderstate *so, + struct intel_context *ce) { - i915_vma_unpin_and_release(&so->vma, 0); + if (so->vma) { + i915_vma_unpin(so->vma); + i915_vma_close(so->vma); + } + + intel_context_unpin(ce); + i915_gem_ww_ctx_fini(&so->ww); + + if (so->vma) + i915_gem_object_put(so->vma->obj); } diff --git a/drivers/gpu/drm/i915/gt/intel_renderstate.h b/drivers/gpu/drm/i915/gt/intel_renderstate.h index 5700be69a05a..713aa1e86c80 100644 --- a/drivers/gpu/drm/i915/gt/intel_renderstate.h +++ b/drivers/gpu/drm/i915/gt/intel_renderstate.h @@ -25,9 +25,10 @@ #define _INTEL_RENDERSTATE_H_ #include +#include "i915_gem.h" struct i915_request; -struct intel_engine_cs; +struct intel_context; struct i915_vma; struct intel_renderstate_rodata { @@ -49,6 +50,7 @@ extern const struct intel_renderstate_rodata gen8_null_state; extern const struct intel_renderstate_rodata gen9_null_state; struct intel_renderstate { + struct i915_gem_ww_ctx ww; const struct intel_renderstate_rodata *rodata; struct i915_vma *vma; u32 batch_offset; @@ -58,9 +60,10 @@ struct intel_renderstate { }; int intel_renderstate_init(struct intel_renderstate *so, - struct intel_engine_cs *engine); + struct intel_context *ce); int intel_renderstate_emit(struct intel_renderstate *so, struct i915_request *rq); -void intel_renderstate_fini(struct intel_renderstate *so); +void intel_renderstate_fini(struct intel_renderstate *so, + struct intel_context *ce); #endif /* _INTEL_RENDERSTATE_H_ */ diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index 46a5ceffc22f..ac36b67fb46b 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -15,6 +15,7 @@ #include "i915_drv.h" #include "i915_gpu_error.h" #include "i915_irq.h" +#include "intel_breadcrumbs.h" #include "intel_engine_pm.h" #include "intel_gt.h" #include "intel_gt_pm.h" diff --git a/drivers/gpu/drm/i915/gt/intel_ring.c b/drivers/gpu/drm/i915/gt/intel_ring.c index bdb324167ef3..4034a4bac7f0 100644 --- a/drivers/gpu/drm/i915/gt/intel_ring.c +++ b/drivers/gpu/drm/i915/gt/intel_ring.c @@ -21,7 +21,13 @@ unsigned int intel_ring_update_space(struct intel_ring *ring) return space; } -int intel_ring_pin(struct intel_ring *ring) +void __intel_ring_pin(struct intel_ring *ring) +{ + GEM_BUG_ON(!atomic_read(&ring->pin_count)); + atomic_inc(&ring->pin_count); +} + +int intel_ring_pin(struct intel_ring *ring, struct i915_gem_ww_ctx *ww) { struct i915_vma *vma = ring->vma; unsigned int flags; @@ -39,7 +45,7 @@ int intel_ring_pin(struct intel_ring *ring) else flags |= PIN_HIGH; - ret = i915_ggtt_pin(vma, 0, flags); + ret = i915_ggtt_pin(vma, ww, 0, flags); if (unlikely(ret)) goto err_unpin; diff --git a/drivers/gpu/drm/i915/gt/intel_ring.h b/drivers/gpu/drm/i915/gt/intel_ring.h index cc0ebca65167..1700579bdc93 100644 --- a/drivers/gpu/drm/i915/gt/intel_ring.h +++ b/drivers/gpu/drm/i915/gt/intel_ring.h @@ -21,7 +21,8 @@ int intel_ring_cacheline_align(struct i915_request *rq); unsigned int intel_ring_update_space(struct intel_ring *ring); -int intel_ring_pin(struct intel_ring *ring); +void __intel_ring_pin(struct intel_ring *ring); +int intel_ring_pin(struct intel_ring *ring, struct i915_gem_ww_ctx *ww); void intel_ring_unpin(struct intel_ring *ring); void intel_ring_reset(struct intel_ring *ring, u32 tail); diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c index 898593ca4889..16b48e72c369 100644 --- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c @@ -32,6 +32,7 @@ #include "gen6_ppgtt.h" #include "gen7_renderclear.h" #include "i915_drv.h" +#include "intel_breadcrumbs.h" #include "intel_context.h" #include "intel_gt.h" #include "intel_reset.h" @@ -201,16 +202,18 @@ static struct i915_address_space *vm_alias(struct i915_address_space *vm) return vm; } +static u32 pp_dir(struct i915_address_space *vm) +{ + return to_gen6_ppgtt(i915_vm_to_ppgtt(vm))->pp_dir; +} + static void set_pp_dir(struct intel_engine_cs *engine) { struct i915_address_space *vm = vm_alias(engine->gt->vm); if (vm) { - struct i915_ppgtt *ppgtt = i915_vm_to_ppgtt(vm); - ENGINE_WRITE(engine, RING_PP_DIR_DCLV, PP_DIR_DCLV_2G); - ENGINE_WRITE(engine, RING_PP_DIR_BASE, - px_base(ppgtt->pd)->ggtt_offset << 10); + ENGINE_WRITE(engine, RING_PP_DIR_BASE, pp_dir(vm)); } } @@ -255,7 +258,7 @@ static int xcs_resume(struct intel_engine_cs *engine) else ring_setup_status_page(engine); - intel_engine_reset_breadcrumbs(engine); + intel_breadcrumbs_reset(engine->breadcrumbs); /* Enforce ordering by reading HEAD register back */ ENGINE_POSTING_READ(engine, RING_HEAD); @@ -474,14 +477,16 @@ static void ring_context_destroy(struct kref *ref) intel_context_free(ce); } -static int __context_pin_ppgtt(struct intel_context *ce) +static int ring_context_pre_pin(struct intel_context *ce, + struct i915_gem_ww_ctx *ww, + void **unused) { struct i915_address_space *vm; int err = 0; vm = vm_alias(ce->vm); if (vm) - err = gen6_ppgtt_pin(i915_vm_to_ppgtt((vm))); + err = gen6_ppgtt_pin(i915_vm_to_ppgtt((vm)), ww); return err; } @@ -496,6 +501,10 @@ static void __context_unpin_ppgtt(struct intel_context *ce) } static void ring_context_unpin(struct intel_context *ce) +{ +} + +static void ring_context_post_unpin(struct intel_context *ce) { __context_unpin_ppgtt(ce); } @@ -584,9 +593,9 @@ static int ring_context_alloc(struct intel_context *ce) return 0; } -static int ring_context_pin(struct intel_context *ce) +static int ring_context_pin(struct intel_context *ce, void *unused) { - return __context_pin_ppgtt(ce); + return 0; } static void ring_context_reset(struct intel_context *ce) @@ -597,8 +606,10 @@ static void ring_context_reset(struct intel_context *ce) static const struct intel_context_ops ring_context_ops = { .alloc = ring_context_alloc, + .pre_pin = ring_context_pre_pin, .pin = ring_context_pin, .unpin = ring_context_unpin, + .post_unpin = ring_context_post_unpin, .enter = intel_context_enter_engine, .exit = intel_context_exit_engine, @@ -608,7 +619,7 @@ static const struct intel_context_ops ring_context_ops = { }; static int load_pd_dir(struct i915_request *rq, - const struct i915_ppgtt *ppgtt, + struct i915_address_space *vm, u32 valid) { const struct intel_engine_cs * const engine = rq->engine; @@ -624,7 +635,7 @@ static int load_pd_dir(struct i915_request *rq, *cs++ = MI_LOAD_REGISTER_IMM(1); *cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine->mmio_base)); - *cs++ = px_base(ppgtt->pd)->ggtt_offset << 10; + *cs++ = pp_dir(vm); /* Stall until the page table load is complete? */ *cs++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT; @@ -826,7 +837,7 @@ static int switch_mm(struct i915_request *rq, struct i915_address_space *vm) * post-sync op, this extra pass appears vital before a * mm switch! */ - ret = load_pd_dir(rq, i915_vm_to_ppgtt(vm), PP_DIR_DCLV_2G); + ret = load_pd_dir(rq, vm, PP_DIR_DCLV_2G); if (ret) return ret; @@ -1250,14 +1261,15 @@ int intel_ring_submission_setup(struct intel_engine_cs *engine) return -ENODEV; } - timeline = intel_timeline_create(engine->gt, engine->status_page.vma); + timeline = intel_timeline_create_from_engine(engine, + I915_GEM_HWS_SEQNO_ADDR); if (IS_ERR(timeline)) { err = PTR_ERR(timeline); goto err; } GEM_BUG_ON(timeline->has_initial_breadcrumb); - err = intel_timeline_pin(timeline); + err = intel_timeline_pin(timeline, NULL); if (err) goto err_timeline; @@ -1267,7 +1279,7 @@ int intel_ring_submission_setup(struct intel_engine_cs *engine) goto err_timeline_unpin; } - err = intel_ring_pin(ring); + err = intel_ring_pin(ring, NULL); if (err) goto err_ring; diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c index 97ba14ad52e4..e6a00eea0631 100644 --- a/drivers/gpu/drm/i915/gt/intel_rps.c +++ b/drivers/gpu/drm/i915/gt/intel_rps.c @@ -7,6 +7,7 @@ #include #include "i915_drv.h" +#include "intel_breadcrumbs.h" #include "intel_gt.h" #include "intel_gt_clock_utils.h" #include "intel_gt_irq.h" diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.c b/drivers/gpu/drm/i915/gt/intel_timeline.c index 46d20f5f3ddc..a2f74cefe4c3 100644 --- a/drivers/gpu/drm/i915/gt/intel_timeline.c +++ b/drivers/gpu/drm/i915/gt/intel_timeline.c @@ -215,7 +215,8 @@ static void cacheline_free(struct intel_timeline_cacheline *cl) static int intel_timeline_init(struct intel_timeline *timeline, struct intel_gt *gt, - struct i915_vma *hwsp) + struct i915_vma *hwsp, + unsigned int offset) { void *vaddr; @@ -246,8 +247,7 @@ static int intel_timeline_init(struct intel_timeline *timeline, vaddr = page_mask_bits(cl->vaddr); } else { - timeline->hwsp_offset = I915_GEM_HWS_SEQNO_ADDR; - + timeline->hwsp_offset = offset; vaddr = i915_gem_object_pin_map(hwsp->obj, I915_MAP_WB); if (IS_ERR(vaddr)) return PTR_ERR(vaddr); @@ -297,7 +297,9 @@ static void intel_timeline_fini(struct intel_timeline *timeline) } struct intel_timeline * -intel_timeline_create(struct intel_gt *gt, struct i915_vma *global_hwsp) +__intel_timeline_create(struct intel_gt *gt, + struct i915_vma *global_hwsp, + unsigned int offset) { struct intel_timeline *timeline; int err; @@ -306,7 +308,7 @@ intel_timeline_create(struct intel_gt *gt, struct i915_vma *global_hwsp) if (!timeline) return ERR_PTR(-ENOMEM); - err = intel_timeline_init(timeline, gt, global_hwsp); + err = intel_timeline_init(timeline, gt, global_hwsp, offset); if (err) { kfree(timeline); return ERR_PTR(err); @@ -315,14 +317,20 @@ intel_timeline_create(struct intel_gt *gt, struct i915_vma *global_hwsp) return timeline; } -int intel_timeline_pin(struct intel_timeline *tl) +void __intel_timeline_pin(struct intel_timeline *tl) +{ + GEM_BUG_ON(!atomic_read(&tl->pin_count)); + atomic_inc(&tl->pin_count); +} + +int intel_timeline_pin(struct intel_timeline *tl, struct i915_gem_ww_ctx *ww) { int err; if (atomic_add_unless(&tl->pin_count, 1, 0)) return 0; - err = i915_ggtt_pin(tl->hwsp_ggtt, 0, PIN_HIGH); + err = i915_ggtt_pin(tl->hwsp_ggtt, ww, 0, PIN_HIGH); if (err) return err; @@ -465,7 +473,7 @@ __intel_timeline_get_seqno(struct intel_timeline *tl, goto err_rollback; } - err = i915_ggtt_pin(vma, 0, PIN_HIGH); + err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH); if (err) { __idle_hwsp_free(vma->private, cacheline); goto err_rollback; @@ -484,7 +492,9 @@ __intel_timeline_get_seqno(struct intel_timeline *tl, * free it after the current request is retired, which ensures that * all writes into the cacheline from previous requests are complete. */ - err = i915_active_ref(&tl->hwsp_cacheline->active, tl, &rq->fence); + err = i915_active_ref(&tl->hwsp_cacheline->active, + tl->fence_context, + &rq->fence); if (err) goto err_cacheline; diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.h b/drivers/gpu/drm/i915/gt/intel_timeline.h index 4298b9ac7327..9882cd911d8e 100644 --- a/drivers/gpu/drm/i915/gt/intel_timeline.h +++ b/drivers/gpu/drm/i915/gt/intel_timeline.h @@ -29,10 +29,27 @@ #include "i915_active.h" #include "i915_syncmap.h" -#include "gt/intel_timeline_types.h" +#include "intel_timeline_types.h" struct intel_timeline * -intel_timeline_create(struct intel_gt *gt, struct i915_vma *global_hwsp); +__intel_timeline_create(struct intel_gt *gt, + struct i915_vma *global_hwsp, + unsigned int offset); + +static inline struct intel_timeline * +intel_timeline_create(struct intel_gt *gt) +{ + return __intel_timeline_create(gt, NULL, 0); +} + +static inline struct intel_timeline * +intel_timeline_create_from_engine(struct intel_engine_cs *engine, + unsigned int offset) +{ + return __intel_timeline_create(engine->gt, + engine->status_page.vma, + offset); +} static inline struct intel_timeline * intel_timeline_get(struct intel_timeline *timeline) @@ -71,7 +88,8 @@ static inline bool intel_timeline_sync_is_later(struct intel_timeline *tl, return __intel_timeline_sync_is_later(tl, fence->context, fence->seqno); } -int intel_timeline_pin(struct intel_timeline *tl); +void __intel_timeline_pin(struct intel_timeline *tl); +int intel_timeline_pin(struct intel_timeline *tl, struct i915_gem_ww_ctx *ww); void intel_timeline_enter(struct intel_timeline *tl); int intel_timeline_get_seqno(struct intel_timeline *tl, struct i915_request *rq, diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c index be5a4685c991..a3f72b75c61e 100644 --- a/drivers/gpu/drm/i915/gt/intel_workarounds.c +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c @@ -2088,6 +2088,7 @@ static int engine_wa_list_verify(struct intel_context *ce, const struct i915_wa *wa; struct i915_request *rq; struct i915_vma *vma; + struct i915_gem_ww_ctx ww; unsigned int i; u32 *results; int err; @@ -2100,29 +2101,34 @@ static int engine_wa_list_verify(struct intel_context *ce, return PTR_ERR(vma); intel_engine_pm_get(ce->engine); - rq = intel_context_create_request(ce); - intel_engine_pm_put(ce->engine); + i915_gem_ww_ctx_init(&ww, false); +retry: + err = i915_gem_object_lock(vma->obj, &ww); + if (err == 0) + err = intel_context_pin_ww(ce, &ww); + if (err) + goto err_pm; + + rq = i915_request_create(ce); if (IS_ERR(rq)) { err = PTR_ERR(rq); - goto err_vma; + goto err_unpin; } - i915_vma_lock(vma); err = i915_request_await_object(rq, vma->obj, true); if (err == 0) err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE); - i915_vma_unlock(vma); - if (err) { - i915_request_add(rq); - goto err_vma; - } - - err = wa_list_srm(rq, wal, vma); - if (err) - goto err_vma; + if (err == 0) + err = wa_list_srm(rq, wal, vma); i915_request_get(rq); + if (err) + i915_request_set_error_once(rq, err); i915_request_add(rq); + + if (err) + goto err_rq; + if (i915_request_wait(rq, 0, HZ / 5) < 0) { err = -ETIME; goto err_rq; @@ -2147,7 +2153,16 @@ static int engine_wa_list_verify(struct intel_context *ce, err_rq: i915_request_put(rq); -err_vma: +err_unpin: + intel_context_unpin(ce); +err_pm: + if (err == -EDEADLK) { + err = i915_gem_ww_ctx_backoff(&ww); + if (!err) + goto retry; + } + i915_gem_ww_ctx_fini(&ww); + intel_engine_pm_put(ce->engine); i915_vma_unpin(vma); i915_vma_put(vma); return err; diff --git a/drivers/gpu/drm/i915/gt/mock_engine.c b/drivers/gpu/drm/i915/gt/mock_engine.c index b8dd3cbc8696..dfd1cfb8a7ec 100644 --- a/drivers/gpu/drm/i915/gt/mock_engine.c +++ b/drivers/gpu/drm/i915/gt/mock_engine.c @@ -131,6 +131,10 @@ static void mock_context_unpin(struct intel_context *ce) { } +static void mock_context_post_unpin(struct intel_context *ce) +{ +} + static void mock_context_destroy(struct kref *ref) { struct intel_context *ce = container_of(ref, typeof(*ce), ref); @@ -152,8 +156,7 @@ static int mock_context_alloc(struct intel_context *ce) if (!ce->ring) return -ENOMEM; - GEM_BUG_ON(ce->timeline); - ce->timeline = intel_timeline_create(ce->engine->gt, NULL); + ce->timeline = intel_timeline_create(ce->engine->gt); if (IS_ERR(ce->timeline)) { kfree(ce->engine); return PTR_ERR(ce->timeline); @@ -164,7 +167,13 @@ static int mock_context_alloc(struct intel_context *ce) return 0; } -static int mock_context_pin(struct intel_context *ce) +static int mock_context_pre_pin(struct intel_context *ce, + struct i915_gem_ww_ctx *ww, void **unused) +{ + return 0; +} + +static int mock_context_pin(struct intel_context *ce, void *unused) { return 0; } @@ -176,8 +185,10 @@ static void mock_context_reset(struct intel_context *ce) static const struct intel_context_ops mock_context_ops = { .alloc = mock_context_alloc, + .pre_pin = mock_context_pre_pin, .pin = mock_context_pin, .unpin = mock_context_unpin, + .post_unpin = mock_context_post_unpin, .enter = intel_context_enter_engine, .exit = intel_context_exit_engine, @@ -261,11 +272,12 @@ static void mock_engine_release(struct intel_engine_cs *engine) GEM_BUG_ON(timer_pending(&mock->hw_delay)); + intel_breadcrumbs_free(engine->breadcrumbs); + intel_context_unpin(engine->kernel_context); intel_context_put(engine->kernel_context); intel_engine_fini_retire(engine); - intel_engine_fini_breadcrumbs(engine); } struct intel_engine_cs *mock_engine(struct drm_i915_private *i915, @@ -323,20 +335,26 @@ int mock_engine_init(struct intel_engine_cs *engine) struct intel_context *ce; intel_engine_init_active(engine, ENGINE_MOCK); - intel_engine_init_breadcrumbs(engine); intel_engine_init_execlists(engine); intel_engine_init__pm(engine); intel_engine_init_retire(engine); + engine->breadcrumbs = intel_breadcrumbs_create(NULL); + if (!engine->breadcrumbs) + return -ENOMEM; + ce = create_kernel_context(engine); if (IS_ERR(ce)) goto err_breadcrumbs; + /* We insist the kernel context is using the status_page */ + engine->status_page.vma = ce->timeline->hwsp_ggtt; + engine->kernel_context = ce; return 0; err_breadcrumbs: - intel_engine_fini_breadcrumbs(engine); + intel_breadcrumbs_free(engine->breadcrumbs); return -ENOMEM; } diff --git a/drivers/gpu/drm/i915/gt/selftest_context.c b/drivers/gpu/drm/i915/gt/selftest_context.c index 52af1cee9a94..1f4020e906a8 100644 --- a/drivers/gpu/drm/i915/gt/selftest_context.c +++ b/drivers/gpu/drm/i915/gt/selftest_context.c @@ -68,6 +68,8 @@ static int context_sync(struct intel_context *ce) } while (!err); mutex_unlock(&tl->mutex); + /* Wait for all barriers to complete (remote CPU) before we check */ + i915_active_unlock_wait(&ce->active); return err; } diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c index 3fc5de961280..95d41c01d0e0 100644 --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c @@ -2729,7 +2729,7 @@ static int create_gang(struct intel_engine_cs *engine, i915_gem_object_put(obj); intel_context_put(ce); - rq->client_link.next = &(*prev)->client_link; + rq->mock.link.next = &(*prev)->mock.link; *prev = rq; return 0; @@ -2970,8 +2970,7 @@ static int live_preempt_gang(void *arg) } while (rq) { /* wait for each rq from highest to lowest prio */ - struct i915_request *n = - list_next_entry(rq, client_link); + struct i915_request *n = list_next_entry(rq, mock.link); if (err == 0 && i915_request_wait(rq, 0, HZ / 5) < 0) { struct drm_printer p = @@ -3090,7 +3089,7 @@ static struct i915_vma *create_global(struct intel_gt *gt, size_t sz) return vma; } - err = i915_ggtt_pin(vma, 0, 0); + err = i915_ggtt_pin(vma, NULL, 0, 0); if (err) { i915_vma_put(vma); return ERR_PTR(err); @@ -4997,6 +4996,7 @@ static int __live_lrc_state(struct intel_engine_cs *engine, { struct intel_context *ce; struct i915_request *rq; + struct i915_gem_ww_ctx ww; enum { RING_START_IDX = 0, RING_TAIL_IDX, @@ -5011,7 +5011,11 @@ static int __live_lrc_state(struct intel_engine_cs *engine, if (IS_ERR(ce)) return PTR_ERR(ce); - err = intel_context_pin(ce); + i915_gem_ww_ctx_init(&ww, false); +retry: + err = i915_gem_object_lock(scratch->obj, &ww); + if (!err) + err = intel_context_pin_ww(ce, &ww); if (err) goto err_put; @@ -5040,11 +5044,9 @@ static int __live_lrc_state(struct intel_engine_cs *engine, *cs++ = i915_ggtt_offset(scratch) + RING_TAIL_IDX * sizeof(u32); *cs++ = 0; - i915_vma_lock(scratch); err = i915_request_await_object(rq, scratch->obj, true); if (!err) err = i915_vma_move_to_active(scratch, rq, EXEC_OBJECT_WRITE); - i915_vma_unlock(scratch); i915_request_get(rq); i915_request_add(rq); @@ -5081,6 +5083,12 @@ static int __live_lrc_state(struct intel_engine_cs *engine, err_unpin: intel_context_unpin(ce); err_put: + if (err == -EDEADLK) { + err = i915_gem_ww_ctx_backoff(&ww); + if (!err) + goto retry; + } + i915_gem_ww_ctx_fini(&ww); intel_context_put(ce); return err; } diff --git a/drivers/gpu/drm/i915/gt/selftest_rps.c b/drivers/gpu/drm/i915/gt/selftest_rps.c index 34b403d47840..3540ba9bd459 100644 --- a/drivers/gpu/drm/i915/gt/selftest_rps.c +++ b/drivers/gpu/drm/i915/gt/selftest_rps.c @@ -77,20 +77,20 @@ create_spin_counter(struct intel_engine_cs *engine, vma = i915_vma_instance(obj, vm, NULL); if (IS_ERR(vma)) { - i915_gem_object_put(obj); - return vma; + err = PTR_ERR(vma); + goto err_put; } err = i915_vma_pin(vma, 0, 0, PIN_USER); - if (err) { - i915_vma_put(vma); - return ERR_PTR(err); - } + if (err) + goto err_unlock; + + i915_vma_lock(vma); base = i915_gem_object_pin_map(obj, I915_MAP_WC); if (IS_ERR(base)) { - i915_gem_object_put(obj); - return ERR_CAST(base); + err = PTR_ERR(base); + goto err_unpin; } cs = base; @@ -134,6 +134,14 @@ create_spin_counter(struct intel_engine_cs *engine, *cancel = base + loop; *counter = srm ? memset32(base + end, 0, 1) : NULL; return vma; + +err_unpin: + i915_vma_unpin(vma); +err_unlock: + i915_vma_unlock(vma); +err_put: + i915_gem_object_put(obj); + return ERR_PTR(err); } static u8 wait_for_freq(struct intel_rps *rps, u8 freq, int timeout_ms) @@ -639,7 +647,6 @@ int live_rps_frequency_cs(void *arg) goto err_vma; } - i915_vma_lock(vma); err = i915_request_await_object(rq, vma->obj, false); if (!err) err = i915_vma_move_to_active(vma, rq, 0); @@ -647,7 +654,6 @@ int live_rps_frequency_cs(void *arg) err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, 0); - i915_vma_unlock(vma); i915_request_add(rq); if (err) goto err_vma; @@ -708,6 +714,7 @@ int live_rps_frequency_cs(void *arg) i915_gem_object_flush_map(vma->obj); i915_gem_object_unpin_map(vma->obj); i915_vma_unpin(vma); + i915_vma_unlock(vma); i915_vma_put(vma); st_engine_heartbeat_enable(engine); @@ -781,7 +788,6 @@ int live_rps_frequency_srm(void *arg) goto err_vma; } - i915_vma_lock(vma); err = i915_request_await_object(rq, vma->obj, false); if (!err) err = i915_vma_move_to_active(vma, rq, 0); @@ -789,7 +795,6 @@ int live_rps_frequency_srm(void *arg) err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, 0); - i915_vma_unlock(vma); i915_request_add(rq); if (err) goto err_vma; @@ -849,6 +854,7 @@ int live_rps_frequency_srm(void *arg) i915_gem_object_flush_map(vma->obj); i915_gem_object_unpin_map(vma->obj); i915_vma_unpin(vma); + i915_vma_unlock(vma); i915_vma_put(vma); st_engine_heartbeat_enable(engine); diff --git a/drivers/gpu/drm/i915/gt/selftest_timeline.c b/drivers/gpu/drm/i915/gt/selftest_timeline.c index 6564c989dbee..96d164a3841d 100644 --- a/drivers/gpu/drm/i915/gt/selftest_timeline.c +++ b/drivers/gpu/drm/i915/gt/selftest_timeline.c @@ -72,7 +72,7 @@ static int __mock_hwsp_timeline(struct mock_hwsp_freelist *state, unsigned long cacheline; int err; - tl = intel_timeline_create(state->gt, NULL); + tl = intel_timeline_create(state->gt); if (IS_ERR(tl)) return PTR_ERR(tl); @@ -455,7 +455,7 @@ tl_write(struct intel_timeline *tl, struct intel_engine_cs *engine, u32 value) struct i915_request *rq; int err; - err = intel_timeline_pin(tl); + err = intel_timeline_pin(tl, NULL); if (err) { rq = ERR_PTR(err); goto out; @@ -487,7 +487,7 @@ checked_intel_timeline_create(struct intel_gt *gt) { struct intel_timeline *tl; - tl = intel_timeline_create(gt, NULL); + tl = intel_timeline_create(gt); if (IS_ERR(tl)) return tl; @@ -660,14 +660,14 @@ static int live_hwsp_wrap(void *arg) * foreign GPU references. */ - tl = intel_timeline_create(gt, NULL); + tl = intel_timeline_create(gt); if (IS_ERR(tl)) return PTR_ERR(tl); if (!tl->has_initial_breadcrumb || !tl->hwsp_cacheline) goto out_free; - err = intel_timeline_pin(tl); + err = intel_timeline_pin(tl, NULL); if (err) goto out_free; diff --git a/drivers/gpu/drm/i915/gt/selftest_workarounds.c b/drivers/gpu/drm/i915/gt/selftest_workarounds.c index febc9e6692ba..61a0532d0f3d 100644 --- a/drivers/gpu/drm/i915/gt/selftest_workarounds.c +++ b/drivers/gpu/drm/i915/gt/selftest_workarounds.c @@ -214,7 +214,7 @@ static int check_whitelist(struct i915_gem_context *ctx, return PTR_ERR(results); err = 0; - i915_gem_object_lock(results); + i915_gem_object_lock(results, NULL); intel_wedge_on_timeout(&wedge, engine->gt, HZ / 5) /* safety net! */ err = i915_gem_object_set_to_cpu_domain(results, false); i915_gem_object_unlock(results); diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.c b/drivers/gpu/drm/i915/gt/uc/intel_guc.c index 861657897c0f..942c7c187adb 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.c @@ -677,7 +677,7 @@ struct i915_vma *intel_guc_allocate_vma(struct intel_guc *guc, u32 size) goto err; flags = PIN_OFFSET_BIAS | i915_ggtt_pin_bias(vma); - ret = i915_ggtt_pin(vma, 0, flags); + ret = i915_ggtt_pin(vma, NULL, 0, flags); if (ret) { vma = ERR_PTR(ret); goto err; diff --git a/drivers/gpu/drm/i915/gvt/cmd_parser.c b/drivers/gpu/drm/i915/gvt/cmd_parser.c index f1940939260a..d0a599b51bfe 100644 --- a/drivers/gpu/drm/i915/gvt/cmd_parser.c +++ b/drivers/gpu/drm/i915/gvt/cmd_parser.c @@ -1923,6 +1923,7 @@ static int perform_bb_shadow(struct parser_exec_state *s) if (ret) goto err_unmap; + i915_gem_object_unlock(bb->obj); INIT_LIST_HEAD(&bb->list); list_add(&bb->list, &s->workload->shadow_bb); @@ -2982,7 +2983,7 @@ static int shadow_indirect_ctx(struct intel_shadow_wa_ctx *wa_ctx) goto put_obj; } - i915_gem_object_lock(obj); + i915_gem_object_lock(obj, NULL); ret = i915_gem_object_set_to_cpu_domain(obj, false); i915_gem_object_unlock(obj); if (ret) { diff --git a/drivers/gpu/drm/i915/gvt/scheduler.c b/drivers/gpu/drm/i915/gvt/scheduler.c index 3c3b9842bbbd..1570eb8aa978 100644 --- a/drivers/gpu/drm/i915/gvt/scheduler.c +++ b/drivers/gpu/drm/i915/gvt/scheduler.c @@ -403,6 +403,14 @@ static void release_shadow_wa_ctx(struct intel_shadow_wa_ctx *wa_ctx) wa_ctx->indirect_ctx.shadow_va = NULL; } +static void set_dma_address(struct i915_page_directory *pd, dma_addr_t addr) +{ + struct scatterlist *sg = pd->pt.base->mm.pages->sgl; + + /* This is not a good idea */ + sg->dma_address = addr; +} + static void set_context_ppgtt_from_shadow(struct intel_vgpu_workload *workload, struct intel_context *ce) { @@ -411,7 +419,7 @@ static void set_context_ppgtt_from_shadow(struct intel_vgpu_workload *workload, int i = 0; if (mm->ppgtt_mm.root_entry_type == GTT_TYPE_PPGTT_ROOT_L4_ENTRY) { - px_dma(ppgtt->pd) = mm->ppgtt_mm.shadow_pdps[0]; + set_dma_address(ppgtt->pd, mm->ppgtt_mm.shadow_pdps[0]); } else { for (i = 0; i < GVT_RING_CTX_NR_PDPS; i++) { struct i915_page_directory * const pd = @@ -421,7 +429,8 @@ static void set_context_ppgtt_from_shadow(struct intel_vgpu_workload *workload, shadow ppgtt. */ if (!pd) break; - px_dma(pd) = mm->ppgtt_mm.shadow_pdps[i]; + + set_dma_address(pd, mm->ppgtt_mm.shadow_pdps[i]); } } } @@ -1240,13 +1249,13 @@ i915_context_ppgtt_root_restore(struct intel_vgpu_submission *s, int i; if (i915_vm_is_4lvl(&ppgtt->vm)) { - px_dma(ppgtt->pd) = s->i915_context_pml4; + set_dma_address(ppgtt->pd, s->i915_context_pml4); } else { for (i = 0; i < GEN8_3LVL_PDPES; i++) { struct i915_page_directory * const pd = i915_pd_entry(ppgtt->pd, i); - px_dma(pd) = s->i915_context_pdps[i]; + set_dma_address(pd, s->i915_context_pdps[i]); } } } diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c index 1e1253207425..b0a6522be3d1 100644 --- a/drivers/gpu/drm/i915/i915_active.c +++ b/drivers/gpu/drm/i915/i915_active.c @@ -28,12 +28,14 @@ static struct i915_global_active { } global; struct active_node { + struct rb_node node; struct i915_active_fence base; struct i915_active *ref; - struct rb_node node; u64 timeline; }; +#define fetch_node(x) rb_entry(READ_ONCE(x), typeof(struct active_node), node) + static inline struct active_node * node_from_active(struct i915_active_fence *active) { @@ -128,8 +130,8 @@ static inline void debug_active_assert(struct i915_active *ref) { } static void __active_retire(struct i915_active *ref) { + struct rb_root root = RB_ROOT; struct active_node *it, *n; - struct rb_root root; unsigned long flags; GEM_BUG_ON(i915_active_is_idle(ref)); @@ -141,9 +143,25 @@ __active_retire(struct i915_active *ref) GEM_BUG_ON(rcu_access_pointer(ref->excl.fence)); debug_active_deactivate(ref); - root = ref->tree; - ref->tree = RB_ROOT; - ref->cache = NULL; + /* Even if we have not used the cache, we may still have a barrier */ + if (!ref->cache) + ref->cache = fetch_node(ref->tree.rb_node); + + /* Keep the MRU cached node for reuse */ + if (ref->cache) { + /* Discard all other nodes in the tree */ + rb_erase(&ref->cache->node, &ref->tree); + root = ref->tree; + + /* Rebuild the tree with only the cached node */ + rb_link_node(&ref->cache->node, NULL, &ref->tree.rb_node); + rb_insert_color(&ref->cache->node, &ref->tree); + GEM_BUG_ON(ref->tree.rb_node != &ref->cache->node); + + /* Make the cached node available for reuse with any timeline */ + if (IS_ENABLED(CONFIG_64BIT)) + ref->cache->timeline = 0; /* needs cmpxchg(u64) */ + } spin_unlock_irqrestore(&ref->tree_lock, flags); @@ -154,6 +172,7 @@ __active_retire(struct i915_active *ref) /* ... except if you wait on it, you must manage your own references! */ wake_up_var(ref); + /* Finally free the discarded timeline tree */ rbtree_postorder_for_each_entry_safe(it, n, &root, node) { GEM_BUG_ON(i915_active_fence_isset(&it->base)); kmem_cache_free(global.slab_cache, it); @@ -216,12 +235,11 @@ excl_retire(struct dma_fence *fence, struct dma_fence_cb *cb) active_retire(container_of(cb, struct i915_active, excl.cb)); } -static struct i915_active_fence * -active_instance(struct i915_active *ref, struct intel_timeline *tl) +static struct active_node *__active_lookup(struct i915_active *ref, u64 idx) { - struct active_node *node, *prealloc; - struct rb_node **p, *parent; - u64 idx = tl->fence_context; + struct active_node *it; + + GEM_BUG_ON(idx == 0); /* 0 is the unordered timeline, rsvd for cache */ /* * We track the most recently used timeline to skip a rbtree search @@ -230,8 +248,59 @@ active_instance(struct i915_active *ref, struct intel_timeline *tl) * after the previous activity has been retired, or if it matches the * current timeline. */ - node = READ_ONCE(ref->cache); - if (node && node->timeline == idx) + it = READ_ONCE(ref->cache); + if (it) { + u64 cached = READ_ONCE(it->timeline); + + /* Once claimed, this slot will only belong to this idx */ + if (cached == idx) + return it; + +#ifdef CONFIG_64BIT /* for cmpxchg(u64) */ + /* + * An unclaimed cache [.timeline=0] can only be claimed once. + * + * If the value is already non-zero, some other thread has + * claimed the cache and we know that is does not match our + * idx. If, and only if, the timeline is currently zero is it + * worth competing to claim it atomically for ourselves (for + * only the winner of that race will cmpxchg return the old + * value of 0). + */ + if (!cached && !cmpxchg(&it->timeline, 0, idx)) + return it; +#endif + } + + BUILD_BUG_ON(offsetof(typeof(*it), node)); + + /* While active, the tree can only be built; not destroyed */ + GEM_BUG_ON(i915_active_is_idle(ref)); + + it = fetch_node(ref->tree.rb_node); + while (it) { + if (it->timeline < idx) { + it = fetch_node(it->node.rb_right); + } else if (it->timeline > idx) { + it = fetch_node(it->node.rb_left); + } else { + WRITE_ONCE(ref->cache, it); + break; + } + } + + /* NB: If the tree rotated beneath us, we may miss our target. */ + return it; +} + +static struct i915_active_fence * +active_instance(struct i915_active *ref, u64 idx) +{ + struct active_node *node, *prealloc; + struct rb_node **p, *parent; + + node = __active_lookup(ref, idx); + if (likely(node)) return &node->base; /* Preallocate a replacement, just in case */ @@ -268,10 +337,9 @@ active_instance(struct i915_active *ref, struct intel_timeline *tl) rb_insert_color(&node->node, &ref->tree); out: - ref->cache = node; + WRITE_ONCE(ref->cache, node); spin_unlock_irq(&ref->tree_lock); - BUILD_BUG_ON(offsetof(typeof(*node), base)); return &node->base; } @@ -353,69 +421,116 @@ __active_del_barrier(struct i915_active *ref, struct active_node *node) return ____active_del_barrier(ref, node, barrier_to_engine(node)); } -int i915_active_ref(struct i915_active *ref, - struct intel_timeline *tl, - struct dma_fence *fence) +static bool +replace_barrier(struct i915_active *ref, struct i915_active_fence *active) +{ + if (!is_barrier(active)) /* proto-node used by our idle barrier? */ + return false; + + /* + * This request is on the kernel_context timeline, and so + * we can use it to substitute for the pending idle-barrer + * request that we want to emit on the kernel_context. + */ + __active_del_barrier(ref, node_from_active(active)); + return true; +} + +int i915_active_ref(struct i915_active *ref, u64 idx, struct dma_fence *fence) { struct i915_active_fence *active; int err; - lockdep_assert_held(&tl->mutex); - /* Prevent reaping in case we malloc/wait while building the tree */ err = i915_active_acquire(ref); if (err) return err; - active = active_instance(ref, tl); + active = active_instance(ref, idx); if (!active) { err = -ENOMEM; goto out; } - if (is_barrier(active)) { /* proto-node used by our idle barrier */ - /* - * This request is on the kernel_context timeline, and so - * we can use it to substitute for the pending idle-barrer - * request that we want to emit on the kernel_context. - */ - __active_del_barrier(ref, node_from_active(active)); + if (replace_barrier(ref, active)) { RCU_INIT_POINTER(active->fence, NULL); atomic_dec(&ref->count); } if (!__i915_active_fence_set(active, fence)) - atomic_inc(&ref->count); + __i915_active_acquire(ref); out: i915_active_release(ref); return err; } -struct dma_fence * -i915_active_set_exclusive(struct i915_active *ref, struct dma_fence *f) +static struct dma_fence * +__i915_active_set_fence(struct i915_active *ref, + struct i915_active_fence *active, + struct dma_fence *fence) { struct dma_fence *prev; - /* We expect the caller to manage the exclusive timeline ordering */ - GEM_BUG_ON(i915_active_is_idle(ref)); + if (replace_barrier(ref, active)) { + RCU_INIT_POINTER(active->fence, fence); + return NULL; + } rcu_read_lock(); - prev = __i915_active_fence_set(&ref->excl, f); + prev = __i915_active_fence_set(active, fence); if (prev) prev = dma_fence_get_rcu(prev); else - atomic_inc(&ref->count); + __i915_active_acquire(ref); rcu_read_unlock(); return prev; } +static struct i915_active_fence * +__active_fence(struct i915_active *ref, u64 idx) +{ + struct active_node *it; + + it = __active_lookup(ref, idx); + if (unlikely(!it)) { /* Contention with parallel tree builders! */ + spin_lock_irq(&ref->tree_lock); + it = __active_lookup(ref, idx); + spin_unlock_irq(&ref->tree_lock); + } + GEM_BUG_ON(!it); /* slot must be preallocated */ + + return &it->base; +} + +struct dma_fence * +__i915_active_ref(struct i915_active *ref, u64 idx, struct dma_fence *fence) +{ + /* Only valid while active, see i915_active_acquire_for_context() */ + return __i915_active_set_fence(ref, __active_fence(ref, idx), fence); +} + +struct dma_fence * +i915_active_set_exclusive(struct i915_active *ref, struct dma_fence *f) +{ + /* We expect the caller to manage the exclusive timeline ordering */ + return __i915_active_set_fence(ref, &ref->excl, f); +} + bool i915_active_acquire_if_busy(struct i915_active *ref) { debug_active_assert(ref); return atomic_add_unless(&ref->count, 1, 0); } +static void __i915_active_activate(struct i915_active *ref) +{ + spin_lock_irq(&ref->tree_lock); /* __active_retire() */ + if (!atomic_fetch_inc(&ref->count)) + debug_active_activate(ref); + spin_unlock_irq(&ref->tree_lock); +} + int i915_active_acquire(struct i915_active *ref) { int err; @@ -423,19 +538,19 @@ int i915_active_acquire(struct i915_active *ref) if (i915_active_acquire_if_busy(ref)) return 0; + if (!ref->active) { + __i915_active_activate(ref); + return 0; + } + err = mutex_lock_interruptible(&ref->mutex); if (err) return err; if (likely(!i915_active_acquire_if_busy(ref))) { - if (ref->active) - err = ref->active(ref); - if (!err) { - spin_lock_irq(&ref->tree_lock); /* __active_retire() */ - debug_active_activate(ref); - atomic_inc(&ref->count); - spin_unlock_irq(&ref->tree_lock); - } + err = ref->active(ref); + if (!err) + __i915_active_activate(ref); } mutex_unlock(&ref->mutex); @@ -443,6 +558,24 @@ int i915_active_acquire(struct i915_active *ref) return err; } +int i915_active_acquire_for_context(struct i915_active *ref, u64 idx) +{ + struct i915_active_fence *active; + int err; + + err = i915_active_acquire(ref); + if (err) + return err; + + active = active_instance(ref, idx); + if (!active) { + i915_active_release(ref); + return -ENOMEM; + } + + return 0; /* return with active ref */ +} + void i915_active_release(struct i915_active *ref) { debug_active_assert(ref); @@ -651,16 +784,16 @@ int i915_sw_fence_await_active(struct i915_sw_fence *fence, return await_active(ref, flags, sw_await_fence, fence, fence); } -#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) void i915_active_fini(struct i915_active *ref) { debug_active_fini(ref); GEM_BUG_ON(atomic_read(&ref->count)); GEM_BUG_ON(work_pending(&ref->work)); - GEM_BUG_ON(!RB_EMPTY_ROOT(&ref->tree)); mutex_destroy(&ref->mutex); + + if (ref->cache) + kmem_cache_free(global.slab_cache, ref->cache); } -#endif static inline bool is_idle_barrier(struct active_node *node, u64 idx) { @@ -674,7 +807,6 @@ static struct active_node *reuse_idle_barrier(struct i915_active *ref, u64 idx) if (RB_EMPTY_ROOT(&ref->tree)) return NULL; - spin_lock_irq(&ref->tree_lock); GEM_BUG_ON(i915_active_is_idle(ref)); /* @@ -700,9 +832,9 @@ static struct active_node *reuse_idle_barrier(struct i915_active *ref, u64 idx) prev = p; if (node->timeline < idx) - p = p->rb_right; + p = READ_ONCE(p->rb_right); else - p = p->rb_left; + p = READ_ONCE(p->rb_left); } /* @@ -739,14 +871,13 @@ static struct active_node *reuse_idle_barrier(struct i915_active *ref, u64 idx) goto match; } - spin_unlock_irq(&ref->tree_lock); - return NULL; match: + spin_lock_irq(&ref->tree_lock); rb_erase(p, &ref->tree); /* Hide from waits and sibling allocations */ if (p == &ref->cache->node) - ref->cache = NULL; + WRITE_ONCE(ref->cache, NULL); spin_unlock_irq(&ref->tree_lock); return rb_entry(p, struct active_node, node); @@ -777,7 +908,9 @@ int i915_active_acquire_preallocate_barrier(struct i915_active *ref, struct llist_node *prev = first; struct active_node *node; + rcu_read_lock(); node = reuse_idle_barrier(ref, idx); + rcu_read_unlock(); if (!node) { node = kmem_cache_alloc(global.slab_cache, GFP_KERNEL); if (!node) @@ -801,7 +934,7 @@ int i915_active_acquire_preallocate_barrier(struct i915_active *ref, */ RCU_INIT_POINTER(node->base.fence, ERR_PTR(-EAGAIN)); node->base.cb.node.prev = (void *)engine; - atomic_inc(&ref->count); + __i915_active_acquire(ref); } GEM_BUG_ON(rcu_access_pointer(node->base.fence) != ERR_PTR(-EAGAIN)); diff --git a/drivers/gpu/drm/i915/i915_active.h b/drivers/gpu/drm/i915/i915_active.h index cf4058150966..fb165d3f01cf 100644 --- a/drivers/gpu/drm/i915/i915_active.h +++ b/drivers/gpu/drm/i915/i915_active.h @@ -163,14 +163,16 @@ void __i915_active_init(struct i915_active *ref, __i915_active_init(ref, active, retire, &__mkey, &__wkey); \ } while (0) -int i915_active_ref(struct i915_active *ref, - struct intel_timeline *tl, - struct dma_fence *fence); +struct dma_fence * +__i915_active_ref(struct i915_active *ref, u64 idx, struct dma_fence *fence); +int i915_active_ref(struct i915_active *ref, u64 idx, struct dma_fence *fence); static inline int i915_active_add_request(struct i915_active *ref, struct i915_request *rq) { - return i915_active_ref(ref, i915_request_timeline(rq), &rq->fence); + return i915_active_ref(ref, + i915_request_timeline(rq)->fence_context, + &rq->fence); } struct dma_fence * @@ -198,7 +200,9 @@ int i915_request_await_active(struct i915_request *rq, #define I915_ACTIVE_AWAIT_BARRIER BIT(2) int i915_active_acquire(struct i915_active *ref); +int i915_active_acquire_for_context(struct i915_active *ref, u64 idx); bool i915_active_acquire_if_busy(struct i915_active *ref); + void i915_active_release(struct i915_active *ref); static inline void __i915_active_acquire(struct i915_active *ref) @@ -213,11 +217,7 @@ i915_active_is_idle(const struct i915_active *ref) return !atomic_read(&ref->count); } -#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) void i915_active_fini(struct i915_active *ref); -#else -static inline void i915_active_fini(struct i915_active *ref) { } -#endif int i915_active_acquire_preallocate_barrier(struct i915_active *ref, struct intel_engine_cs *engine); @@ -231,4 +231,19 @@ struct i915_active *i915_active_create(void); struct i915_active *i915_active_get(struct i915_active *ref); void i915_active_put(struct i915_active *ref); +static inline int __i915_request_await_exclusive(struct i915_request *rq, + struct i915_active *active) +{ + struct dma_fence *fence; + int err = 0; + + fence = i915_active_fence_get(&active->excl); + if (fence) { + err = i915_request_await_dma_fence(rq, fence); + dma_fence_put(fence); + } + + return err; +} + #endif /* _I915_ACTIVE_H_ */ diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index e6918c7c0709..00292a849c34 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -1075,6 +1075,7 @@ static void i915_driver_release(struct drm_device *dev) intel_memory_regions_driver_release(dev_priv); i915_ggtt_driver_release(dev_priv); + i915_gem_drain_freed_objects(dev_priv); i915_driver_mmio_release(dev_priv); @@ -1119,7 +1120,6 @@ static void i915_driver_postclose(struct drm_device *dev, struct drm_file *file) struct drm_i915_file_private *file_priv = file->driver_priv; i915_gem_context_close(file); - i915_gem_release(dev, file); kfree_rcu(file_priv, rcu); diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 1314e0e92c41..ab17084af0ff 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -203,11 +203,6 @@ struct drm_i915_file_private { struct rcu_head rcu; }; - struct { - spinlock_t lock; - struct list_head request_list; - } mm; - struct xarray context_xa; struct xarray vm_xa; @@ -592,11 +587,6 @@ struct i915_gem_mm { */ atomic_t free_count; - /** - * Small stash of WC pages - */ - struct pagestash wc_stash; - /** * tmpfs instance used for shmem backed objects */ @@ -1826,11 +1816,18 @@ static inline void i915_gem_drain_workqueue(struct drm_i915_private *i915) } struct i915_vma * __must_check +i915_gem_object_ggtt_pin_ww(struct drm_i915_gem_object *obj, + struct i915_gem_ww_ctx *ww, + const struct i915_ggtt_view *view, + u64 size, u64 alignment, u64 flags); + +static inline struct i915_vma * __must_check i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj, const struct i915_ggtt_view *view, - u64 size, - u64 alignment, - u64 flags); + u64 size, u64 alignment, u64 flags) +{ + return i915_gem_object_ggtt_pin_ww(obj, NULL, view, size, alignment, flags); +} int i915_gem_object_unbind(struct drm_i915_gem_object *obj, unsigned long flags); @@ -1867,7 +1864,6 @@ void i915_gem_suspend_late(struct drm_i915_private *dev_priv); void i915_gem_resume(struct drm_i915_private *dev_priv); int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file); -void i915_gem_release(struct drm_device *dev, struct drm_file *file); int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj, enum i915_cache_level cache_level); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 9aa3066cb75d..bb0c12975f38 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -335,12 +335,20 @@ i915_gem_shmem_pread(struct drm_i915_gem_object *obj, u64 remain; int ret; - ret = i915_gem_object_prepare_read(obj, &needs_clflush); + ret = i915_gem_object_lock_interruptible(obj, NULL); if (ret) return ret; + ret = i915_gem_object_prepare_read(obj, &needs_clflush); + if (ret) { + i915_gem_object_unlock(obj); + return ret; + } + fence = i915_gem_object_lock_fence(obj); i915_gem_object_finish_access(obj); + i915_gem_object_unlock(obj); + if (!fence) return -ENOMEM; @@ -420,7 +428,7 @@ i915_gem_gtt_pread(struct drm_i915_gem_object *obj, GEM_BUG_ON(!drm_mm_node_allocated(&node)); } - ret = i915_gem_object_lock_interruptible(obj); + ret = i915_gem_object_lock_interruptible(obj, NULL); if (ret) goto out_unpin; @@ -619,7 +627,7 @@ i915_gem_gtt_pwrite_fast(struct drm_i915_gem_object *obj, GEM_BUG_ON(!drm_mm_node_allocated(&node)); } - ret = i915_gem_object_lock_interruptible(obj); + ret = i915_gem_object_lock_interruptible(obj, NULL); if (ret) goto out_unpin; @@ -734,12 +742,20 @@ i915_gem_shmem_pwrite(struct drm_i915_gem_object *obj, u64 remain; int ret; - ret = i915_gem_object_prepare_write(obj, &needs_clflush); + ret = i915_gem_object_lock_interruptible(obj, NULL); if (ret) return ret; + ret = i915_gem_object_prepare_write(obj, &needs_clflush); + if (ret) { + i915_gem_object_unlock(obj); + return ret; + } + fence = i915_gem_object_lock_fence(obj); i915_gem_object_finish_access(obj); + i915_gem_object_unlock(obj); + if (!fence) return -ENOMEM; @@ -946,11 +962,10 @@ static void discard_ggtt_vma(struct i915_vma *vma) } struct i915_vma * -i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj, - const struct i915_ggtt_view *view, - u64 size, - u64 alignment, - u64 flags) +i915_gem_object_ggtt_pin_ww(struct drm_i915_gem_object *obj, + struct i915_gem_ww_ctx *ww, + const struct i915_ggtt_view *view, + u64 size, u64 alignment, u64 flags) { struct drm_i915_private *i915 = to_i915(obj->base.dev); struct i915_ggtt *ggtt = &i915->ggtt; @@ -1016,7 +1031,7 @@ i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj, return ERR_PTR(ret); } - ret = i915_vma_pin(vma, size, alignment, flags | PIN_GLOBAL); + ret = i915_vma_pin_ww(vma, ww, size, alignment, flags | PIN_GLOBAL); if (ret) return ERR_PTR(ret); @@ -1290,7 +1305,7 @@ int i915_gem_freeze_late(struct drm_i915_private *i915) i915_gem_drain_freed_objects(i915); list_for_each_entry(obj, &i915->mm.shrink_list, mm.link) { - i915_gem_object_lock(obj); + i915_gem_object_lock(obj, NULL); drm_WARN_ON(&i915->drm, i915_gem_object_set_to_cpu_domain(obj, true)); i915_gem_object_unlock(obj); @@ -1301,21 +1316,6 @@ int i915_gem_freeze_late(struct drm_i915_private *i915) return 0; } -void i915_gem_release(struct drm_device *dev, struct drm_file *file) -{ - struct drm_i915_file_private *file_priv = file->driver_priv; - struct i915_request *request; - - /* Clean up our request list when the client is going away, so that - * later retire_requests won't dereference our soon-to-be-gone - * file_priv. - */ - spin_lock(&file_priv->mm.lock); - list_for_each_entry(request, &file_priv->mm.request_list, client_link) - request->file_priv = NULL; - spin_unlock(&file_priv->mm.lock); -} - int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file) { struct drm_i915_file_private *file_priv; @@ -1331,9 +1331,6 @@ int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file) file_priv->dev_priv = i915; file_priv->file = file; - spin_lock_init(&file_priv->mm.lock); - INIT_LIST_HEAD(&file_priv->mm.request_list); - file_priv->bsd_engine = -1; file_priv->hang_timestamp = jiffies; @@ -1344,6 +1341,58 @@ int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file) return ret; } +void i915_gem_ww_ctx_init(struct i915_gem_ww_ctx *ww, bool intr) +{ + ww_acquire_init(&ww->ctx, &reservation_ww_class); + INIT_LIST_HEAD(&ww->obj_list); + ww->intr = intr; + ww->contended = NULL; +} + +static void i915_gem_ww_ctx_unlock_all(struct i915_gem_ww_ctx *ww) +{ + struct drm_i915_gem_object *obj; + + while ((obj = list_first_entry_or_null(&ww->obj_list, struct drm_i915_gem_object, obj_link))) { + list_del(&obj->obj_link); + i915_gem_object_unlock(obj); + } +} + +void i915_gem_ww_unlock_single(struct drm_i915_gem_object *obj) +{ + list_del(&obj->obj_link); + i915_gem_object_unlock(obj); +} + +void i915_gem_ww_ctx_fini(struct i915_gem_ww_ctx *ww) +{ + i915_gem_ww_ctx_unlock_all(ww); + WARN_ON(ww->contended); + ww_acquire_fini(&ww->ctx); +} + +int __must_check i915_gem_ww_ctx_backoff(struct i915_gem_ww_ctx *ww) +{ + int ret = 0; + + if (WARN_ON(!ww->contended)) + return -EINVAL; + + i915_gem_ww_ctx_unlock_all(ww); + if (ww->intr) + ret = dma_resv_lock_slow_interruptible(ww->contended->base.resv, &ww->ctx); + else + dma_resv_lock_slow(ww->contended->base.resv, &ww->ctx); + + if (!ret) + list_add_tail(&ww->contended->obj_link, &ww->obj_list); + + ww->contended = NULL; + + return ret; +} + #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) #include "selftests/mock_gem_device.c" #include "selftests/i915_gem.c" diff --git a/drivers/gpu/drm/i915/i915_gem.h b/drivers/gpu/drm/i915/i915_gem.h index f333e88a2b6e..a4cad3f154ca 100644 --- a/drivers/gpu/drm/i915/i915_gem.h +++ b/drivers/gpu/drm/i915/i915_gem.h @@ -116,4 +116,16 @@ static inline bool __tasklet_is_scheduled(struct tasklet_struct *t) return test_bit(TASKLET_STATE_SCHED, &t->state); } +struct i915_gem_ww_ctx { + struct ww_acquire_ctx ctx; + struct list_head obj_list; + bool intr; + struct drm_i915_gem_object *contended; +}; + +void i915_gem_ww_ctx_init(struct i915_gem_ww_ctx *ctx, bool intr); +void i915_gem_ww_ctx_fini(struct i915_gem_ww_ctx *ctx); +int __must_check i915_gem_ww_ctx_backoff(struct i915_gem_ww_ctx *ctx); +void i915_gem_ww_unlock_single(struct drm_i915_gem_object *obj); + #endif /* __I915_GEM_H__ */ diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 1fa67700d8f4..f113fe44572b 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -41,6 +41,7 @@ #include "display/intel_lpe_audio.h" #include "display/intel_psr.h" +#include "gt/intel_breadcrumbs.h" #include "gt/intel_gt.h" #include "gt/intel_gt_irq.h" #include "gt/intel_gt_pm_irq.h" diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index c6f6370283cf..e94976976571 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -1195,24 +1195,39 @@ static struct intel_context *oa_pin_context(struct i915_perf_stream *stream) struct i915_gem_engines_iter it; struct i915_gem_context *ctx = stream->ctx; struct intel_context *ce; - int err; + struct i915_gem_ww_ctx ww; + int err = -ENODEV; for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it) { if (ce->engine != stream->engine) /* first match! */ continue; - /* - * As the ID is the gtt offset of the context's vma we - * pin the vma to ensure the ID remains fixed. - */ - err = intel_context_pin(ce); - if (err == 0) { - stream->pinned_ctx = ce; - break; - } + err = 0; + break; } i915_gem_context_unlock_engines(ctx); + if (err) + return ERR_PTR(err); + + i915_gem_ww_ctx_init(&ww, true); +retry: + /* + * As the ID is the gtt offset of the context's vma we + * pin the vma to ensure the ID remains fixed. + */ + err = intel_context_pin_ww(ce, &ww); + if (err == -EDEADLK) { + err = i915_gem_ww_ctx_backoff(&ww); + if (!err) + goto retry; + } + i915_gem_ww_ctx_fini(&ww); + + if (err) + return ERR_PTR(err); + + stream->pinned_ctx = ce; return stream->pinned_ctx; } @@ -1923,15 +1938,22 @@ emit_oa_config(struct i915_perf_stream *stream, { struct i915_request *rq; struct i915_vma *vma; + struct i915_gem_ww_ctx ww; int err; vma = get_oa_vma(stream, oa_config); if (IS_ERR(vma)) return PTR_ERR(vma); - err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH); + i915_gem_ww_ctx_init(&ww, true); +retry: + err = i915_gem_object_lock(vma->obj, &ww); if (err) - goto err_vma_put; + goto err; + + err = i915_vma_pin_ww(vma, &ww, 0, 0, PIN_GLOBAL | PIN_HIGH); + if (err) + goto err; intel_engine_pm_get(ce->engine); rq = i915_request_create(ce); @@ -1953,11 +1975,9 @@ emit_oa_config(struct i915_perf_stream *stream, goto err_add_request; } - i915_vma_lock(vma); err = i915_request_await_object(rq, vma->obj, 0); if (!err) err = i915_vma_move_to_active(vma, rq, 0); - i915_vma_unlock(vma); if (err) goto err_add_request; @@ -1971,7 +1991,14 @@ emit_oa_config(struct i915_perf_stream *stream, i915_request_add(rq); err_vma_unpin: i915_vma_unpin(vma); -err_vma_put: +err: + if (err == -EDEADLK) { + err = i915_gem_ww_ctx_backoff(&ww); + if (!err) + goto retry; + } + + i915_gem_ww_ctx_fini(&ww); i915_vma_put(vma); return err; } diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 7a05850ca931..11e272422fb7 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -31,6 +31,7 @@ #include #include "gem/i915_gem_context.h" +#include "gt/intel_breadcrumbs.h" #include "gt/intel_context.h" #include "gt/intel_ring.h" #include "gt/intel_rps.h" @@ -186,48 +187,34 @@ static void irq_execute_cb_hook(struct irq_work *wrk) irq_execute_cb(wrk); } -static void __notify_execute_cb(struct i915_request *rq) +static __always_inline void +__notify_execute_cb(struct i915_request *rq, bool (*fn)(struct irq_work *wrk)) { struct execute_cb *cb, *cn; - lockdep_assert_held(&rq->lock); - - GEM_BUG_ON(!i915_request_is_active(rq)); if (llist_empty(&rq->execute_cb)) return; - llist_for_each_entry_safe(cb, cn, rq->execute_cb.first, work.llnode) - irq_work_queue(&cb->work); - - /* - * XXX Rollback on __i915_request_unsubmit() - * - * In the future, perhaps when we have an active time-slicing scheduler, - * it will be interesting to unsubmit parallel execution and remove - * busywaits from the GPU until their master is restarted. This is - * quite hairy, we have to carefully rollback the fence and do a - * preempt-to-idle cycle on the target engine, all the while the - * master execute_cb may refire. - */ - init_llist_head(&rq->execute_cb); + llist_for_each_entry_safe(cb, cn, + llist_del_all(&rq->execute_cb), + work.llnode) + fn(&cb->work); } -static inline void -remove_from_client(struct i915_request *request) +static void __notify_execute_cb_irq(struct i915_request *rq) { - struct drm_i915_file_private *file_priv; + __notify_execute_cb(rq, irq_work_queue); +} - if (!READ_ONCE(request->file_priv)) - return; +static bool irq_work_imm(struct irq_work *wrk) +{ + wrk->func(wrk); + return false; +} - rcu_read_lock(); - file_priv = xchg(&request->file_priv, NULL); - if (file_priv) { - spin_lock(&file_priv->mm.lock); - list_del(&request->client_link); - spin_unlock(&file_priv->mm.lock); - } - rcu_read_unlock(); +static void __notify_execute_cb_imm(struct i915_request *rq) +{ + __notify_execute_cb(rq, irq_work_imm); } static void free_capture_list(struct i915_request *request) @@ -274,9 +261,16 @@ static void remove_from_engine(struct i915_request *rq) locked = engine; } list_del_init(&rq->sched.link); + clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); clear_bit(I915_FENCE_FLAG_HOLD, &rq->fence.flags); + + /* Prevent further __await_execution() registering a cb, then flush */ + set_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags); + spin_unlock_irq(&locked->active.lock); + + __notify_execute_cb_imm(rq); } bool i915_request_retire(struct i915_request *rq) @@ -288,6 +282,7 @@ bool i915_request_retire(struct i915_request *rq) GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit)); trace_i915_request_retire(rq); + i915_request_mark_complete(rq); /* * We know the GPU must have read the request to have @@ -305,32 +300,30 @@ bool i915_request_retire(struct i915_request *rq) __i915_request_fill(rq, POISON_FREE); rq->ring->head = rq->postfix; + if (!i915_request_signaled(rq)) { + spin_lock_irq(&rq->lock); + dma_fence_signal_locked(&rq->fence); + spin_unlock_irq(&rq->lock); + } + + if (i915_request_has_waitboost(rq)) { + GEM_BUG_ON(!atomic_read(&rq->engine->gt->rps.num_waiters)); + atomic_dec(&rq->engine->gt->rps.num_waiters); + } + /* * We only loosely track inflight requests across preemption, * and so we may find ourselves attempting to retire a _completed_ * request that we have removed from the HW and put back on a run * queue. + * + * As we set I915_FENCE_FLAG_ACTIVE on the request, this should be + * after removing the breadcrumb and signaling it, so that we do not + * inadvertently attach the breadcrumb to a completed request. */ remove_from_engine(rq); - - spin_lock_irq(&rq->lock); - i915_request_mark_complete(rq); - if (!i915_request_signaled(rq)) - dma_fence_signal_locked(&rq->fence); - if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &rq->fence.flags)) - i915_request_cancel_breadcrumb(rq); - if (i915_request_has_waitboost(rq)) { - GEM_BUG_ON(!atomic_read(&rq->engine->gt->rps.num_waiters)); - atomic_dec(&rq->engine->gt->rps.num_waiters); - } - if (!test_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags)) { - set_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags); - __notify_execute_cb(rq); - } GEM_BUG_ON(!llist_empty(&rq->execute_cb)); - spin_unlock_irq(&rq->lock); - remove_from_client(rq); __list_del_entry(&rq->link); /* poison neither prev/next (RCU walks) */ intel_context_exit(rq->context); @@ -357,12 +350,6 @@ void i915_request_retire_upto(struct i915_request *rq) } while (i915_request_retire(tmp) && tmp != rq); } -static void __llist_add(struct llist_node *node, struct llist_head *head) -{ - node->next = head->first; - head->first = node; -} - static struct i915_request * const * __engine_active(struct intel_engine_cs *engine) { @@ -388,17 +375,38 @@ static bool __request_in_flight(const struct i915_request *signal) * As we know that there are always preemption points between * requests, we know that only the currently executing request * may be still active even though we have cleared the flag. - * However, we can't rely on our tracking of ELSP[0] to known + * However, we can't rely on our tracking of ELSP[0] to know * which request is currently active and so maybe stuck, as * the tracking maybe an event behind. Instead assume that * if the context is still inflight, then it is still active * even if the active flag has been cleared. + * + * To further complicate matters, if there a pending promotion, the HW + * may either perform a context switch to the second inflight execlists, + * or it may switch to the pending set of execlists. In the case of the + * latter, it may send the ACK and we process the event copying the + * pending[] over top of inflight[], _overwriting_ our *active. Since + * this implies the HW is arbitrating and not struck in *active, we do + * not worry about complete accuracy, but we do require no read/write + * tearing of the pointer [the read of the pointer must be valid, even + * as the array is being overwritten, for which we require the writes + * to avoid tearing.] + * + * Note that the read of *execlists->active may race with the promotion + * of execlists->pending[] to execlists->inflight[], overwritting + * the value at *execlists->active. This is fine. The promotion implies + * that we received an ACK from the HW, and so the context is not + * stuck -- if we do not see ourselves in *active, the inflight status + * is valid. If instead we see ourselves being copied into *active, + * we are inflight and may signal the callback. */ if (!intel_context_inflight(signal->context)) return false; rcu_read_lock(); - for (port = __engine_active(signal->engine); (rq = *port); port++) { + for (port = __engine_active(signal->engine); + (rq = READ_ONCE(*port)); /* may race with promotion of pending[] */ + port++) { if (rq->context == signal->context) { inflight = i915_seqno_passed(rq->fence.seqno, signal->fence.seqno); @@ -439,18 +447,24 @@ __await_execution(struct i915_request *rq, cb->work.func = irq_execute_cb_hook; } - spin_lock_irq(&signal->lock); - if (i915_request_is_active(signal) || __request_in_flight(signal)) { - if (hook) { - hook(rq, &signal->fence); - i915_request_put(signal); - } - i915_sw_fence_complete(cb->fence); - kmem_cache_free(global.slab_execute_cbs, cb); - } else { - __llist_add(&cb->work.llnode, &signal->execute_cb); + /* + * Register the callback first, then see if the signaler is already + * active. This ensures that if we race with the + * __notify_execute_cb from i915_request_submit() and we are not + * included in that list, we get a second bite of the cherry and + * execute it ourselves. After this point, a future + * i915_request_submit() will notify us. + * + * In i915_request_retire() we set the ACTIVE bit on a completed + * request (then flush the execute_cb). So by registering the + * callback first, then checking the ACTIVE bit, we serialise with + * the completed/retired request. + */ + if (llist_add(&cb->work.llnode, &signal->execute_cb)) { + if (i915_request_is_active(signal) || + __request_in_flight(signal)) + __notify_execute_cb_imm(signal); } - spin_unlock_irq(&signal->lock); return 0; } @@ -566,18 +580,28 @@ bool __i915_request_submit(struct i915_request *request) clear_bit(I915_FENCE_FLAG_PQUEUE, &request->fence.flags); } + /* + * XXX Rollback bonded-execution on __i915_request_unsubmit()? + * + * In the future, perhaps when we have an active time-slicing scheduler, + * it will be interesting to unsubmit parallel execution and remove + * busywaits from the GPU until their master is restarted. This is + * quite hairy, we have to carefully rollback the fence and do a + * preempt-to-idle cycle on the target engine, all the while the + * master execute_cb may refire. + */ + __notify_execute_cb_irq(request); + /* We may be recursing from the signal callback of another i915 fence */ if (!i915_request_signaled(request)) { spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING); - __notify_execute_cb(request); if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags) && !i915_request_enable_breadcrumb(request)) intel_engine_signal_breadcrumbs(engine); spin_unlock(&request->lock); - GEM_BUG_ON(!llist_empty(&request->execute_cb)); } return result; @@ -600,27 +624,27 @@ void __i915_request_unsubmit(struct i915_request *request) { struct intel_engine_cs *engine = request->engine; + /* + * Only unwind in reverse order, required so that the per-context list + * is kept in seqno/ring order. + */ RQ_TRACE(request, "\n"); GEM_BUG_ON(!irqs_disabled()); lockdep_assert_held(&engine->active.lock); /* - * Only unwind in reverse order, required so that the per-context list - * is kept in seqno/ring order. + * Before we remove this breadcrumb from the signal list, we have + * to ensure that a concurrent dma_fence_enable_signaling() does not + * attach itself. We first mark the request as no longer active and + * make sure that is visible to other cores, and then remove the + * breadcrumb if attached. */ - - /* We may be recursing from the signal callback of another i915 fence */ - spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING); - + GEM_BUG_ON(!test_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags)); + clear_bit_unlock(I915_FENCE_FLAG_ACTIVE, &request->fence.flags); if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags)) i915_request_cancel_breadcrumb(request); - GEM_BUG_ON(!test_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags)); - clear_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags); - - spin_unlock(&request->lock); - /* We've already spun, don't charge on resubmitting. */ if (request->sched.semaphores && i915_request_started(request)) request->sched.semaphores = 0; @@ -757,7 +781,6 @@ static void __i915_request_ctor(void *arg) dma_fence_init(&rq->fence, &i915_fence_ops, &rq->lock, 0, 0); - rq->file_priv = NULL; rq->capture_list = NULL; init_llist_head(&rq->execute_cb); @@ -847,7 +870,6 @@ __i915_request_create(struct intel_context *ce, gfp_t gfp) /* No zalloc, everything must be cleared after use */ rq->batch = NULL; - GEM_BUG_ON(rq->file_priv); GEM_BUG_ON(rq->capture_list); GEM_BUG_ON(!llist_empty(&rq->execute_cb)); @@ -1640,7 +1662,7 @@ static bool busywait_stop(unsigned long timeout, unsigned int cpu) return this_cpu != cpu; } -static bool __i915_spin_request(const struct i915_request * const rq, int state) +static bool __i915_spin_request(struct i915_request * const rq, int state) { unsigned long timeout_ns; unsigned int cpu; @@ -1673,7 +1695,7 @@ static bool __i915_spin_request(const struct i915_request * const rq, int state) timeout_ns = READ_ONCE(rq->engine->props.max_busywait_duration_ns); timeout_ns += local_clock_ns(&cpu); do { - if (i915_request_completed(rq)) + if (dma_fence_is_signaled(&rq->fence)) return true; if (signal_pending_state(state, current)) @@ -1697,7 +1719,7 @@ static void request_wait_wake(struct dma_fence *fence, struct dma_fence_cb *cb) { struct request_wait *wait = container_of(cb, typeof(*wait), cb); - wake_up_process(wait->tsk); + wake_up_process(fetch_and_zero(&wait->tsk)); } /** @@ -1766,10 +1788,8 @@ long i915_request_wait(struct i915_request *rq, * duration, which we currently lack. */ if (IS_ACTIVE(CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT) && - __i915_spin_request(rq, state)) { - dma_fence_signal(&rq->fence); + __i915_spin_request(rq, state)) goto out; - } /* * This client is about to stall waiting for the GPU. In many cases @@ -1790,15 +1810,29 @@ long i915_request_wait(struct i915_request *rq, if (dma_fence_add_callback(&rq->fence, &wait.cb, request_wait_wake)) goto out; + /* + * Flush the submission tasklet, but only if it may help this request. + * + * We sometimes experience some latency between the HW interrupts and + * tasklet execution (mostly due to ksoftirqd latency, but it can also + * be due to lazy CS events), so lets run the tasklet manually if there + * is a chance it may submit this request. If the request is not ready + * to run, as it is waiting for other fences to be signaled, flushing + * the tasklet is busy work without any advantage for this client. + * + * If the HW is being lazy, this is the last chance before we go to + * sleep to catch any pending events. We will check periodically in + * the heartbeat to flush the submission tasklets as a last resort + * for unhappy HW. + */ + if (i915_request_is_ready(rq)) + intel_engine_flush_submission(rq->engine); + for (;;) { set_current_state(state); - if (i915_request_completed(rq)) { - dma_fence_signal(&rq->fence); + if (dma_fence_is_signaled(&rq->fence)) break; - } - - intel_engine_flush_submission(rq->engine); if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; @@ -1814,7 +1848,9 @@ long i915_request_wait(struct i915_request *rq, } __set_current_state(TASK_RUNNING); - dma_fence_remove_callback(&rq->fence, &wait.cb); + if (READ_ONCE(wait.tsk)) + dma_fence_remove_callback(&rq->fence, &wait.cb); + GEM_BUG_ON(!list_empty(&wait.cb.node)); out: mutex_release(&rq->engine->gt->reset.mutex.dep_map, _THIS_IP_); diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h index 590762820761..16b721080195 100644 --- a/drivers/gpu/drm/i915/i915_request.h +++ b/drivers/gpu/drm/i915/i915_request.h @@ -284,10 +284,6 @@ struct i915_request { /** timeline->request entry for this request */ struct list_head link; - struct drm_i915_file_private *file_priv; - /** file_priv list entry for this request */ - struct list_head client_link; - I915_SELFTEST_DECLARE(struct { struct list_head link; unsigned long delay; @@ -365,10 +361,6 @@ void i915_request_submit(struct i915_request *request); void __i915_request_unsubmit(struct i915_request *request); void i915_request_unsubmit(struct i915_request *request); -/* Note: part of the intel_breadcrumbs family */ -bool i915_request_enable_breadcrumb(struct i915_request *request); -void i915_request_cancel_breadcrumb(struct i915_request *request); - long i915_request_wait(struct i915_request *rq, unsigned int flags, long timeout) diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c index 295b9829e2da..4cd2038cbe35 100644 --- a/drivers/gpu/drm/i915/i915_sw_fence.c +++ b/drivers/gpu/drm/i915/i915_sw_fence.c @@ -164,9 +164,13 @@ static void __i915_sw_fence_wake_up_all(struct i915_sw_fence *fence, do { list_for_each_entry_safe(pos, next, &x->head, entry) { - pos->func(pos, - TASK_NORMAL, fence->error, - &extra); + int wake_flags; + + wake_flags = fence->error; + if (pos->func == autoremove_wake_function) + wake_flags = 0; + + pos->func(pos, TASK_NORMAL, wake_flags, &extra); } if (list_empty(&extra)) diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index bc64f773dcdb..495d28f6d160 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -291,6 +291,8 @@ i915_vma_instance(struct drm_i915_gem_object *obj, struct i915_vma_work { struct dma_fence_work base; + struct i915_address_space *vm; + struct i915_vm_pt_stash stash; struct i915_vma *vma; struct drm_i915_gem_object *pinned; struct i915_sw_dma_fence_cb cb; @@ -302,13 +304,10 @@ static int __vma_bind(struct dma_fence_work *work) { struct i915_vma_work *vw = container_of(work, typeof(*vw), base); struct i915_vma *vma = vw->vma; - int err; - err = vma->ops->bind_vma(vma->vm, vma, vw->cache_level, vw->flags); - if (err) - atomic_or(I915_VMA_ERROR, &vma->flags); - - return err; + vma->ops->bind_vma(vw->vm, &vw->stash, + vma, vw->cache_level, vw->flags); + return 0; } static void __vma_release(struct dma_fence_work *work) @@ -317,6 +316,9 @@ static void __vma_release(struct dma_fence_work *work) if (vw->pinned) __i915_gem_object_unpin_pages(vw->pinned); + + i915_vm_free_pt_stash(vw->vm, &vw->stash); + i915_vm_put(vw->vm); } static const struct dma_fence_work_ops bind_ops = { @@ -376,7 +378,6 @@ int i915_vma_bind(struct i915_vma *vma, { u32 bind_flags; u32 vma_flags; - int ret; GEM_BUG_ON(!drm_mm_node_allocated(&vma->node)); GEM_BUG_ON(vma->size > vma->node.size); @@ -433,9 +434,7 @@ int i915_vma_bind(struct i915_vma *vma, work->pinned = vma->obj; } } else { - ret = vma->ops->bind_vma(vma->vm, vma, cache_level, bind_flags); - if (ret) - return ret; + vma->ops->bind_vma(vma->vm, NULL, vma, cache_level, bind_flags); } atomic_or(bind_flags, &vma->flags); @@ -853,13 +852,19 @@ static void vma_unbind_pages(struct i915_vma *vma) __vma_put_pages(vma, count | count << I915_VMA_PAGES_BIAS); } -int i915_vma_pin(struct i915_vma *vma, u64 size, u64 alignment, u64 flags) +int i915_vma_pin_ww(struct i915_vma *vma, struct i915_gem_ww_ctx *ww, + u64 size, u64 alignment, u64 flags) { struct i915_vma_work *work = NULL; intel_wakeref_t wakeref = 0; unsigned int bound; int err; +#ifdef CONFIG_PROVE_LOCKING + if (debug_locks && lockdep_is_held(&vma->vm->i915->drm.struct_mutex)) + WARN_ON(!ww); +#endif + BUILD_BUG_ON(PIN_GLOBAL != I915_VMA_GLOBAL_BIND); BUILD_BUG_ON(PIN_USER != I915_VMA_LOCAL_BIND); @@ -873,17 +878,31 @@ int i915_vma_pin(struct i915_vma *vma, u64 size, u64 alignment, u64 flags) if (err) return err; + if (flags & PIN_GLOBAL) + wakeref = intel_runtime_pm_get(&vma->vm->i915->runtime_pm); + if (flags & vma->vm->bind_async_flags) { work = i915_vma_work(); if (!work) { err = -ENOMEM; - goto err_pages; + goto err_rpm; + } + + work->vm = i915_vm_get(vma->vm); + + /* Allocate enough page directories to used PTE */ + if (vma->vm->allocate_va_range) { + i915_vm_alloc_pt_stash(vma->vm, + &work->stash, + vma->size); + + err = i915_vm_pin_pt_stash(vma->vm, + &work->stash); + if (err) + goto err_fence; } } - if (flags & PIN_GLOBAL) - wakeref = intel_runtime_pm_get(&vma->vm->i915->runtime_pm); - /* * Differentiate between user/kernel vma inside the aliasing-ppgtt. * @@ -971,9 +990,9 @@ int i915_vma_pin(struct i915_vma *vma, u64 size, u64 alignment, u64 flags) err_fence: if (work) dma_fence_work_commit_imm(&work->base); +err_rpm: if (wakeref) intel_runtime_pm_put(&vma->vm->i915->runtime_pm, wakeref); -err_pages: vma_put_pages(vma); return err; } @@ -989,7 +1008,8 @@ static void flush_idle_contexts(struct intel_gt *gt) intel_gt_wait_for_idle(gt, MAX_SCHEDULE_TIMEOUT); } -int i915_ggtt_pin(struct i915_vma *vma, u32 align, unsigned int flags) +int i915_ggtt_pin(struct i915_vma *vma, struct i915_gem_ww_ctx *ww, + u32 align, unsigned int flags) { struct i915_address_space *vm = vma->vm; int err; @@ -997,7 +1017,7 @@ int i915_ggtt_pin(struct i915_vma *vma, u32 align, unsigned int flags) GEM_BUG_ON(!i915_vma_is_ggtt(vma)); do { - err = i915_vma_pin(vma, 0, align, flags | PIN_GLOBAL); + err = i915_vma_pin_ww(vma, ww, 0, align, flags | PIN_GLOBAL); if (err != -ENOSPC) { if (!err) { err = i915_vma_wait_for_bind(vma); @@ -1167,6 +1187,12 @@ void i915_vma_revoke_mmap(struct i915_vma *vma) list_del(&vma->obj->userfault_link); } +static int +__i915_request_await_bind(struct i915_request *rq, struct i915_vma *vma) +{ + return __i915_request_await_exclusive(rq, &vma->active); +} + int __i915_vma_move_to_active(struct i915_vma *vma, struct i915_request *rq) { int err; @@ -1174,8 +1200,7 @@ int __i915_vma_move_to_active(struct i915_vma *vma, struct i915_request *rq) GEM_BUG_ON(!i915_vma_is_pinned(vma)); /* Wait for the vma to be bound before we start! */ - err = i915_request_await_active(rq, &vma->active, - I915_ACTIVE_AWAIT_EXCL); + err = __i915_request_await_bind(rq, vma); if (err) return err; diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h index d0d01f909548..5b3a3c653454 100644 --- a/drivers/gpu/drm/i915/i915_vma.h +++ b/drivers/gpu/drm/i915/i915_vma.h @@ -237,8 +237,17 @@ static inline void i915_vma_unlock(struct i915_vma *vma) } int __must_check -i915_vma_pin(struct i915_vma *vma, u64 size, u64 alignment, u64 flags); -int i915_ggtt_pin(struct i915_vma *vma, u32 align, unsigned int flags); +i915_vma_pin_ww(struct i915_vma *vma, struct i915_gem_ww_ctx *ww, + u64 size, u64 alignment, u64 flags); + +static inline int __must_check +i915_vma_pin(struct i915_vma *vma, u64 size, u64 alignment, u64 flags) +{ + return i915_vma_pin_ww(vma, NULL, size, alignment, flags); +} + +int i915_ggtt_pin(struct i915_vma *vma, struct i915_gem_ww_ctx *ww, + u32 align, unsigned int flags); static inline int i915_vma_pin_count(const struct i915_vma *vma) { diff --git a/drivers/gpu/drm/i915/selftests/i915_gem.c b/drivers/gpu/drm/i915/selftests/i915_gem.c index 88d400b9df88..23a6132c5f4e 100644 --- a/drivers/gpu/drm/i915/selftests/i915_gem.c +++ b/drivers/gpu/drm/i915/selftests/i915_gem.c @@ -199,11 +199,52 @@ static int igt_gem_hibernate(void *arg) return err; } +static int igt_gem_ww_ctx(void *arg) +{ + struct drm_i915_private *i915 = arg; + struct drm_i915_gem_object *obj, *obj2; + struct i915_gem_ww_ctx ww; + int err = 0; + + obj = i915_gem_object_create_internal(i915, PAGE_SIZE); + if (IS_ERR(obj)) + return PTR_ERR(obj); + + obj2 = i915_gem_object_create_internal(i915, PAGE_SIZE); + if (IS_ERR(obj)) { + err = PTR_ERR(obj); + goto put1; + } + + i915_gem_ww_ctx_init(&ww, true); +retry: + /* Lock the objects, twice for good measure (-EALREADY handling) */ + err = i915_gem_object_lock(obj, &ww); + if (!err) + err = i915_gem_object_lock_interruptible(obj, &ww); + if (!err) + err = i915_gem_object_lock_interruptible(obj2, &ww); + if (!err) + err = i915_gem_object_lock(obj2, &ww); + + if (err == -EDEADLK) { + err = i915_gem_ww_ctx_backoff(&ww); + if (!err) + goto retry; + } + i915_gem_ww_ctx_fini(&ww); + i915_gem_object_put(obj2); +put1: + i915_gem_object_put(obj); + return err; +} + int i915_gem_live_selftests(struct drm_i915_private *i915) { static const struct i915_subtest tests[] = { SUBTEST(igt_gem_suspend), SUBTEST(igt_gem_hibernate), + SUBTEST(igt_gem_ww_ctx), }; if (intel_gt_is_wedged(&i915->gt)) diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c index 0016ffc7d914..af8205a2bd8f 100644 --- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c +++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c @@ -172,35 +172,45 @@ static int igt_ppgtt_alloc(void *arg) /* Check we can allocate the entire range */ for (size = 4096; size <= limit; size <<= 2) { - err = ppgtt->vm.allocate_va_range(&ppgtt->vm, 0, size); + struct i915_vm_pt_stash stash = {}; + + err = i915_vm_alloc_pt_stash(&ppgtt->vm, &stash, size); + if (err) + goto err_ppgtt_cleanup; + + err = i915_vm_pin_pt_stash(&ppgtt->vm, &stash); if (err) { - if (err == -ENOMEM) { - pr_info("[1] Ran out of memory for va_range [0 + %llx] [bit %d]\n", - size, ilog2(size)); - err = 0; /* virtual space too large! */ - } + i915_vm_free_pt_stash(&ppgtt->vm, &stash); goto err_ppgtt_cleanup; } + ppgtt->vm.allocate_va_range(&ppgtt->vm, &stash, 0, size); cond_resched(); ppgtt->vm.clear_range(&ppgtt->vm, 0, size); + + i915_vm_free_pt_stash(&ppgtt->vm, &stash); } /* Check we can incrementally allocate the entire range */ for (last = 0, size = 4096; size <= limit; last = size, size <<= 2) { - err = ppgtt->vm.allocate_va_range(&ppgtt->vm, - last, size - last); + struct i915_vm_pt_stash stash = {}; + + err = i915_vm_alloc_pt_stash(&ppgtt->vm, &stash, size - last); + if (err) + goto err_ppgtt_cleanup; + + err = i915_vm_pin_pt_stash(&ppgtt->vm, &stash); if (err) { - if (err == -ENOMEM) { - pr_info("[2] Ran out of memory for va_range [%llx + %llx] [bit %d]\n", - last, size - last, ilog2(size)); - err = 0; /* virtual space too large! */ - } + i915_vm_free_pt_stash(&ppgtt->vm, &stash); goto err_ppgtt_cleanup; } + ppgtt->vm.allocate_va_range(&ppgtt->vm, &stash, + last, size - last); cond_resched(); + + i915_vm_free_pt_stash(&ppgtt->vm, &stash); } err_ppgtt_cleanup: @@ -284,9 +294,23 @@ static int lowlevel_hole(struct i915_address_space *vm, break; } - if (vm->allocate_va_range && - vm->allocate_va_range(vm, addr, BIT_ULL(size))) - break; + if (vm->allocate_va_range) { + struct i915_vm_pt_stash stash = {}; + + if (i915_vm_alloc_pt_stash(vm, &stash, + BIT_ULL(size))) + break; + + if (i915_vm_pin_pt_stash(vm, &stash)) { + i915_vm_free_pt_stash(vm, &stash); + break; + } + + vm->allocate_va_range(vm, &stash, + addr, BIT_ULL(size)); + + i915_vm_free_pt_stash(vm, &stash); + } mock_vma->pages = obj->mm.pages; mock_vma->node.size = BIT_ULL(size); @@ -1881,6 +1905,7 @@ static int igt_cs_tlb(void *arg) continue; while (!__igt_timeout(end_time, NULL)) { + struct i915_vm_pt_stash stash = {}; struct i915_request *rq; u64 offset; @@ -1888,10 +1913,6 @@ static int igt_cs_tlb(void *arg) 0, vm->total - PAGE_SIZE, chunk_size, PAGE_SIZE); - err = vm->allocate_va_range(vm, offset, chunk_size); - if (err) - goto end; - memset32(result, STACK_MAGIC, PAGE_SIZE / sizeof(u32)); vma = i915_vma_instance(bbe, vm, NULL); @@ -1904,6 +1925,20 @@ static int igt_cs_tlb(void *arg) if (err) goto end; + err = i915_vm_alloc_pt_stash(vm, &stash, chunk_size); + if (err) + goto end; + + err = i915_vm_pin_pt_stash(vm, &stash); + if (err) { + i915_vm_free_pt_stash(vm, &stash); + goto end; + } + + vm->allocate_va_range(vm, &stash, offset, chunk_size); + + i915_vm_free_pt_stash(vm, &stash); + /* Prime the TLB with the dummy pages */ for (i = 0; i < count; i++) { vma->node.start = offset + i * PAGE_SIZE; diff --git a/drivers/gpu/drm/i915/selftests/i915_perf.c b/drivers/gpu/drm/i915/selftests/i915_perf.c index c2d001d9c0ec..debbac660519 100644 --- a/drivers/gpu/drm/i915/selftests/i915_perf.c +++ b/drivers/gpu/drm/i915/selftests/i915_perf.c @@ -307,7 +307,7 @@ static int live_noa_gpr(void *arg) } /* Poison the ce->vm so we detect writes not to the GGTT gt->scratch */ - scratch = kmap(ce->vm->scratch[0].base.page); + scratch = kmap(__px_page(ce->vm->scratch[0])); memset(scratch, POISON_FREE, PAGE_SIZE); rq = intel_context_create_request(ce); @@ -405,7 +405,7 @@ static int live_noa_gpr(void *arg) out_rq: i915_request_put(rq); out_ce: - kunmap(ce->vm->scratch[0].base.page); + kunmap(__px_page(ce->vm->scratch[0])); intel_context_put(ce); out: stream_destroy(stream); diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c index c1dcd4b91bda..3092ca763789 100644 --- a/drivers/gpu/drm/i915/selftests/i915_request.c +++ b/drivers/gpu/drm/i915/selftests/i915_request.c @@ -862,6 +862,8 @@ static int live_all_engines(void *arg) goto out_free; } + i915_vma_lock(batch); + idx = 0; for_each_uabi_engine(engine, i915) { request[idx] = intel_engine_create_kernel_request(engine); @@ -872,11 +874,9 @@ static int live_all_engines(void *arg) goto out_request; } - i915_vma_lock(batch); err = i915_request_await_object(request[idx], batch->obj, 0); if (err == 0) err = i915_vma_move_to_active(batch, request[idx], 0); - i915_vma_unlock(batch); GEM_BUG_ON(err); err = engine->emit_bb_start(request[idx], @@ -891,6 +891,8 @@ static int live_all_engines(void *arg) idx++; } + i915_vma_unlock(batch); + idx = 0; for_each_uabi_engine(engine, i915) { if (i915_request_completed(request[idx])) { @@ -981,12 +983,13 @@ static int live_sequential_engines(void *arg) goto out_free; } + i915_vma_lock(batch); request[idx] = intel_engine_create_kernel_request(engine); if (IS_ERR(request[idx])) { err = PTR_ERR(request[idx]); pr_err("%s: Request allocation failed for %s with err=%d\n", __func__, engine->name, err); - goto out_request; + goto out_unlock; } if (prev) { @@ -996,16 +999,14 @@ static int live_sequential_engines(void *arg) i915_request_add(request[idx]); pr_err("%s: Request await failed for %s with err=%d\n", __func__, engine->name, err); - goto out_request; + goto out_unlock; } } - i915_vma_lock(batch); err = i915_request_await_object(request[idx], batch->obj, false); if (err == 0) err = i915_vma_move_to_active(batch, request[idx], 0); - i915_vma_unlock(batch); GEM_BUG_ON(err); err = engine->emit_bb_start(request[idx], @@ -1020,6 +1021,11 @@ static int live_sequential_engines(void *arg) prev = request[idx]; idx++; + +out_unlock: + i915_vma_unlock(batch); + if (err) + goto out_request; } idx = 0; diff --git a/drivers/gpu/drm/i915/selftests/i915_vma.c b/drivers/gpu/drm/i915/selftests/i915_vma.c index af89c7fc8f59..88c5e9acb84c 100644 --- a/drivers/gpu/drm/i915/selftests/i915_vma.c +++ b/drivers/gpu/drm/i915/selftests/i915_vma.c @@ -892,7 +892,7 @@ static int igt_vma_remapped_gtt(void *arg) unsigned int x, y; int err; - i915_gem_object_lock(obj); + i915_gem_object_lock(obj, NULL); err = i915_gem_object_set_to_gtt_domain(obj, true); i915_gem_object_unlock(obj); if (err) diff --git a/drivers/gpu/drm/i915/selftests/intel_memory_region.c b/drivers/gpu/drm/i915/selftests/intel_memory_region.c index 6e80d99048e4..93a38a323584 100644 --- a/drivers/gpu/drm/i915/selftests/intel_memory_region.c +++ b/drivers/gpu/drm/i915/selftests/intel_memory_region.c @@ -509,7 +509,7 @@ static int igt_lmem_write_cpu(void *arg) if (err) goto out_unpin; - i915_gem_object_lock(obj); + i915_gem_object_lock(obj, NULL); err = i915_gem_object_set_to_wc_domain(obj, true); i915_gem_object_unlock(obj); if (err) @@ -522,9 +522,9 @@ static int igt_lmem_write_cpu(void *arg) goto out_unpin; } - /* We want to throw in a random width/align */ - bytes[0] = igt_random_offset(&prng, 0, PAGE_SIZE, sizeof(u32), - sizeof(u32)); + /* A random multiple of u32, picked between [64, PAGE_SIZE - 64] */ + bytes[0] = igt_random_offset(&prng, 64, PAGE_SIZE - 64, 0, sizeof(u32)); + GEM_BUG_ON(!IS_ALIGNED(bytes[0], sizeof(u32))); i = 0; do { diff --git a/drivers/gpu/drm/i915/selftests/mock_gtt.c b/drivers/gpu/drm/i915/selftests/mock_gtt.c index b173086411ef..7270fc8ca801 100644 --- a/drivers/gpu/drm/i915/selftests/mock_gtt.c +++ b/drivers/gpu/drm/i915/selftests/mock_gtt.c @@ -38,14 +38,14 @@ static void mock_insert_entries(struct i915_address_space *vm, { } -static int mock_bind_ppgtt(struct i915_address_space *vm, - struct i915_vma *vma, - enum i915_cache_level cache_level, - u32 flags) +static void mock_bind_ppgtt(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + struct i915_vma *vma, + enum i915_cache_level cache_level, + u32 flags) { GEM_BUG_ON(flags & I915_VMA_GLOBAL_BIND); set_bit(I915_VMA_LOCAL_BIND_BIT, __i915_vma_flags(vma)); - return 0; } static void mock_unbind_ppgtt(struct i915_address_space *vm, @@ -74,9 +74,12 @@ struct i915_ppgtt *mock_ppgtt(struct drm_i915_private *i915, const char *name) ppgtt->vm.i915 = i915; ppgtt->vm.total = round_down(U64_MAX, PAGE_SIZE); ppgtt->vm.file = ERR_PTR(-ENODEV); + ppgtt->vm.dma = &i915->drm.pdev->dev; i915_address_space_init(&ppgtt->vm, VM_CLASS_PPGTT); + ppgtt->vm.alloc_pt_dma = alloc_pt_dma; + ppgtt->vm.clear_range = mock_clear_range; ppgtt->vm.insert_page = mock_insert_page; ppgtt->vm.insert_entries = mock_insert_entries; @@ -90,13 +93,12 @@ struct i915_ppgtt *mock_ppgtt(struct drm_i915_private *i915, const char *name) return ppgtt; } -static int mock_bind_ggtt(struct i915_address_space *vm, - struct i915_vma *vma, - enum i915_cache_level cache_level, - u32 flags) +static void mock_bind_ggtt(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + struct i915_vma *vma, + enum i915_cache_level cache_level, + u32 flags) { - atomic_or(I915_VMA_GLOBAL_BIND | I915_VMA_LOCAL_BIND, &vma->flags); - return 0; } static void mock_unbind_ggtt(struct i915_address_space *vm, @@ -116,6 +118,8 @@ void mock_init_ggtt(struct drm_i915_private *i915, struct i915_ggtt *ggtt) ggtt->mappable_end = resource_size(&ggtt->gmadr); ggtt->vm.total = 4096 * PAGE_SIZE; + ggtt->vm.alloc_pt_dma = alloc_pt_dma; + ggtt->vm.clear_range = mock_clear_range; ggtt->vm.insert_page = mock_insert_page; ggtt->vm.insert_entries = mock_insert_entries;