From 182d919b84902eece162c63ed3d476c8016b4197 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Feb 2015 23:47:31 +0000 Subject: FS-Cache: Count culled objects and objects rejected due to lack of space Count the number of objects that get culled by the cache backend and the number of objects that the cache backend declines to instantiate due to lack of space in the cache. These numbers are made available through /proc/fs/fscache/stats Signed-off-by: David Howells Reviewed-by: Steve Dickson Acked-by: Jeff Layton --- include/linux/fscache-cache.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 771484993ca7..c9dafdaf3347 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -371,6 +371,7 @@ struct fscache_object { #define FSCACHE_OBJECT_IS_LOOKED_UP 4 /* T if object has been looked up */ #define FSCACHE_OBJECT_IS_AVAILABLE 5 /* T if object has become active */ #define FSCACHE_OBJECT_RETIRED 6 /* T if object was retired on relinquishment */ +#define FSCACHE_OBJECT_KILLED_BY_CACHE 7 /* T if object was killed by the cache */ struct list_head cache_link; /* link in cache->object_list */ struct hlist_node cookie_link; /* link in cookie->backing_objects */ @@ -551,4 +552,15 @@ extern enum fscache_checkaux fscache_check_aux(struct fscache_object *object, const void *data, uint16_t datalen); +extern void fscache_object_retrying_stale(struct fscache_object *object); + +enum fscache_why_object_killed { + FSCACHE_OBJECT_IS_STALE, + FSCACHE_OBJECT_NO_SPACE, + FSCACHE_OBJECT_WAS_RETIRED, + FSCACHE_OBJECT_WAS_CULLED, +}; +extern void fscache_object_mark_killed(struct fscache_object *object, + enum fscache_why_object_killed why); + #endif /* _LINUX_FSCACHE_CACHE_H */ -- cgit v1.2.3 From 30ceec6284129662efc3a1e7675b2bd857a046fe Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 24 Feb 2015 10:05:27 +0000 Subject: FS-Cache: When submitting an op, cancel it if the target object is dying When submitting an operation, prefer to cancel the operation immediately rather than queuing it for later processing if the object is marked as dying (ie. the object state machine has reached the KILL_OBJECT state). Whilst we're at it, change the series of related test_bit() calls into a READ_ONCE() and bitwise-AND operators to reduce the number of load instructions (test_bit() has a volatile address). Signed-off-by: David Howells Reviewed-by: Steve Dickson Acked-by: Jeff Layton --- fs/fscache/operation.c | 47 +++++++++++++++++++++++++++---------------- include/linux/fscache-cache.h | 9 +++++++-- 2 files changed, 37 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 68ead8482617..dec6defe3be3 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -120,6 +120,8 @@ static void fscache_report_unexpected_submission(struct fscache_object *object, int fscache_submit_exclusive_op(struct fscache_object *object, struct fscache_operation *op) { + const struct fscache_state *ostate; + unsigned long flags; int ret; _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id); @@ -132,8 +134,19 @@ int fscache_submit_exclusive_op(struct fscache_object *object, ASSERTCMP(object->n_ops, >=, object->n_exclusive); ASSERT(list_empty(&op->pend_link)); + ostate = object->state; + smp_rmb(); + op->state = FSCACHE_OP_ST_PENDING; - if (fscache_object_is_active(object)) { + flags = READ_ONCE(object->flags); + if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) { + fscache_stat(&fscache_n_op_rejected); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -ENOBUFS; + } else if (unlikely(fscache_cache_is_broken(object))) { + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -EIO; + } else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) { op->object = object; object->n_ops++; object->n_exclusive++; /* reads and writes must wait */ @@ -155,7 +168,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object, /* need to issue a new write op after this */ clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); ret = 0; - } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) { + } else if (flags & BIT(FSCACHE_OBJECT_IS_LOOKED_UP)) { op->object = object; object->n_ops++; object->n_exclusive++; /* reads and writes must wait */ @@ -164,11 +177,9 @@ int fscache_submit_exclusive_op(struct fscache_object *object, fscache_stat(&fscache_n_op_pend); ret = 0; } else { - /* If we're in any other state, there must have been an I/O - * error of some nature. - */ - ASSERT(test_bit(FSCACHE_IOERROR, &object->cache->flags)); - ret = -EIO; + fscache_report_unexpected_submission(object, op, ostate); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -ENOBUFS; } spin_unlock(&object->lock); @@ -187,6 +198,7 @@ int fscache_submit_op(struct fscache_object *object, struct fscache_operation *op) { const struct fscache_state *ostate; + unsigned long flags; int ret; _enter("{OBJ%x OP%x},{%u}", @@ -204,7 +216,15 @@ int fscache_submit_op(struct fscache_object *object, smp_rmb(); op->state = FSCACHE_OP_ST_PENDING; - if (fscache_object_is_active(object)) { + flags = READ_ONCE(object->flags); + if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) { + fscache_stat(&fscache_n_op_rejected); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -ENOBUFS; + } else if (unlikely(fscache_cache_is_broken(object))) { + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -EIO; + } else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) { op->object = object; object->n_ops++; @@ -222,25 +242,18 @@ int fscache_submit_op(struct fscache_object *object, fscache_run_op(object, op); } ret = 0; - } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) { + } else if (flags & BIT(FSCACHE_OBJECT_IS_LOOKED_UP)) { op->object = object; object->n_ops++; atomic_inc(&op->usage); list_add_tail(&op->pend_link, &object->pending_ops); fscache_stat(&fscache_n_op_pend); ret = 0; - } else if (fscache_object_is_dying(object)) { - fscache_stat(&fscache_n_op_rejected); - op->state = FSCACHE_OP_ST_CANCELLED; - ret = -ENOBUFS; - } else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) { + } else { fscache_report_unexpected_submission(object, op, ostate); ASSERT(!fscache_object_is_active(object)); op->state = FSCACHE_OP_ST_CANCELLED; ret = -ENOBUFS; - } else { - op->state = FSCACHE_OP_ST_CANCELLED; - ret = -ENOBUFS; } spin_unlock(&object->lock); diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index c9dafdaf3347..2e83a141e465 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -411,17 +411,22 @@ static inline bool fscache_object_is_available(struct fscache_object *object) return test_bit(FSCACHE_OBJECT_IS_AVAILABLE, &object->flags); } +static inline bool fscache_cache_is_broken(struct fscache_object *object) +{ + return test_bit(FSCACHE_IOERROR, &object->cache->flags); +} + static inline bool fscache_object_is_active(struct fscache_object *object) { return fscache_object_is_available(object) && fscache_object_is_live(object) && - !test_bit(FSCACHE_IOERROR, &object->cache->flags); + !fscache_cache_is_broken(object); } static inline bool fscache_object_is_dead(struct fscache_object *object) { return fscache_object_is_dying(object) && - test_bit(FSCACHE_IOERROR, &object->cache->flags); + fscache_cache_is_broken(object); } /** -- cgit v1.2.3 From 87021526300f1a292dd966e141e183630ac95317 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 24 Feb 2015 10:52:51 +0000 Subject: FS-Cache: fscache_object_is_dead() has wrong logic, kill it fscache_object_is_dead() returns true only if the object is marked dead and the cache got an I/O error. This should be a logical OR instead. Since two of the callers got split up into handling for separate subcases, expand the other callers and kill the function. This is probably the right thing to do anyway since one of the subcases isn't about the object at all, but rather about the cache. Signed-off-by: David Howells Reviewed-by: Steve Dickson Acked-by: Jeff Layton --- fs/fscache/cookie.c | 3 ++- fs/fscache/page.c | 6 ++++-- include/linux/fscache-cache.h | 6 ------ 3 files changed, 6 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 89acec742e0b..8de22164f5fb 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -327,7 +327,8 @@ static int fscache_alloc_object(struct fscache_cache *cache, object_already_extant: ret = -ENOBUFS; - if (fscache_object_is_dead(object)) { + if (fscache_object_is_dying(object) || + fscache_cache_is_broken(object)) { spin_unlock(&cookie->lock); goto error; } diff --git a/fs/fscache/page.c b/fs/fscache/page.c index de33b3fccca6..d0805e31361c 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -377,11 +377,13 @@ check_if_dead: _leave(" = -ENOBUFS [cancelled]"); return -ENOBUFS; } - if (unlikely(fscache_object_is_dead(object))) { - pr_err("%s() = -ENOBUFS [obj dead %d]\n", __func__, op->state); + if (unlikely(fscache_object_is_dying(object) || + fscache_cache_is_broken(object))) { + enum fscache_operation_state state = op->state; fscache_cancel_op(op, do_cancel); if (stat_object_dead) fscache_stat(stat_object_dead); + _leave(" = -ENOBUFS [obj dead %d]", state); return -ENOBUFS; } return 0; diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 2e83a141e465..ca3d550da11e 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -423,12 +423,6 @@ static inline bool fscache_object_is_active(struct fscache_object *object) !fscache_cache_is_broken(object); } -static inline bool fscache_object_is_dead(struct fscache_object *object) -{ - return fscache_object_is_dying(object) && - fscache_cache_is_broken(object); -} - /** * fscache_object_destroyed - Note destruction of an object in a cache * @cache: The cache from which the object came -- cgit v1.2.3 From 1339ec98e32b4bc8efb6fbb71c006a465130aaba Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 25 Feb 2015 13:26:39 +0000 Subject: FS-Cache: Out of line fscache_operation_init() Out of line fscache_operation_init() so that it can access internal FS-Cache features, such as stats, in a later commit. Signed-off-by: David Howells Reviewed-by: Steve Dickson Acked-by: Jeff Layton --- fs/fscache/operation.c | 22 ++++++++++++++++++++++ include/linux/fscache-cache.h | 24 +++--------------------- 2 files changed, 25 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 67594a8d791a..61a6e78b85fa 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -20,6 +20,28 @@ atomic_t fscache_op_debug_id; EXPORT_SYMBOL(fscache_op_debug_id); +/** + * fscache_operation_init - Do basic initialisation of an operation + * @op: The operation to initialise + * @release: The release function to assign + * + * Do basic initialisation of an operation. The caller must still set flags, + * object and processor if needed. + */ +void fscache_operation_init(struct fscache_operation *op, + fscache_operation_processor_t processor, + fscache_operation_release_t release) +{ + INIT_WORK(&op->work, fscache_op_work_func); + atomic_set(&op->usage, 1); + op->state = FSCACHE_OP_ST_INITIALISED; + op->debug_id = atomic_inc_return(&fscache_op_debug_id); + op->processor = processor; + op->release = release; + INIT_LIST_HEAD(&op->pend_link); +} +EXPORT_SYMBOL(fscache_operation_init); + /** * fscache_enqueue_operation - Enqueue an operation for processing * @op: The operation to enqueue diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index ca3d550da11e..0e26d49972e3 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -119,27 +119,9 @@ extern void fscache_op_work_func(struct work_struct *work); extern void fscache_enqueue_operation(struct fscache_operation *); extern void fscache_op_complete(struct fscache_operation *, bool); extern void fscache_put_operation(struct fscache_operation *); - -/** - * fscache_operation_init - Do basic initialisation of an operation - * @op: The operation to initialise - * @release: The release function to assign - * - * Do basic initialisation of an operation. The caller must still set flags, - * object and processor if needed. - */ -static inline void fscache_operation_init(struct fscache_operation *op, - fscache_operation_processor_t processor, - fscache_operation_release_t release) -{ - INIT_WORK(&op->work, fscache_op_work_func); - atomic_set(&op->usage, 1); - op->state = FSCACHE_OP_ST_INITIALISED; - op->debug_id = atomic_inc_return(&fscache_op_debug_id); - op->processor = processor; - op->release = release; - INIT_LIST_HEAD(&op->pend_link); -} +extern void fscache_operation_init(struct fscache_operation *, + fscache_operation_processor_t, + fscache_operation_release_t); /* * data read operation -- cgit v1.2.3 From d3b97ca4a99e4e6c78f5a21c968eadf5c8ba9971 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 24 Feb 2015 10:05:29 +0000 Subject: FS-Cache: The operation cancellation method needs calling in more places Any time an incomplete operation is cancelled, the operation cancellation function needs to be called to clean up. This is currently being passed directly to some of the functions that might want to call it, but not all. Instead, pass the cancellation method pointer to the fscache_operation_init() and have that cache it in the operation struct. Further, plug in a dummy cancellation handler if the caller declines to set one as this allows us to call the function unconditionally (the extra overhead isn't worth bothering about as we don't expect to be calling this typically). The cancellation method must thence be called everywhere the CANCELLED state is set. Note that we call it *before* setting the CANCELLED state such that the method can use the old state value to guide its operation. fscache_do_cancel_retrieval() needs moving higher up in the sources so that the init function can use it now. Without this, the following oops may be seen: FS-Cache: Assertion failed FS-Cache: 3 == 0 is false ------------[ cut here ]------------ kernel BUG at ../fs/fscache/page.c:261! ... RIP: 0010:[] fscache_release_retrieval_op+0x77/0x100 [] fscache_put_operation+0x114/0x2da [] __fscache_read_or_alloc_pages+0x358/0x3b3 [] __nfs_readpages_from_fscache+0x59/0xbf [nfs] [] nfs_readpages+0x10c/0x185 [nfs] [] ? alloc_pages_current+0x119/0x13e [] ? __page_cache_alloc+0xfb/0x10a [] __do_page_cache_readahead+0x188/0x22c [] ondemand_readahead+0x29e/0x2af [] page_cache_sync_readahead+0x38/0x3a [] generic_file_read_iter+0x1a2/0x55a [] ? nfs_revalidate_mapping+0xd6/0x288 [nfs] [] nfs_file_read+0x49/0x70 [nfs] [] new_sync_read+0x78/0x9c [] __vfs_read+0x13/0x38 [] vfs_read+0x95/0x121 [] SyS_read+0x4c/0x8a [] system_call_fastpath+0x12/0x17 The assertion is showing that the remaining number of pages (n_pages) is not 0 when the operation is being released. Signed-off-by: David Howells Reviewed-by: Steve Dickson Acked-by: Jeff Layton --- fs/fscache/cookie.c | 5 ++--- fs/fscache/internal.h | 7 ++----- fs/fscache/object.c | 3 ++- fs/fscache/operation.c | 31 ++++++++++++++++++++++------- fs/fscache/page.c | 46 +++++++++++++++++++++---------------------- include/linux/fscache-cache.h | 5 +++++ 6 files changed, 57 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 8de22164f5fb..d403c69bee08 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -672,7 +672,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie) if (!op) return -ENOMEM; - fscache_operation_init(op, NULL, NULL); + fscache_operation_init(op, NULL, NULL, NULL); op->flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING) | (1 << FSCACHE_OP_UNUSE_COOKIE); @@ -696,8 +696,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie) /* the work queue now carries its own ref on the object */ spin_unlock(&cookie->lock); - ret = fscache_wait_for_operation_activation(object, op, - NULL, NULL, NULL); + ret = fscache_wait_for_operation_activation(object, op, NULL, NULL); if (ret == 0) { /* ask the cache to honour the operation */ ret = object->cache->ops->check_consistency(op); diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index a63225116db6..97ec45110957 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -124,9 +124,7 @@ extern int fscache_submit_exclusive_op(struct fscache_object *, struct fscache_operation *); extern int fscache_submit_op(struct fscache_object *, struct fscache_operation *); -extern int fscache_cancel_op(struct fscache_operation *, - void (*)(struct fscache_operation *), - bool); +extern int fscache_cancel_op(struct fscache_operation *, bool); extern void fscache_cancel_all_ops(struct fscache_object *); extern void fscache_abort_object(struct fscache_object *); extern void fscache_start_operations(struct fscache_object *); @@ -139,8 +137,7 @@ extern int fscache_wait_for_deferred_lookup(struct fscache_cookie *); extern int fscache_wait_for_operation_activation(struct fscache_object *, struct fscache_operation *, atomic_t *, - atomic_t *, - void (*)(struct fscache_operation *)); + atomic_t *); extern void fscache_invalidate_writes(struct fscache_cookie *); /* diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 40049f7505f0..9e792e30f4db 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -961,7 +961,8 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj if (!op) goto nomem; - fscache_operation_init(op, object->cache->ops->invalidate_object, NULL); + fscache_operation_init(op, object->cache->ops->invalidate_object, + NULL, NULL); op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE) | (1 << FSCACHE_OP_UNUSE_COOKIE); diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index c76c09730768..57d4abb68656 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -20,6 +20,10 @@ atomic_t fscache_op_debug_id; EXPORT_SYMBOL(fscache_op_debug_id); +static void fscache_operation_dummy_cancel(struct fscache_operation *op) +{ +} + /** * fscache_operation_init - Do basic initialisation of an operation * @op: The operation to initialise @@ -30,6 +34,7 @@ EXPORT_SYMBOL(fscache_op_debug_id); */ void fscache_operation_init(struct fscache_operation *op, fscache_operation_processor_t processor, + fscache_operation_cancel_t cancel, fscache_operation_release_t release) { INIT_WORK(&op->work, fscache_op_work_func); @@ -37,6 +42,7 @@ void fscache_operation_init(struct fscache_operation *op, op->state = FSCACHE_OP_ST_INITIALISED; op->debug_id = atomic_inc_return(&fscache_op_debug_id); op->processor = processor; + op->cancel = cancel ?: fscache_operation_dummy_cancel; op->release = release; INIT_LIST_HEAD(&op->pend_link); fscache_stat(&fscache_n_op_initialised); @@ -164,9 +170,11 @@ int fscache_submit_exclusive_op(struct fscache_object *object, flags = READ_ONCE(object->flags); if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) { fscache_stat(&fscache_n_op_rejected); + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; ret = -ENOBUFS; } else if (unlikely(fscache_cache_is_broken(object))) { + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; ret = -EIO; } else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) { @@ -200,10 +208,12 @@ int fscache_submit_exclusive_op(struct fscache_object *object, fscache_stat(&fscache_n_op_pend); ret = 0; } else if (flags & BIT(FSCACHE_OBJECT_KILLED_BY_CACHE)) { + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; ret = -ENOBUFS; } else { fscache_report_unexpected_submission(object, op, ostate); + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; ret = -ENOBUFS; } @@ -245,9 +255,11 @@ int fscache_submit_op(struct fscache_object *object, flags = READ_ONCE(object->flags); if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) { fscache_stat(&fscache_n_op_rejected); + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; ret = -ENOBUFS; } else if (unlikely(fscache_cache_is_broken(object))) { + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; ret = -EIO; } else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) { @@ -276,11 +288,13 @@ int fscache_submit_op(struct fscache_object *object, fscache_stat(&fscache_n_op_pend); ret = 0; } else if (flags & BIT(FSCACHE_OBJECT_KILLED_BY_CACHE)) { + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; ret = -ENOBUFS; } else { fscache_report_unexpected_submission(object, op, ostate); ASSERT(!fscache_object_is_active(object)); + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; ret = -ENOBUFS; } @@ -335,7 +349,6 @@ void fscache_start_operations(struct fscache_object *object) * cancel an operation that's pending on an object */ int fscache_cancel_op(struct fscache_operation *op, - void (*do_cancel)(struct fscache_operation *), bool cancel_in_progress_op) { struct fscache_object *object = op->object; @@ -355,9 +368,9 @@ int fscache_cancel_op(struct fscache_operation *op, ASSERT(!list_empty(&op->pend_link)); list_del_init(&op->pend_link); put = true; + fscache_stat(&fscache_n_op_cancelled); - if (do_cancel) - do_cancel(op); + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) object->n_exclusive--; @@ -373,8 +386,7 @@ int fscache_cancel_op(struct fscache_operation *op, fscache_start_operations(object); fscache_stat(&fscache_n_op_cancelled); - if (do_cancel) - do_cancel(op); + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) object->n_exclusive--; @@ -408,6 +420,7 @@ void fscache_cancel_all_ops(struct fscache_object *object) list_del_init(&op->pend_link); ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING); + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) @@ -440,8 +453,12 @@ void fscache_op_complete(struct fscache_operation *op, bool cancelled) spin_lock(&object->lock); - op->state = cancelled ? - FSCACHE_OP_ST_CANCELLED : FSCACHE_OP_ST_COMPLETE; + if (!cancelled) { + op->state = FSCACHE_OP_ST_COMPLETE; + } else { + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + } if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) object->n_exclusive--; diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 7db9752252ef..d1b23a67c031 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -213,7 +213,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) return -ENOMEM; } - fscache_operation_init(op, fscache_attr_changed_op, NULL); + fscache_operation_init(op, fscache_attr_changed_op, NULL, NULL); op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE) | (1 << FSCACHE_OP_UNUSE_COOKIE); @@ -248,6 +248,17 @@ nobufs: } EXPORT_SYMBOL(__fscache_attr_changed); +/* + * Handle cancellation of a pending retrieval op + */ +static void fscache_do_cancel_retrieval(struct fscache_operation *_op) +{ + struct fscache_retrieval *op = + container_of(_op, struct fscache_retrieval, op); + + atomic_set(&op->n_pages, 0); +} + /* * release a retrieval op reference */ @@ -285,7 +296,9 @@ static struct fscache_retrieval *fscache_alloc_retrieval( return NULL; } - fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op); + fscache_operation_init(&op->op, NULL, + fscache_do_cancel_retrieval, + fscache_release_retrieval_op); op->op.flags = FSCACHE_OP_MYTHREAD | (1UL << FSCACHE_OP_WAITING) | (1UL << FSCACHE_OP_UNUSE_COOKIE); @@ -329,25 +342,13 @@ int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie) return 0; } -/* - * Handle cancellation of a pending retrieval op - */ -static void fscache_do_cancel_retrieval(struct fscache_operation *_op) -{ - struct fscache_retrieval *op = - container_of(_op, struct fscache_retrieval, op); - - atomic_set(&op->n_pages, 0); -} - /* * wait for an object to become active (or dead) */ int fscache_wait_for_operation_activation(struct fscache_object *object, struct fscache_operation *op, atomic_t *stat_op_waits, - atomic_t *stat_object_dead, - void (*do_cancel)(struct fscache_operation *)) + atomic_t *stat_object_dead) { int ret; @@ -359,7 +360,7 @@ int fscache_wait_for_operation_activation(struct fscache_object *object, fscache_stat(stat_op_waits); if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING, TASK_INTERRUPTIBLE) != 0) { - ret = fscache_cancel_op(op, do_cancel, false); + ret = fscache_cancel_op(op, false); if (ret == 0) return -ERESTARTSYS; @@ -380,7 +381,7 @@ check_if_dead: if (unlikely(fscache_object_is_dying(object) || fscache_cache_is_broken(object))) { enum fscache_operation_state state = op->state; - fscache_cancel_op(op, do_cancel, true); + fscache_cancel_op(op, true); if (stat_object_dead) fscache_stat(stat_object_dead); _leave(" = -ENOBUFS [obj dead %d]", state); @@ -464,8 +465,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, ret = fscache_wait_for_operation_activation( object, &op->op, __fscache_stat(&fscache_n_retrieval_op_waits), - __fscache_stat(&fscache_n_retrievals_object_dead), - fscache_do_cancel_retrieval); + __fscache_stat(&fscache_n_retrievals_object_dead)); if (ret < 0) goto error; @@ -595,8 +595,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, ret = fscache_wait_for_operation_activation( object, &op->op, __fscache_stat(&fscache_n_retrieval_op_waits), - __fscache_stat(&fscache_n_retrievals_object_dead), - fscache_do_cancel_retrieval); + __fscache_stat(&fscache_n_retrievals_object_dead)); if (ret < 0) goto error; @@ -702,8 +701,7 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, ret = fscache_wait_for_operation_activation( object, &op->op, __fscache_stat(&fscache_n_alloc_op_waits), - __fscache_stat(&fscache_n_allocs_object_dead), - fscache_do_cancel_retrieval); + __fscache_stat(&fscache_n_allocs_object_dead)); if (ret < 0) goto error; @@ -946,7 +944,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, if (!op) goto nomem; - fscache_operation_init(&op->op, fscache_write_op, + fscache_operation_init(&op->op, fscache_write_op, NULL, fscache_release_write_op); op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING) | diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 0e26d49972e3..69c6eb10d858 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -74,6 +74,7 @@ extern wait_queue_head_t fscache_cache_cleared_wq; */ typedef void (*fscache_operation_release_t)(struct fscache_operation *op); typedef void (*fscache_operation_processor_t)(struct fscache_operation *op); +typedef void (*fscache_operation_cancel_t)(struct fscache_operation *op); enum fscache_operation_state { FSCACHE_OP_ST_BLANK, /* Op is not yet submitted */ @@ -109,6 +110,9 @@ struct fscache_operation { * the op in a non-pool thread */ fscache_operation_processor_t processor; + /* Operation cancellation cleanup (optional) */ + fscache_operation_cancel_t cancel; + /* operation releaser */ fscache_operation_release_t release; }; @@ -121,6 +125,7 @@ extern void fscache_op_complete(struct fscache_operation *, bool); extern void fscache_put_operation(struct fscache_operation *); extern void fscache_operation_init(struct fscache_operation *, fscache_operation_processor_t, + fscache_operation_cancel_t, fscache_operation_release_t); /* -- cgit v1.2.3 From 4a47132ff472a0c2c5441baeb50cf97f2580bc43 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 24 Feb 2015 10:05:29 +0000 Subject: FS-Cache: Retain the netfs context in the retrieval op earlier Now that the retrieval operation may be disposed of by fscache_put_operation() before we actually set the context, the retrieval-specific cleanup operation can produce a NULL-pointer dereference when it tries to unconditionally clean up the netfs context. Given that it is expected that we'll get at least as far as the place where we currently set the context pointer and it is unlikely we'll go through the error handling paths prior to that point, retain the context right from the point that the retrieval op is allocated. Concomitant to this, we need to retain the cookie pointer in the retrieval op also so that we can call the netfs to release its context in the release method. In addition, we might now get into fscache_release_retrieval_op() with the op only initialised. To this end, set the operation to DEAD only after the release method has been called and skip the n_pages test upon cleanup if the op is still in the INITIALISED state. Without these changes, the following oops might be seen: BUG: unable to handle kernel NULL pointer dereference at 00000000000000b8 ... RIP: 0010:[] fscache_release_retrieval_op+0xae/0x100 ... Call Trace: [] fscache_put_operation+0x117/0x2e0 [] __fscache_read_or_alloc_pages+0x351/0x3ac [] __nfs_readpages_from_fscache+0x59/0xbf [nfs] [] nfs_readpages+0x10c/0x185 [nfs] [] ? alloc_pages_current+0x119/0x13e [] ? __page_cache_alloc+0xfb/0x10a [] __do_page_cache_readahead+0x188/0x22c [] ondemand_readahead+0x29e/0x2af [] page_cache_sync_readahead+0x38/0x3a [] generic_file_read_iter+0x1a2/0x55a [] ? nfs_revalidate_mapping+0xd6/0x288 [nfs] [] nfs_file_read+0x49/0x70 [nfs] [] new_sync_read+0x78/0x9c [] __vfs_read+0x13/0x38 [] vfs_read+0x95/0x121 [] SyS_read+0x4c/0x8a [] system_call_fastpath+0x12/0x17 Signed-off-by: David Howells Reviewed-by: Steve Dickson Acked-by: Jeff Layton --- fs/fscache/operation.c | 2 +- fs/fscache/page.c | 20 ++++++++++---------- include/linux/fscache-cache.h | 1 + 3 files changed, 12 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 57d4abb68656..de67745e1cd7 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -492,7 +492,6 @@ void fscache_put_operation(struct fscache_operation *op) ASSERTIFCMP(op->state != FSCACHE_OP_ST_INITIALISED && op->state != FSCACHE_OP_ST_COMPLETE, op->state, ==, FSCACHE_OP_ST_CANCELLED); - op->state = FSCACHE_OP_ST_DEAD; fscache_stat(&fscache_n_op_release); @@ -500,6 +499,7 @@ void fscache_put_operation(struct fscache_operation *op) op->release(op); op->release = NULL; } + op->state = FSCACHE_OP_ST_DEAD; object = op->object; if (likely(object)) { diff --git a/fs/fscache/page.c b/fs/fscache/page.c index d1b23a67c031..483bbc613bf0 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -269,11 +269,12 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op) _enter("{OP%x}", op->op.debug_id); - ASSERTCMP(atomic_read(&op->n_pages), ==, 0); + ASSERTIFCMP(op->op.state != FSCACHE_OP_ST_INITIALISED, + atomic_read(&op->n_pages), ==, 0); fscache_hist(fscache_retrieval_histogram, op->start_time); if (op->context) - fscache_put_context(op->op.object->cookie, op->context); + fscache_put_context(op->cookie, op->context); _leave(""); } @@ -302,11 +303,18 @@ static struct fscache_retrieval *fscache_alloc_retrieval( op->op.flags = FSCACHE_OP_MYTHREAD | (1UL << FSCACHE_OP_WAITING) | (1UL << FSCACHE_OP_UNUSE_COOKIE); + op->cookie = cookie; op->mapping = mapping; op->end_io_func = end_io_func; op->context = context; op->start_time = jiffies; INIT_LIST_HEAD(&op->to_do); + + /* Pin the netfs read context in case we need to do the actual netfs + * read because we've encountered a cache read failure. + */ + if (context) + fscache_get_context(op->cookie, context); return op; } @@ -456,10 +464,6 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, fscache_stat(&fscache_n_retrieval_ops); - /* pin the netfs read context in case we need to do the actual netfs - * read because we've encountered a cache read failure */ - fscache_get_context(object->cookie, op->context); - /* we wait for the operation to become active, and then process it * *here*, in this thread, and not in the thread pool */ ret = fscache_wait_for_operation_activation( @@ -586,10 +590,6 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, fscache_stat(&fscache_n_retrieval_ops); - /* pin the netfs read context in case we need to do the actual netfs - * read because we've encountered a cache read failure */ - fscache_get_context(object->cookie, op->context); - /* we wait for the operation to become active, and then process it * *here*, in this thread, and not in the thread pool */ ret = fscache_wait_for_operation_activation( diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 69c6eb10d858..604e1526cd00 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -133,6 +133,7 @@ extern void fscache_operation_init(struct fscache_operation *, */ struct fscache_retrieval { struct fscache_operation op; + struct fscache_cookie *cookie; /* The netfs cookie */ struct address_space *mapping; /* netfs pages */ fscache_rw_complete_t end_io_func; /* function to call on I/O completion */ void *context; /* netfs read context (pinned) */ -- cgit v1.2.3 From c8a8585431efba0faaf41167f8f7c27c48307ca6 Mon Sep 17 00:00:00 2001 From: Vianney le Clément de Saint-Marcq Date: Mon, 30 Mar 2015 10:34:58 +0200 Subject: iio: core: Introduce IIO_CHAN_INFO_CALIBEMISSIVITY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Contact-less IR temperature sensors measure the temperature of an object by using its thermal radiation. Surfaces with different emissivity ratios emit different amounts of energy at the same temperature. IIO_CHAN_INFO_CALIBEMISSIVITY allows the user to inform the sensor of the emissivity of the object in front of it, in order to effectively measure its temperature. A device providing such setting is Melexis's MLX90614: http://melexis.com/Assets/IR-sensor-thermometer-MLX90614-Datasheet-5152.aspx. Signed-off-by: Vianney le Clément de Saint-Marcq Cc: Arnout Vandecappelle (Essensium/Mind) Signed-off-by: Jonathan Cameron --- Documentation/ABI/testing/sysfs-bus-iio | 11 +++++++++++ drivers/iio/industrialio-core.c | 1 + include/linux/iio/iio.h | 1 + 3 files changed, 13 insertions(+) (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio index 3befcb19f414..866b4ec4aab6 100644 --- a/Documentation/ABI/testing/sysfs-bus-iio +++ b/Documentation/ABI/testing/sysfs-bus-iio @@ -1364,3 +1364,14 @@ Description: hwfifo_watermak_min but not equal to any of the values in this list, the driver will chose an appropriate value for the hardware fifo watermark level. + +What: /sys/bus/iio/devices/iio:deviceX/in_temp_calibemissivity +What: /sys/bus/iio/devices/iio:deviceX/in_tempX_calibemissivity +What: /sys/bus/iio/devices/iio:deviceX/in_temp_object_calibemissivity +What: /sys/bus/iio/devices/iio:deviceX/in_tempX_object_calibemissivity +KernelVersion: 4.1 +Contact: linux-iio@vger.kernel.org +Description: + The emissivity ratio of the surface in the field of view of the + contactless temperature sensor. Emissivity varies from 0 to 1, + with 1 being the emissivity of a black body. diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c index 4df97f650e44..7c98bc1504e6 100644 --- a/drivers/iio/industrialio-core.c +++ b/drivers/iio/industrialio-core.c @@ -128,6 +128,7 @@ static const char * const iio_chan_info_postfix[] = { [IIO_CHAN_INFO_CALIBWEIGHT] = "calibweight", [IIO_CHAN_INFO_DEBOUNCE_COUNT] = "debounce_count", [IIO_CHAN_INFO_DEBOUNCE_TIME] = "debounce_time", + [IIO_CHAN_INFO_CALIBEMISSIVITY] = "calibemissivity", }; /** diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index d86b753e9b30..b1e46ae89aa7 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -43,6 +43,7 @@ enum iio_chan_info_enum { IIO_CHAN_INFO_CALIBWEIGHT, IIO_CHAN_INFO_DEBOUNCE_COUNT, IIO_CHAN_INFO_DEBOUNCE_TIME, + IIO_CHAN_INFO_CALIBEMISSIVITY, }; enum iio_shared_by { -- cgit v1.2.3 From d0e83059a6c9b04f00264a74b8f6439948de4613 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 20 Apr 2015 13:39:03 +0800 Subject: crypto: rng - Convert crypto_rng to new style crypto_type This patch converts the top-level crypto_rng to the "new" style. It was the last algorithm type added before we switched over to the new way of doing things exemplified by shash. All users will automatically switch over to the new interface. Note that this patch does not touch the low-level interface to rng implementations. Signed-off-by: Herbert Xu --- crypto/rng.c | 35 +++++++++++++++++++++++------------ include/crypto/rng.h | 32 ++++++++++---------------------- include/linux/crypto.h | 12 ------------ 3 files changed, 33 insertions(+), 46 deletions(-) (limited to 'include/linux') diff --git a/crypto/rng.c b/crypto/rng.c index e0a25c2456de..87fa2f4933b0 100644 --- a/crypto/rng.c +++ b/crypto/rng.c @@ -24,11 +24,18 @@ #include #include +#include "internal.h" + static DEFINE_MUTEX(crypto_default_rng_lock); struct crypto_rng *crypto_default_rng; EXPORT_SYMBOL_GPL(crypto_default_rng); static int crypto_default_rng_refcnt; +static inline struct crypto_rng *__crypto_rng_cast(struct crypto_tfm *tfm) +{ + return container_of(tfm, struct crypto_rng, base); +} + static int rngapi_reset(struct crypto_rng *tfm, u8 *seed, unsigned int slen) { u8 *buf = NULL; @@ -49,13 +56,13 @@ static int rngapi_reset(struct crypto_rng *tfm, u8 *seed, unsigned int slen) return err; } -static int crypto_init_rng_ops(struct crypto_tfm *tfm, u32 type, u32 mask) +static int crypto_rng_init_tfm(struct crypto_tfm *tfm) { + struct crypto_rng *rng = __crypto_rng_cast(tfm); struct rng_alg *alg = &tfm->__crt_alg->cra_rng; - struct rng_tfm *ops = &tfm->crt_rng; - ops->rng_gen_random = alg->rng_make_random; - ops->rng_reset = rngapi_reset; + rng->generate = alg->rng_make_random; + rng->seed = rngapi_reset; return 0; } @@ -92,22 +99,26 @@ static void crypto_rng_show(struct seq_file *m, struct crypto_alg *alg) seq_printf(m, "seedsize : %u\n", alg->cra_rng.seedsize); } -static unsigned int crypto_rng_ctxsize(struct crypto_alg *alg, u32 type, - u32 mask) -{ - return alg->cra_ctxsize; -} - const struct crypto_type crypto_rng_type = { - .ctxsize = crypto_rng_ctxsize, - .init = crypto_init_rng_ops, + .extsize = crypto_alg_extsize, + .init_tfm = crypto_rng_init_tfm, #ifdef CONFIG_PROC_FS .show = crypto_rng_show, #endif .report = crypto_rng_report, + .maskclear = ~CRYPTO_ALG_TYPE_MASK, + .maskset = CRYPTO_ALG_TYPE_MASK, + .type = CRYPTO_ALG_TYPE_RNG, + .tfmsize = offsetof(struct crypto_rng, base), }; EXPORT_SYMBOL_GPL(crypto_rng_type); +struct crypto_rng *crypto_alloc_rng(const char *alg_name, u32 type, u32 mask) +{ + return crypto_alloc_tfm(alg_name, &crypto_rng_type, type, mask); +} +EXPORT_SYMBOL_GPL(crypto_alloc_rng); + int crypto_get_default_rng(void) { struct crypto_rng *rng; diff --git a/include/crypto/rng.h b/include/crypto/rng.h index 6e28ea5be9f1..f13f3faca4d7 100644 --- a/include/crypto/rng.h +++ b/include/crypto/rng.h @@ -15,6 +15,12 @@ #include +struct crypto_rng { + int (*generate)(struct crypto_rng *tfm, u8 *rdata, unsigned int dlen); + int (*seed)(struct crypto_rng *tfm, u8 *seed, unsigned int slen); + struct crypto_tfm base; +}; + extern struct crypto_rng *crypto_default_rng; int crypto_get_default_rng(void); @@ -27,11 +33,6 @@ void crypto_put_default_rng(void); * CRYPTO_ALG_TYPE_RNG (listed as type "rng" in /proc/crypto) */ -static inline struct crypto_rng *__crypto_rng_cast(struct crypto_tfm *tfm) -{ - return (struct crypto_rng *)tfm; -} - /** * crypto_alloc_rng() -- allocate RNG handle * @alg_name: is the cra_name / name or cra_driver_name / driver name of the @@ -52,15 +53,7 @@ static inline struct crypto_rng *__crypto_rng_cast(struct crypto_tfm *tfm) * Return: allocated cipher handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ -static inline struct crypto_rng *crypto_alloc_rng(const char *alg_name, - u32 type, u32 mask) -{ - type &= ~CRYPTO_ALG_TYPE_MASK; - type |= CRYPTO_ALG_TYPE_RNG; - mask |= CRYPTO_ALG_TYPE_MASK; - - return __crypto_rng_cast(crypto_alloc_base(alg_name, type, mask)); -} +struct crypto_rng *crypto_alloc_rng(const char *alg_name, u32 type, u32 mask); static inline struct crypto_tfm *crypto_rng_tfm(struct crypto_rng *tfm) { @@ -80,18 +73,13 @@ static inline struct rng_alg *crypto_rng_alg(struct crypto_rng *tfm) return &crypto_rng_tfm(tfm)->__crt_alg->cra_rng; } -static inline struct rng_tfm *crypto_rng_crt(struct crypto_rng *tfm) -{ - return &crypto_rng_tfm(tfm)->crt_rng; -} - /** * crypto_free_rng() - zeroize and free RNG handle * @tfm: cipher handle to be freed */ static inline void crypto_free_rng(struct crypto_rng *tfm) { - crypto_free_tfm(crypto_rng_tfm(tfm)); + crypto_destroy_tfm(tfm, crypto_rng_tfm(tfm)); } /** @@ -108,7 +96,7 @@ static inline void crypto_free_rng(struct crypto_rng *tfm) static inline int crypto_rng_get_bytes(struct crypto_rng *tfm, u8 *rdata, unsigned int dlen) { - return crypto_rng_crt(tfm)->rng_gen_random(tfm, rdata, dlen); + return tfm->generate(tfm, rdata, dlen); } /** @@ -131,7 +119,7 @@ static inline int crypto_rng_get_bytes(struct crypto_rng *tfm, static inline int crypto_rng_reset(struct crypto_rng *tfm, u8 *seed, unsigned int slen) { - return crypto_rng_crt(tfm)->rng_reset(tfm, seed, slen); + return tfm->seed(tfm, seed, slen); } /** diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 10df5d2d093a..781f7d546020 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -655,19 +655,12 @@ struct compress_tfm { u8 *dst, unsigned int *dlen); }; -struct rng_tfm { - int (*rng_gen_random)(struct crypto_rng *tfm, u8 *rdata, - unsigned int dlen); - int (*rng_reset)(struct crypto_rng *tfm, u8 *seed, unsigned int slen); -}; - #define crt_ablkcipher crt_u.ablkcipher #define crt_aead crt_u.aead #define crt_blkcipher crt_u.blkcipher #define crt_cipher crt_u.cipher #define crt_hash crt_u.hash #define crt_compress crt_u.compress -#define crt_rng crt_u.rng struct crypto_tfm { @@ -680,7 +673,6 @@ struct crypto_tfm { struct cipher_tfm cipher; struct hash_tfm hash; struct compress_tfm compress; - struct rng_tfm rng; } crt_u; void (*exit)(struct crypto_tfm *tfm); @@ -714,10 +706,6 @@ struct crypto_hash { struct crypto_tfm base; }; -struct crypto_rng { - struct crypto_tfm base; -}; - enum { CRYPTOA_UNSPEC, CRYPTOA_ALG, -- cgit v1.2.3 From acec27ff35af9caf34d76d16ee17ff3b292e7d83 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 21 Apr 2015 10:46:38 +0800 Subject: crypto: rng - Convert low-level crypto_rng to new style This patch converts the low-level crypto_rng interface to the "new" style. This allows existing implementations to be converted over one- by-one. Once that is complete we can then remove the old rng interface. Signed-off-by: Herbert Xu --- crypto/rng.c | 56 ++++++++++++++++++++++++++++++++++++++----- include/crypto/internal/rng.h | 3 +++ include/crypto/rng.h | 42 ++++++++++++++++++++++++++++++-- include/linux/crypto.h | 6 ++--- 4 files changed, 96 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/crypto/rng.c b/crypto/rng.c index f1d64948f6fc..5e0425a24657 100644 --- a/crypto/rng.c +++ b/crypto/rng.c @@ -36,10 +36,15 @@ static inline struct crypto_rng *__crypto_rng_cast(struct crypto_tfm *tfm) return container_of(tfm, struct crypto_rng, base); } +static inline struct old_rng_alg *crypto_old_rng_alg(struct crypto_rng *tfm) +{ + return &crypto_rng_tfm(tfm)->__crt_alg->cra_rng; +} + static int generate(struct crypto_rng *tfm, const u8 *src, unsigned int slen, u8 *dst, unsigned int dlen) { - return crypto_rng_alg(tfm)->rng_make_random(tfm, dst, dlen); + return crypto_old_rng_alg(tfm)->rng_make_random(tfm, dst, dlen); } static int rngapi_reset(struct crypto_rng *tfm, const u8 *seed, @@ -58,7 +63,7 @@ static int rngapi_reset(struct crypto_rng *tfm, const u8 *seed, src = buf; } - err = crypto_rng_alg(tfm)->rng_reset(tfm, src, slen); + err = crypto_old_rng_alg(tfm)->rng_reset(tfm, src, slen); kzfree(buf); return err; @@ -88,13 +93,31 @@ EXPORT_SYMBOL_GPL(crypto_rng_reset); static int crypto_rng_init_tfm(struct crypto_tfm *tfm) { struct crypto_rng *rng = __crypto_rng_cast(tfm); + struct rng_alg *alg = crypto_rng_alg(rng); + struct old_rng_alg *oalg = crypto_old_rng_alg(rng); + + if (oalg->rng_make_random) { + rng->generate = generate; + rng->seed = rngapi_reset; + rng->seedsize = oalg->seedsize; + return 0; + } - rng->generate = generate; - rng->seed = rngapi_reset; + rng->generate = alg->generate; + rng->seed = alg->seed; + rng->seedsize = alg->seedsize; return 0; } +static unsigned int seedsize(struct crypto_alg *alg) +{ + struct rng_alg *ralg = container_of(alg, struct rng_alg, base); + + return alg->cra_rng.rng_make_random ? + alg->cra_rng.seedsize : ralg->seedsize; +} + #ifdef CONFIG_NET static int crypto_rng_report(struct sk_buff *skb, struct crypto_alg *alg) { @@ -102,7 +125,7 @@ static int crypto_rng_report(struct sk_buff *skb, struct crypto_alg *alg) strncpy(rrng.type, "rng", sizeof(rrng.type)); - rrng.seedsize = alg->cra_rng.seedsize; + rrng.seedsize = seedsize(alg); if (nla_put(skb, CRYPTOCFGA_REPORT_RNG, sizeof(struct crypto_report_rng), &rrng)) @@ -124,7 +147,7 @@ static void crypto_rng_show(struct seq_file *m, struct crypto_alg *alg) static void crypto_rng_show(struct seq_file *m, struct crypto_alg *alg) { seq_printf(m, "type : rng\n"); - seq_printf(m, "seedsize : %u\n", alg->cra_rng.seedsize); + seq_printf(m, "seedsize : %u\n", seedsize(alg)); } const struct crypto_type crypto_rng_type = { @@ -189,5 +212,26 @@ void crypto_put_default_rng(void) } EXPORT_SYMBOL_GPL(crypto_put_default_rng); +int crypto_register_rng(struct rng_alg *alg) +{ + struct crypto_alg *base = &alg->base; + + if (alg->seedsize > PAGE_SIZE / 8) + return -EINVAL; + + base->cra_type = &crypto_rng_type; + base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK; + base->cra_flags |= CRYPTO_ALG_TYPE_RNG; + + return crypto_register_alg(base); +} +EXPORT_SYMBOL_GPL(crypto_register_rng); + +void crypto_unregister_rng(struct rng_alg *alg) +{ + crypto_unregister_alg(&alg->base); +} +EXPORT_SYMBOL_GPL(crypto_unregister_rng); + MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Random Number Generator"); diff --git a/include/crypto/internal/rng.h b/include/crypto/internal/rng.h index 896973369573..76f3c9519ba5 100644 --- a/include/crypto/internal/rng.h +++ b/include/crypto/internal/rng.h @@ -18,6 +18,9 @@ extern const struct crypto_type crypto_rng_type; +int crypto_register_rng(struct rng_alg *alg); +void crypto_unregister_rng(struct rng_alg *alg); + static inline void *crypto_rng_ctx(struct crypto_rng *tfm) { return crypto_tfm_ctx(&tfm->base); diff --git a/include/crypto/rng.h b/include/crypto/rng.h index 7fca37144b59..133f0441ad3e 100644 --- a/include/crypto/rng.h +++ b/include/crypto/rng.h @@ -15,11 +15,48 @@ #include +struct crypto_rng; + +/** + * struct rng_alg - random number generator definition + * + * @generate: The function defined by this variable obtains a + * random number. The random number generator transform + * must generate the random number out of the context + * provided with this call, plus any additional data + * if provided to the call. + * @seed: Seed or reseed the random number generator. With the + * invocation of this function call, the random number + * generator shall become ready fo generation. If the + * random number generator requires a seed for setting + * up a new state, the seed must be provided by the + * consumer while invoking this function. The required + * size of the seed is defined with @seedsize . + * @seedsize: The seed size required for a random number generator + * initialization defined with this variable. Some + * random number generators does not require a seed + * as the seeding is implemented internally without + * the need of support by the consumer. In this case, + * the seed size is set to zero. + * @base: Common crypto API algorithm data structure. + */ +struct rng_alg { + int (*generate)(struct crypto_rng *tfm, + const u8 *src, unsigned int slen, + u8 *dst, unsigned int dlen); + int (*seed)(struct crypto_rng *tfm, const u8 *seed, unsigned int slen); + + unsigned int seedsize; + + struct crypto_alg base; +}; + struct crypto_rng { int (*generate)(struct crypto_rng *tfm, const u8 *src, unsigned int slen, u8 *dst, unsigned int dlen); int (*seed)(struct crypto_rng *tfm, const u8 *seed, unsigned int slen); + unsigned int seedsize; struct crypto_tfm base; }; @@ -72,7 +109,8 @@ static inline struct crypto_tfm *crypto_rng_tfm(struct crypto_rng *tfm) */ static inline struct rng_alg *crypto_rng_alg(struct crypto_rng *tfm) { - return &crypto_rng_tfm(tfm)->__crt_alg->cra_rng; + return container_of(crypto_rng_tfm(tfm)->__crt_alg, + struct rng_alg, base); } /** @@ -156,7 +194,7 @@ int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, */ static inline int crypto_rng_seedsize(struct crypto_rng *tfm) { - return crypto_rng_alg(tfm)->seedsize; + return tfm->seedsize; } #endif diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 781f7d546020..2fa9b05360f7 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -427,7 +427,7 @@ struct compress_alg { }; /** - * struct rng_alg - random number generator definition + * struct old_rng_alg - random number generator definition * @rng_make_random: The function defined by this variable obtains a random * number. The random number generator transform must generate * the random number out of the context provided with this @@ -445,7 +445,7 @@ struct compress_alg { * seeding is implemented internally without the need of support by * the consumer. In this case, the seed size is set to zero. */ -struct rng_alg { +struct old_rng_alg { int (*rng_make_random)(struct crypto_rng *tfm, u8 *rdata, unsigned int dlen); int (*rng_reset)(struct crypto_rng *tfm, u8 *seed, unsigned int slen); @@ -559,7 +559,7 @@ struct crypto_alg { struct blkcipher_alg blkcipher; struct cipher_alg cipher; struct compress_alg compress; - struct rng_alg rng; + struct old_rng_alg rng; } cra_u; int (*cra_init)(struct crypto_tfm *tfm); -- cgit v1.2.3 From 94f1bb15bed84ad6c893916b7e7b9db6f1d7eec6 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 21 Apr 2015 10:46:46 +0800 Subject: crypto: rng - Remove old low-level rng interface Now that all rng implementations have switched over to the new interface, we can remove the old low-level interface. Signed-off-by: Herbert Xu --- crypto/rng.c | 57 +++---------------------------------------- include/crypto/internal/rng.h | 3 +-- include/crypto/rng.h | 10 +++----- include/linux/crypto.h | 30 ----------------------- 4 files changed, 8 insertions(+), 92 deletions(-) (limited to 'include/linux') diff --git a/crypto/rng.c b/crypto/rng.c index e98ce1ca527d..055e276427b1 100644 --- a/crypto/rng.c +++ b/crypto/rng.c @@ -4,6 +4,7 @@ * RNG operations. * * Copyright (c) 2008 Neil Horman + * Copyright (c) 2015 Herbert Xu * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -36,39 +37,6 @@ static inline struct crypto_rng *__crypto_rng_cast(struct crypto_tfm *tfm) return container_of(tfm, struct crypto_rng, base); } -static inline struct old_rng_alg *crypto_old_rng_alg(struct crypto_rng *tfm) -{ - return &crypto_rng_tfm(tfm)->__crt_alg->cra_rng; -} - -static int generate(struct crypto_rng *tfm, const u8 *src, unsigned int slen, - u8 *dst, unsigned int dlen) -{ - return crypto_old_rng_alg(tfm)->rng_make_random(tfm, dst, dlen); -} - -static int rngapi_reset(struct crypto_rng *tfm, const u8 *seed, - unsigned int slen) -{ - u8 *buf = NULL; - u8 *src = (u8 *)seed; - int err; - - if (slen) { - buf = kmalloc(slen, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - memcpy(buf, seed, slen); - src = buf; - } - - err = crypto_old_rng_alg(tfm)->rng_reset(tfm, src, slen); - - kzfree(buf); - return err; -} - int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen) { u8 *buf = NULL; @@ -83,7 +51,7 @@ int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen) seed = buf; } - err = tfm->seed(tfm, seed, slen); + err = crypto_rng_alg(tfm)->seed(tfm, seed, slen); kfree(buf); return err; @@ -92,21 +60,6 @@ EXPORT_SYMBOL_GPL(crypto_rng_reset); static int crypto_rng_init_tfm(struct crypto_tfm *tfm) { - struct crypto_rng *rng = __crypto_rng_cast(tfm); - struct rng_alg *alg = crypto_rng_alg(rng); - struct old_rng_alg *oalg = crypto_old_rng_alg(rng); - - if (oalg->rng_make_random) { - rng->generate = generate; - rng->seed = rngapi_reset; - rng->seedsize = oalg->seedsize; - return 0; - } - - rng->generate = alg->generate; - rng->seed = alg->seed; - rng->seedsize = alg->seedsize; - return 0; } @@ -114,8 +67,7 @@ static unsigned int seedsize(struct crypto_alg *alg) { struct rng_alg *ralg = container_of(alg, struct rng_alg, base); - return alg->cra_rng.rng_make_random ? - alg->cra_rng.seedsize : ralg->seedsize; + return ralg->seedsize; } #ifdef CONFIG_NET @@ -150,7 +102,7 @@ static void crypto_rng_show(struct seq_file *m, struct crypto_alg *alg) seq_printf(m, "seedsize : %u\n", seedsize(alg)); } -const struct crypto_type crypto_rng_type = { +static const struct crypto_type crypto_rng_type = { .extsize = crypto_alg_extsize, .init_tfm = crypto_rng_init_tfm, #ifdef CONFIG_PROC_FS @@ -162,7 +114,6 @@ const struct crypto_type crypto_rng_type = { .type = CRYPTO_ALG_TYPE_RNG, .tfmsize = offsetof(struct crypto_rng, base), }; -EXPORT_SYMBOL_GPL(crypto_rng_type); struct crypto_rng *crypto_alloc_rng(const char *alg_name, u32 type, u32 mask) { diff --git a/include/crypto/internal/rng.h b/include/crypto/internal/rng.h index 2c9a865c66e2..263f1a5eebc7 100644 --- a/include/crypto/internal/rng.h +++ b/include/crypto/internal/rng.h @@ -2,6 +2,7 @@ * RNG: Random Number Generator algorithms under the crypto API * * Copyright (c) 2008 Neil Horman + * Copyright (c) 2015 Herbert Xu * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -16,8 +17,6 @@ #include #include -extern const struct crypto_type crypto_rng_type; - int crypto_register_rng(struct rng_alg *alg); void crypto_unregister_rng(struct rng_alg *alg); int crypto_register_rngs(struct rng_alg *algs, int count); diff --git a/include/crypto/rng.h b/include/crypto/rng.h index cc22e52a129a..c5d4684429f5 100644 --- a/include/crypto/rng.h +++ b/include/crypto/rng.h @@ -2,6 +2,7 @@ * RNG: Random Number Generator algorithms under the crypto API * * Copyright (c) 2008 Neil Horman + * Copyright (c) 2015 Herbert Xu * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -56,11 +57,6 @@ struct rng_alg { }; struct crypto_rng { - int (*generate)(struct crypto_rng *tfm, - const u8 *src, unsigned int slen, - u8 *dst, unsigned int dlen); - int (*seed)(struct crypto_rng *tfm, const u8 *seed, unsigned int slen); - unsigned int seedsize; struct crypto_tfm base; }; @@ -144,7 +140,7 @@ static inline int crypto_rng_generate(struct crypto_rng *tfm, const u8 *src, unsigned int slen, u8 *dst, unsigned int dlen) { - return tfm->generate(tfm, src, slen, dst, dlen); + return crypto_rng_alg(tfm)->generate(tfm, src, slen, dst, dlen); } /** @@ -198,7 +194,7 @@ int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, */ static inline int crypto_rng_seedsize(struct crypto_rng *tfm) { - return tfm->seedsize; + return crypto_rng_alg(tfm)->seedsize; } #endif diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 2fa9b05360f7..ee14140f8893 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -138,7 +138,6 @@ struct crypto_async_request; struct crypto_aead; struct crypto_blkcipher; struct crypto_hash; -struct crypto_rng; struct crypto_tfm; struct crypto_type; struct aead_givcrypt_request; @@ -426,40 +425,12 @@ struct compress_alg { unsigned int slen, u8 *dst, unsigned int *dlen); }; -/** - * struct old_rng_alg - random number generator definition - * @rng_make_random: The function defined by this variable obtains a random - * number. The random number generator transform must generate - * the random number out of the context provided with this - * call. - * @rng_reset: Reset of the random number generator by clearing the entire state. - * With the invocation of this function call, the random number - * generator shall completely reinitialize its state. If the random - * number generator requires a seed for setting up a new state, - * the seed must be provided by the consumer while invoking this - * function. The required size of the seed is defined with - * @seedsize . - * @seedsize: The seed size required for a random number generator - * initialization defined with this variable. Some random number - * generators like the SP800-90A DRBG does not require a seed as the - * seeding is implemented internally without the need of support by - * the consumer. In this case, the seed size is set to zero. - */ -struct old_rng_alg { - int (*rng_make_random)(struct crypto_rng *tfm, u8 *rdata, - unsigned int dlen); - int (*rng_reset)(struct crypto_rng *tfm, u8 *seed, unsigned int slen); - - unsigned int seedsize; -}; - #define cra_ablkcipher cra_u.ablkcipher #define cra_aead cra_u.aead #define cra_blkcipher cra_u.blkcipher #define cra_cipher cra_u.cipher #define cra_compress cra_u.compress -#define cra_rng cra_u.rng /** * struct crypto_alg - definition of a cryptograpic cipher algorithm @@ -559,7 +530,6 @@ struct crypto_alg { struct blkcipher_alg blkcipher; struct cipher_alg cipher; struct compress_alg compress; - struct old_rng_alg rng; } cra_u; int (*cra_init)(struct crypto_tfm *tfm); -- cgit v1.2.3 From 4796cf9b02b5bea141632e21d64556a7eb883a65 Mon Sep 17 00:00:00 2001 From: Yingjoe Chen Date: Fri, 10 Apr 2015 21:55:50 +0800 Subject: time: Remove nonexistent function prototype The function clocksource_get_next() was removed in commit 75c5158f70 (timekeeping: Update clocksource with stop_machine), but the prototype was not removed with it. Remove the prototype. Signed-off-by: Yingjoe Chen Cc: Cc: Martin Schwidefsky Cc: Cc: John Stultz Link: http://lkml.kernel.org/r/1428674150-1780-1-git-send-email-yingjoe.chen@mediatek.com Signed-off-by: Thomas Gleixner --- include/linux/clocksource.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 135509821c39..a25fc6e873b8 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -181,7 +181,6 @@ static inline s64 clocksource_cyc2ns(cycle_t cycles, u32 mult, u32 shift) extern int clocksource_unregister(struct clocksource*); extern void clocksource_touch_watchdog(void); -extern struct clocksource* clocksource_get_next(void); extern void clocksource_change_rating(struct clocksource *cs, int rating); extern void clocksource_suspend(void); extern void clocksource_resume(void); -- cgit v1.2.3 From 91e5a2170e795989da9f90c18ba18984f23acc5b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 13 Apr 2015 21:02:22 +0000 Subject: hrtimer: Document hrtimer_forward[_now]() proper Document the calling context conditions. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20150413210035.178751779@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 17 ++++++++++++++++- kernel/time/hrtimer.c | 8 ++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 05f6df1fdf5b..7770676c387a 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -418,7 +418,22 @@ static inline int hrtimer_callback_running(struct hrtimer *timer) extern u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval); -/* Forward a hrtimer so it expires after the hrtimer's current now */ +/** + * hrtimer_forward_now - forward the timer expiry so it expires after now + * @timer: hrtimer to forward + * @interval: the interval to forward + * + * Forward the timer expiry so it will expire after the current time + * of the hrtimer clock base. Returns the number of overruns. + * + * Can be safely called from the callback function of @timer. If + * called from other contexts @timer must neither be enqueued nor + * running the callback and the caller needs to take care of + * serialization. + * + * Note: This only updates the timer expiry value and does not requeue + * the timer. + */ static inline u64 hrtimer_forward_now(struct hrtimer *timer, ktime_t interval) { diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 76d4bd962b19..b1a74ee3e99a 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -801,6 +801,14 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) * * Forward the timer expiry so it will expire in the future. * Returns the number of overruns. + * + * Can be safely called from the callback function of @timer. If + * called from other contexts @timer must neither be enqueued nor + * running the callback and the caller needs to take care of + * serialization. + * + * Note: This only updates the timer expiry value and does not requeue + * the timer. */ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) { -- cgit v1.2.3 From 398ca17fb54b212cdc9da7ff4a17a35c48dd2103 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:27 +0000 Subject: hrtimer: Get rid of the resolution field in hrtimer_clock_base The field has no value because all clock bases have the same resolution. The resolution only changes when we switch to high resolution timer mode. We can evaluate that from a single static variable as well. In the !HIGHRES case its simply a constant. Export the variable, so we can simplify the usage sites. Signed-off-by: Thomas Gleixner Reviewed-by: Preeti U Murthy Acked-by: Peter Zijlstra Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203500.645454122@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 6 ++++-- kernel/time/hrtimer.c | 26 +++++++++----------------- kernel/time/timer_list.c | 8 ++++---- 3 files changed, 17 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 7770676c387a..bc6f91b5443b 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -137,7 +137,6 @@ struct hrtimer_sleeper { * timer to a base on another cpu. * @clockid: clock id for per_cpu support * @active: red black tree root node for the active timers - * @resolution: the resolution of the clock, in nanoseconds * @get_time: function to retrieve the current time of the clock * @softirq_time: the time when running the hrtimer queue in the softirq * @offset: offset of this clock to the monotonic base @@ -147,7 +146,6 @@ struct hrtimer_clock_base { int index; clockid_t clockid; struct timerqueue_head active; - ktime_t resolution; ktime_t (*get_time)(void); ktime_t softirq_time; ktime_t offset; @@ -295,11 +293,15 @@ extern void hrtimer_peek_ahead_timers(void); extern void clock_was_set_delayed(void); +extern unsigned int hrtimer_resolution; + #else # define MONOTONIC_RES_NSEC LOW_RES_NSEC # define KTIME_MONOTONIC_RES KTIME_LOW_RES +#define hrtimer_resolution LOW_RES_NSEC + static inline void hrtimer_peek_ahead_timers(void) { } /* diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 9abd50b60e83..965687a1fce6 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -66,7 +66,6 @@ */ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { - .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), .clock_base = { @@ -74,25 +73,21 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .index = HRTIMER_BASE_MONOTONIC, .clockid = CLOCK_MONOTONIC, .get_time = &ktime_get, - .resolution = KTIME_LOW_RES, }, { .index = HRTIMER_BASE_REALTIME, .clockid = CLOCK_REALTIME, .get_time = &ktime_get_real, - .resolution = KTIME_LOW_RES, }, { .index = HRTIMER_BASE_BOOTTIME, .clockid = CLOCK_BOOTTIME, .get_time = &ktime_get_boottime, - .resolution = KTIME_LOW_RES, }, { .index = HRTIMER_BASE_TAI, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, - .resolution = KTIME_LOW_RES, }, } }; @@ -478,6 +473,8 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) * High resolution timer enabled ? */ static int hrtimer_hres_enabled __read_mostly = 1; +unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; +EXPORT_SYMBOL_GPL(hrtimer_resolution); /* * Enable / Disable high resolution mode @@ -660,7 +657,7 @@ static void retrigger_next_event(void *arg) */ static int hrtimer_switch_to_hres(void) { - int i, cpu = smp_processor_id(); + int cpu = smp_processor_id(); struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); unsigned long flags; @@ -676,8 +673,7 @@ static int hrtimer_switch_to_hres(void) return 0; } base->hres_active = 1; - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) - base->clock_base[i].resolution = KTIME_HIGH_RES; + hrtimer_resolution = HIGH_RES_NSEC; tick_setup_sched_timer(); /* "Retrigger" the interrupt to get things going */ @@ -820,8 +816,8 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) if (delta.tv64 < 0) return 0; - if (interval.tv64 < timer->base->resolution.tv64) - interval.tv64 = timer->base->resolution.tv64; + if (interval.tv64 < hrtimer_resolution) + interval.tv64 = hrtimer_resolution; if (unlikely(delta.tv64 >= interval.tv64)) { s64 incr = ktime_to_ns(interval); @@ -963,7 +959,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * timeouts. This will go away with the GTOD framework. */ #ifdef CONFIG_TIME_LOW_RES - tim = ktime_add_safe(tim, base->resolution); + tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution)); #endif } @@ -1193,12 +1189,8 @@ EXPORT_SYMBOL_GPL(hrtimer_init); */ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) { - struct hrtimer_cpu_base *cpu_base; - int base = hrtimer_clockid_to_base(which_clock); - - cpu_base = raw_cpu_ptr(&hrtimer_bases); - *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution); - + tp->tv_sec = 0; + tp->tv_nsec = hrtimer_resolution; return 0; } EXPORT_SYMBOL_GPL(hrtimer_get_res); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 5960af2196ac..bdd5e987f115 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -127,10 +127,10 @@ static void print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { SEQ_printf(m, " .base: %pK\n", base); - SEQ_printf(m, " .index: %d\n", - base->index); - SEQ_printf(m, " .resolution: %Lu nsecs\n", - (unsigned long long)ktime_to_ns(base->resolution)); + SEQ_printf(m, " .index: %d\n", base->index); + + SEQ_printf(m, " .resolution: %u nsecs\n", (unsigned) hrtimer_resolution); + SEQ_printf(m, " .get_time: "); print_name_offset(m, base->get_time); SEQ_printf(m, "\n"); -- cgit v1.2.3 From 056a3cacbc46e5aca27b350ce4ecb3b33ebb0700 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:32 +0000 Subject: hrtimer: Get rid of hrtimer_get_res() The resolution is directly accessible now. So its simpler just to fill in the values of the timespec and be done with it. Text size reduction (combined with "hrtimer: Get rid of the resolution field in hrtimer_clock_base"): x8664 -61, i386 -221, ARM -60, power64 -48 Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203500.879888080@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 1 - kernel/time/alarmtimer.c | 6 +++--- kernel/time/hrtimer.c | 16 ---------------- kernel/time/posix-timers.c | 17 ++++++++++++----- 4 files changed, 15 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index bc6f91b5443b..8025156c8fa1 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -385,7 +385,6 @@ static inline int hrtimer_restart(struct hrtimer *timer) /* Query timers: */ extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer); -extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp); extern ktime_t hrtimer_get_next_event(void); diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 1b001ed1edb9..0b55a7570c90 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -495,12 +495,12 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, */ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) { - clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid; - if (!alarmtimer_get_rtcdev()) return -EINVAL; - return hrtimer_get_res(baseid, tp); + tp->tv_sec = 0; + tp->tv_nsec = hrtimer_resolution; + return 0; } /** diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 965687a1fce6..73131dab787e 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1179,22 +1179,6 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, } EXPORT_SYMBOL_GPL(hrtimer_init); -/** - * hrtimer_get_res - get the timer resolution for a clock - * @which_clock: which clock to query - * @tp: pointer to timespec variable to store the resolution - * - * Store the resolution of the clock selected by @which_clock in the - * variable pointed to by @tp. - */ -int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) -{ - tp->tv_sec = 0; - tp->tv_nsec = hrtimer_resolution; - return 0; -} -EXPORT_SYMBOL_GPL(hrtimer_get_res); - static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) { struct hrtimer_clock_base *base = timer->base; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 31ea01f42e1f..31d11ac9fa47 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -272,13 +272,20 @@ static int posix_get_tai(clockid_t which_clock, struct timespec *tp) return 0; } +static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec *tp) +{ + tp->tv_sec = 0; + tp->tv_nsec = hrtimer_resolution; + return 0; +} + /* * Initialize everything, well, just everything in Posix clocks/timers ;) */ static __init int init_posix_timers(void) { struct k_clock clock_realtime = { - .clock_getres = hrtimer_get_res, + .clock_getres = posix_get_hrtimer_res, .clock_get = posix_clock_realtime_get, .clock_set = posix_clock_realtime_set, .clock_adj = posix_clock_realtime_adj, @@ -290,7 +297,7 @@ static __init int init_posix_timers(void) .timer_del = common_timer_del, }; struct k_clock clock_monotonic = { - .clock_getres = hrtimer_get_res, + .clock_getres = posix_get_hrtimer_res, .clock_get = posix_ktime_get_ts, .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, @@ -300,7 +307,7 @@ static __init int init_posix_timers(void) .timer_del = common_timer_del, }; struct k_clock clock_monotonic_raw = { - .clock_getres = hrtimer_get_res, + .clock_getres = posix_get_hrtimer_res, .clock_get = posix_get_monotonic_raw, }; struct k_clock clock_realtime_coarse = { @@ -312,7 +319,7 @@ static __init int init_posix_timers(void) .clock_get = posix_get_monotonic_coarse, }; struct k_clock clock_tai = { - .clock_getres = hrtimer_get_res, + .clock_getres = posix_get_hrtimer_res, .clock_get = posix_get_tai, .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, @@ -322,7 +329,7 @@ static __init int init_posix_timers(void) .timer_del = common_timer_del, }; struct k_clock clock_boottime = { - .clock_getres = hrtimer_get_res, + .clock_getres = posix_get_hrtimer_res, .clock_get = posix_get_boottime, .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, -- cgit v1.2.3 From a6ffebce7f89f6f97cc22838a5d4383b15d6774f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:34 +0000 Subject: hrtimer: Make the statistics fields smaller No point in having usigned long for /proc/timer_list statistics. Make them unsigned int. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203500.959773467@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 8 ++++---- kernel/time/hrtimer.c | 4 ++-- kernel/time/timer_list.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 8025156c8fa1..d39f2847754c 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -187,10 +187,10 @@ struct hrtimer_cpu_base { int in_hrtirq; int hres_active; int hang_detected; - unsigned long nr_events; - unsigned long nr_retries; - unsigned long nr_hangs; - ktime_t max_hang_time; + unsigned int nr_events; + unsigned int nr_retries; + unsigned int nr_hangs; + unsigned int max_hang_time; #endif struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; }; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 73131dab787e..874e0914eb3c 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1327,8 +1327,8 @@ retry: cpu_base->hang_detected = 1; raw_spin_unlock(&cpu_base->lock); delta = ktime_sub(now, entry_time); - if (delta.tv64 > cpu_base->max_hang_time.tv64) - cpu_base->max_hang_time = delta; + if ((unsigned int)delta.tv64 > cpu_base->max_hang_time) + cpu_base->max_hang_time = (unsigned int) delta.tv64; /* * Limit it to a sensible value as we enforce a longer * delay. Give the CPU at least 100ms to catch up. diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index bdd5e987f115..6232fc536185 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -165,7 +165,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) P(nr_events); P(nr_retries); P(nr_hangs); - P_ns(max_hang_time); + P(max_hang_time); #endif #undef P #undef P_ns -- cgit v1.2.3 From 21d6d52a1b7028e6a6840bd82e354aefa9a5e203 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:35 +0000 Subject: hrtimer: Get rid of softirq time The softirq time field in the clock bases is an optimization from the early days of hrtimers. It provides a coarse "jiffies" like time mostly for self rearming timers. But that comes with a price: - Larger code size - Extra storage space - Duplicated functions with really small differences The benefit of this is optimization is marginal for contemporary systems. Consolidate everything on the high resolution timer implementation. This makes further optimizations possible. Text size reduction: x8664 -95, i386 -356, ARM -148, ARM64 -40, power64 -16 Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203501.039977424@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 24 ++------ kernel/time/hrtimer.c | 148 ++++++++++++++++++---------------------------- kernel/time/timekeeping.c | 32 ---------- kernel/time/timekeeping.h | 3 - 4 files changed, 64 insertions(+), 143 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index d39f2847754c..e292830b58f0 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -138,7 +138,6 @@ struct hrtimer_sleeper { * @clockid: clock id for per_cpu support * @active: red black tree root node for the active timers * @get_time: function to retrieve the current time of the clock - * @softirq_time: the time when running the hrtimer queue in the softirq * @offset: offset of this clock to the monotonic base */ struct hrtimer_clock_base { @@ -147,7 +146,6 @@ struct hrtimer_clock_base { clockid_t clockid; struct timerqueue_head active; ktime_t (*get_time)(void); - ktime_t softirq_time; ktime_t offset; }; @@ -260,19 +258,16 @@ static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) return ktime_sub(timer->node.expires, timer->base->get_time()); } -#ifdef CONFIG_HIGH_RES_TIMERS -struct clock_event_device; - -extern void hrtimer_interrupt(struct clock_event_device *dev); - -/* - * In high resolution mode the time reference must be read accurate - */ static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer) { return timer->base->get_time(); } +#ifdef CONFIG_HIGH_RES_TIMERS +struct clock_event_device; + +extern void hrtimer_interrupt(struct clock_event_device *dev); + static inline int hrtimer_is_hres_active(struct hrtimer *timer) { return timer->base->cpu_base->hres_active; @@ -304,15 +299,6 @@ extern unsigned int hrtimer_resolution; static inline void hrtimer_peek_ahead_timers(void) { } -/* - * In non high resolution mode the time reference is taken from - * the base softirq time variable. - */ -static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer) -{ - return timer->base->softirq_time; -} - static inline int hrtimer_is_hres_active(struct hrtimer *timer) { return 0; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 874e0914eb3c..9e111dd83ca3 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -104,27 +104,6 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) return hrtimer_clock_to_base_table[clock_id]; } - -/* - * Get the coarse grained time at the softirq based on xtime and - * wall_to_monotonic. - */ -static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) -{ - ktime_t xtim, mono, boot, tai; - ktime_t off_real, off_boot, off_tai; - - mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai); - boot = ktime_add(mono, off_boot); - xtim = ktime_add(mono, off_real); - tai = ktime_add(mono, off_tai); - - base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; - base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; - base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; - base->clock_base[HRTIMER_BASE_TAI].softirq_time = tai; -} - /* * Functions and macros which are different for UP/SMP systems are kept in a * single place @@ -466,6 +445,15 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) } #endif +static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) +{ + ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; + ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; + ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; + + return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai); +} + /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS @@ -516,7 +504,12 @@ static inline int hrtimer_hres_active(void) static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) { - ktime_t expires_next = __hrtimer_get_next_event(cpu_base); + ktime_t expires_next; + + if (!cpu_base->hres_active) + return; + + expires_next = __hrtimer_get_next_event(cpu_base); if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64) return; @@ -625,15 +618,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) base->hres_active = 0; } -static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) -{ - ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; - ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; - ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; - - return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai); -} - /* * Retrigger next event is called after clock was set * @@ -1179,10 +1163,10 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, } EXPORT_SYMBOL_GPL(hrtimer_init); -static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) +static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, + struct hrtimer_clock_base *base, + struct hrtimer *timer, ktime_t *now) { - struct hrtimer_clock_base *base = timer->base; - struct hrtimer_cpu_base *cpu_base = base->cpu_base; enum hrtimer_restart (*fn)(struct hrtimer *); int restart; @@ -1219,34 +1203,9 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) timer->state &= ~HRTIMER_STATE_CALLBACK; } -#ifdef CONFIG_HIGH_RES_TIMERS - -/* - * High resolution timer interrupt - * Called with interrupts disabled - */ -void hrtimer_interrupt(struct clock_event_device *dev) +static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) { - struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - ktime_t expires_next, now, entry_time, delta; - int i, retries = 0; - - BUG_ON(!cpu_base->hres_active); - cpu_base->nr_events++; - dev->next_event.tv64 = KTIME_MAX; - - raw_spin_lock(&cpu_base->lock); - entry_time = now = hrtimer_update_base(cpu_base); -retry: - cpu_base->in_hrtirq = 1; - /* - * We set expires_next to KTIME_MAX here with cpu_base->lock - * held to prevent that a timer is enqueued in our queue via - * the migration code. This does not affect enqueueing of - * timers which run their callback and need to be requeued on - * this CPU. - */ - cpu_base->expires_next.tv64 = KTIME_MAX; + int i; for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { struct hrtimer_clock_base *base; @@ -1279,9 +1238,42 @@ retry: if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) break; - __run_hrtimer(timer, &basenow); + __run_hrtimer(cpu_base, base, timer, &basenow); } } +} + +#ifdef CONFIG_HIGH_RES_TIMERS + +/* + * High resolution timer interrupt + * Called with interrupts disabled + */ +void hrtimer_interrupt(struct clock_event_device *dev) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + ktime_t expires_next, now, entry_time, delta; + int retries = 0; + + BUG_ON(!cpu_base->hres_active); + cpu_base->nr_events++; + dev->next_event.tv64 = KTIME_MAX; + + raw_spin_lock(&cpu_base->lock); + entry_time = now = hrtimer_update_base(cpu_base); +retry: + cpu_base->in_hrtirq = 1; + /* + * We set expires_next to KTIME_MAX here with cpu_base->lock + * held to prevent that a timer is enqueued in our queue via + * the migration code. This does not affect enqueueing of + * timers which run their callback and need to be requeued on + * this CPU. + */ + cpu_base->expires_next.tv64 = KTIME_MAX; + + __hrtimer_run_queues(cpu_base, now); + /* Reevaluate the clock bases for the next expiry */ expires_next = __hrtimer_get_next_event(cpu_base); /* @@ -1416,38 +1408,16 @@ void hrtimer_run_pending(void) */ void hrtimer_run_queues(void) { - struct timerqueue_node *node; struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - struct hrtimer_clock_base *base; - int index, gettime = 1; + ktime_t now; if (hrtimer_hres_active()) return; - for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { - base = &cpu_base->clock_base[index]; - if (!timerqueue_getnext(&base->active)) - continue; - - if (gettime) { - hrtimer_get_softirq_time(cpu_base); - gettime = 0; - } - - raw_spin_lock(&cpu_base->lock); - - while ((node = timerqueue_getnext(&base->active))) { - struct hrtimer *timer; - - timer = container_of(node, struct hrtimer, node); - if (base->softirq_time.tv64 <= - hrtimer_get_expires_tv64(timer)) - break; - - __run_hrtimer(timer, &base->softirq_time); - } - raw_spin_unlock(&cpu_base->lock); - } + raw_spin_lock(&cpu_base->lock); + now = hrtimer_update_base(cpu_base); + __hrtimer_run_queues(cpu_base, now); + raw_spin_unlock(&cpu_base->lock); } /* diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 946acb72179f..dd1efa6a4ea4 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1925,37 +1925,6 @@ void do_timer(unsigned long ticks) calc_global_load(ticks); } -/** - * ktime_get_update_offsets_tick - hrtimer helper - * @offs_real: pointer to storage for monotonic -> realtime offset - * @offs_boot: pointer to storage for monotonic -> boottime offset - * @offs_tai: pointer to storage for monotonic -> clock tai offset - * - * Returns monotonic time at last tick and various offsets - */ -ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot, - ktime_t *offs_tai) -{ - struct timekeeper *tk = &tk_core.timekeeper; - unsigned int seq; - ktime_t base; - u64 nsecs; - - do { - seq = read_seqcount_begin(&tk_core.seq); - - base = tk->tkr_mono.base; - nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; - - *offs_real = tk->offs_real; - *offs_boot = tk->offs_boot; - *offs_tai = tk->offs_tai; - } while (read_seqcount_retry(&tk_core.seq, seq)); - - return ktime_add_ns(base, nsecs); -} - -#ifdef CONFIG_HIGH_RES_TIMERS /** * ktime_get_update_offsets_now - hrtimer helper * @offs_real: pointer to storage for monotonic -> realtime offset @@ -1986,7 +1955,6 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, return ktime_add_ns(base, nsecs); } -#endif /** * do_adjtimex() - Accessor function to NTP __do_adjtimex function diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 5b57f6c9ae34..4d177fce37d5 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -3,9 +3,6 @@ /* * Internal interfaces for kernel/time/ */ -extern ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, - ktime_t *offs_boot, - ktime_t *offs_tai); extern ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, ktime_t *offs_tai); -- cgit v1.2.3 From 868a3e915f7f5eba8f8cb4f7da2276760807c51c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:37 +0000 Subject: hrtimer: Make offset update smarter On every tick/hrtimer interrupt we update the offset variables of the clock bases. That's silly because these offsets change very seldom. Add a sequence counter to the time keeping code which keeps track of the offset updates (clock_was_set()). Have a sequence cache in the hrtimer cpu bases to evaluate whether the offsets must be updated or not. This allows us later to avoid pointless cacheline pollution. Signed-off-by: Thomas Gleixner Reviewed-by: Preeti U Murthy Acked-by: Peter Zijlstra Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Cc: John Stultz Link: http://lkml.kernel.org/r/20150414203501.132820245@linutronix.de Signed-off-by: Thomas Gleixner Cc: John Stultz --- include/linux/hrtimer.h | 4 ++-- include/linux/timekeeper_internal.h | 2 ++ kernel/time/hrtimer.c | 3 ++- kernel/time/timekeeping.c | 23 ++++++++++++++++------- kernel/time/timekeeping.h | 7 ++++--- 5 files changed, 26 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index e292830b58f0..5e04f8fc26f6 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -163,7 +163,7 @@ enum hrtimer_base_type { * and timers * @cpu: cpu number * @active_bases: Bitfield to mark bases with active timers - * @clock_was_set: Indicates that clock was set from irq context. + * @clock_was_set_seq: Sequence counter of clock was set events * @expires_next: absolute time of the next event which was scheduled * via clock_set_next_event() * @in_hrtirq: hrtimer_interrupt() is currently executing @@ -179,7 +179,7 @@ struct hrtimer_cpu_base { raw_spinlock_t lock; unsigned int cpu; unsigned int active_bases; - unsigned int clock_was_set; + unsigned int clock_was_set_seq; #ifdef CONFIG_HIGH_RES_TIMERS ktime_t expires_next; int in_hrtirq; diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index fb86963859c7..6f8276ae579c 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -49,6 +49,7 @@ struct tk_read_base { * @offs_boot: Offset clock monotonic -> clock boottime * @offs_tai: Offset clock monotonic -> clock tai * @tai_offset: The current UTC to TAI offset in seconds + * @clock_was_set_seq: The sequence number of clock was set events * @raw_time: Monotonic raw base time in timespec64 format * @cycle_interval: Number of clock cycles in one NTP interval * @xtime_interval: Number of clock shifted nano seconds in one NTP @@ -85,6 +86,7 @@ struct timekeeper { ktime_t offs_boot; ktime_t offs_tai; s32 tai_offset; + unsigned int clock_was_set_seq; struct timespec64 raw_time; /* The following members are for timekeeping internal use */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 9e111dd83ca3..8ce9b3138017 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -451,7 +451,8 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; - return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai); + return ktime_get_update_offsets_now(&base->clock_was_set_seq, + offs_real, offs_boot, offs_tai); } /* High resolution timer related functions */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index dd1efa6a4ea4..3365e32dc208 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -602,6 +602,9 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); + + if (action & TK_CLOCK_WAS_SET) + tk->clock_was_set_seq++; } /** @@ -1927,15 +1930,19 @@ void do_timer(unsigned long ticks) /** * ktime_get_update_offsets_now - hrtimer helper + * @cwsseq: pointer to check and store the clock was set sequence number * @offs_real: pointer to storage for monotonic -> realtime offset * @offs_boot: pointer to storage for monotonic -> boottime offset * @offs_tai: pointer to storage for monotonic -> clock tai offset * - * Returns current monotonic time and updates the offsets + * Returns current monotonic time and updates the offsets if the + * sequence number in @cwsseq and timekeeper.clock_was_set_seq are + * different. + * * Called from hrtimer_interrupt() or retrigger_next_event() */ -ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, - ktime_t *offs_tai) +ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, + ktime_t *offs_boot, ktime_t *offs_tai) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; @@ -1947,10 +1954,12 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, base = tk->tkr_mono.base; nsecs = timekeeping_get_ns(&tk->tkr_mono); - - *offs_real = tk->offs_real; - *offs_boot = tk->offs_boot; - *offs_tai = tk->offs_tai; + if (*cwsseq != tk->clock_was_set_seq) { + *cwsseq = tk->clock_was_set_seq; + *offs_real = tk->offs_real; + *offs_boot = tk->offs_boot; + *offs_tai = tk->offs_tai; + } } while (read_seqcount_retry(&tk_core.seq, seq)); return ktime_add_ns(base, nsecs); diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 4d177fce37d5..704f595ce83f 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -3,9 +3,10 @@ /* * Internal interfaces for kernel/time/ */ -extern ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, - ktime_t *offs_boot, - ktime_t *offs_tai); +extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, + ktime_t *offs_real, + ktime_t *offs_boot, + ktime_t *offs_tai); extern int timekeeping_valid_for_hres(void); extern u64 timekeeping_max_deferment(void); -- cgit v1.2.3 From e19ffe8be2cd0a1f726b235443eba21e64f6be5e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:39 +0000 Subject: hrtimer: Use bits for various boolean indicators No point in wasting 12 byte storage space. Generates better code as well. Text size reduction: x8664 -64, i386 -16, ARM -132, ARM64 -0, power64 -48 Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203501.227955358@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 6 +++--- kernel/time/hrtimer.c | 24 ++++++++++++++++-------- 2 files changed, 19 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 5e04f8fc26f6..17a59ddcc79a 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -181,10 +181,10 @@ struct hrtimer_cpu_base { unsigned int active_bases; unsigned int clock_was_set_seq; #ifdef CONFIG_HIGH_RES_TIMERS + unsigned int in_hrtirq : 1, + hres_active : 1, + hang_detected : 1; ktime_t expires_next; - int in_hrtirq; - int hres_active; - int hang_detected; unsigned int nr_events; unsigned int nr_retries; unsigned int nr_hangs; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 8ce9b3138017..9bbfe33f6575 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -492,9 +492,14 @@ static inline int hrtimer_is_hres_enabled(void) /* * Is the high resolution mode active ? */ +static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) +{ + return cpu_base->hres_active; +} + static inline int hrtimer_hres_active(void) { - return __this_cpu_read(hrtimer_bases.hres_active); + return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases)); } /* @@ -628,7 +633,7 @@ static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); - if (!hrtimer_hres_active()) + if (!base->hres_active) return; raw_spin_lock(&base->lock); @@ -685,6 +690,7 @@ void clock_was_set_delayed(void) #else +static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; } static inline int hrtimer_hres_active(void) { return 0; } static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline int hrtimer_switch_to_hres(void) { return 0; } @@ -862,25 +868,27 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, unsigned long newstate, int reprogram) { + struct hrtimer_cpu_base *cpu_base = base->cpu_base; struct timerqueue_node *next_timer; + if (!(timer->state & HRTIMER_STATE_ENQUEUED)) goto out; next_timer = timerqueue_getnext(&base->active); timerqueue_del(&base->active, &timer->node); if (!timerqueue_getnext(&base->active)) - base->cpu_base->active_bases &= ~(1 << base->index); + cpu_base->active_bases &= ~(1 << base->index); if (&timer->node == next_timer) { #ifdef CONFIG_HIGH_RES_TIMERS /* Reprogram the clock event device. if enabled */ - if (reprogram && hrtimer_hres_active()) { + if (reprogram && cpu_base->hres_active) { ktime_t expires; expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - if (base->cpu_base->expires_next.tv64 == expires.tv64) - hrtimer_force_reprogram(base->cpu_base, 1); + if (cpu_base->expires_next.tv64 == expires.tv64) + hrtimer_force_reprogram(cpu_base, 1); } #endif } @@ -1114,7 +1122,7 @@ ktime_t hrtimer_get_next_event(void) raw_spin_lock_irqsave(&cpu_base->lock, flags); - if (!hrtimer_hres_active()) + if (!__hrtimer_hres_active(cpu_base)) mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base), ktime_get()); @@ -1412,7 +1420,7 @@ void hrtimer_run_queues(void) struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); ktime_t now; - if (hrtimer_hres_active()) + if (__hrtimer_hres_active(cpu_base)) return; raw_spin_lock(&cpu_base->lock); -- cgit v1.2.3 From 6d9a1411393d51f17bee3fe163430b21b2cb2de9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:42 +0000 Subject: hrtimer: Cache line align the hrtimer cpu base We really want that data structure to start at a cache line boundary. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203501.417597627@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 17a59ddcc79a..0853f52f8ffb 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -191,7 +191,7 @@ struct hrtimer_cpu_base { unsigned int max_hang_time; #endif struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; -}; +} ____cacheline_aligned; static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time) { -- cgit v1.2.3 From b8e38413ac2c33c497e72895fcd5da709fd1b908 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:44 +0000 Subject: hrtimer: Align the hrtimer clock bases as well We don't use cacheline_align here because that might waste lot of space on 32bit machine with 64 bytes cachelines and on 64bit machines with 128 bytes cachelines. The size of struct hrtimer_clock_base is 64byte on 64bit and 32byte on 32bit machines. So we utilize the cache lines proper. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203501.498165771@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 0853f52f8ffb..e5c22d611850 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -130,6 +130,12 @@ struct hrtimer_sleeper { struct task_struct *task; }; +#ifdef CONFIG_64BIT +# define HRTIMER_CLOCK_BASE_ALIGN 64 +#else +# define HRTIMER_CLOCK_BASE_ALIGN 32 +#endif + /** * struct hrtimer_clock_base - the timer base for a specific clock * @cpu_base: per cpu clock base @@ -147,7 +153,7 @@ struct hrtimer_clock_base { struct timerqueue_head active; ktime_t (*get_time)(void); ktime_t offset; -}; +} __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN))); enum hrtimer_base_type { HRTIMER_BASE_MONOTONIC, @@ -195,6 +201,8 @@ struct hrtimer_cpu_base { static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time) { + BUILD_BUG_ON(sizeof(struct hrtimer_clock_base) > HRTIMER_CLOCK_BASE_ALIGN); + timer->node.expires = time; timer->_softexpires = time; } -- cgit v1.2.3 From c320642e1ced3b81592610e374894fea995f475b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:46 +0000 Subject: timerqueue: Let timerqueue_add/del return information The hrtimer code is interested whether the added timer is the first one to expire and whether the removed timer was the last one in the tree. The add/del routines have that information already. So we can return it right away instead of reevaluating it at the call site. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Cc: John Stultz Link: http://lkml.kernel.org/r/20150414203501.579063647@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/timerqueue.h | 8 ++++---- lib/timerqueue.c | 10 +++++++--- 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h index a520fd70a59f..7eec17ad7fa1 100644 --- a/include/linux/timerqueue.h +++ b/include/linux/timerqueue.h @@ -16,10 +16,10 @@ struct timerqueue_head { }; -extern void timerqueue_add(struct timerqueue_head *head, - struct timerqueue_node *node); -extern void timerqueue_del(struct timerqueue_head *head, - struct timerqueue_node *node); +extern bool timerqueue_add(struct timerqueue_head *head, + struct timerqueue_node *node); +extern bool timerqueue_del(struct timerqueue_head *head, + struct timerqueue_node *node); extern struct timerqueue_node *timerqueue_iterate_next( struct timerqueue_node *node); diff --git a/lib/timerqueue.c b/lib/timerqueue.c index a382e4a32609..782ae8ca2c06 100644 --- a/lib/timerqueue.c +++ b/lib/timerqueue.c @@ -36,7 +36,7 @@ * Adds the timer node to the timerqueue, sorted by the * node's expires value. */ -void timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node) +bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node) { struct rb_node **p = &head->head.rb_node; struct rb_node *parent = NULL; @@ -56,8 +56,11 @@ void timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node) rb_link_node(&node->node, parent, p); rb_insert_color(&node->node, &head->head); - if (!head->next || node->expires.tv64 < head->next->expires.tv64) + if (!head->next || node->expires.tv64 < head->next->expires.tv64) { head->next = node; + return true; + } + return false; } EXPORT_SYMBOL_GPL(timerqueue_add); @@ -69,7 +72,7 @@ EXPORT_SYMBOL_GPL(timerqueue_add); * * Removes the timer node from the timerqueue. */ -void timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node) +bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node) { WARN_ON_ONCE(RB_EMPTY_NODE(&node->node)); @@ -82,6 +85,7 @@ void timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node) } rb_erase(&node->node, &head->head); RB_CLEAR_NODE(&node->node); + return head->next != NULL; } EXPORT_SYMBOL_GPL(timerqueue_del); -- cgit v1.2.3 From 895bdfa793f6e912d1a58fc445b3dd4d686f7bd3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:49 +0000 Subject: hrtimer: Keep pointer to first timer and simplify __remove_hrtimer() __remove_hrtimer() needs to evaluate the expiry time to figure out whether the timer which is removed is eventually the first expiring timer on the cpu. Keep a pointer to it, which is lazily updated, so we can avoid the evaluation dance and retrieve the information from there. Generates slightly better code. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203501.752838019@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 6 ++++++ kernel/time/hrtimer.c | 46 ++++++++++++++++++++++++++++------------------ 2 files changed, 34 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index e5c22d611850..d194c1dacdaa 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -172,6 +172,7 @@ enum hrtimer_base_type { * @clock_was_set_seq: Sequence counter of clock was set events * @expires_next: absolute time of the next event which was scheduled * via clock_set_next_event() + * @next_timer: Pointer to the first expiring timer * @in_hrtirq: hrtimer_interrupt() is currently executing * @hres_active: State of high resolution mode * @hang_detected: The last hrtimer interrupt detected a hang @@ -180,6 +181,10 @@ enum hrtimer_base_type { * @nr_hangs: Total number of hrtimer interrupt hangs * @max_hang_time: Maximum time spent in hrtimer_interrupt * @clock_base: array of clock bases for this cpu + * + * Note: next_timer is just an optimization for __remove_hrtimer(). + * Do not dereference the pointer because it is not reliable on + * cross cpu removals. */ struct hrtimer_cpu_base { raw_spinlock_t lock; @@ -191,6 +196,7 @@ struct hrtimer_cpu_base { hres_active : 1, hang_detected : 1; ktime_t expires_next; + struct hrtimer *next_timer; unsigned int nr_events; unsigned int nr_retries; unsigned int nr_hangs; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 0cd1e0b8099d..30178d0656cf 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -415,12 +415,21 @@ static inline void debug_deactivate(struct hrtimer *timer) } #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) +static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base, + struct hrtimer *timer) +{ +#ifdef CONFIG_HIGH_RES_TIMERS + cpu_base->next_timer = timer; +#endif +} + static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) { struct hrtimer_clock_base *base = cpu_base->clock_base; ktime_t expires, expires_next = { .tv64 = KTIME_MAX }; unsigned int active = cpu_base->active_bases; + hrtimer_update_next_timer(cpu_base, NULL); for (; active; base++, active >>= 1) { struct timerqueue_node *next; struct hrtimer *timer; @@ -431,8 +440,10 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) next = timerqueue_getnext(&base->active); timer = container_of(next, struct hrtimer, node); expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - if (expires.tv64 < expires_next.tv64) + if (expires.tv64 < expires_next.tv64) { expires_next = expires; + hrtimer_update_next_timer(cpu_base, timer); + } } /* * clock_was_set() might have changed base->offset of any of @@ -597,6 +608,8 @@ static int hrtimer_reprogram(struct hrtimer *timer, if (cpu_base->in_hrtirq) return 0; + cpu_base->next_timer = timer; + /* * If a hang was detected in the last timer interrupt then we * do not schedule a timer which is earlier than the expiry @@ -868,30 +881,27 @@ static void __remove_hrtimer(struct hrtimer *timer, unsigned long newstate, int reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; - struct timerqueue_node *next_timer; + unsigned int state = timer->state; - if (!(timer->state & HRTIMER_STATE_ENQUEUED)) - goto out; + timer->state = newstate; + if (!(state & HRTIMER_STATE_ENQUEUED)) + return; - next_timer = timerqueue_getnext(&base->active); if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); - if (&timer->node == next_timer) { #ifdef CONFIG_HIGH_RES_TIMERS - /* Reprogram the clock event device. if enabled */ - if (reprogram && cpu_base->hres_active) { - ktime_t expires; - - expires = ktime_sub(hrtimer_get_expires(timer), - base->offset); - if (cpu_base->expires_next.tv64 == expires.tv64) - hrtimer_force_reprogram(cpu_base, 1); - } + /* + * Note: If reprogram is false we do not update + * cpu_base->next_timer. This happens when we remove the first + * timer on a remote cpu. No harm as we never dereference + * cpu_base->next_timer. So the worst thing what can happen is + * an superflous call to hrtimer_force_reprogram() on the + * remote cpu later on if the same timer gets enqueued again. + */ + if (reprogram && timer == cpu_base->next_timer) + hrtimer_force_reprogram(cpu_base, 1); #endif - } -out: - timer->state = newstate; } /* -- cgit v1.2.3 From c6eb3f70d4482806dc2d3e1e3c7736f497b1d418 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:51 +0000 Subject: hrtimer: Get rid of hrtimer softirq hrtimer softirq is a leftover from the initial implementation and serves only the purpose to handle the enqueueing of already expired timers in the high resolution timer mode. We discussed whether we change the return value and force all start sites to handle that the timer is already expired, but that would be a Herculean task and I'm not sure whether its a good idea to enforce that handling on everyone. A simpler solution is to enforce a timer interrupt instead of raising and scheduling a softirq. Just use the existing infrastructure to do so and remove all the softirq leftovers. The HRTIMER softirq enum is now unused, but kept around because trace parsers rely on the existing numbering. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203501.840834708@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 1 - include/linux/interrupt.h | 3 +- kernel/time/hrtimer.c | 163 ++++++++++++---------------------------------- kernel/time/tick-common.c | 10 +++ kernel/time/timer.c | 2 - 5 files changed, 55 insertions(+), 124 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index d194c1dacdaa..048270a27bc5 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -459,7 +459,6 @@ extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode); /* Soft interrupt function to run the hrtimer queues: */ extern void hrtimer_run_queues(void); -extern void hrtimer_run_pending(void); /* Bootup initialization: */ extern void __init hrtimers_init(void); diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 950ae4501826..6bf15a66bce7 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -413,7 +413,8 @@ enum BLOCK_IOPOLL_SOFTIRQ, TASKLET_SOFTIRQ, SCHED_SOFTIRQ, - HRTIMER_SOFTIRQ, + HRTIMER_SOFTIRQ, /* Unused, but kept as tools rely on the + numbering. Sigh! */ RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ NR_SOFTIRQS diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 30178d0656cf..fc6b6d25f93d 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -555,59 +555,48 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) } /* - * Shared reprogramming for clock_realtime and clock_monotonic - * * When a timer is enqueued and expires earlier than the already enqueued * timers, we have to check, whether it expires earlier than the timer for * which the clock event device was armed. * - * Note, that in case the state has HRTIMER_STATE_CALLBACK set, no reprogramming - * and no expiry check happens. The timer gets enqueued into the rbtree. The - * reprogramming and expiry check is done in the hrtimer_interrupt or in the - * softirq. - * * Called with interrupts disabled and base->cpu_base.lock held */ -static int hrtimer_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) +static void hrtimer_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - int res; WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); /* - * When the callback is running, we do not reprogram the clock event - * device. The timer callback is either running on a different CPU or - * the callback is executed in the hrtimer_interrupt context. The - * reprogramming is handled either by the softirq, which called the - * callback or at the end of the hrtimer_interrupt. + * If the timer is not on the current cpu, we cannot reprogram + * the other cpus clock event device. */ - if (hrtimer_callback_running(timer)) - return 0; + if (base->cpu_base != cpu_base) + return; + + /* + * If the hrtimer interrupt is running, then it will + * reevaluate the clock bases and reprogram the clock event + * device. The callbacks are always executed in hard interrupt + * context so we don't need an extra check for a running + * callback. + */ + if (cpu_base->in_hrtirq) + return; /* * CLOCK_REALTIME timer might be requested with an absolute - * expiry time which is less than base->offset. Nothing wrong - * about that, just avoid to call into the tick code, which - * has now objections against negative expiry values. + * expiry time which is less than base->offset. Set it to 0. */ if (expires.tv64 < 0) - return -ETIME; + expires.tv64 = 0; if (expires.tv64 >= cpu_base->expires_next.tv64) - return 0; - - /* - * When the target cpu of the timer is currently executing - * hrtimer_interrupt(), then we do not touch the clock event - * device. hrtimer_interrupt() will reevaluate all clock bases - * before reprogramming the device. - */ - if (cpu_base->in_hrtirq) - return 0; + return; + /* Update the pointer to the next expiring timer */ cpu_base->next_timer = timer; /* @@ -617,15 +606,14 @@ static int hrtimer_reprogram(struct hrtimer *timer, * to make progress. */ if (cpu_base->hang_detected) - return 0; + return; /* - * Clockevents returns -ETIME, when the event was in the past. + * Program the timer hardware. We enforce the expiry for + * events which are already in the past. */ - res = tick_program_event(expires, 0); - if (!IS_ERR_VALUE(res)) - cpu_base->expires_next = expires; - return res; + cpu_base->expires_next = expires; + tick_program_event(expires, 1); } /* @@ -660,19 +648,11 @@ static void retrigger_next_event(void *arg) */ static int hrtimer_switch_to_hres(void) { - int cpu = smp_processor_id(); - struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); - unsigned long flags; - - if (base->hres_active) - return 1; - - local_irq_save(flags); + struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); if (tick_init_highres()) { - local_irq_restore(flags); printk(KERN_WARNING "Could not switch to high resolution " - "mode on CPU %d\n", cpu); + "mode on CPU %d\n", base->cpu); return 0; } base->hres_active = 1; @@ -681,7 +661,6 @@ static int hrtimer_switch_to_hres(void) tick_setup_sched_timer(); /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); - local_irq_restore(flags); return 1; } @@ -984,26 +963,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * on dynticks target. */ wake_up_nohz_cpu(new_base->cpu_base->cpu); - } else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases) && - hrtimer_reprogram(timer, new_base)) { - /* - * Only allow reprogramming if the new base is on this CPU. - * (it might still be on another CPU if the timer was pending) - * - * XXX send_remote_softirq() ? - */ - if (wakeup) { - /* - * We need to drop cpu_base->lock to avoid a - * lock ordering issue vs. rq->lock. - */ - raw_spin_unlock(&new_base->cpu_base->lock); - raise_softirq_irqoff(HRTIMER_SOFTIRQ); - local_irq_restore(flags); - return ret; - } else { - __raise_softirq_irqoff(HRTIMER_SOFTIRQ); - } + } else { + hrtimer_reprogram(timer, new_base); } unlock_hrtimer_base(timer, &flags); @@ -1354,7 +1315,7 @@ retry: * local version of hrtimer_peek_ahead_timers() called with interrupts * disabled. */ -static void __hrtimer_peek_ahead_timers(void) +static inline void __hrtimer_peek_ahead_timers(void) { struct tick_device *td; @@ -1366,29 +1327,6 @@ static void __hrtimer_peek_ahead_timers(void) hrtimer_interrupt(td->evtdev); } -/** - * hrtimer_peek_ahead_timers -- run soft-expired timers now - * - * hrtimer_peek_ahead_timers will peek at the timer queue of - * the current cpu and check if there are any timers for which - * the soft expires time has passed. If any such timers exist, - * they are run immediately and then removed from the timer queue. - * - */ -void hrtimer_peek_ahead_timers(void) -{ - unsigned long flags; - - local_irq_save(flags); - __hrtimer_peek_ahead_timers(); - local_irq_restore(flags); -} - -static void run_hrtimer_softirq(struct softirq_action *h) -{ - hrtimer_peek_ahead_timers(); -} - #else /* CONFIG_HIGH_RES_TIMERS */ static inline void __hrtimer_peek_ahead_timers(void) { } @@ -1396,31 +1334,7 @@ static inline void __hrtimer_peek_ahead_timers(void) { } #endif /* !CONFIG_HIGH_RES_TIMERS */ /* - * Called from timer softirq every jiffy, expire hrtimers: - * - * For HRT its the fall back code to run the softirq in the timer - * softirq context in case the hrtimer initialization failed or has - * not been done yet. - */ -void hrtimer_run_pending(void) -{ - if (hrtimer_hres_active()) - return; - - /* - * This _is_ ugly: We have to check in the softirq context, - * whether we can switch to highres and / or nohz mode. The - * clocksource switch happens in the timer interrupt with - * xtime_lock held. Notification from there only sets the - * check bit in the tick_oneshot code, otherwise we might - * deadlock vs. xtime_lock. - */ - if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) - hrtimer_switch_to_hres(); -} - -/* - * Called from hardirq context every jiffy + * Called from run_local_timers in hardirq context every jiffy */ void hrtimer_run_queues(void) { @@ -1430,6 +1344,18 @@ void hrtimer_run_queues(void) if (__hrtimer_hres_active(cpu_base)) return; + /* + * This _is_ ugly: We have to check periodically, whether we + * can switch to highres and / or nohz mode. The clocksource + * switch happens with xtime_lock held. Notification from + * there only sets the check bit in the tick_oneshot code, + * otherwise we might deadlock vs. xtime_lock. + */ + if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) { + hrtimer_switch_to_hres(); + return; + } + raw_spin_lock(&cpu_base->lock); now = hrtimer_update_base(cpu_base); __hrtimer_run_queues(cpu_base, now); @@ -1700,9 +1626,6 @@ void __init hrtimers_init(void) hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); register_cpu_notifier(&hrtimers_nb); -#ifdef CONFIG_HIGH_RES_TIMERS - open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq); -#endif } /** diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 3ae6afa1eb98..ea5f9eae8f74 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -102,6 +102,16 @@ void tick_handle_periodic(struct clock_event_device *dev) tick_periodic(cpu); +#if defined(CONFIG_HIGH_RES_TIMERS) || defined(CONFIG_NO_HZ_COMMON) + /* + * The cpu might have transitioned to HIGHRES or NOHZ mode via + * update_process_times() -> run_local_timers() -> + * hrtimer_run_queues(). + */ + if (dev->event_handler != tick_handle_periodic) + return; +#endif + if (dev->state != CLOCK_EVT_STATE_ONESHOT) return; for (;;) { diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 2ece3aa5069c..b31f13f4fe41 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1409,8 +1409,6 @@ static void run_timer_softirq(struct softirq_action *h) { struct tvec_base *base = __this_cpu_read(tvec_bases); - hrtimer_run_pending(); - if (time_after_eq(jiffies, base->timer_jiffies)) __run_timers(base); } -- cgit v1.2.3 From c1ad348b452aacd784fb97403d03d71723c72ee1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:58 +0000 Subject: tick: Nohz: Rework next timer evaluation The evaluation of the next timer in the nohz code is based on jiffies while all the tick internals are nano seconds based. We have also to convert hrtimer nanoseconds to jiffies in the !highres case. That's just wrong and introduces interesting corner cases. Turn it around and convert the next timer wheel timer expiry and the rcu event to clock monotonic and base all calculations on nanoseconds. That identifies the case where no timer is pending clearly with an absolute expiry value of KTIME_MAX. Makes the code more readable and gets rid of the jiffies magic in the nohz code. Signed-off-by: Thomas Gleixner Reviewed-by: Paul E. McKenney Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Cc: Josh Triplett Cc: Lai Jiangshan Cc: John Stultz Cc: Marcelo Tosatti Link: http://lkml.kernel.org/r/20150414203502.184198593@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 2 +- include/linux/rcupdate.h | 6 ++- include/linux/rcutree.h | 2 +- include/linux/timer.h | 7 --- kernel/rcu/tree_plugin.h | 14 +++--- kernel/time/hrtimer.c | 14 ++---- kernel/time/tick-internal.h | 2 + kernel/time/tick-sched.c | 109 ++++++++++++++++++++------------------------ kernel/time/tick-sched.h | 2 +- kernel/time/timer.c | 71 ++++++++++++++--------------- kernel/time/timer_list.c | 4 +- 11 files changed, 107 insertions(+), 126 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 048270a27bc5..2c68f71ffd24 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -386,7 +386,7 @@ static inline int hrtimer_restart(struct hrtimer *timer) /* Query timers: */ extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer); -extern ktime_t hrtimer_get_next_event(void); +extern u64 hrtimer_get_next_event(void); /* * A timer is active, when it is enqueued into the rbtree or the diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 573a5afd5ed8..0627a447c589 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -44,6 +44,8 @@ #include #include #include +#include + #include extern int rcu_expedited; /* for sysctl */ @@ -1154,9 +1156,9 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head)) #if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL) -static inline int rcu_needs_cpu(unsigned long *delta_jiffies) +static inline int rcu_needs_cpu(u64 basemono, u64 *nextevt) { - *delta_jiffies = ULONG_MAX; + *nextevt = KTIME_MAX; return 0; } #endif /* #if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL) */ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index d2e583a6aaca..db2e31beaae7 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -32,7 +32,7 @@ void rcu_note_context_switch(void); #ifndef CONFIG_RCU_NOCB_CPU_ALL -int rcu_needs_cpu(unsigned long *delta_jiffies); +int rcu_needs_cpu(u64 basem, u64 *nextevt); #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ void rcu_cpu_stall_reset(void); diff --git a/include/linux/timer.h b/include/linux/timer.h index 8c5a197e1587..fbb80e0030bf 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -187,13 +187,6 @@ extern void set_timer_slack(struct timer_list *time, int slack_hz); */ #define NEXT_TIMER_MAX_DELTA ((1UL << 30) - 1) -/* - * Return when the next timer-wheel timeout occurs (in absolute jiffies), - * locks the timer base and does the comparison against the given - * jiffie. - */ -extern unsigned long get_next_timer_interrupt(unsigned long now); - /* * Timer-statistics info: */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 8c0ec0f5a027..0ef80a0bbabb 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1368,9 +1368,9 @@ static void rcu_prepare_kthreads(int cpu) * any flavor of RCU. */ #ifndef CONFIG_RCU_NOCB_CPU_ALL -int rcu_needs_cpu(unsigned long *delta_jiffies) +int rcu_needs_cpu(u64 basemono, u64 *nextevt) { - *delta_jiffies = ULONG_MAX; + *nextevt = KTIME_MAX; return rcu_cpu_has_callbacks(NULL); } #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ @@ -1481,16 +1481,17 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) * The caller must have disabled interrupts. */ #ifndef CONFIG_RCU_NOCB_CPU_ALL -int rcu_needs_cpu(unsigned long *dj) +int rcu_needs_cpu(u64 basemono, u64 *nextevt) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + unsigned long dj; /* Snapshot to detect later posting of non-lazy callback. */ rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; /* If no callbacks, RCU doesn't need the CPU. */ if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) { - *dj = ULONG_MAX; + *nextevt = KTIME_MAX; return 0; } @@ -1504,11 +1505,12 @@ int rcu_needs_cpu(unsigned long *dj) /* Request timer delay depending on laziness, and round. */ if (!rdtp->all_lazy) { - *dj = round_up(rcu_idle_gp_delay + jiffies, + dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies; } else { - *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; + dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; } + *nextevt = basemono + dj * TICK_NSEC; return 0; } #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index fc6b6d25f93d..179b991cfdcb 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1080,26 +1080,22 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining); /** * hrtimer_get_next_event - get the time until next expiry event * - * Returns the delta to the next expiry event or KTIME_MAX if no timer - * is pending. + * Returns the next expiry time or KTIME_MAX if no timer is pending. */ -ktime_t hrtimer_get_next_event(void) +u64 hrtimer_get_next_event(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - ktime_t mindelta = { .tv64 = KTIME_MAX }; + u64 expires = KTIME_MAX; unsigned long flags; raw_spin_lock_irqsave(&cpu_base->lock, flags); if (!__hrtimer_hres_active(cpu_base)) - mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base), - ktime_get()); + expires = __hrtimer_get_next_event(cpu_base).tv64; raw_spin_unlock_irqrestore(&cpu_base->lock, flags); - if (mindelta.tv64 < 0) - mindelta.tv64 = 0; - return mindelta; + return expires; } #endif diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index b64fdd8054c5..65273f0a11ed 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -137,3 +137,5 @@ extern void tick_nohz_init(void); # else static inline void tick_nohz_init(void) { } #endif + +extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 4c5f4a9dcc0a..753c211f6195 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -582,39 +582,46 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now, int cpu) { - unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; - ktime_t last_update, expires, ret = { .tv64 = 0 }; - unsigned long rcu_delta_jiffies; struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); - u64 time_delta; - - time_delta = timekeeping_max_deferment(); + u64 basemono, next_tick, next_tmr, next_rcu, delta, expires; + unsigned long seq, basejiff; + ktime_t tick; /* Read jiffies and the time when jiffies were updated last */ do { seq = read_seqbegin(&jiffies_lock); - last_update = last_jiffies_update; - last_jiffies = jiffies; + basemono = last_jiffies_update.tv64; + basejiff = jiffies; } while (read_seqretry(&jiffies_lock, seq)); + ts->last_jiffies = basejiff; - if (rcu_needs_cpu(&rcu_delta_jiffies) || + if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() || irq_work_needs_cpu()) { - next_jiffies = last_jiffies + 1; - delta_jiffies = 1; + next_tick = basemono + TICK_NSEC; } else { - /* Get the next timer wheel timer */ - next_jiffies = get_next_timer_interrupt(last_jiffies); - delta_jiffies = next_jiffies - last_jiffies; - if (rcu_delta_jiffies < delta_jiffies) { - next_jiffies = last_jiffies + rcu_delta_jiffies; - delta_jiffies = rcu_delta_jiffies; - } + /* + * Get the next pending timer. If high resolution + * timers are enabled this only takes the timer wheel + * timers into account. If high resolution timers are + * disabled this also looks at the next expiring + * hrtimer. + */ + next_tmr = get_next_timer_interrupt(basejiff, basemono); + ts->next_timer = next_tmr; + /* Take the next rcu event into account */ + next_tick = next_rcu < next_tmr ? next_rcu : next_tmr; } - if ((long)delta_jiffies <= 1) { + /* + * If the tick is due in the next period, keep it ticking or + * restart it proper. + */ + delta = next_tick - basemono; + if (delta <= (u64)TICK_NSEC) { + tick.tv64 = 0; if (!ts->tick_stopped) goto out; - if (delta_jiffies == 0) { + if (delta == 0) { /* Tick is stopped, but required now. Enforce it */ tick_nohz_restart(ts, now); goto out; @@ -629,54 +636,39 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, * do_timer() never invoked. Keep track of the fact that it * was the one which had the do_timer() duty last. If this cpu * is the one which had the do_timer() duty last, we limit the - * sleep time to the timekeeping max_deferement value which we - * retrieved above. Otherwise we can sleep as long as we want. + * sleep time to the timekeeping max_deferement value. + * Otherwise we can sleep as long as we want. */ + delta = timekeeping_max_deferment(); if (cpu == tick_do_timer_cpu) { tick_do_timer_cpu = TICK_DO_TIMER_NONE; ts->do_timer_last = 1; } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { - time_delta = KTIME_MAX; + delta = KTIME_MAX; ts->do_timer_last = 0; } else if (!ts->do_timer_last) { - time_delta = KTIME_MAX; + delta = KTIME_MAX; } #ifdef CONFIG_NO_HZ_FULL + /* Limit the tick delta to the maximum scheduler deferment */ if (!ts->inidle) - time_delta = min(time_delta, scheduler_tick_max_deferment()); + delta = min(delta, scheduler_tick_max_deferment()); #endif - /* - * calculate the expiry time for the next timer wheel - * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that - * there is no timer pending or at least extremely far into - * the future (12 days for HZ=1000). In this case we set the - * expiry to the end of time. - */ - if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) { - /* - * Calculate the time delta for the next timer event. - * If the time delta exceeds the maximum time delta - * permitted by the current clocksource then adjust - * the time delta accordingly to ensure the - * clocksource does not wrap. - */ - time_delta = min_t(u64, time_delta, - tick_period.tv64 * delta_jiffies); - } - - if (time_delta < KTIME_MAX) - expires = ktime_add_ns(last_update, time_delta); + /* Calculate the next expiry time */ + if (delta < (KTIME_MAX - basemono)) + expires = basemono + delta; else - expires.tv64 = KTIME_MAX; + expires = KTIME_MAX; + + expires = min_t(u64, expires, next_tick); + tick.tv64 = expires; /* Skip reprogram of event if its not changed */ - if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) + if (ts->tick_stopped && (expires == dev->next_event.tv64)) goto out; - ret = expires; - /* * nohz_stop_sched_tick can be called several times before * the nohz_restart_sched_tick is called. This happens when @@ -694,26 +686,23 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, } /* - * If the expiration time == KTIME_MAX, then - * in this case we simply stop the tick timer. + * If the expiration time == KTIME_MAX, then we simply stop + * the tick timer. */ - if (unlikely(expires.tv64 == KTIME_MAX)) { + if (unlikely(expires == KTIME_MAX)) { if (ts->nohz_mode == NOHZ_MODE_HIGHRES) hrtimer_cancel(&ts->sched_timer); goto out; } if (ts->nohz_mode == NOHZ_MODE_HIGHRES) - hrtimer_start(&ts->sched_timer, expires, - HRTIMER_MODE_ABS_PINNED); + hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED); else - tick_program_event(expires, 1); + tick_program_event(tick, 1); out: - ts->next_jiffies = next_jiffies; - ts->last_jiffies = last_jiffies; + /* Update the estimated sleep length */ ts->sleep_length = ktime_sub(dev->next_event, now); - - return ret; + return tick; } static void tick_nohz_full_stop_tick(struct tick_sched *ts) diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 28b5da3e1a17..42fdf4958bcc 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -57,7 +57,7 @@ struct tick_sched { ktime_t iowait_sleeptime; ktime_t sleep_length; unsigned long last_jiffies; - unsigned long next_jiffies; + u64 next_timer; ktime_t idle_expires; int do_timer_last; }; diff --git a/kernel/time/timer.c b/kernel/time/timer.c index b31f13f4fe41..172b83cd2f8e 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -49,6 +49,8 @@ #include #include +#include "tick-internal.h" + #define CREATE_TRACE_POINTS #include @@ -1311,54 +1313,48 @@ cascade: * Check, if the next hrtimer event is before the next timer wheel * event: */ -static unsigned long cmp_next_hrtimer_event(unsigned long now, - unsigned long expires) +static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) { - ktime_t hr_delta = hrtimer_get_next_event(); - struct timespec tsdelta; - unsigned long delta; - - if (hr_delta.tv64 == KTIME_MAX) - return expires; + u64 nextevt = hrtimer_get_next_event(); /* - * Expired timer available, let it expire in the next tick + * If high resolution timers are enabled + * hrtimer_get_next_event() returns KTIME_MAX. */ - if (hr_delta.tv64 <= 0) - return now + 1; - - tsdelta = ktime_to_timespec(hr_delta); - delta = timespec_to_jiffies(&tsdelta); + if (expires <= nextevt) + return expires; /* - * Limit the delta to the max value, which is checked in - * tick_nohz_stop_sched_tick(): + * If the next timer is already expired, return the tick base + * time so the tick is fired immediately. */ - if (delta > NEXT_TIMER_MAX_DELTA) - delta = NEXT_TIMER_MAX_DELTA; + if (nextevt <= basem) + return basem; /* - * Take rounding errors in to account and make sure, that it - * expires in the next tick. Otherwise we go into an endless - * ping pong due to tick_nohz_stop_sched_tick() retriggering - * the timer softirq + * Round up to the next jiffie. High resolution timers are + * off, so the hrtimers are expired in the tick and we need to + * make sure that this tick really expires the timer to avoid + * a ping pong of the nohz stop code. + * + * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3 */ - if (delta < 1) - delta = 1; - now += delta; - if (time_before(now, expires)) - return now; - return expires; + return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC; } /** - * get_next_timer_interrupt - return the jiffy of the next pending timer - * @now: current time (in jiffies) + * get_next_timer_interrupt - return the time (clock mono) of the next timer + * @basej: base time jiffies + * @basem: base time clock monotonic + * + * Returns the tick aligned clock monotonic time of the next pending + * timer or KTIME_MAX if no timer is pending. */ -unsigned long get_next_timer_interrupt(unsigned long now) +u64 get_next_timer_interrupt(unsigned long basej, u64 basem) { struct tvec_base *base = __this_cpu_read(tvec_bases); - unsigned long expires = now + NEXT_TIMER_MAX_DELTA; + u64 expires = KTIME_MAX; + unsigned long nextevt; /* * Pretend that there is no timer pending if the cpu is offline. @@ -1371,14 +1367,15 @@ unsigned long get_next_timer_interrupt(unsigned long now) if (base->active_timers) { if (time_before_eq(base->next_timer, base->timer_jiffies)) base->next_timer = __next_timer_interrupt(base); - expires = base->next_timer; + nextevt = base->next_timer; + if (time_before_eq(nextevt, basej)) + expires = basem; + else + expires = basem + (nextevt - basej) * TICK_NSEC; } spin_unlock(&base->lock); - if (time_before_eq(expires, now)) - return now; - - return cmp_next_hrtimer_event(now, expires); + return cmp_next_hrtimer_event(basem, expires); } #endif diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 6232fc536185..66f39bba5353 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -191,7 +191,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) P_ns(idle_sleeptime); P_ns(iowait_sleeptime); P(last_jiffies); - P(next_jiffies); + P(next_timer); P_ns(idle_expires); SEQ_printf(m, "jiffies: %Lu\n", (unsigned long long)jiffies); @@ -289,7 +289,7 @@ static void timer_list_show_tickdevices_header(struct seq_file *m) static inline void timer_list_header(struct seq_file *m, u64 now) { - SEQ_printf(m, "Timer List Version: v0.7\n"); + SEQ_printf(m, "Timer List Version: v0.8\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); SEQ_printf(m, "\n"); -- cgit v1.2.3 From 58f1f803f1d6ef9ab280de13246d65970a09cb95 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:08 +0000 Subject: hrtimer: Get rid of __hrtimer_start_range_ns() No more callers. Remove the leftovers. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203502.707871492@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 4 ---- kernel/time/hrtimer.c | 38 +++++++++++++++----------------------- 2 files changed, 15 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 2c68f71ffd24..a80baa86bb24 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -359,10 +359,6 @@ extern int hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode); extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long range_ns, const enum hrtimer_mode mode); -extern int -__hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, - const enum hrtimer_mode mode, int wakeup); extern int hrtimer_cancel(struct hrtimer *timer); extern int hrtimer_try_to_cancel(struct hrtimer *timer); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 179b991cfdcb..88d6ea25dde4 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -916,9 +916,20 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) return 0; } -int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, const enum hrtimer_mode mode, - int wakeup) +/** + * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU + * @timer: the timer to be added + * @tim: expiry time + * @delta_ns: "slack" range for the timer + * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL) + * + * Returns: + * 0 on success + * 1 when the timer was active + */ +int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, + unsigned long delta_ns, const enum hrtimer_mode mode) { struct hrtimer_clock_base *base, *new_base; unsigned long flags; @@ -971,25 +982,6 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, return ret; } -EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns); - -/** - * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU - * @timer: the timer to be added - * @tim: expiry time - * @delta_ns: "slack" range for the timer - * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or - * relative (HRTIMER_MODE_REL) - * - * Returns: - * 0 on success - * 1 when the timer was active - */ -int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, const enum hrtimer_mode mode) -{ - return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1); -} EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); /** @@ -1006,7 +998,7 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); int hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) { - return __hrtimer_start_range_ns(timer, tim, 0, mode, 1); + return hrtimer_start_range_ns(timer, tim, 0, mode); } EXPORT_SYMBOL_GPL(hrtimer_start); -- cgit v1.2.3 From 02a171af1a46966dcdb5b38cdc33e4f43e92c778 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:10 +0000 Subject: hrtimer: Make hrtimer_start() a inline wrapper No point for an extra export just to set the extra argument of hrtimer_start_range_ns() to 0. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203502.808544539@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 19 +++++++++++++++++-- kernel/time/hrtimer.c | 19 ------------------- 2 files changed, 17 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index a80baa86bb24..42074ab3d5c3 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -355,11 +355,26 @@ static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { } #endif /* Basic timer operations: */ -extern int hrtimer_start(struct hrtimer *timer, ktime_t tim, - const enum hrtimer_mode mode); extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long range_ns, const enum hrtimer_mode mode); +/** + * hrtimer_start - (re)start an hrtimer on the current CPU + * @timer: the timer to be added + * @tim: expiry time + * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL) + * + * Returns: + * 0 on success + * 1 when the timer was active + */ +static inline int hrtimer_start(struct hrtimer *timer, ktime_t tim, + const enum hrtimer_mode mode) +{ + return hrtimer_start_range_ns(timer, tim, 0, mode); +} + extern int hrtimer_cancel(struct hrtimer *timer); extern int hrtimer_try_to_cancel(struct hrtimer *timer); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 88d6ea25dde4..e5cf71aa6d77 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -984,25 +984,6 @@ int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, } EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); -/** - * hrtimer_start - (re)start an hrtimer on the current CPU - * @timer: the timer to be added - * @tim: expiry time - * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or - * relative (HRTIMER_MODE_REL) - * - * Returns: - * 0 on success - * 1 when the timer was active - */ -int -hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) -{ - return hrtimer_start_range_ns(timer, tim, 0, mode); -} -EXPORT_SYMBOL_GPL(hrtimer_start); - - /** * hrtimer_try_to_cancel - try to deactivate a timer * @timer: hrtimer to stop -- cgit v1.2.3 From b193217e6dc3f88b599b573b53e0e0f6671d969a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:18 +0000 Subject: alarmtimer: Get rid of unused return value We want to get rid of the hrtimer_start() return value and the alarm timer return value is nowhere used. Remove it. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Cc: John Stultz Link: http://lkml.kernel.org/r/20150414203503.243910615@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/alarmtimer.h | 4 ++-- kernel/time/alarmtimer.c | 11 ++++------- 2 files changed, 6 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/alarmtimer.h b/include/linux/alarmtimer.h index a899402a5a0e..52f3b7da4f2d 100644 --- a/include/linux/alarmtimer.h +++ b/include/linux/alarmtimer.h @@ -43,8 +43,8 @@ struct alarm { void alarm_init(struct alarm *alarm, enum alarmtimer_type type, enum alarmtimer_restart (*function)(struct alarm *, ktime_t)); -int alarm_start(struct alarm *alarm, ktime_t start); -int alarm_start_relative(struct alarm *alarm, ktime_t start); +void alarm_start(struct alarm *alarm, ktime_t start); +void alarm_start_relative(struct alarm *alarm, ktime_t start); void alarm_restart(struct alarm *alarm); int alarm_try_to_cancel(struct alarm *alarm); int alarm_cancel(struct alarm *alarm); diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 0b55a7570c90..7fbba635a549 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -317,19 +317,16 @@ EXPORT_SYMBOL_GPL(alarm_init); * @alarm: ptr to alarm to set * @start: time to run the alarm */ -int alarm_start(struct alarm *alarm, ktime_t start) +void alarm_start(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; unsigned long flags; - int ret; spin_lock_irqsave(&base->lock, flags); alarm->node.expires = start; alarmtimer_enqueue(base, alarm); - ret = hrtimer_start(&alarm->timer, alarm->node.expires, - HRTIMER_MODE_ABS); + hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); spin_unlock_irqrestore(&base->lock, flags); - return ret; } EXPORT_SYMBOL_GPL(alarm_start); @@ -338,12 +335,12 @@ EXPORT_SYMBOL_GPL(alarm_start); * @alarm: ptr to alarm to set * @start: time relative to now to run the alarm */ -int alarm_start_relative(struct alarm *alarm, ktime_t start) +void alarm_start_relative(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; start = ktime_add(start, base->gettime()); - return alarm_start(alarm, start); + alarm_start(alarm, start); } EXPORT_SYMBOL_GPL(alarm_start_relative); -- cgit v1.2.3 From 61699e13072a89880aa584dcc64c6da465fb2ccc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:23 +0000 Subject: hrtimer: Remove hrtimer_start() return value No user was ever interested whether the timer was active or not when it was started. All abusers of the return value are gone, so get rid of it. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203503.483556394@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 22 +++++++++------------- include/linux/interrupt.h | 6 +++--- kernel/time/hrtimer.c | 23 +++++++---------------- 3 files changed, 19 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 42074ab3d5c3..470d876c2eda 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -355,7 +355,7 @@ static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { } #endif /* Basic timer operations: */ -extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, +extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long range_ns, const enum hrtimer_mode mode); /** @@ -364,34 +364,30 @@ extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * @tim: expiry time * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or * relative (HRTIMER_MODE_REL) - * - * Returns: - * 0 on success - * 1 when the timer was active */ -static inline int hrtimer_start(struct hrtimer *timer, ktime_t tim, - const enum hrtimer_mode mode) +static inline void hrtimer_start(struct hrtimer *timer, ktime_t tim, + const enum hrtimer_mode mode) { - return hrtimer_start_range_ns(timer, tim, 0, mode); + hrtimer_start_range_ns(timer, tim, 0, mode); } extern int hrtimer_cancel(struct hrtimer *timer); extern int hrtimer_try_to_cancel(struct hrtimer *timer); -static inline int hrtimer_start_expires(struct hrtimer *timer, - enum hrtimer_mode mode) +static inline void hrtimer_start_expires(struct hrtimer *timer, + enum hrtimer_mode mode) { unsigned long delta; ktime_t soft, hard; soft = hrtimer_get_softexpires(timer); hard = hrtimer_get_expires(timer); delta = ktime_to_ns(ktime_sub(hard, soft)); - return hrtimer_start_range_ns(timer, soft, delta, mode); + hrtimer_start_range_ns(timer, soft, delta, mode); } -static inline int hrtimer_restart(struct hrtimer *timer) +static inline void hrtimer_restart(struct hrtimer *timer) { - return hrtimer_start_expires(timer, HRTIMER_MODE_ABS); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } /* Query timers: */ diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 6bf15a66bce7..be7e75c945e9 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -593,10 +593,10 @@ tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, clockid_t which_clock, enum hrtimer_mode mode); static inline -int tasklet_hrtimer_start(struct tasklet_hrtimer *ttimer, ktime_t time, - const enum hrtimer_mode mode) +void tasklet_hrtimer_start(struct tasklet_hrtimer *ttimer, ktime_t time, + const enum hrtimer_mode mode) { - return hrtimer_start(&ttimer->timer, time, mode); + hrtimer_start(&ttimer->timer, time, mode); } static inline diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index c38f0b6024b4..beab02d3ff1e 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -923,22 +923,18 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) * @delta_ns: "slack" range for the timer * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or * relative (HRTIMER_MODE_REL) - * - * Returns: - * 0 on success - * 1 when the timer was active */ -int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, const enum hrtimer_mode mode) +void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, + unsigned long delta_ns, const enum hrtimer_mode mode) { struct hrtimer_clock_base *base, *new_base; unsigned long flags; - int ret, leftmost; + int leftmost; base = lock_hrtimer_base(timer, &flags); /* Remove an active timer from the queue: */ - ret = remove_hrtimer(timer, base); + remove_hrtimer(timer, base); if (mode & HRTIMER_MODE_REL) { tim = ktime_add_safe(tim, base->get_time()); @@ -962,11 +958,8 @@ int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, timer_stats_hrtimer_set_start_info(timer); leftmost = enqueue_hrtimer(timer, new_base); - - if (!leftmost) { - unlock_hrtimer_base(timer, &flags); - return ret; - } + if (!leftmost) + goto unlock; if (!hrtimer_is_hres_active(timer)) { /* @@ -977,10 +970,8 @@ int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, } else { hrtimer_reprogram(timer, new_base); } - +unlock: unlock_hrtimer_base(timer, &flags); - - return ret; } EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); -- cgit v1.2.3 From 59afdc7b32143528524455039e7557a46b60e4c8 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 22 Apr 2015 11:28:46 +0800 Subject: crypto: api - Move module sig ifdef into accessor function Currently we're hiding mod->sig_ok under an ifdef in open code. This patch adds a module_sig_ok accessor function and removes that ifdef. Signed-off-by: Herbert Xu Acked-by: Rusty Russell --- crypto/algapi.c | 5 +---- include/linux/module.h | 12 ++++++++++++ 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/crypto/algapi.c b/crypto/algapi.c index a60f62625b5f..f835f439bb23 100644 --- a/crypto/algapi.c +++ b/crypto/algapi.c @@ -43,12 +43,9 @@ static inline int crypto_set_driver_name(struct crypto_alg *alg) static inline void crypto_check_module_sig(struct module *mod) { -#ifdef CONFIG_CRYPTO_FIPS - if (fips_enabled && mod && !mod->sig_ok) + if (fips_enabled && mod && !module_sig_ok(mod)) panic("Module %s signature verification failed in FIPS mode\n", mod->name); -#endif - return; } static int crypto_check_alg(struct crypto_alg *alg) diff --git a/include/linux/module.h b/include/linux/module.h index c883b86ea964..1e5436042eb0 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -655,4 +655,16 @@ static inline void module_bug_finalize(const Elf_Ehdr *hdr, static inline void module_bug_cleanup(struct module *mod) {} #endif /* CONFIG_GENERIC_BUG */ +#ifdef CONFIG_MODULE_SIG +static inline bool module_sig_ok(struct module *module) +{ + return module->sig_ok; +} +#else /* !CONFIG_MODULE_SIG */ +static inline bool module_sig_ok(struct module *module) +{ + return true; +} +#endif /* CONFIG_MODULE_SIG */ + #endif /* _LINUX_MODULE_H */ -- cgit v1.2.3 From af87baedf2c23b1181f51323339210a26a64f7fc Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:28 +0800 Subject: x86/htirq: Use new irqdomain interfaces to allocate/free IRQ Use new irqdomain interfaces to allocate/free IRQ for HTIRQ, so we can remove GENERIC_IRQ_LEGACY_ALLOC_HWIRQ later. This patch changes the interfaces between arch independent PCI driver and arch specific code. Currently HT_IRQ is only enabled on x86, so it does not affect other architectures. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-7-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/htirq.c | 26 +++++++++++++------------- drivers/pci/htirq.c | 7 +++---- include/linux/htirq.h | 2 ++ 3 files changed, 18 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c index 816f36e979ad..b307ee7a7148 100644 --- a/arch/x86/kernel/apic/htirq.c +++ b/arch/x86/kernel/apic/htirq.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -61,31 +62,30 @@ static struct irq_chip ht_irq_chip = { .flags = IRQCHIP_SKIP_SET_WAKE, }; +int arch_alloc_ht_irq(struct pci_dev *dev) +{ + return irq_domain_alloc_irqs(NULL, 1, dev_to_node(&dev->dev), NULL); +} + +void arch_free_ht_irq(int irq) +{ + irq_domain_free_irqs(irq, 1); +} + int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) { struct irq_cfg *cfg; struct ht_irq_msg msg; - unsigned dest; - int err; if (disable_apic) return -ENXIO; cfg = irq_cfg(irq); - err = assign_irq_vector(irq, cfg, apic->target_cpus()); - if (err) - return err; - - err = apic->cpu_mask_to_apicid_and(cfg->domain, - apic->target_cpus(), &dest); - if (err) - return err; - - msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); + msg.address_hi = HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid); msg.address_lo = HT_IRQ_LOW_BASE | - HT_IRQ_LOW_DEST_ID(dest) | + HT_IRQ_LOW_DEST_ID(cfg->dest_apicid) | HT_IRQ_LOW_VECTOR(cfg->vector) | ((apic->irq_dest_mode == 0) ? HT_IRQ_LOW_DM_PHYSICAL : diff --git a/drivers/pci/htirq.c b/drivers/pci/htirq.c index a94dd2c4183a..ceb0ebeb7b5f 100644 --- a/drivers/pci/htirq.c +++ b/drivers/pci/htirq.c @@ -117,8 +117,8 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update) cfg->msg.address_lo = 0xffffffff; cfg->msg.address_hi = 0xffffffff; - irq = irq_alloc_hwirq(dev_to_node(&dev->dev)); - if (!irq) { + irq = arch_alloc_ht_irq(dev); + if (irq <= 0) { kfree(cfg); return -EBUSY; } @@ -163,8 +163,7 @@ void ht_destroy_irq(unsigned int irq) cfg = irq_get_handler_data(irq); irq_set_chip(irq, NULL); irq_set_handler_data(irq, NULL); - irq_free_hwirq(irq); - + arch_free_ht_irq(irq); kfree(cfg); } EXPORT_SYMBOL(ht_destroy_irq); diff --git a/include/linux/htirq.h b/include/linux/htirq.h index 70a1dbbf2093..5caa51b7b95c 100644 --- a/include/linux/htirq.h +++ b/include/linux/htirq.h @@ -15,6 +15,8 @@ void unmask_ht_irq(struct irq_data *data); /* The arch hook for getting things started */ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev); +int arch_alloc_ht_irq(struct pci_dev *dev); +void arch_free_ht_irq(int irq); /* For drivers of buggy hardware */ typedef void (ht_irq_update_t)(struct pci_dev *dev, int irq, -- cgit v1.2.3 From b106ee63abccbba5f5a52d6e43168a6a30c6d98a Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:32 +0800 Subject: irq_remapping/vt-d: Enhance Intel IR driver to support hierarchical irqdomains Enhance Intel interrupt remapping driver to support hierarchical irqdomains. Implement intel_ir_chip to support stacked irq_chip. Signed-off-by: Jiang Liu Acked-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Cc: David Woodhouse Link: http://lkml.kernel.org/r/1428905519-23704-11-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- drivers/iommu/intel_irq_remapping.c | 337 +++++++++++++++++++++++++++++++++++- include/linux/intel-iommu.h | 4 + 2 files changed, 333 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c index 0b89ef1e274a..97ea4209fc8d 100644 --- a/drivers/iommu/intel_irq_remapping.c +++ b/drivers/iommu/intel_irq_remapping.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -31,6 +32,14 @@ struct hpet_scope { unsigned int devfn; }; +struct intel_ir_data { + struct irq_2_iommu irq_2_iommu; + struct irte irte_entry; + union { + struct msi_msg msi_entry; + }; +}; + #define IR_X2APIC_MODE(mode) (mode ? (1 << 11) : 0) #define IRTE_DEST(dest) ((eim_mode) ? dest : dest << 8) @@ -50,6 +59,7 @@ static struct hpet_scope ir_hpet[MAX_HPET_TBS]; * the dmar_global_lock. */ static DEFINE_RAW_SPINLOCK(irq_2_ir_lock); +static struct irq_domain_ops intel_ir_domain_ops; static int __init parse_ioapics_under_ir(void); @@ -263,7 +273,7 @@ static int free_irte(int irq) unsigned long flags; int rc; - if (!irq_iommu) + if (!irq_iommu || irq_iommu->iommu == NULL) return -1; raw_spin_lock_irqsave(&irq_2_ir_lock, flags); @@ -488,7 +498,6 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) pages = alloc_pages_node(iommu->node, GFP_KERNEL | __GFP_ZERO, INTR_REMAP_PAGE_ORDER); - if (!pages) { pr_err("IR%d: failed to allocate pages of order %d\n", iommu->seq_id, INTR_REMAP_PAGE_ORDER); @@ -502,11 +511,23 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) goto out_free_pages; } + iommu->ir_domain = irq_domain_add_hierarchy(arch_get_ir_parent_domain(), + 0, INTR_REMAP_TABLE_ENTRIES, + NULL, &intel_ir_domain_ops, + iommu); + if (!iommu->ir_domain) { + pr_err("IR%d: failed to allocate irqdomain\n", iommu->seq_id); + goto out_free_bitmap; + } + iommu->ir_msi_domain = arch_create_msi_irq_domain(iommu->ir_domain); + ir_table->base = page_address(pages); ir_table->bitmap = bitmap; iommu->ir_table = ir_table; return 0; +out_free_bitmap: + kfree(bitmap); out_free_pages: __free_pages(pages, INTR_REMAP_PAGE_ORDER); out_free_table: @@ -517,6 +538,14 @@ out_free_table: static void intel_teardown_irq_remapping(struct intel_iommu *iommu) { if (iommu && iommu->ir_table) { + if (iommu->ir_msi_domain) { + irq_domain_remove(iommu->ir_msi_domain); + iommu->ir_msi_domain = NULL; + } + if (iommu->ir_domain) { + irq_domain_remove(iommu->ir_domain); + iommu->ir_domain = NULL; + } free_pages((unsigned long)iommu->ir_table->base, INTR_REMAP_PAGE_ORDER); kfree(iommu->ir_table->bitmap); @@ -1062,12 +1091,6 @@ intel_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, struct irte irte; int err; - if (!config_enabled(CONFIG_SMP)) - return -EINVAL; - - if (!cpumask_intersects(mask, cpu_online_mask)) - return -EINVAL; - if (get_irte(irq, &irte)) return -EBUSY; @@ -1100,6 +1123,7 @@ intel_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, send_cleanup_vector(cfg); cpumask_copy(data->affinity, mask); + return 0; } @@ -1205,6 +1229,53 @@ static int intel_alloc_hpet_msi(unsigned int irq, unsigned int id) return ret; } +static struct irq_domain *intel_get_ir_irq_domain(struct irq_alloc_info *info) +{ + struct intel_iommu *iommu = NULL; + + if (!info) + return NULL; + + switch (info->type) { + case X86_IRQ_ALLOC_TYPE_IOAPIC: + iommu = map_ioapic_to_ir(info->ioapic_id); + break; + case X86_IRQ_ALLOC_TYPE_HPET: + iommu = map_hpet_to_ir(info->hpet_id); + break; + case X86_IRQ_ALLOC_TYPE_MSI: + case X86_IRQ_ALLOC_TYPE_MSIX: + iommu = map_dev_to_ir(info->msi_dev); + break; + default: + BUG_ON(1); + break; + } + + return iommu ? iommu->ir_domain : NULL; +} + +static struct irq_domain *intel_get_irq_domain(struct irq_alloc_info *info) +{ + struct intel_iommu *iommu; + + if (!info) + return NULL; + + switch (info->type) { + case X86_IRQ_ALLOC_TYPE_MSI: + case X86_IRQ_ALLOC_TYPE_MSIX: + iommu = map_dev_to_ir(info->msi_dev); + if (iommu) + return iommu->ir_msi_domain; + break; + default: + break; + } + + return NULL; +} + struct irq_remap_ops intel_irq_remap_ops = { .prepare = intel_prepare_irq_remapping, .enable = intel_enable_irq_remapping, @@ -1218,6 +1289,256 @@ struct irq_remap_ops intel_irq_remap_ops = { .msi_alloc_irq = intel_msi_alloc_irq, .msi_setup_irq = intel_msi_setup_irq, .alloc_hpet_msi = intel_alloc_hpet_msi, + .get_ir_irq_domain = intel_get_ir_irq_domain, + .get_irq_domain = intel_get_irq_domain, +}; + +/* + * Migrate the IO-APIC irq in the presence of intr-remapping. + * + * For both level and edge triggered, irq migration is a simple atomic + * update(of vector and cpu destination) of IRTE and flush the hardware cache. + * + * For level triggered, we eliminate the io-apic RTE modification (with the + * updated vector information), by using a virtual vector (io-apic pin number). + * Real vector that is used for interrupting cpu will be coming from + * the interrupt-remapping table entry. + * + * As the migration is a simple atomic update of IRTE, the same mechanism + * is used to migrate MSI irq's in the presence of interrupt-remapping. + */ +static int +intel_ir_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) +{ + struct intel_ir_data *ir_data = data->chip_data; + struct irte *irte = &ir_data->irte_entry; + struct irq_cfg *cfg = irqd_cfg(data); + struct irq_data *parent = data->parent_data; + int ret; + + ret = parent->chip->irq_set_affinity(parent, mask, force); + if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE) + return ret; + + /* + * Atomically updates the IRTE with the new destination, vector + * and flushes the interrupt entry cache. + */ + irte->vector = cfg->vector; + irte->dest_id = IRTE_DEST(cfg->dest_apicid); + modify_irte(&ir_data->irq_2_iommu, irte); + + /* + * After this point, all the interrupts will start arriving + * at the new destination. So, time to cleanup the previous + * vector allocation. + */ + if (cfg->move_in_progress) + send_cleanup_vector(cfg); + + return IRQ_SET_MASK_OK_DONE; +} + +static void intel_ir_compose_msi_msg(struct irq_data *irq_data, + struct msi_msg *msg) +{ + struct intel_ir_data *ir_data = irq_data->chip_data; + + *msg = ir_data->msi_entry; +} + +static struct irq_chip intel_ir_chip = { + .irq_ack = ir_ack_apic_edge, + .irq_set_affinity = intel_ir_set_affinity, + .irq_compose_msi_msg = intel_ir_compose_msi_msg, +}; + +static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data, + struct irq_cfg *irq_cfg, + struct irq_alloc_info *info, + int index, int sub_handle) +{ + struct IR_IO_APIC_route_entry *entry; + struct irte *irte = &data->irte_entry; + struct msi_msg *msg = &data->msi_entry; + + prepare_irte(irte, irq_cfg->vector, irq_cfg->dest_apicid); + switch (info->type) { + case X86_IRQ_ALLOC_TYPE_IOAPIC: + /* Set source-id of interrupt request */ + set_ioapic_sid(irte, info->ioapic_id); + apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: Set IRTE entry (P:%d FPD:%d Dst_Mode:%d Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X Avail:%X Vector:%02X Dest:%08X SID:%04X SQ:%X SVT:%X)\n", + info->ioapic_id, irte->present, irte->fpd, + irte->dst_mode, irte->redir_hint, + irte->trigger_mode, irte->dlvry_mode, + irte->avail, irte->vector, irte->dest_id, + irte->sid, irte->sq, irte->svt); + + entry = (struct IR_IO_APIC_route_entry *)info->ioapic_entry; + info->ioapic_entry = NULL; + memset(entry, 0, sizeof(*entry)); + entry->index2 = (index >> 15) & 0x1; + entry->zero = 0; + entry->format = 1; + entry->index = (index & 0x7fff); + /* + * IO-APIC RTE will be configured with virtual vector. + * irq handler will do the explicit EOI to the io-apic. + */ + entry->vector = info->ioapic_pin; + entry->mask = 0; /* enable IRQ */ + entry->trigger = info->ioapic_trigger; + entry->polarity = info->ioapic_polarity; + if (info->ioapic_trigger) + entry->mask = 1; /* Mask level triggered irqs. */ + break; + + case X86_IRQ_ALLOC_TYPE_HPET: + case X86_IRQ_ALLOC_TYPE_MSI: + case X86_IRQ_ALLOC_TYPE_MSIX: + if (info->type == X86_IRQ_ALLOC_TYPE_HPET) + set_hpet_sid(irte, info->hpet_id); + else + set_msi_sid(irte, info->msi_dev); + + msg->address_hi = MSI_ADDR_BASE_HI; + msg->data = sub_handle; + msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT | + MSI_ADDR_IR_SHV | + MSI_ADDR_IR_INDEX1(index) | + MSI_ADDR_IR_INDEX2(index); + break; + + default: + BUG_ON(1); + break; + } +} + +static void intel_free_irq_resources(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + struct irq_data *irq_data; + struct intel_ir_data *data; + struct irq_2_iommu *irq_iommu; + unsigned long flags; + int i; + + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_domain_get_irq_data(domain, virq + i); + if (irq_data && irq_data->chip_data) { + data = irq_data->chip_data; + irq_iommu = &data->irq_2_iommu; + raw_spin_lock_irqsave(&irq_2_ir_lock, flags); + clear_entries(irq_iommu); + raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); + irq_domain_reset_irq_data(irq_data); + kfree(data); + } + } +} + +static int intel_irq_remapping_alloc(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs, + void *arg) +{ + struct intel_iommu *iommu = domain->host_data; + struct irq_alloc_info *info = arg; + struct intel_ir_data *data; + struct irq_data *irq_data; + struct irq_cfg *irq_cfg; + int i, ret, index; + + if (!info || !iommu) + return -EINVAL; + if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_MSI && + info->type != X86_IRQ_ALLOC_TYPE_MSIX) + return -EINVAL; + + /* + * With IRQ remapping enabled, don't need contiguous CPU vectors + * to support multiple MSI interrupts. + */ + if (info->type == X86_IRQ_ALLOC_TYPE_MSI) + info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS; + + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); + if (ret < 0) + return ret; + + ret = -ENOMEM; + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + goto out_free_parent; + + down_read(&dmar_global_lock); + index = alloc_irte(iommu, virq, &data->irq_2_iommu, nr_irqs); + up_read(&dmar_global_lock); + if (index < 0) { + pr_warn("Failed to allocate IRTE\n"); + kfree(data); + goto out_free_parent; + } + + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_domain_get_irq_data(domain, virq + i); + irq_cfg = irqd_cfg(irq_data); + if (!irq_data || !irq_cfg) { + ret = -EINVAL; + goto out_free_data; + } + + if (i > 0) { + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + goto out_free_data; + } + irq_data->hwirq = (index << 16) + i; + irq_data->chip_data = data; + irq_data->chip = &intel_ir_chip; + intel_irq_remapping_prepare_irte(data, irq_cfg, info, index, i); + irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT); + } + return 0; + +out_free_data: + intel_free_irq_resources(domain, virq, i); +out_free_parent: + irq_domain_free_irqs_common(domain, virq, nr_irqs); + return ret; +} + +static void intel_irq_remapping_free(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + intel_free_irq_resources(domain, virq, nr_irqs); + irq_domain_free_irqs_common(domain, virq, nr_irqs); +} + +static void intel_irq_remapping_activate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct intel_ir_data *data = irq_data->chip_data; + + modify_irte(&data->irq_2_iommu, &data->irte_entry); +} + +static void intel_irq_remapping_deactivate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct intel_ir_data *data = irq_data->chip_data; + struct irte entry; + + memset(&entry, 0, sizeof(entry)); + modify_irte(&data->irq_2_iommu, &entry); +} + +static struct irq_domain_ops intel_ir_domain_ops = { + .alloc = intel_irq_remapping_alloc, + .free = intel_irq_remapping_free, + .activate = intel_irq_remapping_activate, + .deactivate = intel_irq_remapping_deactivate, }; /* diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index a65208a8fe18..ecaf3a937845 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -286,6 +286,8 @@ struct q_inval { #define INTR_REMAP_TABLE_ENTRIES 65536 +struct irq_domain; + struct ir_table { struct irte *base; unsigned long *bitmap; @@ -335,6 +337,8 @@ struct intel_iommu { #ifdef CONFIG_IRQ_REMAP struct ir_table *ir_table; /* Interrupt remapping info */ + struct irq_domain *ir_domain; + struct irq_domain *ir_msi_domain; #endif struct device *iommu_dev; /* IOMMU-sysfs device */ int node; -- cgit v1.2.3 From 34742db8eaf9ff364034f214ee5827701e131d4b Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:41 +0800 Subject: iommu/vt-d: Refine the interfaces to create IRQ for DMAR unit Refine the interfaces to create IRQ for DMAR unit. It's a preparation for converting DMAR IRQ to hierarchical irqdomain on x86. It also moves dmar_alloc_hwirq()/dmar_free_hwirq() from irq_remapping.h to dmar.h. They are not irq_remapping specific. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Vinod Koul Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Tony Luck Cc: Fenghua Yu Cc: Joerg Roedel Link: http://lkml.kernel.org/r/1428905519-23704-20-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/ia64/include/asm/irq_remapping.h | 2 -- arch/ia64/kernel/msi_ia64.c | 30 +++++++++++++++++++----------- arch/x86/include/asm/irq_remapping.h | 4 ---- arch/x86/kernel/apic/msi.c | 24 +++++++++++++----------- drivers/iommu/dmar.c | 19 +++++-------------- include/linux/dmar.h | 3 ++- 6 files changed, 39 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/arch/ia64/include/asm/irq_remapping.h b/arch/ia64/include/asm/irq_remapping.h index e3b3556e2e1b..a8687b1d8906 100644 --- a/arch/ia64/include/asm/irq_remapping.h +++ b/arch/ia64/include/asm/irq_remapping.h @@ -1,6 +1,4 @@ #ifndef __IA64_INTR_REMAPPING_H #define __IA64_INTR_REMAPPING_H #define irq_remapping_enabled 0 -#define dmar_alloc_hwirq create_irq -#define dmar_free_hwirq destroy_irq #endif diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c index 9dd7464f8c17..d70bf15c690a 100644 --- a/arch/ia64/kernel/msi_ia64.c +++ b/arch/ia64/kernel/msi_ia64.c @@ -165,7 +165,7 @@ static struct irq_chip dmar_msi_type = { .irq_retrigger = ia64_msi_retrigger_irq, }; -static int +static void msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) { struct irq_cfg *cfg = irq_cfg + irq; @@ -186,21 +186,29 @@ msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) MSI_DATA_LEVEL_ASSERT | MSI_DATA_DELIVERY_FIXED | MSI_DATA_VECTOR(cfg->vector); - return 0; } -int arch_setup_dmar_msi(unsigned int irq) +int dmar_alloc_hwirq(int id, int node, void *arg) { - int ret; + int irq; struct msi_msg msg; - ret = msi_compose_msg(NULL, irq, &msg); - if (ret < 0) - return ret; - dmar_msi_write(irq, &msg); - irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, - "edge"); - return 0; + irq = create_irq(); + if (irq > 0) { + irq_set_handler_data(irq, arg); + irq_set_chip_and_handler_name(irq, &dmar_msi_type, + handle_edge_irq, "edge"); + msi_compose_msg(NULL, irq, &msg); + dmar_msi_write(irq, &msg); + } + + return irq; +} + +void dmar_free_hwirq(int irq) +{ + irq_set_handler_data(irq, NULL); + destroy_irq(irq); } #endif /* CONFIG_INTEL_IOMMU */ diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index b978c68f5d29..bacac1052262 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -117,8 +117,4 @@ irq_remapping_get_irq_domain(struct irq_alloc_info *info) #define irq_remapping_print_chip NULL #endif /* CONFIG_IRQ_REMAP */ - -extern int dmar_alloc_hwirq(void); -extern void dmar_free_hwirq(int irq); - #endif /* __X86_IRQ_REMAPPING_H */ diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 9fe7a08479fa..ca6250439acc 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -228,25 +228,27 @@ static struct irq_chip dmar_msi_type = { .flags = IRQCHIP_SKIP_SET_WAKE, }; -int arch_setup_dmar_msi(unsigned int irq) +int dmar_alloc_hwirq(int id, int node, void *arg) { + int irq; struct msi_msg msg; - struct irq_cfg *cfg = irq_cfg(irq); - native_compose_msi_msg(cfg, &msg); - dmar_msi_write(irq, &msg); - irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, - "edge"); - return 0; -} + irq = irq_domain_alloc_irqs(NULL, 1, node, NULL); + if (irq > 0) { + irq_set_handler_data(irq, arg); + irq_set_chip_and_handler_name(irq, &dmar_msi_type, + handle_edge_irq, "edge"); + native_compose_msi_msg(irq_cfg(irq), &msg); + dmar_msi_write(irq, &msg); + } -int dmar_alloc_hwirq(void) -{ - return irq_domain_alloc_irqs(NULL, 1, NUMA_NO_NODE, NULL); + return irq; } void dmar_free_hwirq(int irq) { + irq_set_handler_data(irq, NULL); + irq_set_handler(irq, NULL); irq_domain_free_irqs(irq, 1); } #endif diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index 9847613085e1..536f2d8ea41a 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -1087,8 +1087,8 @@ static void free_iommu(struct intel_iommu *iommu) if (iommu->irq) { free_irq(iommu->irq, iommu); - irq_set_handler_data(iommu->irq, NULL); dmar_free_hwirq(iommu->irq); + iommu->irq = 0; } if (iommu->qi) { @@ -1642,23 +1642,14 @@ int dmar_set_interrupt(struct intel_iommu *iommu) if (iommu->irq) return 0; - irq = dmar_alloc_hwirq(); - if (irq <= 0) { + irq = dmar_alloc_hwirq(iommu->seq_id, iommu->node, iommu); + if (irq > 0) { + iommu->irq = irq; + } else { pr_err("IOMMU: no free vectors\n"); return -EINVAL; } - irq_set_handler_data(irq, iommu); - iommu->irq = irq; - - ret = arch_setup_dmar_msi(irq); - if (ret) { - irq_set_handler_data(irq, NULL); - iommu->irq = 0; - dmar_free_hwirq(irq); - return ret; - } - ret = request_irq(irq, dmar_fault, IRQF_NO_THREAD, iommu->name, iommu); if (ret) pr_err("IOMMU: can't request irq\n"); diff --git a/include/linux/dmar.h b/include/linux/dmar.h index 30624954dec5..84737565c1fd 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -227,6 +227,7 @@ extern void dmar_msi_read(int irq, struct msi_msg *msg); extern void dmar_msi_write(int irq, struct msi_msg *msg); extern int dmar_set_interrupt(struct intel_iommu *iommu); extern irqreturn_t dmar_fault(int irq, void *dev_id); -extern int arch_setup_dmar_msi(unsigned int irq); +extern int dmar_alloc_hwirq(int id, int node, void *arg); +extern void dmar_free_hwirq(int irq); #endif /* __DMAR_H__ */ -- cgit v1.2.3 From 49e07d8f28c05347f237146a9ec66f6d958db83e Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:43 +0800 Subject: x86/htirq: Use hierarchical irqdomain to manage Hypertransport interrupts We have slightly changed the architecture interfaces to support htirq PCI driver. It's safe because currently Hypertransport interrupt is only enabled on x86 platforms. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-22-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hw_irq.h | 13 ++++ arch/x86/kernel/apic/htirq.c | 161 +++++++++++++++++++++++++++++++----------- arch/x86/kernel/apic/vector.c | 1 + drivers/pci/htirq.c | 47 ++---------- include/linux/htirq.h | 24 +++++-- 5 files changed, 158 insertions(+), 88 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index bf1725047c27..5e0b031d1a7d 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -160,6 +160,14 @@ struct irq_alloc_info { int dmar_id; void *dmar_data; }; +#endif +#ifdef CONFIG_HT_IRQ + struct { + int ht_pos; + int ht_idx; + struct pci_dev *ht_dev; + void *ht_update; + }; #endif }; }; @@ -227,6 +235,11 @@ extern void arch_init_msi_domain(struct irq_domain *domain); #else static inline void arch_init_msi_domain(struct irq_domain *domain) { } #endif +#ifdef CONFIG_HT_IRQ +extern void arch_init_htirq_domain(struct irq_domain *domain); +#else +static inline void arch_init_htirq_domain(struct irq_domain *domain) { } +#endif /* Statistics */ extern atomic_t irq_err_count; diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c index b307ee7a7148..1cae104415ea 100644 --- a/arch/x86/kernel/apic/htirq.c +++ b/arch/x86/kernel/apic/htirq.c @@ -3,6 +3,8 @@ * * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo * Moved from arch/x86/kernel/apic/io_apic.c. + * Jiang Liu + * Add support of hierarchical irqdomain * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -19,70 +21,104 @@ #include #include +static struct irq_domain *htirq_domain; + /* * Hypertransport interrupt support */ -static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) -{ - struct ht_irq_msg msg; - - fetch_ht_irq_msg(irq, &msg); - - msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK); - msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); - - msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest); - msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); - - write_ht_irq_msg(irq, &msg); -} - static int ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { - struct irq_cfg *cfg = irqd_cfg(data); - unsigned int dest; + struct irq_data *parent = data->parent_data; int ret; - ret = apic_set_affinity(data, mask, &dest); - if (ret) - return ret; - - target_ht_irq(data->irq, dest, cfg->vector); - return IRQ_SET_MASK_OK_NOCOPY; + ret = parent->chip->irq_set_affinity(parent, mask, force); + if (ret >= 0) { + struct ht_irq_msg msg; + struct irq_cfg *cfg = irqd_cfg(data); + + fetch_ht_irq_msg(data->irq, &msg); + msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | + HT_IRQ_LOW_DEST_ID_MASK); + msg.address_lo |= HT_IRQ_LOW_VECTOR(cfg->vector) | + HT_IRQ_LOW_DEST_ID(cfg->dest_apicid); + msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); + msg.address_hi |= HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid); + write_ht_irq_msg(data->irq, &msg); + } + + return ret; } static struct irq_chip ht_irq_chip = { .name = "PCI-HT", .irq_mask = mask_ht_irq, .irq_unmask = unmask_ht_irq, - .irq_ack = apic_ack_edge, + .irq_ack = irq_chip_ack_parent, .irq_set_affinity = ht_set_affinity, - .irq_retrigger = apic_retrigger_irq, + .irq_retrigger = irq_chip_retrigger_hierarchy, .flags = IRQCHIP_SKIP_SET_WAKE, }; -int arch_alloc_ht_irq(struct pci_dev *dev) +static int htirq_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) { - return irq_domain_alloc_irqs(NULL, 1, dev_to_node(&dev->dev), NULL); + struct ht_irq_cfg *ht_cfg; + struct irq_alloc_info *info = arg; + struct pci_dev *dev; + irq_hw_number_t hwirq; + int ret; + + if (nr_irqs > 1 || !info) + return -EINVAL; + + dev = info->ht_dev; + hwirq = (info->ht_idx & 0xFF) | + PCI_DEVID(dev->bus->number, dev->devfn) << 8 | + (pci_domain_nr(dev->bus) & 0xFFFFFFFF) << 24; + if (irq_find_mapping(domain, hwirq) > 0) + return -EEXIST; + + ht_cfg = kmalloc(sizeof(*ht_cfg), GFP_KERNEL); + if (!ht_cfg) + return -ENOMEM; + + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info); + if (ret < 0) { + kfree(ht_cfg); + return ret; + } + + /* Initialize msg to a value that will never match the first write. */ + ht_cfg->msg.address_lo = 0xffffffff; + ht_cfg->msg.address_hi = 0xffffffff; + ht_cfg->dev = info->ht_dev; + ht_cfg->update = info->ht_update; + ht_cfg->pos = info->ht_pos; + ht_cfg->idx = 0x10 + (info->ht_idx * 2); + irq_domain_set_info(domain, virq, hwirq, &ht_irq_chip, ht_cfg, + handle_edge_irq, ht_cfg, "edge"); + + return 0; } -void arch_free_ht_irq(int irq) +static void htirq_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) { - irq_domain_free_irqs(irq, 1); + struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq); + + BUG_ON(nr_irqs != 1); + kfree(irq_data->chip_data); + irq_domain_free_irqs_top(domain, virq, nr_irqs); } -int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) +static void htirq_domain_activate(struct irq_domain *domain, + struct irq_data *irq_data) { - struct irq_cfg *cfg; struct ht_irq_msg msg; + struct irq_cfg *cfg = irqd_cfg(irq_data); - if (disable_apic) - return -ENXIO; - - cfg = irq_cfg(irq); msg.address_hi = HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid); - msg.address_lo = HT_IRQ_LOW_BASE | HT_IRQ_LOW_DEST_ID(cfg->dest_apicid) | @@ -95,13 +131,56 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) HT_IRQ_LOW_MT_FIXED : HT_IRQ_LOW_MT_ARBITRATED) | HT_IRQ_LOW_IRQ_MASKED; + write_ht_irq_msg(irq_data->irq, &msg); +} - write_ht_irq_msg(irq, &msg); +static void htirq_domain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct ht_irq_msg msg; - irq_set_chip_and_handler_name(irq, &ht_irq_chip, - handle_edge_irq, "edge"); + memset(&msg, 0, sizeof(msg)); + write_ht_irq_msg(irq_data->irq, &msg); +} - dev_dbg(&dev->dev, "irq %d for HT\n", irq); +static struct irq_domain_ops htirq_domain_ops = { + .alloc = htirq_domain_alloc, + .free = htirq_domain_free, + .activate = htirq_domain_activate, + .deactivate = htirq_domain_deactivate, +}; - return 0; +void arch_init_htirq_domain(struct irq_domain *parent) +{ + if (disable_apic) + return; + + htirq_domain = irq_domain_add_tree(NULL, &htirq_domain_ops, NULL); + if (!htirq_domain) + pr_warn("failed to initialize irqdomain for HTIRQ.\n"); + else + htirq_domain->parent = parent; +} + +int arch_setup_ht_irq(int idx, int pos, struct pci_dev *dev, + ht_irq_update_t *update) +{ + struct irq_alloc_info info; + + if (!htirq_domain) + return -ENOSYS; + + init_irq_alloc_info(&info, NULL); + info.ht_idx = idx; + info.ht_pos = pos; + info.ht_dev = dev; + info.ht_update = update; + + return irq_domain_alloc_irqs(htirq_domain, 1, dev_to_node(&dev->dev), + &info); +} + +void arch_teardown_ht_irq(unsigned int irq) +{ + irq_domain_free_irqs(irq, 1); } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index a8d82896be75..b4b6b5a13440 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -365,6 +365,7 @@ int __init arch_early_irq_init(void) irq_set_default_host(x86_vector_domain); arch_init_msi_domain(x86_vector_domain); + arch_init_htirq_domain(x86_vector_domain); return arch_early_ioapic_init(); } diff --git a/drivers/pci/htirq.c b/drivers/pci/htirq.c index ceb0ebeb7b5f..7eb4109a3df4 100644 --- a/drivers/pci/htirq.c +++ b/drivers/pci/htirq.c @@ -23,20 +23,11 @@ */ static DEFINE_SPINLOCK(ht_irq_lock); -struct ht_irq_cfg { - struct pci_dev *dev; - /* Update callback used to cope with buggy hardware */ - ht_irq_update_t *update; - unsigned pos; - unsigned idx; - struct ht_irq_msg msg; -}; - - void write_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg) { struct ht_irq_cfg *cfg = irq_get_handler_data(irq); unsigned long flags; + spin_lock_irqsave(&ht_irq_lock, flags); if (cfg->msg.address_lo != msg->address_lo) { pci_write_config_byte(cfg->dev, cfg->pos + 2, cfg->idx); @@ -55,6 +46,7 @@ void write_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg) void fetch_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg) { struct ht_irq_cfg *cfg = irq_get_handler_data(irq); + *msg = cfg->msg; } @@ -86,7 +78,6 @@ void unmask_ht_irq(struct irq_data *data) */ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update) { - struct ht_irq_cfg *cfg; int max_irq, pos, irq; unsigned long flags; u32 data; @@ -105,29 +96,9 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update) if (idx > max_irq) return -EINVAL; - cfg = kmalloc(sizeof(*cfg), GFP_KERNEL); - if (!cfg) - return -ENOMEM; - - cfg->dev = dev; - cfg->update = update; - cfg->pos = pos; - cfg->idx = 0x10 + (idx * 2); - /* Initialize msg to a value that will never match the first write. */ - cfg->msg.address_lo = 0xffffffff; - cfg->msg.address_hi = 0xffffffff; - - irq = arch_alloc_ht_irq(dev); - if (irq <= 0) { - kfree(cfg); - return -EBUSY; - } - irq_set_handler_data(irq, cfg); - - if (arch_setup_ht_irq(irq, dev) < 0) { - ht_destroy_irq(irq); - return -EBUSY; - } + irq = arch_setup_ht_irq(idx, pos, dev, update); + if (irq > 0) + dev_dbg(&dev->dev, "irq %d for HT\n", irq); return irq; } @@ -158,12 +129,6 @@ EXPORT_SYMBOL(ht_create_irq); */ void ht_destroy_irq(unsigned int irq) { - struct ht_irq_cfg *cfg; - - cfg = irq_get_handler_data(irq); - irq_set_chip(irq, NULL); - irq_set_handler_data(irq, NULL); - arch_free_ht_irq(irq); - kfree(cfg); + arch_teardown_ht_irq(irq); } EXPORT_SYMBOL(ht_destroy_irq); diff --git a/include/linux/htirq.h b/include/linux/htirq.h index 5caa51b7b95c..d4a527e58434 100644 --- a/include/linux/htirq.h +++ b/include/linux/htirq.h @@ -1,26 +1,38 @@ #ifndef LINUX_HTIRQ_H #define LINUX_HTIRQ_H +struct pci_dev; +struct irq_data; + struct ht_irq_msg { u32 address_lo; /* low 32 bits of the ht irq message */ u32 address_hi; /* high 32 bits of the it irq message */ }; +typedef void (ht_irq_update_t)(struct pci_dev *dev, int irq, + struct ht_irq_msg *msg); + +struct ht_irq_cfg { + struct pci_dev *dev; + /* Update callback used to cope with buggy hardware */ + ht_irq_update_t *update; + unsigned pos; + unsigned idx; + struct ht_irq_msg msg; +}; + /* Helper functions.. */ void fetch_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg); void write_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg); -struct irq_data; void mask_ht_irq(struct irq_data *data); void unmask_ht_irq(struct irq_data *data); /* The arch hook for getting things started */ -int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev); -int arch_alloc_ht_irq(struct pci_dev *dev); -void arch_free_ht_irq(int irq); +int arch_setup_ht_irq(int idx, int pos, struct pci_dev *dev, + ht_irq_update_t *update); +void arch_teardown_ht_irq(unsigned int irq); /* For drivers of buggy hardware */ -typedef void (ht_irq_update_t)(struct pci_dev *dev, int irq, - struct ht_irq_msg *msg); int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update); #endif /* LINUX_HTIRQ_H */ -- cgit v1.2.3 From 591fc116f3302da915bb57d4474a61a5e8884cec Mon Sep 17 00:00:00 2001 From: "Ivan T. Ivanov" Date: Thu, 9 Apr 2015 11:34:22 +0300 Subject: usb: phy: msm: Use extcon framework for VBUS and ID detection On recent Qualcomm platforms VBUS and ID lines are not routed to USB PHY LINK controller. Use extcon framework to receive connect and disconnect ID and VBUS notification. Signed-off-by: Ivan T. Ivanov Signed-off-by: Felipe Balbi --- .../devicetree/bindings/usb/msm-hsusb.txt | 7 ++ drivers/usb/phy/Kconfig | 1 + drivers/usb/phy/phy-msm-usb.c | 84 ++++++++++++++++++++++ include/linux/usb/msm_hsusb.h | 17 +++++ 4 files changed, 109 insertions(+) (limited to 'include/linux') diff --git a/Documentation/devicetree/bindings/usb/msm-hsusb.txt b/Documentation/devicetree/bindings/usb/msm-hsusb.txt index 2826f2af503a..f26bcfac3d8f 100644 --- a/Documentation/devicetree/bindings/usb/msm-hsusb.txt +++ b/Documentation/devicetree/bindings/usb/msm-hsusb.txt @@ -69,6 +69,13 @@ Optional properties: (no, min, max) where each value represents either a voltage in microvolts or a value corresponding to voltage corner. +- extcon: phandles to external connector devices. First phandle + should point to external connector, which provide "USB" + cable events, the second should point to external connector + device, which provide "USB-HOST" cable events. If one of + the external connector devices is not required empty <0> + phandle should be specified. + Example HSUSB OTG controller device node: usb@f9a55000 { diff --git a/drivers/usb/phy/Kconfig b/drivers/usb/phy/Kconfig index 2175678e674e..811f331892d7 100644 --- a/drivers/usb/phy/Kconfig +++ b/drivers/usb/phy/Kconfig @@ -141,6 +141,7 @@ config USB_MSM_OTG tristate "Qualcomm on-chip USB OTG controller support" depends on (USB || USB_GADGET) && (ARCH_QCOM || COMPILE_TEST) depends on RESET_CONTROLLER + depends on EXTCON select USB_PHY help Enable this to support the USB OTG transceiver on Qualcomm chips. It diff --git a/drivers/usb/phy/phy-msm-usb.c b/drivers/usb/phy/phy-msm-usb.c index c9156beeadef..ad66c67ce9a5 100644 --- a/drivers/usb/phy/phy-msm-usb.c +++ b/drivers/usb/phy/phy-msm-usb.c @@ -1436,9 +1436,42 @@ static const struct of_device_id msm_otg_dt_match[] = { }; MODULE_DEVICE_TABLE(of, msm_otg_dt_match); +static int msm_otg_vbus_notifier(struct notifier_block *nb, unsigned long event, + void *ptr) +{ + struct msm_usb_cable *vbus = container_of(nb, struct msm_usb_cable, nb); + struct msm_otg *motg = container_of(vbus, struct msm_otg, vbus); + + if (event) + set_bit(B_SESS_VLD, &motg->inputs); + else + clear_bit(B_SESS_VLD, &motg->inputs); + + schedule_work(&motg->sm_work); + + return NOTIFY_DONE; +} + +static int msm_otg_id_notifier(struct notifier_block *nb, unsigned long event, + void *ptr) +{ + struct msm_usb_cable *id = container_of(nb, struct msm_usb_cable, nb); + struct msm_otg *motg = container_of(id, struct msm_otg, id); + + if (event) + clear_bit(ID, &motg->inputs); + else + set_bit(ID, &motg->inputs); + + schedule_work(&motg->sm_work); + + return NOTIFY_DONE; +} + static int msm_otg_read_dt(struct platform_device *pdev, struct msm_otg *motg) { struct msm_otg_platform_data *pdata; + struct extcon_dev *ext_id, *ext_vbus; const struct of_device_id *id; struct device_node *node = pdev->dev.of_node; struct property *prop; @@ -1487,6 +1520,52 @@ static int msm_otg_read_dt(struct platform_device *pdev, struct msm_otg *motg) motg->vdd_levels[VDD_LEVEL_MAX] = tmp[VDD_LEVEL_MAX]; } + ext_id = ERR_PTR(-ENODEV); + ext_vbus = ERR_PTR(-ENODEV); + if (of_property_read_bool(node, "extcon")) { + + /* Each one of them is not mandatory */ + ext_vbus = extcon_get_edev_by_phandle(&pdev->dev, 0); + if (IS_ERR(ext_vbus) && PTR_ERR(ext_vbus) != -ENODEV) + return PTR_ERR(ext_vbus); + + ext_id = extcon_get_edev_by_phandle(&pdev->dev, 1); + if (IS_ERR(ext_id) && PTR_ERR(ext_id) != -ENODEV) + return PTR_ERR(ext_id); + } + + if (!IS_ERR(ext_vbus)) { + motg->vbus.nb.notifier_call = msm_otg_vbus_notifier; + ret = extcon_register_interest(&motg->vbus.conn, ext_vbus->name, + "USB", &motg->vbus.nb); + if (ret < 0) { + dev_err(&pdev->dev, "register VBUS notifier failed\n"); + return ret; + } + + ret = extcon_get_cable_state(ext_vbus, "USB"); + if (ret) + set_bit(B_SESS_VLD, &motg->inputs); + else + clear_bit(B_SESS_VLD, &motg->inputs); + } + + if (!IS_ERR(ext_id)) { + motg->id.nb.notifier_call = msm_otg_id_notifier; + ret = extcon_register_interest(&motg->id.conn, ext_id->name, + "USB-HOST", &motg->id.nb); + if (ret < 0) { + dev_err(&pdev->dev, "register ID notifier failed\n"); + return ret; + } + + ret = extcon_get_cable_state(ext_id, "USB-HOST"); + if (ret) + clear_bit(ID, &motg->inputs); + else + set_bit(ID, &motg->inputs); + } + prop = of_find_property(node, "qcom,phy-init-sequence", &len); if (!prop || !len) return 0; @@ -1700,6 +1779,11 @@ static int msm_otg_remove(struct platform_device *pdev) if (phy->otg->host || phy->otg->gadget) return -EBUSY; + if (motg->id.conn.edev) + extcon_unregister_interest(&motg->id.conn); + if (motg->vbus.conn.edev) + extcon_unregister_interest(&motg->vbus.conn); + msm_otg_debugfs_cleanup(); cancel_delayed_work_sync(&motg->chg_work); cancel_work_sync(&motg->sm_work); diff --git a/include/linux/usb/msm_hsusb.h b/include/linux/usb/msm_hsusb.h index 7dbecf9a4656..c4d956e50d09 100644 --- a/include/linux/usb/msm_hsusb.h +++ b/include/linux/usb/msm_hsusb.h @@ -18,6 +18,7 @@ #ifndef __ASM_ARCH_MSM_HSUSB_H #define __ASM_ARCH_MSM_HSUSB_H +#include #include #include #include @@ -119,6 +120,17 @@ struct msm_otg_platform_data { void (*setup_gpio)(enum usb_otg_state state); }; +/** + * struct msm_usb_cable - structure for exteternal connector cable + * state tracking + * @nb: hold event notification callback + * @conn: used for notification registration + */ +struct msm_usb_cable { + struct notifier_block nb; + struct extcon_specific_cable_nb conn; +}; + /** * struct msm_otg: OTG driver data. Shared by HCD and DCD. * @otg: USB OTG Transceiver structure. @@ -138,6 +150,8 @@ struct msm_otg_platform_data { * @chg_type: The type of charger attached. * @dcd_retires: The retry count used to track Data contact * detection process. + * @vbus: VBUS signal state trakining, using extcon framework + * @id: ID signal state trakining, using extcon framework */ struct msm_otg { struct usb_phy phy; @@ -166,6 +180,9 @@ struct msm_otg { struct reset_control *phy_rst; struct reset_control *link_rst; int vdd_levels[3]; + + struct msm_usb_cable vbus; + struct msm_usb_cable id; }; #endif -- cgit v1.2.3 From 44e42ae3a398b559c768b9b3c324d72b0b0b4479 Mon Sep 17 00:00:00 2001 From: "Ivan T. Ivanov" Date: Thu, 9 Apr 2015 11:34:33 +0300 Subject: usb: phy: msm: Manual PHY and LINK controller VBUS change notification VBUS is not routed to USB PHY on recent Qualcomm platforms. USB controller must see VBUS in order to pull-up DP when setting RS bit. Henc configure USB PHY and LINK registers sense VBUS and enable manual pullup on D+ line. Cc: Vamsi Krishna Cc: Mayank Rana Signed-off-by: Ivan T. Ivanov Signed-off-by: Felipe Balbi --- .../devicetree/bindings/usb/msm-hsusb.txt | 4 ++++ drivers/usb/phy/phy-msm-usb.c | 26 ++++++++++++++++++++++ include/linux/usb/msm_hsusb.h | 5 +++++ include/linux/usb/msm_hsusb_hw.h | 9 ++++++++ 4 files changed, 44 insertions(+) (limited to 'include/linux') diff --git a/Documentation/devicetree/bindings/usb/msm-hsusb.txt b/Documentation/devicetree/bindings/usb/msm-hsusb.txt index f26bcfac3d8f..bd8d9e753029 100644 --- a/Documentation/devicetree/bindings/usb/msm-hsusb.txt +++ b/Documentation/devicetree/bindings/usb/msm-hsusb.txt @@ -69,6 +69,10 @@ Optional properties: (no, min, max) where each value represents either a voltage in microvolts or a value corresponding to voltage corner. +- qcom,manual-pullup: If present, vbus is not routed to USB controller/phy + and controller driver therefore enables pull-up explicitly + before starting controller using usbcmd run/stop bit. + - extcon: phandles to external connector devices. First phandle should point to external connector, which provide "USB" cable events, the second should point to external connector diff --git a/drivers/usb/phy/phy-msm-usb.c b/drivers/usb/phy/phy-msm-usb.c index ad66c67ce9a5..00c49bb1bd29 100644 --- a/drivers/usb/phy/phy-msm-usb.c +++ b/drivers/usb/phy/phy-msm-usb.c @@ -240,8 +240,14 @@ static void ulpi_init(struct msm_otg *motg) static int msm_phy_notify_disconnect(struct usb_phy *phy, enum usb_device_speed speed) { + struct msm_otg *motg = container_of(phy, struct msm_otg, phy); int val; + if (motg->manual_pullup) { + val = ULPI_MISC_A_VBUSVLDEXT | ULPI_MISC_A_VBUSVLDEXTSEL; + usb_phy_io_write(phy, val, ULPI_CLR(ULPI_MISC_A)); + } + /* * Put the transceiver in non-driving mode. Otherwise host * may not detect soft-disconnection. @@ -422,6 +428,24 @@ static int msm_phy_init(struct usb_phy *phy) ulpi_write(phy, ulpi_val, ULPI_USB_INT_EN_FALL); } + if (motg->manual_pullup) { + val = ULPI_MISC_A_VBUSVLDEXTSEL | ULPI_MISC_A_VBUSVLDEXT; + ulpi_write(phy, val, ULPI_SET(ULPI_MISC_A)); + + val = readl(USB_GENCONFIG_2); + val |= GENCONFIG_2_SESS_VLD_CTRL_EN; + writel(val, USB_GENCONFIG_2); + + val = readl(USB_USBCMD); + val |= USBCMD_SESS_VLD_CTRL; + writel(val, USB_USBCMD); + + val = ulpi_read(phy, ULPI_FUNC_CTRL); + val &= ~ULPI_FUNC_CTRL_OPMODE_MASK; + val |= ULPI_FUNC_CTRL_OPMODE_NORMAL; + ulpi_write(phy, val, ULPI_FUNC_CTRL); + } + if (motg->phy_number) writel(readl(USB_PHY_CTRL2) | BIT(16), USB_PHY_CTRL2); @@ -1520,6 +1544,8 @@ static int msm_otg_read_dt(struct platform_device *pdev, struct msm_otg *motg) motg->vdd_levels[VDD_LEVEL_MAX] = tmp[VDD_LEVEL_MAX]; } + motg->manual_pullup = of_property_read_bool(node, "qcom,manual-pullup"); + ext_id = ERR_PTR(-ENODEV); ext_vbus = ERR_PTR(-ENODEV); if (of_property_read_bool(node, "extcon")) { diff --git a/include/linux/usb/msm_hsusb.h b/include/linux/usb/msm_hsusb.h index c4d956e50d09..e55a1504266e 100644 --- a/include/linux/usb/msm_hsusb.h +++ b/include/linux/usb/msm_hsusb.h @@ -150,6 +150,9 @@ struct msm_usb_cable { * @chg_type: The type of charger attached. * @dcd_retires: The retry count used to track Data contact * detection process. + * @manual_pullup: true if VBUS is not routed to USB controller/phy + * and controller driver therefore enables pull-up explicitly before + * starting controller using usbcmd run/stop bit. * @vbus: VBUS signal state trakining, using extcon framework * @id: ID signal state trakining, using extcon framework */ @@ -181,6 +184,8 @@ struct msm_otg { struct reset_control *link_rst; int vdd_levels[3]; + bool manual_pullup; + struct msm_usb_cable vbus; struct msm_usb_cable id; }; diff --git a/include/linux/usb/msm_hsusb_hw.h b/include/linux/usb/msm_hsusb_hw.h index a29f6030afb1..e159b39f67a2 100644 --- a/include/linux/usb/msm_hsusb_hw.h +++ b/include/linux/usb/msm_hsusb_hw.h @@ -21,6 +21,8 @@ #define USB_AHBBURST (MSM_USB_BASE + 0x0090) #define USB_AHBMODE (MSM_USB_BASE + 0x0098) +#define USB_GENCONFIG_2 (MSM_USB_BASE + 0x00a0) + #define USB_CAPLENGTH (MSM_USB_BASE + 0x0100) /* 8 bit */ #define USB_USBCMD (MSM_USB_BASE + 0x0140) @@ -30,6 +32,9 @@ #define USB_PHY_CTRL (MSM_USB_BASE + 0x0240) #define USB_PHY_CTRL2 (MSM_USB_BASE + 0x0278) +#define GENCONFIG_2_SESS_VLD_CTRL_EN BIT(7) +#define USBCMD_SESS_VLD_CTRL BIT(25) + #define USBCMD_RESET 2 #define USB_USBINTR (MSM_USB_BASE + 0x0148) @@ -50,6 +55,10 @@ #define ULPI_PWR_CLK_MNG_REG 0x88 #define OTG_COMP_DISABLE BIT(0) +#define ULPI_MISC_A 0x96 +#define ULPI_MISC_A_VBUSVLDEXTSEL BIT(1) +#define ULPI_MISC_A_VBUSVLDEXT BIT(0) + #define ASYNC_INTR_CTRL (1 << 29) /* Enable async interrupt */ #define ULPI_STP_CTRL (1 << 30) /* Block communication with PHY */ #define PHY_RETEN (1 << 1) /* PHY retention enable/disable */ -- cgit v1.2.3 From b189a2117223edbe40e0a187ae5c606cbdd6447c Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 28 Apr 2015 14:04:07 +0200 Subject: usb: phy: Remove the phy-rcar-gen2-usb driver The phy-rcar-gen2-usb driver, which supports legacy platform data only, is no longer used since commit a483dcbfa21f919c ("ARM: shmobile: lager: Remove legacy board support"). This driver was superseded by the DT-only phy-rcar-gen2 driver, which was introduced in commit 1233f59f745b237d ("phy: Renesas R-Car Gen2 PHY driver"). Signed-off-by: Geert Uytterhoeven Signed-off-by: Felipe Balbi --- drivers/usb/phy/Kconfig | 13 -- drivers/usb/phy/Makefile | 1 - drivers/usb/phy/phy-rcar-gen2-usb.c | 246 ------------------------ include/linux/platform_data/usb-rcar-gen2-phy.h | 22 --- 4 files changed, 282 deletions(-) delete mode 100644 drivers/usb/phy/phy-rcar-gen2-usb.c delete mode 100644 include/linux/platform_data/usb-rcar-gen2-phy.h (limited to 'include/linux') diff --git a/drivers/usb/phy/Kconfig b/drivers/usb/phy/Kconfig index 811f331892d7..173a5b5d8bc1 100644 --- a/drivers/usb/phy/Kconfig +++ b/drivers/usb/phy/Kconfig @@ -187,19 +187,6 @@ config USB_RCAR_PHY To compile this driver as a module, choose M here: the module will be called phy-rcar-usb. -config USB_RCAR_GEN2_PHY - tristate "Renesas R-Car Gen2 USB PHY support" - depends on ARCH_R8A7790 || ARCH_R8A7791 || COMPILE_TEST - select USB_PHY - help - Say Y here to add support for the Renesas R-Car Gen2 USB PHY driver. - It is typically used to control internal USB PHY for USBHS, - and to configure shared USB channels 0 and 2. - This driver supports R8A7790 and R8A7791. - - To compile this driver as a module, choose M here: the - module will be called phy-rcar-gen2-usb. - config USB_ULPI bool "Generic ULPI Transceiver Driver" depends on ARM || ARM64 diff --git a/drivers/usb/phy/Makefile b/drivers/usb/phy/Makefile index 75f2bba58c84..e36ab1d46d8b 100644 --- a/drivers/usb/phy/Makefile +++ b/drivers/usb/phy/Makefile @@ -23,7 +23,6 @@ obj-$(CONFIG_USB_MSM_OTG) += phy-msm-usb.o obj-$(CONFIG_USB_MV_OTG) += phy-mv-usb.o obj-$(CONFIG_USB_MXS_PHY) += phy-mxs-usb.o obj-$(CONFIG_USB_RCAR_PHY) += phy-rcar-usb.o -obj-$(CONFIG_USB_RCAR_GEN2_PHY) += phy-rcar-gen2-usb.o obj-$(CONFIG_USB_ULPI) += phy-ulpi.o obj-$(CONFIG_USB_ULPI_VIEWPORT) += phy-ulpi-viewport.o obj-$(CONFIG_KEYSTONE_USB_PHY) += phy-keystone.o diff --git a/drivers/usb/phy/phy-rcar-gen2-usb.c b/drivers/usb/phy/phy-rcar-gen2-usb.c deleted file mode 100644 index f81800b6562a..000000000000 --- a/drivers/usb/phy/phy-rcar-gen2-usb.c +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Renesas R-Car Gen2 USB phy driver - * - * Copyright (C) 2013 Renesas Solutions Corp. - * Copyright (C) 2013 Cogent Embedded, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -struct rcar_gen2_usb_phy_priv { - struct usb_phy phy; - void __iomem *base; - struct clk *clk; - spinlock_t lock; - int usecount; - u32 ugctrl2; -}; - -#define usb_phy_to_priv(p) container_of(p, struct rcar_gen2_usb_phy_priv, phy) - -/* Low Power Status register */ -#define USBHS_LPSTS_REG 0x02 -#define USBHS_LPSTS_SUSPM (1 << 14) - -/* USB General control register */ -#define USBHS_UGCTRL_REG 0x80 -#define USBHS_UGCTRL_CONNECT (1 << 2) -#define USBHS_UGCTRL_PLLRESET (1 << 0) - -/* USB General control register 2 */ -#define USBHS_UGCTRL2_REG 0x84 -#define USBHS_UGCTRL2_USB0_PCI (1 << 4) -#define USBHS_UGCTRL2_USB0_HS (3 << 4) -#define USBHS_UGCTRL2_USB2_PCI (0 << 31) -#define USBHS_UGCTRL2_USB2_SS (1 << 31) - -/* USB General status register */ -#define USBHS_UGSTS_REG 0x88 -#define USBHS_UGSTS_LOCK (1 << 8) - -/* Enable USBHS internal phy */ -static int __rcar_gen2_usbhs_phy_enable(void __iomem *base) -{ - u32 val; - int i; - - /* USBHS PHY power on */ - val = ioread32(base + USBHS_UGCTRL_REG); - val &= ~USBHS_UGCTRL_PLLRESET; - iowrite32(val, base + USBHS_UGCTRL_REG); - - val = ioread16(base + USBHS_LPSTS_REG); - val |= USBHS_LPSTS_SUSPM; - iowrite16(val, base + USBHS_LPSTS_REG); - - for (i = 0; i < 20; i++) { - val = ioread32(base + USBHS_UGSTS_REG); - if ((val & USBHS_UGSTS_LOCK) == USBHS_UGSTS_LOCK) { - val = ioread32(base + USBHS_UGCTRL_REG); - val |= USBHS_UGCTRL_CONNECT; - iowrite32(val, base + USBHS_UGCTRL_REG); - return 0; - } - udelay(1); - } - - /* Timed out waiting for the PLL lock */ - return -ETIMEDOUT; -} - -/* Disable USBHS internal phy */ -static int __rcar_gen2_usbhs_phy_disable(void __iomem *base) -{ - u32 val; - - /* USBHS PHY power off */ - val = ioread32(base + USBHS_UGCTRL_REG); - val &= ~USBHS_UGCTRL_CONNECT; - iowrite32(val, base + USBHS_UGCTRL_REG); - - val = ioread16(base + USBHS_LPSTS_REG); - val &= ~USBHS_LPSTS_SUSPM; - iowrite16(val, base + USBHS_LPSTS_REG); - - val = ioread32(base + USBHS_UGCTRL_REG); - val |= USBHS_UGCTRL_PLLRESET; - iowrite32(val, base + USBHS_UGCTRL_REG); - return 0; -} - -/* Setup USB channels */ -static void __rcar_gen2_usb_phy_init(struct rcar_gen2_usb_phy_priv *priv) -{ - u32 val; - - clk_prepare_enable(priv->clk); - - /* Set USB channels in the USBHS UGCTRL2 register */ - val = ioread32(priv->base + USBHS_UGCTRL2_REG); - val &= ~(USBHS_UGCTRL2_USB0_HS | USBHS_UGCTRL2_USB2_SS); - val |= priv->ugctrl2; - iowrite32(val, priv->base + USBHS_UGCTRL2_REG); -} - -/* Shutdown USB channels */ -static void __rcar_gen2_usb_phy_shutdown(struct rcar_gen2_usb_phy_priv *priv) -{ - __rcar_gen2_usbhs_phy_disable(priv->base); - clk_disable_unprepare(priv->clk); -} - -static int rcar_gen2_usb_phy_set_suspend(struct usb_phy *phy, int suspend) -{ - struct rcar_gen2_usb_phy_priv *priv = usb_phy_to_priv(phy); - unsigned long flags; - int retval; - - spin_lock_irqsave(&priv->lock, flags); - retval = suspend ? __rcar_gen2_usbhs_phy_disable(priv->base) : - __rcar_gen2_usbhs_phy_enable(priv->base); - spin_unlock_irqrestore(&priv->lock, flags); - return retval; -} - -static int rcar_gen2_usb_phy_init(struct usb_phy *phy) -{ - struct rcar_gen2_usb_phy_priv *priv = usb_phy_to_priv(phy); - unsigned long flags; - - spin_lock_irqsave(&priv->lock, flags); - /* - * Enable the clock and setup USB channels - * if it's the first user - */ - if (!priv->usecount++) - __rcar_gen2_usb_phy_init(priv); - spin_unlock_irqrestore(&priv->lock, flags); - return 0; -} - -static void rcar_gen2_usb_phy_shutdown(struct usb_phy *phy) -{ - struct rcar_gen2_usb_phy_priv *priv = usb_phy_to_priv(phy); - unsigned long flags; - - spin_lock_irqsave(&priv->lock, flags); - if (!priv->usecount) { - dev_warn(phy->dev, "Trying to disable phy with 0 usecount\n"); - goto out; - } - - /* Disable everything if it's the last user */ - if (!--priv->usecount) - __rcar_gen2_usb_phy_shutdown(priv); -out: - spin_unlock_irqrestore(&priv->lock, flags); -} - -static int rcar_gen2_usb_phy_probe(struct platform_device *pdev) -{ - struct device *dev = &pdev->dev; - struct rcar_gen2_phy_platform_data *pdata; - struct rcar_gen2_usb_phy_priv *priv; - struct resource *res; - void __iomem *base; - struct clk *clk; - int retval; - - pdata = dev_get_platdata(dev); - if (!pdata) { - dev_err(dev, "No platform data\n"); - return -EINVAL; - } - - clk = devm_clk_get(dev, "usbhs"); - if (IS_ERR(clk)) { - dev_err(dev, "Can't get the clock\n"); - return PTR_ERR(clk); - } - - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - base = devm_ioremap_resource(dev, res); - if (IS_ERR(base)) - return PTR_ERR(base); - - priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); - if (!priv) - return -ENOMEM; - - spin_lock_init(&priv->lock); - priv->clk = clk; - priv->base = base; - priv->ugctrl2 = pdata->chan0_pci ? - USBHS_UGCTRL2_USB0_PCI : USBHS_UGCTRL2_USB0_HS; - priv->ugctrl2 |= pdata->chan2_pci ? - USBHS_UGCTRL2_USB2_PCI : USBHS_UGCTRL2_USB2_SS; - priv->phy.dev = dev; - priv->phy.label = dev_name(dev); - priv->phy.init = rcar_gen2_usb_phy_init; - priv->phy.shutdown = rcar_gen2_usb_phy_shutdown; - priv->phy.set_suspend = rcar_gen2_usb_phy_set_suspend; - - retval = usb_add_phy_dev(&priv->phy); - if (retval < 0) { - dev_err(dev, "Failed to add USB phy\n"); - return retval; - } - - platform_set_drvdata(pdev, priv); - - return retval; -} - -static int rcar_gen2_usb_phy_remove(struct platform_device *pdev) -{ - struct rcar_gen2_usb_phy_priv *priv = platform_get_drvdata(pdev); - - usb_remove_phy(&priv->phy); - - return 0; -} - -static struct platform_driver rcar_gen2_usb_phy_driver = { - .driver = { - .name = "usb_phy_rcar_gen2", - }, - .probe = rcar_gen2_usb_phy_probe, - .remove = rcar_gen2_usb_phy_remove, -}; - -module_platform_driver(rcar_gen2_usb_phy_driver); - -MODULE_LICENSE("GPL v2"); -MODULE_DESCRIPTION("Renesas R-Car Gen2 USB phy"); -MODULE_AUTHOR("Valentine Barshak "); diff --git a/include/linux/platform_data/usb-rcar-gen2-phy.h b/include/linux/platform_data/usb-rcar-gen2-phy.h deleted file mode 100644 index dd3ba46c0d90..000000000000 --- a/include/linux/platform_data/usb-rcar-gen2-phy.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright (C) 2013 Renesas Solutions Corp. - * Copyright (C) 2013 Cogent Embedded, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#ifndef __USB_RCAR_GEN2_PHY_H -#define __USB_RCAR_GEN2_PHY_H - -#include - -struct rcar_gen2_phy_platform_data { - /* USB channel 0 configuration */ - bool chan0_pci:1; /* true: PCI USB host 0, false: USBHS */ - /* USB channel 2 configuration */ - bool chan2_pci:1; /* true: PCI USB host 2, false: USBSS */ -}; - -#endif -- cgit v1.2.3 From f1ec7187167ce225d2744b20a90afef5f10fd6cd Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 29 Apr 2015 16:02:30 -0500 Subject: libfdt: add fdt type definitions In preparation for libfdt/dtc update, add the new fdt specific types. Signed-off-by: Rob Herring Cc: Russell King Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: linux-arm-kernel@lists.infradead.org Cc: linuxppc-dev@lists.ozlabs.org --- arch/arm/boot/compressed/libfdt_env.h | 4 ++++ arch/powerpc/boot/libfdt_env.h | 4 ++++ arch/powerpc/boot/of.h | 2 ++ include/linux/libfdt_env.h | 4 ++++ 4 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/arch/arm/boot/compressed/libfdt_env.h b/arch/arm/boot/compressed/libfdt_env.h index 1f4e71876b00..17ae0f3efac8 100644 --- a/arch/arm/boot/compressed/libfdt_env.h +++ b/arch/arm/boot/compressed/libfdt_env.h @@ -5,6 +5,10 @@ #include #include +typedef __be16 fdt16_t; +typedef __be32 fdt32_t; +typedef __be64 fdt64_t; + #define fdt16_to_cpu(x) be16_to_cpu(x) #define cpu_to_fdt16(x) cpu_to_be16(x) #define fdt32_to_cpu(x) be32_to_cpu(x) diff --git a/arch/powerpc/boot/libfdt_env.h b/arch/powerpc/boot/libfdt_env.h index 8dcd744e5728..7e3789ea396b 100644 --- a/arch/powerpc/boot/libfdt_env.h +++ b/arch/powerpc/boot/libfdt_env.h @@ -10,6 +10,10 @@ typedef u32 uint32_t; typedef u64 uint64_t; typedef unsigned long uintptr_t; +typedef __be16 fdt16_t; +typedef __be32 fdt32_t; +typedef __be64 fdt64_t; + #define fdt16_to_cpu(x) be16_to_cpu(x) #define cpu_to_fdt16(x) cpu_to_be16(x) #define fdt32_to_cpu(x) be32_to_cpu(x) diff --git a/arch/powerpc/boot/of.h b/arch/powerpc/boot/of.h index 5603320dce07..53f8f27f94e4 100644 --- a/arch/powerpc/boot/of.h +++ b/arch/powerpc/boot/of.h @@ -21,7 +21,9 @@ int of_setprop(const void *phandle, const char *name, const void *buf, /* Console functions */ void of_console_init(void); +typedef u16 __be16; typedef u32 __be32; +typedef u64 __be64; #ifdef __LITTLE_ENDIAN__ #define cpu_to_be16(x) swab16(x) diff --git a/include/linux/libfdt_env.h b/include/linux/libfdt_env.h index 01508c7b8c81..2a663c6bb428 100644 --- a/include/linux/libfdt_env.h +++ b/include/linux/libfdt_env.h @@ -5,6 +5,10 @@ #include +typedef __be16 fdt16_t; +typedef __be32 fdt32_t; +typedef __be64 fdt64_t; + #define fdt32_to_cpu(x) be32_to_cpu(x) #define cpu_to_fdt32(x) cpu_to_be32(x) #define fdt64_to_cpu(x) be64_to_cpu(x) -- cgit v1.2.3 From 042f7df15a4fff8eec42873f755aea848dcdedd1 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 30 Apr 2015 17:16:12 +0800 Subject: workqueue: Allow modifying low level unbound workqueue cpumask Allow to modify the low-level unbound workqueues cpumask through sysfs. This is performed by traversing the entire workqueue list and calling apply_wqattrs_prepare() on the unbound workqueues with the new low level mask. Only after all the preparation are done, we commit them all together. Ordered workqueues are ignored from the low level unbound workqueue cpumask, it will be handled in near future. All the (default & per-node) pwqs are mandatorily controlled by the low level cpumask. If the user configured cpumask doesn't overlap with the low level cpumask, the low level cpumask will be used for the wq instead. The comment of wq_calc_node_cpumask() is updated and explicitly requires that its first argument should be the attrs of the default pwq. The default wq_unbound_cpumask is cpu_possible_mask. The workqueue subsystem doesn't know its best default value, let the system manager or the other subsystem set it when needed. Changed from V8: merge the calculating code for the attrs of the default pwq together. minor change the code&comments for saving the user configured attrs. remove unnecessary list_del(). minor update the comment of wq_calc_node_cpumask(). update the comment of workqueue_set_unbound_cpumask(); Cc: Christoph Lameter Cc: Kevin Hilman Cc: Lai Jiangshan Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Tejun Heo Cc: Viresh Kumar Cc: Frederic Weisbecker Original-patch-by: Frederic Weisbecker Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 1 + kernel/workqueue.c | 127 ++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 119 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index deee212af8e0..4618dd672d1b 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -424,6 +424,7 @@ struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask); void free_workqueue_attrs(struct workqueue_attrs *attrs); int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs); +int workqueue_set_unbound_cpumask(cpumask_var_t cpumask); extern bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9be75e2a4da6..a3915abc1983 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -299,7 +299,7 @@ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ -static cpumask_var_t wq_unbound_cpumask; +static cpumask_var_t wq_unbound_cpumask; /* PL: low level cpumask for all unbound wqs */ /* the per-cpu worker pools */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], @@ -3429,7 +3429,7 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, /** * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node - * @attrs: the wq_attrs of interest + * @attrs: the wq_attrs of the default pwq of the target workqueue * @node: the target NUMA node * @cpu_going_down: if >= 0, the CPU to consider as offline * @cpumask: outarg, the resulting cpumask @@ -3493,6 +3493,7 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, struct apply_wqattrs_ctx { struct workqueue_struct *wq; /* target workqueue */ struct workqueue_attrs *attrs; /* attrs to apply */ + struct list_head list; /* queued for batching commit */ struct pool_workqueue *dfl_pwq; struct pool_workqueue *pwq_tbl[]; }; @@ -3532,9 +3533,15 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, if (!ctx || !new_attrs || !tmp_attrs) goto out_free; - /* make a copy of @attrs and sanitize it */ + /* + * Calculate the attrs of the default pwq. + * If the user configured cpumask doesn't overlap with the + * wq_unbound_cpumask, we fallback to the wq_unbound_cpumask. + */ copy_workqueue_attrs(new_attrs, attrs); cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask); + if (unlikely(cpumask_empty(new_attrs->cpumask))) + cpumask_copy(new_attrs->cpumask, wq_unbound_cpumask); /* * We may create multiple pwqs with differing cpumasks. Make a @@ -3553,7 +3560,7 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, goto out_free; for_each_node(node) { - if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) { + if (wq_calc_node_cpumask(new_attrs, node, -1, tmp_attrs->cpumask)) { ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs); if (!ctx->pwq_tbl[node]) goto out_free; @@ -3563,7 +3570,11 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, } } + /* save the user configured attrs and sanitize it. */ + copy_workqueue_attrs(new_attrs, attrs); + cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask); ctx->attrs = new_attrs; + ctx->wq = wq; free_workqueue_attrs(tmp_attrs); return ctx; @@ -3704,11 +3715,11 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, /* * Let's determine what needs to be done. If the target cpumask is - * different from wq's, we need to compare it to @pwq's and create - * a new one if they don't match. If the target cpumask equals - * wq's, the default pwq should be used. + * different from the default pwq's, we need to compare it to @pwq's + * and create a new one if they don't match. If the target cpumask + * equals the default pwq's, the default pwq should be used. */ - if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) { + if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, cpu_off, cpumask)) { if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) goto out_unlock; } else { @@ -4731,6 +4742,84 @@ out_unlock: } #endif /* CONFIG_FREEZER */ +static int workqueue_apply_unbound_cpumask(void) +{ + LIST_HEAD(ctxs); + int ret = 0; + struct workqueue_struct *wq; + struct apply_wqattrs_ctx *ctx, *n; + + lockdep_assert_held(&wq_pool_mutex); + + list_for_each_entry(wq, &workqueues, list) { + if (!(wq->flags & WQ_UNBOUND)) + continue; + /* creating multiple pwqs breaks ordering guarantee */ + if (wq->flags & __WQ_ORDERED) + continue; + + ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs); + if (!ctx) { + ret = -ENOMEM; + break; + } + + list_add_tail(&ctx->list, &ctxs); + } + + list_for_each_entry_safe(ctx, n, &ctxs, list) { + if (!ret) + apply_wqattrs_commit(ctx); + apply_wqattrs_cleanup(ctx); + } + + return ret; +} + +/** + * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask + * @cpumask: the cpumask to set + * + * The low-level workqueues cpumask is a global cpumask that limits + * the affinity of all unbound workqueues. This function check the @cpumask + * and apply it to all unbound workqueues and updates all pwqs of them. + * + * Retun: 0 - Success + * -EINVAL - Invalid @cpumask + * -ENOMEM - Failed to allocate memory for attrs or pwqs. + */ +int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) +{ + int ret = -EINVAL; + cpumask_var_t saved_cpumask; + + if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) + return -ENOMEM; + + get_online_cpus(); + cpumask_and(cpumask, cpumask, cpu_possible_mask); + if (!cpumask_empty(cpumask)) { + mutex_lock(&wq_pool_mutex); + + /* save the old wq_unbound_cpumask. */ + cpumask_copy(saved_cpumask, wq_unbound_cpumask); + + /* update wq_unbound_cpumask at first and apply it to wqs. */ + cpumask_copy(wq_unbound_cpumask, cpumask); + ret = workqueue_apply_unbound_cpumask(); + + /* restore the wq_unbound_cpumask when failed. */ + if (ret < 0) + cpumask_copy(wq_unbound_cpumask, saved_cpumask); + + mutex_unlock(&wq_pool_mutex); + } + put_online_cpus(); + + free_cpumask_var(saved_cpumask); + return ret; +} + #ifdef CONFIG_SYSFS /* * Workqueues with WQ_SYSFS flag set is visible to userland via @@ -4952,14 +5041,34 @@ static ssize_t wq_unbound_cpumask_show(struct device *dev, { int written; + mutex_lock(&wq_pool_mutex); written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(wq_unbound_cpumask)); + mutex_unlock(&wq_pool_mutex); return written; } +static ssize_t wq_unbound_cpumask_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + cpumask_var_t cpumask; + int ret; + + if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) + return -ENOMEM; + + ret = cpumask_parse(buf, cpumask); + if (!ret) + ret = workqueue_set_unbound_cpumask(cpumask); + + free_cpumask_var(cpumask); + return ret ? ret : count; +} + static struct device_attribute wq_sysfs_cpumask_attr = - __ATTR(cpumask, 0444, wq_unbound_cpumask_show, NULL); + __ATTR(cpumask, 0644, wq_unbound_cpumask_show, + wq_unbound_cpumask_store); static int __init wq_sysfs_init(void) { -- cgit v1.2.3 From 0bb549052d33f8992544764a6cf1299d06ba7e2f Mon Sep 17 00:00:00 2001 From: Peter Jones Date: Tue, 28 Apr 2015 18:44:31 -0400 Subject: efi: Add esrt support Add sysfs files for the EFI System Resource Table (ESRT) under /sys/firmware/efi/esrt and for each EFI System Resource Entry under entries/ as a subdir. The EFI System Resource Table (ESRT) provides a read-only catalog of system components for which the system accepts firmware upgrades via UEFI's "Capsule Update" feature. This module allows userland utilities to evaluate what firmware updates can be applied to this system, and potentially arrange for those updates to occur. The ESRT is described as part of the UEFI specification, in version 2.5 which should be available from http://uefi.org/specifications in early 2015. If you're a member of the UEFI Forum, information about its addition to the standard is available as UEFI Mantis 1090. For some hardware platforms, additional restrictions may be found at http://msdn.microsoft.com/en-us/library/windows/hardware/jj128256.aspx , and additional documentation may be found at http://download.microsoft.com/download/5/F/5/5F5D16CD-2530-4289-8019-94C6A20BED3C/windows-uefi-firmware-update-platform.docx . Signed-off-by: Peter Jones Signed-off-by: Matt Fleming --- Documentation/ABI/testing/sysfs-firmware-efi-esrt | 81 ++++ arch/x86/platform/efi/efi.c | 2 + drivers/firmware/efi/Makefile | 2 +- drivers/firmware/efi/efi.c | 82 +++- drivers/firmware/efi/esrt.c | 464 ++++++++++++++++++++++ include/linux/efi.h | 8 + 6 files changed, 637 insertions(+), 2 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-firmware-efi-esrt create mode 100644 drivers/firmware/efi/esrt.c (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-firmware-efi-esrt b/Documentation/ABI/testing/sysfs-firmware-efi-esrt new file mode 100644 index 000000000000..6e431d1a4e79 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-firmware-efi-esrt @@ -0,0 +1,81 @@ +What: /sys/firmware/efi/esrt/ +Date: February 2015 +Contact: Peter Jones +Description: Provides userland access to read the EFI System Resource Table + (ESRT), a catalog of firmware for which can be updated with + the UEFI UpdateCapsule mechanism described in section 7.5 of + the UEFI Standard. +Users: fwupdate - https://github.com/rhinstaller/fwupdate + +What: /sys/firmware/efi/esrt/fw_resource_count +Date: February 2015 +Contact: Peter Jones +Description: The number of entries in the ESRT + +What: /sys/firmware/efi/esrt/fw_resource_count_max +Date: February 2015 +Contact: Peter Jones +Description: The maximum number of entries that /could/ be registered + in the allocation the table is currently in. This is + really only useful to the system firmware itself. + +What: /sys/firmware/efi/esrt/fw_resource_version +Date: February 2015 +Contact: Peter Jones +Description: The version of the ESRT structure provided by the firmware. + +What: /sys/firmware/efi/esrt/entries/entry$N/ +Date: February 2015 +Contact: Peter Jones +Description: Each ESRT entry is identified by a GUID, and each gets a + subdirectory under entries/ . + example: /sys/firmware/efi/esrt/entries/entry0/ + +What: /sys/firmware/efi/esrt/entries/entry$N/fw_type +Date: February 2015 +Contact: Peter Jones +Description: What kind of firmware entry this is: + 0 - Unknown + 1 - System Firmware + 2 - Device Firmware + 3 - UEFI Driver + +What: /sys/firmware/efi/esrt/entries/entry$N/fw_class +Date: February 2015 +Contact: Peter Jones +Description: This is the entry's guid, and will match the directory name. + +What: /sys/firmware/efi/esrt/entries/entry$N/fw_version +Date: February 2015 +Contact: Peter Jones +Description: The version of the firmware currently installed. This is a + 32-bit unsigned integer. + +What: /sys/firmware/efi/esrt/entries/entry$N/lowest_supported_fw_version +Date: February 2015 +Contact: Peter Jones +Description: The lowest version of the firmware that can be installed. + +What: /sys/firmware/efi/esrt/entries/entry$N/capsule_flags +Date: February 2015 +Contact: Peter Jones +Description: Flags that must be passed to UpdateCapsule() + +What: /sys/firmware/efi/esrt/entries/entry$N/last_attempt_version +Date: February 2015 +Contact: Peter Jones +Description: The last firmware version for which an update was attempted. + +What: /sys/firmware/efi/esrt/entries/entry$N/last_attempt_status +Date: February 2015 +Contact: Peter Jones +Description: The result of the last firmware update attempt for the + firmware resource entry. + 0 - Success + 1 - Insufficient resources + 2 - Incorrect version + 3 - Invalid format + 4 - Authentication error + 5 - AC power event + 6 - Battery power event + diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 02744df576d5..3b984c3aa1b0 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -501,6 +501,8 @@ void __init efi_init(void) if (efi_enabled(EFI_DBG)) print_efi_memmap(); + + efi_esrt_init(); } void __init efi_late_init(void) diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile index d8be608a9f3b..26eabbc55341 100644 --- a/drivers/firmware/efi/Makefile +++ b/drivers/firmware/efi/Makefile @@ -1,7 +1,7 @@ # # Makefile for linux kernel # -obj-$(CONFIG_EFI) += efi.o vars.o reboot.o +obj-$(CONFIG_EFI) += efi.o esrt.o vars.o reboot.o obj-$(CONFIG_EFI_VARS) += efivars.o obj-$(CONFIG_EFI_VARS_PSTORE) += efi-pstore.o obj-$(CONFIG_UEFI_CPER) += cper.o diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 3061bb8629dc..48b4c356740f 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -39,6 +39,7 @@ struct efi __read_mostly efi = { .fw_vendor = EFI_INVALID_TABLE_ADDR, .runtime = EFI_INVALID_TABLE_ADDR, .config_table = EFI_INVALID_TABLE_ADDR, + .esrt = EFI_INVALID_TABLE_ADDR, }; EXPORT_SYMBOL(efi); @@ -64,7 +65,7 @@ static int __init parse_efi_cmdline(char *str) } early_param("efi", parse_efi_cmdline); -static struct kobject *efi_kobj; +struct kobject *efi_kobj; static struct kobject *efivars_kobj; /* @@ -232,6 +233,84 @@ err_put: subsys_initcall(efisubsys_init); +/* + * Find the efi memory descriptor for a given physical address. Given a + * physicall address, determine if it exists within an EFI Memory Map entry, + * and if so, populate the supplied memory descriptor with the appropriate + * data. + */ +int __init efi_mem_desc_lookup(u64 phys_addr, efi_memory_desc_t *out_md) +{ + struct efi_memory_map *map = efi.memmap; + void *p, *e; + + if (!efi_enabled(EFI_MEMMAP)) { + pr_err_once("EFI_MEMMAP is not enabled.\n"); + return -EINVAL; + } + + if (!map) { + pr_err_once("efi.memmap is not set.\n"); + return -EINVAL; + } + if (!out_md) { + pr_err_once("out_md is null.\n"); + return -EINVAL; + } + if (WARN_ON_ONCE(!map->phys_map)) + return -EINVAL; + if (WARN_ON_ONCE(map->nr_map == 0) || WARN_ON_ONCE(map->desc_size == 0)) + return -EINVAL; + + e = map->phys_map + map->nr_map * map->desc_size; + for (p = map->phys_map; p < e; p += map->desc_size) { + efi_memory_desc_t *md; + u64 size; + u64 end; + + /* + * If a driver calls this after efi_free_boot_services, + * ->map will be NULL, and the target may also not be mapped. + * So just always get our own virtual map on the CPU. + * + */ + md = early_memremap((phys_addr_t)p, sizeof (*md)); + if (!md) { + pr_err_once("early_memremap(%p, %zu) failed.\n", + p, sizeof (*md)); + return -ENOMEM; + } + + if (!(md->attribute & EFI_MEMORY_RUNTIME) && + md->type != EFI_BOOT_SERVICES_DATA && + md->type != EFI_RUNTIME_SERVICES_DATA) { + early_memunmap(md, sizeof (*md)); + continue; + } + + size = md->num_pages << EFI_PAGE_SHIFT; + end = md->phys_addr + size; + if (phys_addr >= md->phys_addr && phys_addr < end) { + memcpy(out_md, md, sizeof(*out_md)); + early_memunmap(md, sizeof (*md)); + return 0; + } + + early_memunmap(md, sizeof (*md)); + } + pr_err_once("requested map not found.\n"); + return -ENOENT; +} + +/* + * Calculate the highest address of an efi memory descriptor. + */ +u64 __init efi_mem_desc_end(efi_memory_desc_t *md) +{ + u64 size = md->num_pages << EFI_PAGE_SHIFT; + u64 end = md->phys_addr + size; + return end; +} /* * We can't ioremap data in EFI boot services RAM, because we've already mapped @@ -274,6 +353,7 @@ static __initdata efi_config_table_type_t common_tables[] = { {SMBIOS_TABLE_GUID, "SMBIOS", &efi.smbios}, {SMBIOS3_TABLE_GUID, "SMBIOS 3.0", &efi.smbios3}, {UGA_IO_PROTOCOL_GUID, "UGA", &efi.uga}, + {EFI_SYSTEM_RESOURCE_TABLE_GUID, "ESRT", &efi.esrt}, {NULL_GUID, NULL, NULL}, }; diff --git a/drivers/firmware/efi/esrt.c b/drivers/firmware/efi/esrt.c new file mode 100644 index 000000000000..20c0cbdb1c60 --- /dev/null +++ b/drivers/firmware/efi/esrt.c @@ -0,0 +1,464 @@ +/* + * esrt.c + * + * This module exports EFI System Resource Table (ESRT) entries into userspace + * through the sysfs file system. The ESRT provides a read-only catalog of + * system components for which the system accepts firmware upgrades via UEFI's + * "Capsule Update" feature. This module allows userland utilities to evaluate + * what firmware updates can be applied to this system, and potentially arrange + * for those updates to occur. + * + * Data is currently found below /sys/firmware/efi/esrt/... + */ +#define pr_fmt(fmt) "esrt: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +struct efi_system_resource_entry_v1 { + efi_guid_t fw_class; + u32 fw_type; + u32 fw_version; + u32 lowest_supported_fw_version; + u32 capsule_flags; + u32 last_attempt_version; + u32 last_attempt_status; +}; + +/* + * _count and _version are what they seem like. _max is actually just + * accounting info for the firmware when creating the table; it should never + * have been exposed to us. To wit, the spec says: + * The maximum number of resource array entries that can be within the + * table without reallocating the table, must not be zero. + * Since there's no guidance about what that means in terms of memory layout, + * it means nothing to us. + */ +struct efi_system_resource_table { + u32 fw_resource_count; + u32 fw_resource_count_max; + u64 fw_resource_version; + u8 entries[]; +}; + +static phys_addr_t esrt_data; +static size_t esrt_data_size; + +static struct efi_system_resource_table *esrt; + +struct esre_entry { + union { + struct efi_system_resource_entry_v1 *esre1; + } esre; + + struct kobject kobj; + struct list_head list; +}; + +/* global list of esre_entry. */ +static LIST_HEAD(entry_list); + +/* entry attribute */ +struct esre_attribute { + struct attribute attr; + ssize_t (*show)(struct esre_entry *entry, char *buf); + ssize_t (*store)(struct esre_entry *entry, + const char *buf, size_t count); +}; + +static struct esre_entry *to_entry(struct kobject *kobj) +{ + return container_of(kobj, struct esre_entry, kobj); +} + +static struct esre_attribute *to_attr(struct attribute *attr) +{ + return container_of(attr, struct esre_attribute, attr); +} + +static ssize_t esre_attr_show(struct kobject *kobj, + struct attribute *_attr, char *buf) +{ + struct esre_entry *entry = to_entry(kobj); + struct esre_attribute *attr = to_attr(_attr); + + /* Don't tell normal users what firmware versions we've got... */ + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + return attr->show(entry, buf); +} + +static const struct sysfs_ops esre_attr_ops = { + .show = esre_attr_show, +}; + +/* Generic ESRT Entry ("ESRE") support. */ +static ssize_t esre_fw_class_show(struct esre_entry *entry, char *buf) +{ + char *str = buf; + + efi_guid_to_str(&entry->esre.esre1->fw_class, str); + str += strlen(str); + str += sprintf(str, "\n"); + + return str - buf; +} + +static struct esre_attribute esre_fw_class = __ATTR(fw_class, 0400, + esre_fw_class_show, NULL); + +#define esre_attr_decl(name, size, fmt) \ +static ssize_t esre_##name##_show(struct esre_entry *entry, char *buf) \ +{ \ + return sprintf(buf, fmt "\n", \ + le##size##_to_cpu(entry->esre.esre1->name)); \ +} \ +\ +static struct esre_attribute esre_##name = __ATTR(name, 0400, \ + esre_##name##_show, NULL) + +esre_attr_decl(fw_type, 32, "%u"); +esre_attr_decl(fw_version, 32, "%u"); +esre_attr_decl(lowest_supported_fw_version, 32, "%u"); +esre_attr_decl(capsule_flags, 32, "0x%x"); +esre_attr_decl(last_attempt_version, 32, "%u"); +esre_attr_decl(last_attempt_status, 32, "%u"); + +static struct attribute *esre1_attrs[] = { + &esre_fw_class.attr, + &esre_fw_type.attr, + &esre_fw_version.attr, + &esre_lowest_supported_fw_version.attr, + &esre_capsule_flags.attr, + &esre_last_attempt_version.attr, + &esre_last_attempt_status.attr, + NULL +}; +static void esre_release(struct kobject *kobj) +{ + struct esre_entry *entry = to_entry(kobj); + + list_del(&entry->list); + kfree(entry); +} + +static struct kobj_type esre1_ktype = { + .release = esre_release, + .sysfs_ops = &esre_attr_ops, + .default_attrs = esre1_attrs, +}; + + +static struct kobject *esrt_kobj; +static struct kset *esrt_kset; + +static int esre_create_sysfs_entry(void *esre, int entry_num) +{ + int rc = 0; + struct esre_entry *entry; + char name[20]; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + sprintf(name, "entry%d", entry_num); + + entry->kobj.kset = esrt_kset; + + if (esrt->fw_resource_version == 1) { + entry->esre.esre1 = esre; + rc = kobject_init_and_add(&entry->kobj, &esre1_ktype, NULL, + "%s", name); + } + if (rc) { + kfree(entry); + return rc; + } + + list_add_tail(&entry->list, &entry_list); + return 0; +} + +/* support for displaying ESRT fields at the top level */ +#define esrt_attr_decl(name, size, fmt) \ +static ssize_t esrt_##name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf)\ +{ \ + return sprintf(buf, fmt "\n", le##size##_to_cpu(esrt->name)); \ +} \ +\ +static struct kobj_attribute esrt_##name = __ATTR(name, 0400, \ + esrt_##name##_show, NULL) + +esrt_attr_decl(fw_resource_count, 32, "%u"); +esrt_attr_decl(fw_resource_count_max, 32, "%u"); +esrt_attr_decl(fw_resource_version, 64, "%llu"); + +static struct attribute *esrt_attrs[] = { + &esrt_fw_resource_count.attr, + &esrt_fw_resource_count_max.attr, + &esrt_fw_resource_version.attr, + NULL, +}; + +static inline int esrt_table_exists(void) +{ + if (!efi_enabled(EFI_CONFIG_TABLES)) + return 0; + if (efi.esrt == EFI_INVALID_TABLE_ADDR) + return 0; + return 1; +} + +static umode_t esrt_attr_is_visible(struct kobject *kobj, + struct attribute *attr, int n) +{ + if (!esrt_table_exists()) + return 0; + return attr->mode; +} + +static struct attribute_group esrt_attr_group = { + .attrs = esrt_attrs, + .is_visible = esrt_attr_is_visible, +}; + +/* + * remap the table, copy it to kmalloced pages, and unmap it. + */ +void __init efi_esrt_init(void) +{ + void *va; + struct efi_system_resource_table tmpesrt; + struct efi_system_resource_entry_v1 *v1_entries; + size_t size, max, entry_size, entries_size; + efi_memory_desc_t md; + int rc; + + pr_debug("esrt-init: loading.\n"); + if (!esrt_table_exists()) + return; + + rc = efi_mem_desc_lookup(efi.esrt, &md); + if (rc < 0) { + pr_err("ESRT header is not in the memory map.\n"); + return; + } + + max = efi_mem_desc_end(&md); + if (max < efi.esrt) { + pr_err("EFI memory descriptor is invalid. (esrt: %p max: %p)\n", + (void *)efi.esrt, (void *)max); + return; + } + + size = sizeof(*esrt); + max -= efi.esrt; + + if (max < size) { + pr_err("ESRT header doen't fit on single memory map entry. (size: %zu max: %zu)\n", + size, max); + return; + } + + va = early_memremap(efi.esrt, size); + if (!va) { + pr_err("early_memremap(%p, %zu) failed.\n", (void *)efi.esrt, + size); + return; + } + + memcpy(&tmpesrt, va, sizeof(tmpesrt)); + + if (tmpesrt.fw_resource_version == 1) { + entry_size = sizeof (*v1_entries); + } else { + pr_err("Unsupported ESRT version %lld.\n", + tmpesrt.fw_resource_version); + return; + } + + if (tmpesrt.fw_resource_count > 0 && max - size < entry_size) { + pr_err("ESRT memory map entry can only hold the header. (max: %zu size: %zu)\n", + max - size, entry_size); + goto err_memunmap; + } + + /* + * The format doesn't really give us any boundary to test here, + * so I'm making up 128 as the max number of individually updatable + * components we support. + * 128 should be pretty excessive, but there's still some chance + * somebody will do that someday and we'll need to raise this. + */ + if (tmpesrt.fw_resource_count > 128) { + pr_err("ESRT says fw_resource_count has very large value %d.\n", + tmpesrt.fw_resource_count); + goto err_memunmap; + } + + /* + * We know it can't be larger than N * sizeof() here, and N is limited + * by the previous test to a small number, so there's no overflow. + */ + entries_size = tmpesrt.fw_resource_count * entry_size; + if (max < size + entries_size) { + pr_err("ESRT does not fit on single memory map entry (size: %zu max: %zu)\n", + size, max); + goto err_memunmap; + } + + /* remap it with our (plausible) new pages */ + early_memunmap(va, size); + size += entries_size; + va = early_memremap(efi.esrt, size); + if (!va) { + pr_err("early_memremap(%p, %zu) failed.\n", (void *)efi.esrt, + size); + return; + } + + esrt_data = (phys_addr_t)efi.esrt; + esrt_data_size = size; + + pr_info("Reserving ESRT space from %p to %p.\n", (void *)esrt_data, + (char *)esrt_data + size); + memblock_reserve(esrt_data, esrt_data_size); + + pr_debug("esrt-init: loaded.\n"); +err_memunmap: + early_memunmap(va, size); +} + +static int __init register_entries(void) +{ + struct efi_system_resource_entry_v1 *v1_entries = (void *)esrt->entries; + int i, rc; + + if (!esrt_table_exists()) + return 0; + + for (i = 0; i < le32_to_cpu(esrt->fw_resource_count); i++) { + void *entry; + if (esrt->fw_resource_version == 1) { + entry = &v1_entries[i]; + } + rc = esre_create_sysfs_entry(entry, i); + if (rc < 0) { + pr_err("ESRT entry creation failed with error %d.\n", + rc); + return rc; + } + } + return 0; +} + +static void cleanup_entry_list(void) +{ + struct esre_entry *entry, *next; + + list_for_each_entry_safe(entry, next, &entry_list, list) { + kobject_put(&entry->kobj); + } +} + +static int __init esrt_sysfs_init(void) +{ + int error; + struct efi_system_resource_table __iomem *ioesrt; + + pr_debug("esrt-sysfs: loading.\n"); + if (!esrt_data || !esrt_data_size) + return -ENOSYS; + + ioesrt = ioremap(esrt_data, esrt_data_size); + if (!ioesrt) { + pr_err("ioremap(%p, %zu) failed.\n", (void *)esrt_data, + esrt_data_size); + return -ENOMEM; + } + + esrt = kmalloc(esrt_data_size, GFP_KERNEL); + if (!esrt) { + pr_err("kmalloc failed. (wanted %zu bytes)\n", esrt_data_size); + iounmap(ioesrt); + return -ENOMEM; + } + + memcpy_fromio(esrt, ioesrt, esrt_data_size); + + esrt_kobj = kobject_create_and_add("esrt", efi_kobj); + if (!esrt_kobj) { + pr_err("Firmware table registration failed.\n"); + error = -ENOMEM; + goto err; + } + + error = sysfs_create_group(esrt_kobj, &esrt_attr_group); + if (error) { + pr_err("Sysfs attribute export failed with error %d.\n", + error); + goto err_remove_esrt; + } + + esrt_kset = kset_create_and_add("entries", NULL, esrt_kobj); + if (!esrt_kset) { + pr_err("kset creation failed.\n"); + error = -ENOMEM; + goto err_remove_group; + } + + error = register_entries(); + if (error) + goto err_cleanup_list; + + memblock_remove(esrt_data, esrt_data_size); + + pr_debug("esrt-sysfs: loaded.\n"); + + return 0; +err_cleanup_list: + cleanup_entry_list(); + kset_unregister(esrt_kset); +err_remove_group: + sysfs_remove_group(esrt_kobj, &esrt_attr_group); +err_remove_esrt: + kobject_put(esrt_kobj); +err: + kfree(esrt); + esrt = NULL; + return error; +} + +static void __exit esrt_sysfs_exit(void) +{ + pr_debug("esrt-sysfs: unloading.\n"); + cleanup_entry_list(); + kset_unregister(esrt_kset); + sysfs_remove_group(esrt_kobj, &esrt_attr_group); + kfree(esrt); + esrt = NULL; + kobject_del(esrt_kobj); + kobject_put(esrt_kobj); +} + +module_init(esrt_sysfs_init); +module_exit(esrt_sysfs_exit); + +MODULE_AUTHOR("Peter Jones "); +MODULE_DESCRIPTION("EFI System Resource Table support"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/efi.h b/include/linux/efi.h index af5be0368dec..024c27e7c0fa 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -583,6 +583,9 @@ void efi_native_runtime_setup(void); #define EFI_FILE_INFO_ID \ EFI_GUID( 0x9576e92, 0x6d3f, 0x11d2, 0x8e, 0x39, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b ) +#define EFI_SYSTEM_RESOURCE_TABLE_GUID \ + EFI_GUID( 0xb122a263, 0x3661, 0x4f68, 0x99, 0x29, 0x78, 0xf8, 0xb0, 0xd6, 0x21, 0x80 ) + #define EFI_FILE_SYSTEM_GUID \ EFI_GUID( 0x964e5b22, 0x6459, 0x11d2, 0x8e, 0x39, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b ) @@ -823,6 +826,7 @@ extern struct efi { unsigned long fw_vendor; /* fw_vendor */ unsigned long runtime; /* runtime table */ unsigned long config_table; /* config tables */ + unsigned long esrt; /* ESRT table */ efi_get_time_t *get_time; efi_set_time_t *set_time; efi_get_wakeup_time_t *get_wakeup_time; @@ -875,6 +879,7 @@ static inline efi_status_t efi_query_variable_store(u32 attributes, unsigned lon #endif extern void __iomem *efi_lookup_mapped_addr(u64 phys_addr); extern int efi_config_init(efi_config_table_type_t *arch_tables); +extern void __init efi_esrt_init(void); extern int efi_config_parse_tables(void *config_tables, int count, int sz, efi_config_table_type_t *arch_tables); extern u64 efi_get_iobase (void); @@ -882,12 +887,15 @@ extern u32 efi_mem_type (unsigned long phys_addr); extern u64 efi_mem_attributes (unsigned long phys_addr); extern u64 efi_mem_attribute (unsigned long phys_addr, unsigned long size); extern int __init efi_uart_console_only (void); +extern u64 efi_mem_desc_end(efi_memory_desc_t *md); +extern int efi_mem_desc_lookup(u64 phys_addr, efi_memory_desc_t *out_md); extern void efi_initialize_iomem_resources(struct resource *code_resource, struct resource *data_resource, struct resource *bss_resource); extern void efi_get_time(struct timespec *now); extern void efi_reserve_boot_services(void); extern int efi_get_fdt_params(struct efi_fdt_params *params, int verbose); extern struct efi_memory_map memmap; +extern struct kobject *efi_kobj; extern int efi_reboot_quirk_mode; extern bool efi_poweroff_required(void); -- cgit v1.2.3 From fb7737e949e31d8a71acee6bbb670f32dbd2a2c0 Mon Sep 17 00:00:00 2001 From: Suman Anna Date: Wed, 4 Mar 2015 20:01:14 -0600 Subject: hwspinlock/core: add device tree support This patch adds a new OF-friendly API of_hwspin_lock_get_id() for hwspinlock clients to use/request locks from a hwspinlock device instantiated through a device-tree blob. This new API can be used by hwspinlock clients to get the id for a specific lock using the phandle + args specifier, so that it can be requested using the available hwspin_lock_request_specific() API. Signed-off-by: Suman Anna Reviewed-by: Bjorn Andersson [small comment clarification] Signed-off-by: Ohad Ben-Cohen --- Documentation/hwspinlock.txt | 10 +++++ drivers/hwspinlock/hwspinlock_core.c | 79 ++++++++++++++++++++++++++++++++++++ include/linux/hwspinlock.h | 7 ++++ 3 files changed, 96 insertions(+) (limited to 'include/linux') diff --git a/Documentation/hwspinlock.txt b/Documentation/hwspinlock.txt index 62f7d4ea6e26..61c1ee98e59f 100644 --- a/Documentation/hwspinlock.txt +++ b/Documentation/hwspinlock.txt @@ -48,6 +48,16 @@ independent, drivers. ids for predefined purposes. Should be called from a process context (might sleep). + int of_hwspin_lock_get_id(struct device_node *np, int index); + - retrieve the global lock id for an OF phandle-based specific lock. + This function provides a means for DT users of a hwspinlock module + to get the global lock id of a specific hwspinlock, so that it can + be requested using the normal hwspin_lock_request_specific() API. + The function returns a lock id number on success, -EPROBE_DEFER if + the hwspinlock device is not yet registered with the core, or other + error values. + Should be called from a process context (might sleep). + int hwspin_lock_free(struct hwspinlock *hwlock); - free a previously-assigned hwspinlock; returns 0 on success, or an appropriate error code on failure (e.g. -EINVAL if the hwspinlock diff --git a/drivers/hwspinlock/hwspinlock_core.c b/drivers/hwspinlock/hwspinlock_core.c index 461a0d739d75..52f708bcf77f 100644 --- a/drivers/hwspinlock/hwspinlock_core.c +++ b/drivers/hwspinlock/hwspinlock_core.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "hwspinlock_internal.h" @@ -257,6 +258,84 @@ void __hwspin_unlock(struct hwspinlock *hwlock, int mode, unsigned long *flags) } EXPORT_SYMBOL_GPL(__hwspin_unlock); +/** + * of_hwspin_lock_simple_xlate - translate hwlock_spec to return a lock id + * @bank: the hwspinlock device bank + * @hwlock_spec: hwlock specifier as found in the device tree + * + * This is a simple translation function, suitable for hwspinlock platform + * drivers that only has a lock specifier length of 1. + * + * Returns a relative index of the lock within a specified bank on success, + * or -EINVAL on invalid specifier cell count. + */ +static inline int +of_hwspin_lock_simple_xlate(const struct of_phandle_args *hwlock_spec) +{ + if (WARN_ON(hwlock_spec->args_count != 1)) + return -EINVAL; + + return hwlock_spec->args[0]; +} + +/** + * of_hwspin_lock_get_id() - get lock id for an OF phandle-based specific lock + * @np: device node from which to request the specific hwlock + * @index: index of the hwlock in the list of values + * + * This function provides a means for DT users of the hwspinlock module to + * get the global lock id of a specific hwspinlock using the phandle of the + * hwspinlock device, so that it can be requested using the normal + * hwspin_lock_request_specific() API. + * + * Returns the global lock id number on success, -EPROBE_DEFER if the hwspinlock + * device is not yet registered, -EINVAL on invalid args specifier value or an + * appropriate error as returned from the OF parsing of the DT client node. + */ +int of_hwspin_lock_get_id(struct device_node *np, int index) +{ + struct of_phandle_args args; + struct hwspinlock *hwlock; + struct radix_tree_iter iter; + void **slot; + int id; + int ret; + + ret = of_parse_phandle_with_args(np, "hwlocks", "#hwlock-cells", index, + &args); + if (ret) + return ret; + + /* Find the hwspinlock device: we need its base_id */ + ret = -EPROBE_DEFER; + rcu_read_lock(); + radix_tree_for_each_slot(slot, &hwspinlock_tree, &iter, 0) { + hwlock = radix_tree_deref_slot(slot); + if (unlikely(!hwlock)) + continue; + + if (hwlock->bank->dev->of_node == args.np) { + ret = 0; + break; + } + } + rcu_read_unlock(); + if (ret < 0) + goto out; + + id = of_hwspin_lock_simple_xlate(&args); + if (id < 0 || id >= hwlock->bank->num_locks) { + ret = -EINVAL; + goto out; + } + id += hwlock->bank->base_id; + +out: + of_node_put(args.np); + return ret ? ret : id; +} +EXPORT_SYMBOL_GPL(of_hwspin_lock_get_id); + static int hwspin_lock_register_single(struct hwspinlock *hwlock, int id) { struct hwspinlock *tmp; diff --git a/include/linux/hwspinlock.h b/include/linux/hwspinlock.h index 3343298e40e8..859d673d98c8 100644 --- a/include/linux/hwspinlock.h +++ b/include/linux/hwspinlock.h @@ -26,6 +26,7 @@ #define HWLOCK_IRQ 0x02 /* Disable interrupts, don't save state */ struct device; +struct device_node; struct hwspinlock; struct hwspinlock_device; struct hwspinlock_ops; @@ -66,6 +67,7 @@ int hwspin_lock_unregister(struct hwspinlock_device *bank); struct hwspinlock *hwspin_lock_request(void); struct hwspinlock *hwspin_lock_request_specific(unsigned int id); int hwspin_lock_free(struct hwspinlock *hwlock); +int of_hwspin_lock_get_id(struct device_node *np, int index); int hwspin_lock_get_id(struct hwspinlock *hwlock); int __hwspin_lock_timeout(struct hwspinlock *, unsigned int, int, unsigned long *); @@ -120,6 +122,11 @@ void __hwspin_unlock(struct hwspinlock *hwlock, int mode, unsigned long *flags) { } +static inline int of_hwspin_lock_get_id(struct device_node *np, int index) +{ + return 0; +} + static inline int hwspin_lock_get_id(struct hwspinlock *hwlock) { return 0; -- cgit v1.2.3 From d54385ce68cd18ab002b46f61246ad197cec92de Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 30 Apr 2015 14:53:54 -0700 Subject: etherdev: Process is_multicast_ether_addr at same size as other operations This change makes it so that we process the address in is_multicast_ether_addr at the same size as the other calls. This allows us to avoid duplicate reads when used with other calls such as is_zero_ether_addr or eth_addr_copy. In addition I have added a 64 bit version of the function so in eth_type_trans we can process the destination address as a 64 bit value throughout. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 24 +++++++++++++++++++++++- net/ethernet/eth.c | 2 +- 2 files changed, 24 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 606563ef8a72..c4a10f991fe0 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -110,7 +110,29 @@ static inline bool is_zero_ether_addr(const u8 *addr) */ static inline bool is_multicast_ether_addr(const u8 *addr) { - return 0x01 & addr[0]; +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) + u32 a = *(const u32 *)addr; +#else + u16 a = *(const u16 *)addr; +#endif +#ifdef __BIG_ENDIAN + return 0x01 & (a >> ((sizeof(a) * 8) - 8)); +#else + return 0x01 & a; +#endif +} + +static inline bool is_multicast_ether_addr_64bits(const u8 addr[6+2]) +{ +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 +#ifdef __BIG_ENDIAN + return 0x01 & ((*(const u64 *)addr) >> 56); +#else + return 0x01 & (*(const u64 *)addr); +#endif +#else + return is_multicast_ether_addr(addr); +#endif } /** diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 60069318d5d1..21c211e9fd5a 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -159,7 +159,7 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) skb_pull_inline(skb, ETH_HLEN); eth = eth_hdr(skb); - if (unlikely(is_multicast_ether_addr(eth->h_dest))) { + if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) { if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) skb->pkt_type = PACKET_BROADCAST; else -- cgit v1.2.3 From 50fb799289501c2eab9f43fc9af513027e1e994f Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 1 May 2015 11:30:12 -0700 Subject: net: Add skb_get_hash_perturb This calls flow_disect and __skb_get_hash to procure a hash for a packet. Input includes a key to initialize jhash. This function does not set skb->hash. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 ++ net/core/flow_dissector.c | 38 ++++++++++++++++++++++++++++++-------- 2 files changed, 32 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 66e374d62f64..acb83e249e3f 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -927,6 +927,8 @@ static inline __u32 skb_get_hash(struct sk_buff *skb) return skb->hash; } +__u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb); + static inline __u32 skb_get_hash_raw(const struct sk_buff *skb) { return skb->hash; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 2c35c02a931e..b1df995ef06c 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -267,13 +267,12 @@ static __always_inline void __flow_hash_secret_init(void) net_get_random_once(&hashrnd, sizeof(hashrnd)); } -static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c) +static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c, u32 keyval) { - __flow_hash_secret_init(); - return jhash_3words(a, b, c, hashrnd); + return jhash_3words(a, b, c, keyval); } -static inline u32 __flow_hash_from_keys(struct flow_keys *keys) +static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval) { u32 hash; @@ -287,7 +286,8 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys) hash = __flow_hash_3words((__force u32)keys->dst, (__force u32)keys->src, - (__force u32)keys->ports); + (__force u32)keys->ports, + keyval); if (!hash) hash = 1; @@ -296,10 +296,20 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys) u32 flow_hash_from_keys(struct flow_keys *keys) { - return __flow_hash_from_keys(keys); + __flow_hash_secret_init(); + return __flow_hash_from_keys(keys, hashrnd); } EXPORT_SYMBOL(flow_hash_from_keys); +static inline u32 ___skb_get_hash(const struct sk_buff *skb, + struct flow_keys *keys, u32 keyval) +{ + if (!skb_flow_dissect(skb, keys)) + return 0; + + return __flow_hash_from_keys(keys, keyval); +} + /* * __skb_get_hash: calculate a flow hash based on src/dst addresses * and src/dst port numbers. Sets hash in skb to non-zero hash value @@ -309,8 +319,12 @@ EXPORT_SYMBOL(flow_hash_from_keys); void __skb_get_hash(struct sk_buff *skb) { struct flow_keys keys; + u32 hash; - if (!skb_flow_dissect(skb, &keys)) + __flow_hash_secret_init(); + + hash = ___skb_get_hash(skb, &keys, hashrnd); + if (!hash) return; if (keys.ports) @@ -318,10 +332,18 @@ void __skb_get_hash(struct sk_buff *skb) skb->sw_hash = 1; - skb->hash = __flow_hash_from_keys(&keys); + skb->hash = hash; } EXPORT_SYMBOL(__skb_get_hash); +__u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb) +{ + struct flow_keys keys; + + return ___skb_get_hash(skb, &keys, perturb); +} +EXPORT_SYMBOL(skb_get_hash_perturb); + /* * Returns a Tx hash based on the given packet descriptor a Tx queues' number * to be used as a distribution range. -- cgit v1.2.3 From 20f56758b04364c3c139edbffde8cfaf0307edee Mon Sep 17 00:00:00 2001 From: Jacek Anaszewski Date: Tue, 28 Apr 2015 00:18:41 -0700 Subject: leds: unify the location of led-trigger API Part of led-trigger API was in the private drivers/leds/leds.h header. Move it to the include/linux/leds.h header to unify the API location and announce it as public. It has been already exported from led-triggers.c with EXPORT_SYMBOL_GPL macro. The no-op definitions are changed from macros to inline to match the style of the surrounding code. Signed-off-by: Jacek Anaszewski Cc: Richard Purdie Acked-by: Sakari Ailus Signed-off-by: Bryan Wu --- drivers/leds/leds.h | 24 ------------------------ include/linux/leds.h | 25 +++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/drivers/leds/leds.h b/drivers/leds/leds.h index 79efe57c7405..bc89d7ace2c4 100644 --- a/drivers/leds/leds.h +++ b/drivers/leds/leds.h @@ -13,7 +13,6 @@ #ifndef __LEDS_H_INCLUDED #define __LEDS_H_INCLUDED -#include #include #include @@ -50,27 +49,4 @@ void led_stop_software_blink(struct led_classdev *led_cdev); extern struct rw_semaphore leds_list_lock; extern struct list_head leds_list; -#ifdef CONFIG_LEDS_TRIGGERS -void led_trigger_set_default(struct led_classdev *led_cdev); -void led_trigger_set(struct led_classdev *led_cdev, - struct led_trigger *trigger); -void led_trigger_remove(struct led_classdev *led_cdev); - -static inline void *led_get_trigger_data(struct led_classdev *led_cdev) -{ - return led_cdev->trigger_data; -} - -#else -#define led_trigger_set_default(x) do {} while (0) -#define led_trigger_set(x, y) do {} while (0) -#define led_trigger_remove(x) do {} while (0) -#define led_get_trigger_data(x) (NULL) -#endif - -ssize_t led_trigger_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count); -ssize_t led_trigger_show(struct device *dev, struct device_attribute *attr, - char *buf); - #endif /* __LEDS_H_INCLUDED */ diff --git a/include/linux/leds.h b/include/linux/leds.h index 9a2b000094cf..b122eeafb5dc 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -12,6 +12,7 @@ #ifndef __LINUX_LEDS_H_INCLUDED #define __LINUX_LEDS_H_INCLUDED +#include #include #include #include @@ -222,6 +223,11 @@ struct led_trigger { struct list_head next_trig; }; +ssize_t led_trigger_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count); +ssize_t led_trigger_show(struct device *dev, struct device_attribute *attr, + char *buf); + /* Registration functions for complex triggers */ extern int led_trigger_register(struct led_trigger *trigger); extern void led_trigger_unregister(struct led_trigger *trigger); @@ -238,6 +244,16 @@ extern void led_trigger_blink_oneshot(struct led_trigger *trigger, unsigned long *delay_on, unsigned long *delay_off, int invert); +extern void led_trigger_set_default(struct led_classdev *led_cdev); +extern void led_trigger_set(struct led_classdev *led_cdev, + struct led_trigger *trigger); +extern void led_trigger_remove(struct led_classdev *led_cdev); + +static inline void *led_get_trigger_data(struct led_classdev *led_cdev) +{ + return led_cdev->trigger_data; +} + /** * led_trigger_rename_static - rename a trigger * @name: the new trigger name @@ -267,6 +283,15 @@ static inline void led_trigger_register_simple(const char *name, static inline void led_trigger_unregister_simple(struct led_trigger *trigger) {} static inline void led_trigger_event(struct led_trigger *trigger, enum led_brightness event) {} +static inline void led_trigger_set_default(struct led_classdev *led_cdev) {} +static inline void led_trigger_set(struct led_classdev *led_cdev, + struct led_trigger *trigger) {} +static inline void led_trigger_remove(struct led_classdev *led_cdev) {} +static inline void *led_get_trigger_data(struct led_classdev *led_cdev) +{ + return NULL; +} + #endif /* CONFIG_LEDS_TRIGGERS */ /* Trigger specific functions */ -- cgit v1.2.3 From 9afd85c9e4552b276e2f4cfefd622bdeeffbbf26 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Sat, 2 May 2015 14:01:07 +0200 Subject: net: Export IGMP/MLD message validation code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With this patch, the IGMP and MLD message validation functions are moved from the bridge code to IPv4/IPv6 multicast files. Some small refactoring was done to enhance readibility and to iron out some differences in behaviour between the IGMP and MLD parsing code (e.g. the skb-cloning of MLD messages is now only done if necessary, just like the IGMP part always did). Finally, these IGMP and MLD message validation functions are exported so that not only the bridge can use it but batman-adv later, too. Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- include/linux/igmp.h | 1 + include/linux/skbuff.h | 3 + include/net/addrconf.h | 1 + net/bridge/br_multicast.c | 218 +++++++--------------------------------------- net/core/skbuff.c | 87 ++++++++++++++++++ net/ipv4/igmp.c | 162 ++++++++++++++++++++++++++++++++++ net/ipv6/Makefile | 1 + net/ipv6/mcast_snoop.c | 213 ++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 498 insertions(+), 188 deletions(-) create mode 100644 net/ipv6/mcast_snoop.c (limited to 'include/linux') diff --git a/include/linux/igmp.h b/include/linux/igmp.h index 2c677afeea47..193ad488d3e2 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -130,5 +130,6 @@ extern void ip_mc_unmap(struct in_device *); extern void ip_mc_remap(struct in_device *); extern void ip_mc_dec_group(struct in_device *in_dev, __be32 addr); extern void ip_mc_inc_group(struct in_device *in_dev, __be32 addr); +int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed); #endif diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index acb83e249e3f..9c2f793573fa 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3419,6 +3419,9 @@ static inline void skb_checksum_none_assert(const struct sk_buff *skb) bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off); int skb_checksum_setup(struct sk_buff *skb, bool recalculate); +struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, + unsigned int transport_len, + __sum16(*skb_chkf)(struct sk_buff *skb)); u32 skb_get_poff(const struct sk_buff *skb); u32 __skb_get_poff(const struct sk_buff *skb, void *data, diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 80456f72d70a..def59d3a34d5 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -142,6 +142,7 @@ void ipv6_mc_unmap(struct inet6_dev *idev); void ipv6_mc_remap(struct inet6_dev *idev); void ipv6_mc_init_dev(struct inet6_dev *idev); void ipv6_mc_destroy_dev(struct inet6_dev *idev); +int ipv6_mc_check_mld(struct sk_buff *skb, struct sk_buff **skb_trimmed); void addrconf_dad_failure(struct inet6_ifaddr *ifp); bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index b52f4cb8aee9..2d69d5cab52f 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -975,9 +975,6 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, int err = 0; __be32 group; - if (!pskb_may_pull(skb, sizeof(*ih))) - return -EINVAL; - ih = igmpv3_report_hdr(skb); num = ntohs(ih->ngrec); len = sizeof(*ih); @@ -1248,25 +1245,14 @@ static int br_ip4_multicast_query(struct net_bridge *br, max_delay = 10 * HZ; group = 0; } - } else { - if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) { - err = -EINVAL; - goto out; - } - + } else if (skb->len >= sizeof(*ih3)) { ih3 = igmpv3_query_hdr(skb); if (ih3->nsrcs) goto out; max_delay = ih3->code ? IGMPV3_MRC(ih3->code) * (HZ / IGMP_TIMER_SCALE) : 1; - } - - /* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer - * all-systems destination addresses (224.0.0.1) for general queries - */ - if (!group && iph->daddr != htonl(INADDR_ALLHOSTS_GROUP)) { - err = -EINVAL; + } else { goto out; } @@ -1329,12 +1315,6 @@ static int br_ip6_multicast_query(struct net_bridge *br, (port && port->state == BR_STATE_DISABLED)) goto out; - /* RFC2710+RFC3810 (MLDv1+MLDv2) require link-local source addresses */ - if (!(ipv6_addr_type(&ip6h->saddr) & IPV6_ADDR_LINKLOCAL)) { - err = -EINVAL; - goto out; - } - if (skb->len == sizeof(*mld)) { if (!pskb_may_pull(skb, sizeof(*mld))) { err = -EINVAL; @@ -1358,14 +1338,6 @@ static int br_ip6_multicast_query(struct net_bridge *br, is_general_query = group && ipv6_addr_any(group); - /* RFC2710+RFC3810 (MLDv1+MLDv2) require the multicast link layer - * all-nodes destination address (ff02::1) for general queries - */ - if (is_general_query && !ipv6_addr_is_ll_all_nodes(&ip6h->daddr)) { - err = -EINVAL; - goto out; - } - if (is_general_query) { saddr.proto = htons(ETH_P_IPV6); saddr.u.ip6 = ip6h->saddr; @@ -1557,66 +1529,22 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, struct sk_buff *skb, u16 vid) { - struct sk_buff *skb2 = skb; - const struct iphdr *iph; + struct sk_buff *skb_trimmed = NULL; struct igmphdr *ih; - unsigned int len; - unsigned int offset; int err; - /* We treat OOM as packet loss for now. */ - if (!pskb_may_pull(skb, sizeof(*iph))) - return -EINVAL; - - iph = ip_hdr(skb); - - if (iph->ihl < 5 || iph->version != 4) - return -EINVAL; - - if (!pskb_may_pull(skb, ip_hdrlen(skb))) - return -EINVAL; - - iph = ip_hdr(skb); + err = ip_mc_check_igmp(skb, &skb_trimmed); - if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) - return -EINVAL; - - if (iph->protocol != IPPROTO_IGMP) { - if (!ipv4_is_local_multicast(iph->daddr)) + if (err == -ENOMSG) { + if (!ipv4_is_local_multicast(ip_hdr(skb)->daddr)) BR_INPUT_SKB_CB(skb)->mrouters_only = 1; return 0; + } else if (err < 0) { + return err; } - len = ntohs(iph->tot_len); - if (skb->len < len || len < ip_hdrlen(skb)) - return -EINVAL; - - if (skb->len > len) { - skb2 = skb_clone(skb, GFP_ATOMIC); - if (!skb2) - return -ENOMEM; - - err = pskb_trim_rcsum(skb2, len); - if (err) - goto err_out; - } - - len -= ip_hdrlen(skb2); - offset = skb_network_offset(skb2) + ip_hdrlen(skb2); - __skb_pull(skb2, offset); - skb_reset_transport_header(skb2); - - err = -EINVAL; - if (!pskb_may_pull(skb2, sizeof(*ih))) - goto out; - - if (skb_checksum_simple_validate(skb2)) - goto out; - - err = 0; - BR_INPUT_SKB_CB(skb)->igmp = 1; - ih = igmp_hdr(skb2); + ih = igmp_hdr(skb); switch (ih->type) { case IGMP_HOST_MEMBERSHIP_REPORT: @@ -1625,21 +1553,19 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, err = br_ip4_multicast_add_group(br, port, ih->group, vid); break; case IGMPV3_HOST_MEMBERSHIP_REPORT: - err = br_ip4_multicast_igmp3_report(br, port, skb2, vid); + err = br_ip4_multicast_igmp3_report(br, port, skb_trimmed, vid); break; case IGMP_HOST_MEMBERSHIP_QUERY: - err = br_ip4_multicast_query(br, port, skb2, vid); + err = br_ip4_multicast_query(br, port, skb_trimmed, vid); break; case IGMP_HOST_LEAVE_MESSAGE: br_ip4_multicast_leave_group(br, port, ih->group, vid); break; } -out: - __skb_push(skb2, offset); -err_out: - if (skb2 != skb) - kfree_skb(skb2); + if (skb_trimmed) + kfree_skb(skb_trimmed); + return err; } @@ -1649,126 +1575,42 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, struct sk_buff *skb, u16 vid) { - struct sk_buff *skb2; - const struct ipv6hdr *ip6h; - u8 icmp6_type; - u8 nexthdr; - __be16 frag_off; - unsigned int len; - int offset; + struct sk_buff *skb_trimmed = NULL; + struct mld_msg *mld; int err; - if (!pskb_may_pull(skb, sizeof(*ip6h))) - return -EINVAL; + err = ipv6_mc_check_mld(skb, &skb_trimmed); - ip6h = ipv6_hdr(skb); - - /* - * We're interested in MLD messages only. - * - Version is 6 - * - MLD has always Router Alert hop-by-hop option - * - But we do not support jumbrograms. - */ - if (ip6h->version != 6) - return 0; - - /* Prevent flooding this packet if there is no listener present */ - if (!ipv6_addr_is_ll_all_nodes(&ip6h->daddr)) - BR_INPUT_SKB_CB(skb)->mrouters_only = 1; - - if (ip6h->nexthdr != IPPROTO_HOPOPTS || - ip6h->payload_len == 0) - return 0; - - len = ntohs(ip6h->payload_len) + sizeof(*ip6h); - if (skb->len < len) - return -EINVAL; - - nexthdr = ip6h->nexthdr; - offset = ipv6_skip_exthdr(skb, sizeof(*ip6h), &nexthdr, &frag_off); - - if (offset < 0 || nexthdr != IPPROTO_ICMPV6) + if (err == -ENOMSG) { + if (!ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr)) + BR_INPUT_SKB_CB(skb)->mrouters_only = 1; return 0; - - /* Okay, we found ICMPv6 header */ - skb2 = skb_clone(skb, GFP_ATOMIC); - if (!skb2) - return -ENOMEM; - - err = -EINVAL; - if (!pskb_may_pull(skb2, offset + sizeof(struct icmp6hdr))) - goto out; - - len -= offset - skb_network_offset(skb2); - - __skb_pull(skb2, offset); - skb_reset_transport_header(skb2); - skb_postpull_rcsum(skb2, skb_network_header(skb2), - skb_network_header_len(skb2)); - - icmp6_type = icmp6_hdr(skb2)->icmp6_type; - - switch (icmp6_type) { - case ICMPV6_MGM_QUERY: - case ICMPV6_MGM_REPORT: - case ICMPV6_MGM_REDUCTION: - case ICMPV6_MLD2_REPORT: - break; - default: - err = 0; - goto out; - } - - /* Okay, we found MLD message. Check further. */ - if (skb2->len > len) { - err = pskb_trim_rcsum(skb2, len); - if (err) - goto out; - err = -EINVAL; + } else if (err < 0) { + return err; } - ip6h = ipv6_hdr(skb2); - - if (skb_checksum_validate(skb2, IPPROTO_ICMPV6, ip6_compute_pseudo)) - goto out; - - err = 0; - BR_INPUT_SKB_CB(skb)->igmp = 1; + mld = (struct mld_msg *)skb_transport_header(skb); - switch (icmp6_type) { + switch (mld->mld_type) { case ICMPV6_MGM_REPORT: - { - struct mld_msg *mld; - if (!pskb_may_pull(skb2, sizeof(*mld))) { - err = -EINVAL; - goto out; - } - mld = (struct mld_msg *)skb_transport_header(skb2); BR_INPUT_SKB_CB(skb)->mrouters_only = 1; err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid); break; - } case ICMPV6_MLD2_REPORT: - err = br_ip6_multicast_mld2_report(br, port, skb2, vid); + err = br_ip6_multicast_mld2_report(br, port, skb_trimmed, vid); break; case ICMPV6_MGM_QUERY: - err = br_ip6_multicast_query(br, port, skb2, vid); + err = br_ip6_multicast_query(br, port, skb_trimmed, vid); break; case ICMPV6_MGM_REDUCTION: - { - struct mld_msg *mld; - if (!pskb_may_pull(skb2, sizeof(*mld))) { - err = -EINVAL; - goto out; - } - mld = (struct mld_msg *)skb_transport_header(skb2); br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid); - } + break; } -out: - kfree_skb(skb2); + if (skb_trimmed) + kfree_skb(skb_trimmed); + return err; } #endif diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3cfff2a3d651..1e4278a4dd7e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4030,6 +4030,93 @@ int skb_checksum_setup(struct sk_buff *skb, bool recalculate) } EXPORT_SYMBOL(skb_checksum_setup); +/** + * skb_checksum_maybe_trim - maybe trims the given skb + * @skb: the skb to check + * @transport_len: the data length beyond the network header + * + * Checks whether the given skb has data beyond the given transport length. + * If so, returns a cloned skb trimmed to this transport length. + * Otherwise returns the provided skb. Returns NULL in error cases + * (e.g. transport_len exceeds skb length or out-of-memory). + * + * Caller needs to set the skb transport header and release the returned skb. + * Provided skb is consumed. + */ +static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb, + unsigned int transport_len) +{ + struct sk_buff *skb_chk; + unsigned int len = skb_transport_offset(skb) + transport_len; + int ret; + + if (skb->len < len) { + kfree_skb(skb); + return NULL; + } else if (skb->len == len) { + return skb; + } + + skb_chk = skb_clone(skb, GFP_ATOMIC); + kfree_skb(skb); + + if (!skb_chk) + return NULL; + + ret = pskb_trim_rcsum(skb_chk, len); + if (ret) { + kfree_skb(skb_chk); + return NULL; + } + + return skb_chk; +} + +/** + * skb_checksum_trimmed - validate checksum of an skb + * @skb: the skb to check + * @transport_len: the data length beyond the network header + * @skb_chkf: checksum function to use + * + * Applies the given checksum function skb_chkf to the provided skb. + * Returns a checked and maybe trimmed skb. Returns NULL on error. + * + * If the skb has data beyond the given transport length, then a + * trimmed & cloned skb is checked and returned. + * + * Caller needs to set the skb transport header and release the returned skb. + * Provided skb is consumed. + */ +struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, + unsigned int transport_len, + __sum16(*skb_chkf)(struct sk_buff *skb)) +{ + struct sk_buff *skb_chk; + unsigned int offset = skb_transport_offset(skb); + int ret; + + skb_chk = skb_checksum_maybe_trim(skb, transport_len); + if (!skb_chk) + return NULL; + + if (!pskb_may_pull(skb_chk, offset)) { + kfree_skb(skb_chk); + return NULL; + } + + __skb_pull(skb_chk, offset); + ret = skb_chkf(skb_chk); + __skb_push(skb_chk, offset); + + if (ret) { + kfree_skb(skb_chk); + return NULL; + } + + return skb_chk; +} +EXPORT_SYMBOL(skb_checksum_trimmed); + void __skb_warn_lro_forwarding(const struct sk_buff *skb) { net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n", diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index a3a697f5ffba..651cdf648ec4 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1339,6 +1339,168 @@ out: } EXPORT_SYMBOL(ip_mc_inc_group); +static int ip_mc_check_iphdr(struct sk_buff *skb) +{ + const struct iphdr *iph; + unsigned int len; + unsigned int offset = skb_network_offset(skb) + sizeof(*iph); + + if (!pskb_may_pull(skb, offset)) + return -EINVAL; + + iph = ip_hdr(skb); + + if (iph->version != 4 || ip_hdrlen(skb) < sizeof(*iph)) + return -EINVAL; + + offset += ip_hdrlen(skb) - sizeof(*iph); + + if (!pskb_may_pull(skb, offset)) + return -EINVAL; + + iph = ip_hdr(skb); + + if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) + return -EINVAL; + + len = skb_network_offset(skb) + ntohs(iph->tot_len); + if (skb->len < len || len < offset) + return -EINVAL; + + skb_set_transport_header(skb, offset); + + return 0; +} + +static int ip_mc_check_igmp_reportv3(struct sk_buff *skb) +{ + unsigned int len = skb_transport_offset(skb); + + len += sizeof(struct igmpv3_report); + + return pskb_may_pull(skb, len) ? 0 : -EINVAL; +} + +static int ip_mc_check_igmp_query(struct sk_buff *skb) +{ + unsigned int len = skb_transport_offset(skb); + + len += sizeof(struct igmphdr); + if (skb->len < len) + return -EINVAL; + + /* IGMPv{1,2}? */ + if (skb->len != len) { + /* or IGMPv3? */ + len += sizeof(struct igmpv3_query) - sizeof(struct igmphdr); + if (skb->len < len || !pskb_may_pull(skb, len)) + return -EINVAL; + } + + /* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer + * all-systems destination addresses (224.0.0.1) for general queries + */ + if (!igmp_hdr(skb)->group && + ip_hdr(skb)->daddr != htonl(INADDR_ALLHOSTS_GROUP)) + return -EINVAL; + + return 0; +} + +static int ip_mc_check_igmp_msg(struct sk_buff *skb) +{ + switch (igmp_hdr(skb)->type) { + case IGMP_HOST_LEAVE_MESSAGE: + case IGMP_HOST_MEMBERSHIP_REPORT: + case IGMPV2_HOST_MEMBERSHIP_REPORT: + /* fall through */ + return 0; + case IGMPV3_HOST_MEMBERSHIP_REPORT: + return ip_mc_check_igmp_reportv3(skb); + case IGMP_HOST_MEMBERSHIP_QUERY: + return ip_mc_check_igmp_query(skb); + default: + return -ENOMSG; + } +} + +static inline __sum16 ip_mc_validate_checksum(struct sk_buff *skb) +{ + return skb_checksum_simple_validate(skb); +} + +static int __ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed) + +{ + struct sk_buff *skb_chk; + unsigned int transport_len; + unsigned int len = skb_transport_offset(skb) + sizeof(struct igmphdr); + int ret; + + transport_len = ntohs(ip_hdr(skb)->tot_len) - ip_hdrlen(skb); + + skb_get(skb); + skb_chk = skb_checksum_trimmed(skb, transport_len, + ip_mc_validate_checksum); + if (!skb_chk) + return -EINVAL; + + if (!pskb_may_pull(skb_chk, len)) { + kfree_skb(skb_chk); + return -EINVAL; + } + + ret = ip_mc_check_igmp_msg(skb_chk); + if (ret) { + kfree_skb(skb_chk); + return ret; + } + + if (skb_trimmed) + *skb_trimmed = skb_chk; + else + kfree_skb(skb_chk); + + return 0; +} + +/** + * ip_mc_check_igmp - checks whether this is a sane IGMP packet + * @skb: the skb to validate + * @skb_trimmed: to store an skb pointer trimmed to IPv4 packet tail (optional) + * + * Checks whether an IPv4 packet is a valid IGMP packet. If so sets + * skb network and transport headers accordingly and returns zero. + * + * -EINVAL: A broken packet was detected, i.e. it violates some internet + * standard + * -ENOMSG: IP header validation succeeded but it is not an IGMP packet. + * -ENOMEM: A memory allocation failure happened. + * + * Optionally, an skb pointer might be provided via skb_trimmed (or set it + * to NULL): After parsing an IGMP packet successfully it will point to + * an skb which has its tail aligned to the IP packet end. This might + * either be the originally provided skb or a trimmed, cloned version if + * the skb frame had data beyond the IP packet. A cloned skb allows us + * to leave the original skb and its full frame unchanged (which might be + * desirable for layer 2 frame jugglers). + * + * The caller needs to release a reference count from any returned skb_trimmed. + */ +int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed) +{ + int ret = ip_mc_check_iphdr(skb); + + if (ret < 0) + return ret; + + if (ip_hdr(skb)->protocol != IPPROTO_IGMP) + return -ENOMSG; + + return __ip_mc_check_igmp(skb, skb_trimmed); +} +EXPORT_SYMBOL(ip_mc_check_igmp); + /* * Resend IGMP JOIN report; used by netdev notifier. */ diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 2e8c06108ab9..0f3f1999719a 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -48,4 +48,5 @@ obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o ifneq ($(CONFIG_IPV6),) obj-$(CONFIG_NET_UDP_TUNNEL) += ip6_udp_tunnel.o +obj-y += mcast_snoop.o endif diff --git a/net/ipv6/mcast_snoop.c b/net/ipv6/mcast_snoop.c new file mode 100644 index 000000000000..1a2cbc13a7d3 --- /dev/null +++ b/net/ipv6/mcast_snoop.c @@ -0,0 +1,213 @@ +/* Copyright (C) 2010: YOSHIFUJI Hideaki + * Copyright (C) 2015: Linus Lüssing + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + * + * + * Based on the MLD support added to br_multicast.c by YOSHIFUJI Hideaki. + */ + +#include +#include +#include +#include +#include + +static int ipv6_mc_check_ip6hdr(struct sk_buff *skb) +{ + const struct ipv6hdr *ip6h; + unsigned int len; + unsigned int offset = skb_network_offset(skb) + sizeof(*ip6h); + + if (!pskb_may_pull(skb, offset)) + return -EINVAL; + + ip6h = ipv6_hdr(skb); + + if (ip6h->version != 6) + return -EINVAL; + + len = offset + ntohs(ip6h->payload_len); + if (skb->len < len || len <= offset) + return -EINVAL; + + return 0; +} + +static int ipv6_mc_check_exthdrs(struct sk_buff *skb) +{ + const struct ipv6hdr *ip6h; + unsigned int offset; + u8 nexthdr; + __be16 frag_off; + + ip6h = ipv6_hdr(skb); + + if (ip6h->nexthdr != IPPROTO_HOPOPTS) + return -ENOMSG; + + nexthdr = ip6h->nexthdr; + offset = skb_network_offset(skb) + sizeof(*ip6h); + offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off); + + if (offset < 0) + return -EINVAL; + + if (nexthdr != IPPROTO_ICMPV6) + return -ENOMSG; + + skb_set_transport_header(skb, offset); + + return 0; +} + +static int ipv6_mc_check_mld_reportv2(struct sk_buff *skb) +{ + unsigned int len = skb_transport_offset(skb); + + len += sizeof(struct mld2_report); + + return pskb_may_pull(skb, len) ? 0 : -EINVAL; +} + +static int ipv6_mc_check_mld_query(struct sk_buff *skb) +{ + struct mld_msg *mld; + unsigned int len = skb_transport_offset(skb); + + /* RFC2710+RFC3810 (MLDv1+MLDv2) require link-local source addresses */ + if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) + return -EINVAL; + + len += sizeof(struct mld_msg); + if (skb->len < len) + return -EINVAL; + + /* MLDv1? */ + if (skb->len != len) { + /* or MLDv2? */ + len += sizeof(struct mld2_query) - sizeof(struct mld_msg); + if (skb->len < len || !pskb_may_pull(skb, len)) + return -EINVAL; + } + + mld = (struct mld_msg *)skb_transport_header(skb); + + /* RFC2710+RFC3810 (MLDv1+MLDv2) require the multicast link layer + * all-nodes destination address (ff02::1) for general queries + */ + if (ipv6_addr_any(&mld->mld_mca) && + !ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr)) + return -EINVAL; + + return 0; +} + +static int ipv6_mc_check_mld_msg(struct sk_buff *skb) +{ + struct mld_msg *mld = (struct mld_msg *)skb_transport_header(skb); + + switch (mld->mld_type) { + case ICMPV6_MGM_REDUCTION: + case ICMPV6_MGM_REPORT: + /* fall through */ + return 0; + case ICMPV6_MLD2_REPORT: + return ipv6_mc_check_mld_reportv2(skb); + case ICMPV6_MGM_QUERY: + return ipv6_mc_check_mld_query(skb); + default: + return -ENOMSG; + } +} + +static inline __sum16 ipv6_mc_validate_checksum(struct sk_buff *skb) +{ + return skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo); +} + +static int __ipv6_mc_check_mld(struct sk_buff *skb, + struct sk_buff **skb_trimmed) + +{ + struct sk_buff *skb_chk = NULL; + unsigned int transport_len; + unsigned int len = skb_transport_offset(skb) + sizeof(struct mld_msg); + int ret; + + transport_len = ntohs(ipv6_hdr(skb)->payload_len); + transport_len -= skb_transport_offset(skb) - sizeof(struct ipv6hdr); + + skb_get(skb); + skb_chk = skb_checksum_trimmed(skb, transport_len, + ipv6_mc_validate_checksum); + if (!skb_chk) + return -EINVAL; + + if (!pskb_may_pull(skb_chk, len)) { + kfree_skb(skb_chk); + return -EINVAL; + } + + ret = ipv6_mc_check_mld_msg(skb_chk); + if (ret) { + kfree_skb(skb_chk); + return ret; + } + + if (skb_trimmed) + *skb_trimmed = skb_chk; + else + kfree_skb(skb_chk); + + return 0; +} + +/** + * ipv6_mc_check_mld - checks whether this is a sane MLD packet + * @skb: the skb to validate + * @skb_trimmed: to store an skb pointer trimmed to IPv6 packet tail (optional) + * + * Checks whether an IPv6 packet is a valid MLD packet. If so sets + * skb network and transport headers accordingly and returns zero. + * + * -EINVAL: A broken packet was detected, i.e. it violates some internet + * standard + * -ENOMSG: IP header validation succeeded but it is not an MLD packet. + * -ENOMEM: A memory allocation failure happened. + * + * Optionally, an skb pointer might be provided via skb_trimmed (or set it + * to NULL): After parsing an MLD packet successfully it will point to + * an skb which has its tail aligned to the IP packet end. This might + * either be the originally provided skb or a trimmed, cloned version if + * the skb frame had data beyond the IP packet. A cloned skb allows us + * to leave the original skb and its full frame unchanged (which might be + * desirable for layer 2 frame jugglers). + * + * The caller needs to release a reference count from any returned skb_trimmed. + */ +int ipv6_mc_check_mld(struct sk_buff *skb, struct sk_buff **skb_trimmed) +{ + int ret; + + ret = ipv6_mc_check_ip6hdr(skb); + if (ret < 0) + return ret; + + ret = ipv6_mc_check_exthdrs(skb); + if (ret < 0) + return ret; + + return __ipv6_mc_check_mld(skb, skb_trimmed); +} +EXPORT_SYMBOL(ipv6_mc_check_mld); -- cgit v1.2.3 From 6cd9e9f629f11b9412d4e9aa294c029dbb36b3cf Mon Sep 17 00:00:00 2001 From: Kapileshwar Singh Date: Wed, 18 Feb 2015 16:04:21 +0000 Subject: thermal: of: fix cooling device weights in device tree Currently you can specify the weight of the cooling device in the device tree but that information is not populated to the thermal_bind_params where the fair share governor expects it to be. The of thermal zone device doesn't have a thermal_bind_params structure and arguably it's better to pass the weight inside the thermal_instance as it is specific to the bind of a cooling device to a thermal zone parameter. Core thermal code is fixed to populate the weight in the instance from the thermal_bind_params, so platform code that was passing the weight inside the thermal_bind_params continue to work seamlessly. While we are at it, create a default value for the weight parameter for those thermal zones that currently don't define it and remove the hardcoded default in of-thermal. Cc: Zhang Rui Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Peter Feuerer Cc: Darren Hart Cc: Eduardo Valentin Cc: Kukjin Kim Cc: Durgadoss R Signed-off-by: Kapileshwar Singh Signed-off-by: Eduardo Valentin --- Documentation/thermal/sysfs-api.txt | 4 +++- drivers/acpi/thermal.c | 9 ++++++--- drivers/platform/x86/acerhdf.c | 3 ++- drivers/thermal/db8500_thermal.c | 2 +- drivers/thermal/fair_share.c | 2 +- drivers/thermal/imx_thermal.c | 3 ++- drivers/thermal/of-thermal.c | 5 +++-- drivers/thermal/thermal_core.c | 22 ++++++++++++++++------ drivers/thermal/thermal_core.h | 1 + drivers/thermal/ti-soc-thermal/ti-thermal-common.c | 3 ++- include/linux/thermal.h | 6 +++++- 11 files changed, 42 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/Documentation/thermal/sysfs-api.txt b/Documentation/thermal/sysfs-api.txt index 87519cb379ee..7ec632ed9769 100644 --- a/Documentation/thermal/sysfs-api.txt +++ b/Documentation/thermal/sysfs-api.txt @@ -95,7 +95,7 @@ temperature) and throttle appropriate devices. 1.3 interface for binding a thermal zone device with a thermal cooling device 1.3.1 int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz, int trip, struct thermal_cooling_device *cdev, - unsigned long upper, unsigned long lower); + unsigned long upper, unsigned long lower, unsigned int weight); This interface function bind a thermal cooling device to the certain trip point of a thermal zone device. @@ -110,6 +110,8 @@ temperature) and throttle appropriate devices. lower:the Minimum cooling state can be used for this trip point. THERMAL_NO_LIMIT means no lower limit, and the cooling device can be in cooling state 0. + weight: the influence of this cooling device in this thermal + zone. See 1.4.1 below for more information. 1.3.2 int thermal_zone_unbind_cooling_device(struct thermal_zone_device *tz, int trip, struct thermal_cooling_device *cdev); diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index d24fa1964eb8..6d4e44ea74ac 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c @@ -800,7 +800,8 @@ static int acpi_thermal_cooling_device_cb(struct thermal_zone_device *thermal, result = thermal_zone_bind_cooling_device (thermal, trip, cdev, - THERMAL_NO_LIMIT, THERMAL_NO_LIMIT); + THERMAL_NO_LIMIT, THERMAL_NO_LIMIT, + THERMAL_WEIGHT_DEFAULT); else result = thermal_zone_unbind_cooling_device @@ -824,7 +825,8 @@ static int acpi_thermal_cooling_device_cb(struct thermal_zone_device *thermal, if (bind) result = thermal_zone_bind_cooling_device (thermal, trip, cdev, - THERMAL_NO_LIMIT, THERMAL_NO_LIMIT); + THERMAL_NO_LIMIT, THERMAL_NO_LIMIT, + THERMAL_WEIGHT_DEFAULT); else result = thermal_zone_unbind_cooling_device (thermal, trip, cdev); @@ -841,7 +843,8 @@ static int acpi_thermal_cooling_device_cb(struct thermal_zone_device *thermal, result = thermal_zone_bind_cooling_device (thermal, THERMAL_TRIPS_NONE, cdev, THERMAL_NO_LIMIT, - THERMAL_NO_LIMIT); + THERMAL_NO_LIMIT, + THERMAL_WEIGHT_DEFAULT); else result = thermal_zone_unbind_cooling_device (thermal, THERMAL_TRIPS_NONE, diff --git a/drivers/platform/x86/acerhdf.c b/drivers/platform/x86/acerhdf.c index 594c918b553d..1ef02daddb60 100644 --- a/drivers/platform/x86/acerhdf.c +++ b/drivers/platform/x86/acerhdf.c @@ -372,7 +372,8 @@ static int acerhdf_bind(struct thermal_zone_device *thermal, return 0; if (thermal_zone_bind_cooling_device(thermal, 0, cdev, - THERMAL_NO_LIMIT, THERMAL_NO_LIMIT)) { + THERMAL_NO_LIMIT, THERMAL_NO_LIMIT, + THERMAL_WEIGHT_DEFAULT)) { pr_err("error binding cooling dev\n"); return -EINVAL; } diff --git a/drivers/thermal/db8500_thermal.c b/drivers/thermal/db8500_thermal.c index 20adfbe27df1..2fb273c4baa9 100644 --- a/drivers/thermal/db8500_thermal.c +++ b/drivers/thermal/db8500_thermal.c @@ -76,7 +76,7 @@ static int db8500_cdev_bind(struct thermal_zone_device *thermal, upper = lower = i > max_state ? max_state : i; ret = thermal_zone_bind_cooling_device(thermal, i, cdev, - upper, lower); + upper, lower, THERMAL_WEIGHT_DEFAULT); dev_info(&cdev->device, "%s bind to %d: %d-%s\n", cdev->type, i, ret, ret ? "fail" : "succeed"); diff --git a/drivers/thermal/fair_share.c b/drivers/thermal/fair_share.c index 6e0a3fbfae86..c3b25187b467 100644 --- a/drivers/thermal/fair_share.c +++ b/drivers/thermal/fair_share.c @@ -109,7 +109,7 @@ static int fair_share_throttle(struct thermal_zone_device *tz, int trip) continue; instance->target = get_target_state(tz, cdev, - tzp->tbp[i].weight, cur_trip_level); + instance->weight, cur_trip_level); instance->cdev->updated = false; thermal_cdev_update(cdev); diff --git a/drivers/thermal/imx_thermal.c b/drivers/thermal/imx_thermal.c index 2ccbc0788353..fde4c2876d14 100644 --- a/drivers/thermal/imx_thermal.c +++ b/drivers/thermal/imx_thermal.c @@ -306,7 +306,8 @@ static int imx_bind(struct thermal_zone_device *tz, ret = thermal_zone_bind_cooling_device(tz, IMX_TRIP_PASSIVE, cdev, THERMAL_NO_LIMIT, - THERMAL_NO_LIMIT); + THERMAL_NO_LIMIT, + THERMAL_WEIGHT_DEFAULT); if (ret) { dev_err(&tz->device, "binding zone %s with cdev %s failed:%d\n", diff --git a/drivers/thermal/of-thermal.c b/drivers/thermal/of-thermal.c index 668fb1bdea9e..c606b85ea9f4 100644 --- a/drivers/thermal/of-thermal.c +++ b/drivers/thermal/of-thermal.c @@ -227,7 +227,8 @@ static int of_thermal_bind(struct thermal_zone_device *thermal, ret = thermal_zone_bind_cooling_device(thermal, tbp->trip_id, cdev, tbp->max, - tbp->min); + tbp->min, + tbp->usage); if (ret) return ret; } @@ -581,7 +582,7 @@ static int thermal_of_populate_bind_params(struct device_node *np, u32 prop; /* Default weight. Usage is optional */ - __tbp->usage = 0; + __tbp->usage = THERMAL_WEIGHT_DEFAULT; ret = of_property_read_u32(np, "contribution", &prop); if (ret == 0) __tbp->usage = prop; diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 4108db7e10c1..a6cb9b78b629 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -218,7 +218,8 @@ static void print_bind_err_msg(struct thermal_zone_device *tz, static void __bind(struct thermal_zone_device *tz, int mask, struct thermal_cooling_device *cdev, - unsigned long *limits) + unsigned long *limits, + unsigned int weight) { int i, ret; @@ -233,7 +234,8 @@ static void __bind(struct thermal_zone_device *tz, int mask, upper = limits[i * 2 + 1]; } ret = thermal_zone_bind_cooling_device(tz, i, cdev, - upper, lower); + upper, lower, + weight); if (ret) print_bind_err_msg(tz, cdev, ret); } @@ -280,7 +282,8 @@ static void bind_cdev(struct thermal_cooling_device *cdev) continue; tzp->tbp[i].cdev = cdev; __bind(pos, tzp->tbp[i].trip_mask, cdev, - tzp->tbp[i].binding_limits); + tzp->tbp[i].binding_limits, + tzp->tbp[i].weight); } } @@ -319,7 +322,8 @@ static void bind_tz(struct thermal_zone_device *tz) continue; tzp->tbp[i].cdev = pos; __bind(tz, tzp->tbp[i].trip_mask, pos, - tzp->tbp[i].binding_limits); + tzp->tbp[i].binding_limits, + tzp->tbp[i].weight); } } exit: @@ -713,7 +717,8 @@ passive_store(struct device *dev, struct device_attribute *attr, thermal_zone_bind_cooling_device(tz, THERMAL_TRIPS_NONE, cdev, THERMAL_NO_LIMIT, - THERMAL_NO_LIMIT); + THERMAL_NO_LIMIT, + THERMAL_WEIGHT_DEFAULT); } mutex_unlock(&thermal_list_lock); if (!tz->passive_delay) @@ -931,6 +936,9 @@ static const struct attribute_group *cooling_device_attr_groups[] = { * @lower: the Minimum cooling state can be used for this trip point. * THERMAL_NO_LIMIT means no lower limit, * and the cooling device can be in cooling state 0. + * @weight: The weight of the cooling device to be bound to the + * thermal zone. Use THERMAL_WEIGHT_DEFAULT for the + * default value * * This interface function bind a thermal cooling device to the certain trip * point of a thermal zone device. @@ -941,7 +949,8 @@ static const struct attribute_group *cooling_device_attr_groups[] = { int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz, int trip, struct thermal_cooling_device *cdev, - unsigned long upper, unsigned long lower) + unsigned long upper, unsigned long lower, + unsigned int weight) { struct thermal_instance *dev; struct thermal_instance *pos; @@ -986,6 +995,7 @@ int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz, dev->upper = upper; dev->lower = lower; dev->target = THERMAL_NO_TARGET; + dev->weight = weight; result = get_idr(&tz->idr, &tz->lock, &dev->id); if (result) diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h index 0531c752fbbb..7a465e9d456c 100644 --- a/drivers/thermal/thermal_core.h +++ b/drivers/thermal/thermal_core.h @@ -48,6 +48,7 @@ struct thermal_instance { struct device_attribute attr; struct list_head tz_node; /* node in tz->thermal_instances */ struct list_head cdev_node; /* node in cdev->thermal_instances */ + unsigned int weight; /* The weight of the cooling device */ }; int thermal_register_governor(struct thermal_governor *); diff --git a/drivers/thermal/ti-soc-thermal/ti-thermal-common.c b/drivers/thermal/ti-soc-thermal/ti-thermal-common.c index a38c1756442a..cb45e729adb5 100644 --- a/drivers/thermal/ti-soc-thermal/ti-thermal-common.c +++ b/drivers/thermal/ti-soc-thermal/ti-thermal-common.c @@ -146,7 +146,8 @@ static int ti_thermal_bind(struct thermal_zone_device *thermal, return thermal_zone_bind_cooling_device(thermal, 0, cdev, /* bind with min and max states defined by cpu_cooling */ THERMAL_NO_LIMIT, - THERMAL_NO_LIMIT); + THERMAL_NO_LIMIT, + THERMAL_WEIGHT_DEFAULT); } /* Unbind callback functions for thermal zone */ diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 5eac316490ea..00dacd4dfdce 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -40,6 +40,9 @@ /* No upper/lower limit requirement */ #define THERMAL_NO_LIMIT ((u32)~0) +/* Default weight of a bound cooling device */ +#define THERMAL_WEIGHT_DEFAULT 0 + /* Unit conversion macros */ #define KELVIN_TO_CELSIUS(t) (long)(((long)t-2732 >= 0) ? \ ((long)t-2732+5)/10 : ((long)t-2732-5)/10) @@ -323,7 +326,8 @@ void thermal_zone_device_unregister(struct thermal_zone_device *); int thermal_zone_bind_cooling_device(struct thermal_zone_device *, int, struct thermal_cooling_device *, - unsigned long, unsigned long); + unsigned long, unsigned long, + unsigned int); int thermal_zone_unbind_cooling_device(struct thermal_zone_device *, int, struct thermal_cooling_device *); void thermal_zone_device_update(struct thermal_zone_device *); -- cgit v1.2.3 From bcdcbbc71125c37195f97314f453ca9a3a4eb758 Mon Sep 17 00:00:00 2001 From: Javi Merino Date: Wed, 18 Feb 2015 16:04:25 +0000 Subject: thermal: fair_share: generalize the weight concept The fair share governor has the concept of weights, which is the influence of each cooling device in a thermal zone. The current implementation forces the weights of all cooling devices in a thermal zone to add up to a 100. This complicates setups, as you need to know in advance how many cooling devices you are going to have. If you bind a new cooling device, you have to modify all the other cooling devices weights, which is error prone. Furthermore, you can't specify a "default" weight for platforms since that default value depends on the number of cooling devices in the platform. This patch generalizes the concept of weight by allowing any number to be a "weight". Weights are now relative to each other. Platforms that don't specify weights get the same default value for all their cooling devices, so all their cdevs are considered to be equally influential. It's important to note that previous users of the weights don't need to alter the code: percentages continue to work as they used to. This patch just removes the constraint of all the weights in a thermal zone having to add up to a 100. If they do, you get the same behavior as before. If they don't, fair share now works for that platform. Cc: Zhang Rui Cc: Eduardo Valentin Cc: Durgadoss R Acked-by: Durgadoss R Signed-off-by: Javi Merino Signed-off-by: Eduardo Valentin --- Documentation/thermal/sysfs-api.txt | 12 +++++++++--- drivers/thermal/fair_share.c | 26 +++++++++++++++++++++----- include/linux/thermal.h | 9 ++++++--- 3 files changed, 36 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/Documentation/thermal/sysfs-api.txt b/Documentation/thermal/sysfs-api.txt index 3625453ceef6..fc7dfe10778b 100644 --- a/Documentation/thermal/sysfs-api.txt +++ b/Documentation/thermal/sysfs-api.txt @@ -129,9 +129,15 @@ temperature) and throttle appropriate devices. This structure defines the following parameters that are used to bind a zone with a cooling device for a particular trip point. .cdev: The cooling device pointer - .weight: The 'influence' of a particular cooling device on this zone. - This is on a percentage scale. The sum of all these weights - (for a particular zone) cannot exceed 100. + .weight: The 'influence' of a particular cooling device on this + zone. This is relative to the rest of the cooling + devices. For example, if all cooling devices have a + weight of 1, then they all contribute the same. You can + use percentages if you want, but it's not mandatory. A + weight of 0 means that this cooling device doesn't + contribute to the cooling of this zone unless all cooling + devices have a weight of 0. If all weights are 0, then + they all contribute the same. .trip_mask:This is a bit mask that gives the binding relation between this thermal zone and cdev, for a particular trip point. If nth bit is set, then the cdev and thermal zone are bound diff --git a/drivers/thermal/fair_share.c b/drivers/thermal/fair_share.c index 692f4053f08b..c2c10bbe24d6 100644 --- a/drivers/thermal/fair_share.c +++ b/drivers/thermal/fair_share.c @@ -59,13 +59,13 @@ static int get_trip_level(struct thermal_zone_device *tz) } static long get_target_state(struct thermal_zone_device *tz, - struct thermal_cooling_device *cdev, int weight, int level) + struct thermal_cooling_device *cdev, int percentage, int level) { unsigned long max_state; cdev->ops->get_max_state(cdev, &max_state); - return (long)(weight * level * max_state) / (100 * tz->trips); + return (long)(percentage * level * max_state) / (100 * tz->trips); } /** @@ -77,7 +77,7 @@ static long get_target_state(struct thermal_zone_device *tz, * * Parameters used for Throttling: * P1. max_state: Maximum throttle state exposed by the cooling device. - * P2. weight[i]/100: + * P2. percentage[i]/100: * How 'effective' the 'i'th device is, in cooling the given zone. * P3. cur_trip_level/max_no_of_trips: * This describes the extent to which the devices should be throttled. @@ -89,16 +89,32 @@ static long get_target_state(struct thermal_zone_device *tz, static int fair_share_throttle(struct thermal_zone_device *tz, int trip) { struct thermal_instance *instance; + int total_weight = 0; + int total_instance = 0; int cur_trip_level = get_trip_level(tz); list_for_each_entry(instance, &tz->thermal_instances, tz_node) { + if (instance->trip != trip) + continue; + + total_weight += instance->weight; + total_instance++; + } + + list_for_each_entry(instance, &tz->thermal_instances, tz_node) { + int percentage; struct thermal_cooling_device *cdev = instance->cdev; if (instance->trip != trip) continue; - instance->target = get_target_state(tz, cdev, - instance->weight, cur_trip_level); + if (!total_weight) + percentage = 100 / total_instance; + else + percentage = (instance->weight * 100) / total_weight; + + instance->target = get_target_state(tz, cdev, percentage, + cur_trip_level); instance->cdev->updated = false; thermal_cdev_update(cdev); diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 00dacd4dfdce..bac0f52c7a1e 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -217,9 +217,12 @@ struct thermal_bind_params { /* * This is a measure of 'how effectively these devices can - * cool 'this' thermal zone. The shall be determined by platform - * characterization. This is on a 'percentage' scale. - * See Documentation/thermal/sysfs-api.txt for more information. + * cool 'this' thermal zone. It shall be determined by + * platform characterization. This value is relative to the + * rest of the weights so a cooling device whose weight is + * double that of another cooling device is twice as + * effective. See Documentation/thermal/sysfs-api.txt for more + * information. */ int weight; -- cgit v1.2.3 From e33df1d2f3a0141cd79e770f31999ba0dd7ebfa8 Mon Sep 17 00:00:00 2001 From: Javi Merino Date: Thu, 26 Feb 2015 19:00:27 +0000 Subject: thermal: let governors have private data for each thermal zone A governor may need to store its current state between calls to throttle(). That state depends on the thermal zone, so store it as private data in struct thermal_zone_device. The governors may have two new ops: bind_to_tz() and unbind_from_tz(). When provided, these functions let governors do some initialization and teardown when they are bound/unbound to a tz and possibly store that information in the governor_data field of the struct thermal_zone_device. Cc: Zhang Rui Cc: Eduardo Valentin Signed-off-by: Javi Merino Signed-off-by: Eduardo Valentin --- drivers/thermal/thermal_core.c | 83 ++++++++++++++++++++++++++++++++++++++---- include/linux/thermal.h | 9 +++++ 2 files changed, 84 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 605d6919c1b6..be62b1622ed3 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -75,6 +75,58 @@ static struct thermal_governor *__find_governor(const char *name) return NULL; } +/** + * bind_previous_governor() - bind the previous governor of the thermal zone + * @tz: a valid pointer to a struct thermal_zone_device + * @failed_gov_name: the name of the governor that failed to register + * + * Register the previous governor of the thermal zone after a new + * governor has failed to be bound. + */ +static void bind_previous_governor(struct thermal_zone_device *tz, + const char *failed_gov_name) +{ + if (tz->governor && tz->governor->bind_to_tz) { + if (tz->governor->bind_to_tz(tz)) { + dev_err(&tz->device, + "governor %s failed to bind and the previous one (%s) failed to bind again, thermal zone %s has no governor\n", + failed_gov_name, tz->governor->name, tz->type); + tz->governor = NULL; + } + } +} + +/** + * thermal_set_governor() - Switch to another governor + * @tz: a valid pointer to a struct thermal_zone_device + * @new_gov: pointer to the new governor + * + * Change the governor of thermal zone @tz. + * + * Return: 0 on success, an error if the new governor's bind_to_tz() failed. + */ +static int thermal_set_governor(struct thermal_zone_device *tz, + struct thermal_governor *new_gov) +{ + int ret = 0; + + if (tz->governor && tz->governor->unbind_from_tz) + tz->governor->unbind_from_tz(tz); + + if (new_gov && new_gov->bind_to_tz) { + ret = new_gov->bind_to_tz(tz); + if (ret) { + bind_previous_governor(tz, new_gov->name); + + return ret; + } + } + + tz->governor = new_gov; + + return ret; +} + int thermal_register_governor(struct thermal_governor *governor) { int err; @@ -107,8 +159,15 @@ int thermal_register_governor(struct thermal_governor *governor) name = pos->tzp->governor_name; - if (!strncasecmp(name, governor->name, THERMAL_NAME_LENGTH)) - pos->governor = governor; + if (!strncasecmp(name, governor->name, THERMAL_NAME_LENGTH)) { + int ret; + + ret = thermal_set_governor(pos, governor); + if (ret) + dev_err(&pos->device, + "Failed to set governor %s for thermal zone %s: %d\n", + governor->name, pos->type, ret); + } } mutex_unlock(&thermal_list_lock); @@ -134,7 +193,7 @@ void thermal_unregister_governor(struct thermal_governor *governor) list_for_each_entry(pos, &thermal_tz_list, node) { if (!strncasecmp(pos->governor->name, governor->name, THERMAL_NAME_LENGTH)) - pos->governor = NULL; + thermal_set_governor(pos, NULL); } mutex_unlock(&thermal_list_lock); @@ -770,8 +829,9 @@ policy_store(struct device *dev, struct device_attribute *attr, if (!gov) goto exit; - tz->governor = gov; - ret = count; + ret = thermal_set_governor(tz, gov); + if (!ret) + ret = count; exit: mutex_unlock(&tz->lock); @@ -1512,6 +1572,7 @@ struct thermal_zone_device *thermal_zone_device_register(const char *type, int result; int count; int passive = 0; + struct thermal_governor *governor; if (type && strlen(type) >= THERMAL_NAME_LENGTH) return ERR_PTR(-EINVAL); @@ -1602,9 +1663,15 @@ struct thermal_zone_device *thermal_zone_device_register(const char *type, mutex_lock(&thermal_governor_lock); if (tz->tzp) - tz->governor = __find_governor(tz->tzp->governor_name); + governor = __find_governor(tz->tzp->governor_name); else - tz->governor = def_governor; + governor = def_governor; + + result = thermal_set_governor(tz, governor); + if (result) { + mutex_unlock(&thermal_governor_lock); + goto unregister; + } mutex_unlock(&thermal_governor_lock); @@ -1693,7 +1760,7 @@ void thermal_zone_device_unregister(struct thermal_zone_device *tz) device_remove_file(&tz->device, &dev_attr_mode); device_remove_file(&tz->device, &dev_attr_policy); remove_trip_attrs(tz); - tz->governor = NULL; + thermal_set_governor(tz, NULL); thermal_remove_hwmon_sysfs(tz); release_idr(&thermal_tz_idr, &thermal_idr_lock, tz->id); diff --git a/include/linux/thermal.h b/include/linux/thermal.h index bac0f52c7a1e..edf9d53c67e6 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -165,6 +165,7 @@ struct thermal_attr { * @ops: operations this &thermal_zone_device supports * @tzp: thermal zone parameters * @governor: pointer to the governor for this thermal zone + * @governor_data: private pointer for governor data * @thermal_instances: list of &struct thermal_instance of this thermal zone * @idr: &struct idr to generate unique id for this zone's cooling * devices @@ -191,6 +192,7 @@ struct thermal_zone_device { struct thermal_zone_device_ops *ops; const struct thermal_zone_params *tzp; struct thermal_governor *governor; + void *governor_data; struct list_head thermal_instances; struct idr idr; struct mutex lock; @@ -201,12 +203,19 @@ struct thermal_zone_device { /** * struct thermal_governor - structure that holds thermal governor information * @name: name of the governor + * @bind_to_tz: callback called when binding to a thermal zone. If it + * returns 0, the governor is bound to the thermal zone, + * otherwise it fails. + * @unbind_from_tz: callback called when a governor is unbound from a + * thermal zone. * @throttle: callback called for every trip point even if temperature is * below the trip point temperature * @governor_list: node in thermal_governor_list (in thermal_core.c) */ struct thermal_governor { char name[THERMAL_NAME_LENGTH]; + int (*bind_to_tz)(struct thermal_zone_device *tz); + void (*unbind_from_tz)(struct thermal_zone_device *tz); int (*throttle)(struct thermal_zone_device *tz, int trip); struct list_head governor_list; }; -- cgit v1.2.3 From 35b11d2e3a66279a477e36cefb2603806295b8ce Mon Sep 17 00:00:00 2001 From: Javi Merino Date: Thu, 26 Feb 2015 19:00:28 +0000 Subject: thermal: extend the cooling device API to include power information Add three optional callbacks to the cooling device interface to allow them to express power. In addition to the callbacks, add helpers to identify cooling devices that implement the power cooling device API. Cc: Zhang Rui Cc: Eduardo Valentin Signed-off-by: Javi Merino Signed-off-by: Eduardo Valentin --- drivers/thermal/thermal_core.c | 52 ++++++++++++++++++++++++++++++++++++++++++ include/linux/thermal.h | 25 ++++++++++++++++++++ 2 files changed, 77 insertions(+) (limited to 'include/linux') diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index be62b1622ed3..263628b0e862 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -875,6 +875,58 @@ emul_temp_store(struct device *dev, struct device_attribute *attr, static DEVICE_ATTR(emul_temp, S_IWUSR, NULL, emul_temp_store); #endif/*CONFIG_THERMAL_EMULATION*/ +/** + * power_actor_get_max_power() - get the maximum power that a cdev can consume + * @cdev: pointer to &thermal_cooling_device + * @tz: a valid thermal zone device pointer + * @max_power: pointer in which to store the maximum power + * + * Calculate the maximum power consumption in milliwats that the + * cooling device can currently consume and store it in @max_power. + * + * Return: 0 on success, -EINVAL if @cdev doesn't support the + * power_actor API or -E* on other error. + */ +int power_actor_get_max_power(struct thermal_cooling_device *cdev, + struct thermal_zone_device *tz, u32 *max_power) +{ + if (!cdev_is_power_actor(cdev)) + return -EINVAL; + + return cdev->ops->state2power(cdev, tz, 0, max_power); +} + +/** + * power_actor_set_power() - limit the maximum power that a cooling device can consume + * @cdev: pointer to &thermal_cooling_device + * @instance: thermal instance to update + * @power: the power in milliwatts + * + * Set the cooling device to consume at most @power milliwatts. + * + * Return: 0 on success, -EINVAL if the cooling device does not + * implement the power actor API or -E* for other failures. + */ +int power_actor_set_power(struct thermal_cooling_device *cdev, + struct thermal_instance *instance, u32 power) +{ + unsigned long state; + int ret; + + if (!cdev_is_power_actor(cdev)) + return -EINVAL; + + ret = cdev->ops->power2state(cdev, instance->tz, power, &state); + if (ret) + return ret; + + instance->target = state; + cdev->updated = false; + thermal_cdev_update(cdev); + + return 0; +} + static DEVICE_ATTR(type, 0444, type_show, NULL); static DEVICE_ATTR(temp, 0444, temp_show, NULL); static DEVICE_ATTR(mode, 0644, mode_show, mode_store); diff --git a/include/linux/thermal.h b/include/linux/thermal.h index edf9d53c67e6..bf3c55f405c2 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -63,6 +63,7 @@ struct thermal_zone_device; struct thermal_cooling_device; +struct thermal_instance; enum thermal_device_mode { THERMAL_DEVICE_DISABLED = 0, @@ -116,6 +117,12 @@ struct thermal_cooling_device_ops { int (*get_max_state) (struct thermal_cooling_device *, unsigned long *); int (*get_cur_state) (struct thermal_cooling_device *, unsigned long *); int (*set_cur_state) (struct thermal_cooling_device *, unsigned long); + int (*get_requested_power)(struct thermal_cooling_device *, + struct thermal_zone_device *, u32 *); + int (*state2power)(struct thermal_cooling_device *, + struct thermal_zone_device *, unsigned long, u32 *); + int (*power2state)(struct thermal_cooling_device *, + struct thermal_zone_device *, u32, unsigned long *); }; struct thermal_cooling_device { @@ -331,6 +338,16 @@ void thermal_zone_of_sensor_unregister(struct device *dev, #endif #if IS_ENABLED(CONFIG_THERMAL) +static inline bool cdev_is_power_actor(struct thermal_cooling_device *cdev) +{ + return cdev->ops->get_requested_power && cdev->ops->state2power && + cdev->ops->power2state; +} + +int power_actor_get_max_power(struct thermal_cooling_device *, + struct thermal_zone_device *tz, u32 *max_power); +int power_actor_set_power(struct thermal_cooling_device *, + struct thermal_instance *, u32); struct thermal_zone_device *thermal_zone_device_register(const char *, int, int, void *, struct thermal_zone_device_ops *, const struct thermal_zone_params *, int, int); @@ -359,6 +376,14 @@ struct thermal_instance *get_thermal_instance(struct thermal_zone_device *, void thermal_cdev_update(struct thermal_cooling_device *); void thermal_notify_framework(struct thermal_zone_device *, int); #else +static inline bool cdev_is_power_actor(struct thermal_cooling_device *cdev) +{ return false; } +static inline int power_actor_get_max_power(struct thermal_cooling_device *cdev, + struct thermal_zone_device *tz, u32 *max_power) +{ return 0; } +static inline int power_actor_set_power(struct thermal_cooling_device *cdev, + struct thermal_instance *tz, u32 power) +{ return 0; } static inline struct thermal_zone_device *thermal_zone_device_register( const char *type, int trips, int mask, void *devdata, struct thermal_zone_device_ops *ops, -- cgit v1.2.3 From c36cf07176316fbe6a4bdbc23afcb0cbf7822bf2 Mon Sep 17 00:00:00 2001 From: Javi Merino Date: Thu, 26 Feb 2015 19:00:29 +0000 Subject: thermal: cpu_cooling: implement the power cooling device API Add a basic power model to the cpu cooling device to implement the power cooling device API. The power model uses the current frequency, current load and OPPs for the power calculations. The cpus must have registered their OPPs using the OPP library. Cc: Zhang Rui Cc: Eduardo Valentin Signed-off-by: Kapileshwar Singh Signed-off-by: Punit Agrawal Signed-off-by: Javi Merino Signed-off-by: Eduardo Valentin --- Documentation/thermal/cpu-cooling-api.txt | 156 +++++++- drivers/thermal/cpu_cooling.c | 583 +++++++++++++++++++++++++++++- include/linux/cpu_cooling.h | 39 ++ 3 files changed, 760 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/Documentation/thermal/cpu-cooling-api.txt b/Documentation/thermal/cpu-cooling-api.txt index 753e47cc2e20..71653584cd03 100644 --- a/Documentation/thermal/cpu-cooling-api.txt +++ b/Documentation/thermal/cpu-cooling-api.txt @@ -36,8 +36,162 @@ the user. The registration APIs returns the cooling device pointer. np: pointer to the cooling device device tree node clip_cpus: cpumask of cpus where the frequency constraints will happen. -1.1.3 void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev) +1.1.3 struct thermal_cooling_device *cpufreq_power_cooling_register( + const struct cpumask *clip_cpus, u32 capacitance, + get_static_t plat_static_func) + +Similar to cpufreq_cooling_register, this function registers a cpufreq +cooling device. Using this function, the cooling device will +implement the power extensions by using a simple cpu power model. The +cpus must have registered their OPPs using the OPP library. + +The additional parameters are needed for the power model (See 2. Power +models). "capacitance" is the dynamic power coefficient (See 2.1 +Dynamic power). "plat_static_func" is a function to calculate the +static power consumed by these cpus (See 2.2 Static power). + +1.1.4 struct thermal_cooling_device *of_cpufreq_power_cooling_register( + struct device_node *np, const struct cpumask *clip_cpus, u32 capacitance, + get_static_t plat_static_func) + +Similar to cpufreq_power_cooling_register, this function register a +cpufreq cooling device with power extensions using the device tree +information supplied by the np parameter. + +1.1.5 void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev) This interface function unregisters the "thermal-cpufreq-%x" cooling device. cdev: Cooling device pointer which has to be unregistered. + +2. Power models + +The power API registration functions provide a simple power model for +CPUs. The current power is calculated as dynamic + (optionally) +static power. This power model requires that the operating-points of +the CPUs are registered using the kernel's opp library and the +`cpufreq_frequency_table` is assigned to the `struct device` of the +cpu. If you are using CONFIG_CPUFREQ_DT then the +`cpufreq_frequency_table` should already be assigned to the cpu +device. + +The `plat_static_func` parameter of `cpufreq_power_cooling_register()` +and `of_cpufreq_power_cooling_register()` is optional. If you don't +provide it, only dynamic power will be considered. + +2.1 Dynamic power + +The dynamic power consumption of a processor depends on many factors. +For a given processor implementation the primary factors are: + +- The time the processor spends running, consuming dynamic power, as + compared to the time in idle states where dynamic consumption is + negligible. Herein we refer to this as 'utilisation'. +- The voltage and frequency levels as a result of DVFS. The DVFS + level is a dominant factor governing power consumption. +- In running time the 'execution' behaviour (instruction types, memory + access patterns and so forth) causes, in most cases, a second order + variation. In pathological cases this variation can be significant, + but typically it is of a much lesser impact than the factors above. + +A high level dynamic power consumption model may then be represented as: + +Pdyn = f(run) * Voltage^2 * Frequency * Utilisation + +f(run) here represents the described execution behaviour and its +result has a units of Watts/Hz/Volt^2 (this often expressed in +mW/MHz/uVolt^2) + +The detailed behaviour for f(run) could be modelled on-line. However, +in practice, such an on-line model has dependencies on a number of +implementation specific processor support and characterisation +factors. Therefore, in initial implementation that contribution is +represented as a constant coefficient. This is a simplification +consistent with the relative contribution to overall power variation. + +In this simplified representation our model becomes: + +Pdyn = Capacitance * Voltage^2 * Frequency * Utilisation + +Where `capacitance` is a constant that represents an indicative +running time dynamic power coefficient in fundamental units of +mW/MHz/uVolt^2. Typical values for mobile CPUs might lie in range +from 100 to 500. For reference, the approximate values for the SoC in +ARM's Juno Development Platform are 530 for the Cortex-A57 cluster and +140 for the Cortex-A53 cluster. + + +2.2 Static power + +Static leakage power consumption depends on a number of factors. For a +given circuit implementation the primary factors are: + +- Time the circuit spends in each 'power state' +- Temperature +- Operating voltage +- Process grade + +The time the circuit spends in each 'power state' for a given +evaluation period at first order means OFF or ON. However, +'retention' states can also be supported that reduce power during +inactive periods without loss of context. + +Note: The visibility of state entries to the OS can vary, according to +platform specifics, and this can then impact the accuracy of a model +based on OS state information alone. It might be possible in some +cases to extract more accurate information from system resources. + +The temperature, operating voltage and process 'grade' (slow to fast) +of the circuit are all significant factors in static leakage power +consumption. All of these have complex relationships to static power. + +Circuit implementation specific factors include the chosen silicon +process as well as the type, number and size of transistors in both +the logic gates and any RAM elements included. + +The static power consumption modelling must take into account the +power managed regions that are implemented. Taking the example of an +ARM processor cluster, the modelling would take into account whether +each CPU can be powered OFF separately or if only a single power +region is implemented for the complete cluster. + +In one view, there are others, a static power consumption model can +then start from a set of reference values for each power managed +region (e.g. CPU, Cluster/L2) in each state (e.g. ON, OFF) at an +arbitrary process grade, voltage and temperature point. These values +are then scaled for all of the following: the time in each state, the +process grade, the current temperature and the operating voltage. +However, since both implementation specific and complex relationships +dominate the estimate, the appropriate interface to the model from the +cpu cooling device is to provide a function callback that calculates +the static power in this platform. When registering the cpu cooling +device pass a function pointer that follows the `get_static_t` +prototype: + + int plat_get_static(cpumask_t *cpumask, int interval, + unsigned long voltage, u32 &power); + +`cpumask` is the cpumask of the cpus involved in the calculation. +`voltage` is the voltage at which they are operating. The function +should calculate the average static power for the last `interval` +milliseconds. It returns 0 on success, -E* on error. If it +succeeds, it should store the static power in `power`. Reading the +temperature of the cpus described by `cpumask` is left for +plat_get_static() to do as the platform knows best which thermal +sensor is closest to the cpu. + +If `plat_static_func` is NULL, static power is considered to be +negligible for this platform and only dynamic power is considered. + +The platform specific callback can then use any combination of tables +and/or equations to permute the estimated value. Process grade +information is not passed to the model since access to such data, from +on-chip measurement capability or manufacture time data, is platform +specific. + +Note: the significance of static power for CPUs in comparison to +dynamic power is highly dependent on implementation. Given the +potential complexity in implementation, the importance and accuracy of +its inclusion when using cpu cooling devices should be assessed on a +case by case basis. + diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c index f65f0d109fc8..ba23150c7bde 100644 --- a/drivers/thermal/cpu_cooling.c +++ b/drivers/thermal/cpu_cooling.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -44,6 +45,19 @@ * ... */ +/** + * struct power_table - frequency to power conversion + * @frequency: frequency in KHz + * @power: power in mW + * + * This structure is built when the cooling device registers and helps + * in translating frequency to power and viceversa. + */ +struct power_table { + u32 frequency; + u32 power; +}; + /** * struct cpufreq_cooling_device - data for cooling device with cpufreq * @id: unique integer value corresponding to each cpufreq_cooling_device @@ -58,6 +72,15 @@ * cpufreq frequencies. * @allowed_cpus: all the cpus involved for this cpufreq_cooling_device. * @node: list_head to link all cpufreq_cooling_device together. + * @last_load: load measured by the latest call to cpufreq_get_actual_power() + * @time_in_idle: previous reading of the absolute time that this cpu was idle + * @time_in_idle_timestamp: wall time of the last invocation of + * get_cpu_idle_time_us() + * @dyn_power_table: array of struct power_table for frequency to power + * conversion, sorted in ascending order. + * @dyn_power_table_entries: number of entries in the @dyn_power_table array + * @cpu_dev: the first cpu_device from @allowed_cpus that has OPPs registered + * @plat_get_static_power: callback to calculate the static power * * This structure is required for keeping information of each registered * cpufreq_cooling_device. @@ -71,6 +94,13 @@ struct cpufreq_cooling_device { unsigned int *freq_table; /* In descending order */ struct cpumask allowed_cpus; struct list_head node; + u32 last_load; + u64 *time_in_idle; + u64 *time_in_idle_timestamp; + struct power_table *dyn_power_table; + int dyn_power_table_entries; + struct device *cpu_dev; + get_static_t plat_get_static_power; }; static DEFINE_IDR(cpufreq_idr); static DEFINE_MUTEX(cooling_cpufreq_lock); @@ -167,6 +197,39 @@ unsigned long cpufreq_cooling_get_level(unsigned int cpu, unsigned int freq) } EXPORT_SYMBOL_GPL(cpufreq_cooling_get_level); +static void update_cpu_device(int cpu) +{ + struct cpufreq_cooling_device *cpufreq_dev; + + mutex_lock(&cooling_cpufreq_lock); + list_for_each_entry(cpufreq_dev, &cpufreq_dev_list, node) { + if (cpumask_test_cpu(cpu, &cpufreq_dev->allowed_cpus)) { + cpufreq_dev->cpu_dev = get_cpu_device(cpu); + if (!cpufreq_dev->cpu_dev) { + dev_warn(&cpufreq_dev->cool_dev->device, + "No cpu device for new policy cpu %d\n", + cpu); + } + break; + } + } + mutex_unlock(&cooling_cpufreq_lock); +} + +static void remove_cpu_device(int cpu) +{ + struct cpufreq_cooling_device *cpufreq_dev; + + mutex_lock(&cooling_cpufreq_lock); + list_for_each_entry(cpufreq_dev, &cpufreq_dev_list, node) { + if (cpumask_test_cpu(cpu, &cpufreq_dev->allowed_cpus)) { + cpufreq_dev->cpu_dev = NULL; + break; + } + } + mutex_unlock(&cooling_cpufreq_lock); +} + /** * cpufreq_thermal_notifier - notifier callback for cpufreq policy change. * @nb: struct notifier_block * with callback info. @@ -186,23 +249,240 @@ static int cpufreq_thermal_notifier(struct notifier_block *nb, unsigned long max_freq = 0; struct cpufreq_cooling_device *cpufreq_dev; - if (event != CPUFREQ_ADJUST) - return 0; + switch (event) { - mutex_lock(&cooling_cpufreq_lock); - list_for_each_entry(cpufreq_dev, &cpufreq_dev_list, node) { - if (!cpumask_test_cpu(policy->cpu, - &cpufreq_dev->allowed_cpus)) + case CPUFREQ_ADJUST: + mutex_lock(&cooling_cpufreq_lock); + list_for_each_entry(cpufreq_dev, &cpufreq_dev_list, node) { + if (!cpumask_test_cpu(policy->cpu, + &cpufreq_dev->allowed_cpus)) + continue; + + max_freq = cpufreq_dev->cpufreq_val; + + if (policy->max != max_freq) + cpufreq_verify_within_limits(policy, 0, + max_freq); + } + mutex_unlock(&cooling_cpufreq_lock); + break; + + case CPUFREQ_CREATE_POLICY: + update_cpu_device(policy->cpu); + break; + case CPUFREQ_REMOVE_POLICY: + remove_cpu_device(policy->cpu); + break; + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; +} + +/** + * build_dyn_power_table() - create a dynamic power to frequency table + * @cpufreq_device: the cpufreq cooling device in which to store the table + * @capacitance: dynamic power coefficient for these cpus + * + * Build a dynamic power to frequency table for this cpu and store it + * in @cpufreq_device. This table will be used in cpu_power_to_freq() and + * cpu_freq_to_power() to convert between power and frequency + * efficiently. Power is stored in mW, frequency in KHz. The + * resulting table is in ascending order. + * + * Return: 0 on success, -E* on error. + */ +static int build_dyn_power_table(struct cpufreq_cooling_device *cpufreq_device, + u32 capacitance) +{ + struct power_table *power_table; + struct dev_pm_opp *opp; + struct device *dev = NULL; + int num_opps = 0, cpu, i, ret = 0; + unsigned long freq; + + rcu_read_lock(); + + for_each_cpu(cpu, &cpufreq_device->allowed_cpus) { + dev = get_cpu_device(cpu); + if (!dev) { + dev_warn(&cpufreq_device->cool_dev->device, + "No cpu device for cpu %d\n", cpu); continue; + } - max_freq = cpufreq_dev->cpufreq_val; + num_opps = dev_pm_opp_get_opp_count(dev); + if (num_opps > 0) { + break; + } else if (num_opps < 0) { + ret = num_opps; + goto unlock; + } + } - if (policy->max != max_freq) - cpufreq_verify_within_limits(policy, 0, max_freq); + if (num_opps == 0) { + ret = -EINVAL; + goto unlock; } - mutex_unlock(&cooling_cpufreq_lock); - return 0; + power_table = kcalloc(num_opps, sizeof(*power_table), GFP_KERNEL); + + for (freq = 0, i = 0; + opp = dev_pm_opp_find_freq_ceil(dev, &freq), !IS_ERR(opp); + freq++, i++) { + u32 freq_mhz, voltage_mv; + u64 power; + + freq_mhz = freq / 1000000; + voltage_mv = dev_pm_opp_get_voltage(opp) / 1000; + + /* + * Do the multiplication with MHz and millivolt so as + * to not overflow. + */ + power = (u64)capacitance * freq_mhz * voltage_mv * voltage_mv; + do_div(power, 1000000000); + + /* frequency is stored in power_table in KHz */ + power_table[i].frequency = freq / 1000; + + /* power is stored in mW */ + power_table[i].power = power; + } + + if (i == 0) { + ret = PTR_ERR(opp); + goto unlock; + } + + cpufreq_device->cpu_dev = dev; + cpufreq_device->dyn_power_table = power_table; + cpufreq_device->dyn_power_table_entries = i; + +unlock: + rcu_read_unlock(); + return ret; +} + +static u32 cpu_freq_to_power(struct cpufreq_cooling_device *cpufreq_device, + u32 freq) +{ + int i; + struct power_table *pt = cpufreq_device->dyn_power_table; + + for (i = 1; i < cpufreq_device->dyn_power_table_entries; i++) + if (freq < pt[i].frequency) + break; + + return pt[i - 1].power; +} + +static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_device, + u32 power) +{ + int i; + struct power_table *pt = cpufreq_device->dyn_power_table; + + for (i = 1; i < cpufreq_device->dyn_power_table_entries; i++) + if (power < pt[i].power) + break; + + return pt[i - 1].frequency; +} + +/** + * get_load() - get load for a cpu since last updated + * @cpufreq_device: &struct cpufreq_cooling_device for this cpu + * @cpu: cpu number + * + * Return: The average load of cpu @cpu in percentage since this + * function was last called. + */ +static u32 get_load(struct cpufreq_cooling_device *cpufreq_device, int cpu) +{ + u32 load; + u64 now, now_idle, delta_time, delta_idle; + + now_idle = get_cpu_idle_time(cpu, &now, 0); + delta_idle = now_idle - cpufreq_device->time_in_idle[cpu]; + delta_time = now - cpufreq_device->time_in_idle_timestamp[cpu]; + + if (delta_time <= delta_idle) + load = 0; + else + load = div64_u64(100 * (delta_time - delta_idle), delta_time); + + cpufreq_device->time_in_idle[cpu] = now_idle; + cpufreq_device->time_in_idle_timestamp[cpu] = now; + + return load; +} + +/** + * get_static_power() - calculate the static power consumed by the cpus + * @cpufreq_device: struct &cpufreq_cooling_device for this cpu cdev + * @tz: thermal zone device in which we're operating + * @freq: frequency in KHz + * @power: pointer in which to store the calculated static power + * + * Calculate the static power consumed by the cpus described by + * @cpu_actor running at frequency @freq. This function relies on a + * platform specific function that should have been provided when the + * actor was registered. If it wasn't, the static power is assumed to + * be negligible. The calculated static power is stored in @power. + * + * Return: 0 on success, -E* on failure. + */ +static int get_static_power(struct cpufreq_cooling_device *cpufreq_device, + struct thermal_zone_device *tz, unsigned long freq, + u32 *power) +{ + struct dev_pm_opp *opp; + unsigned long voltage; + struct cpumask *cpumask = &cpufreq_device->allowed_cpus; + unsigned long freq_hz = freq * 1000; + + if (!cpufreq_device->plat_get_static_power || + !cpufreq_device->cpu_dev) { + *power = 0; + return 0; + } + + rcu_read_lock(); + + opp = dev_pm_opp_find_freq_exact(cpufreq_device->cpu_dev, freq_hz, + true); + voltage = dev_pm_opp_get_voltage(opp); + + rcu_read_unlock(); + + if (voltage == 0) { + dev_warn_ratelimited(cpufreq_device->cpu_dev, + "Failed to get voltage for frequency %lu: %ld\n", + freq_hz, IS_ERR(opp) ? PTR_ERR(opp) : 0); + return -EINVAL; + } + + return cpufreq_device->plat_get_static_power(cpumask, tz->passive_delay, + voltage, power); +} + +/** + * get_dynamic_power() - calculate the dynamic power + * @cpufreq_device: &cpufreq_cooling_device for this cdev + * @freq: current frequency + * + * Return: the dynamic power consumed by the cpus described by + * @cpufreq_device. + */ +static u32 get_dynamic_power(struct cpufreq_cooling_device *cpufreq_device, + unsigned long freq) +{ + u32 raw_cpu_power; + + raw_cpu_power = cpu_freq_to_power(cpufreq_device, freq); + return (raw_cpu_power * cpufreq_device->last_load) / 100; } /* cpufreq cooling device callback functions are defined below */ @@ -280,8 +560,169 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev, return 0; } +/** + * cpufreq_get_requested_power() - get the current power + * @cdev: &thermal_cooling_device pointer + * @tz: a valid thermal zone device pointer + * @power: pointer in which to store the resulting power + * + * Calculate the current power consumption of the cpus in milliwatts + * and store it in @power. This function should actually calculate + * the requested power, but it's hard to get the frequency that + * cpufreq would have assigned if there were no thermal limits. + * Instead, we calculate the current power on the assumption that the + * immediate future will look like the immediate past. + * + * We use the current frequency and the average load since this + * function was last called. In reality, there could have been + * multiple opps since this function was last called and that affects + * the load calculation. While it's not perfectly accurate, this + * simplification is good enough and works. REVISIT this, as more + * complex code may be needed if experiments show that it's not + * accurate enough. + * + * Return: 0 on success, -E* if getting the static power failed. + */ +static int cpufreq_get_requested_power(struct thermal_cooling_device *cdev, + struct thermal_zone_device *tz, + u32 *power) +{ + unsigned long freq; + int cpu, ret; + u32 static_power, dynamic_power, total_load = 0; + struct cpufreq_cooling_device *cpufreq_device = cdev->devdata; + + freq = cpufreq_quick_get(cpumask_any(&cpufreq_device->allowed_cpus)); + + for_each_cpu(cpu, &cpufreq_device->allowed_cpus) { + u32 load; + + if (cpu_online(cpu)) + load = get_load(cpufreq_device, cpu); + else + load = 0; + + total_load += load; + } + + cpufreq_device->last_load = total_load; + + dynamic_power = get_dynamic_power(cpufreq_device, freq); + ret = get_static_power(cpufreq_device, tz, freq, &static_power); + if (ret) + return ret; + + *power = static_power + dynamic_power; + return 0; +} + +/** + * cpufreq_state2power() - convert a cpu cdev state to power consumed + * @cdev: &thermal_cooling_device pointer + * @tz: a valid thermal zone device pointer + * @state: cooling device state to be converted + * @power: pointer in which to store the resulting power + * + * Convert cooling device state @state into power consumption in + * milliwatts assuming 100% load. Store the calculated power in + * @power. + * + * Return: 0 on success, -EINVAL if the cooling device state could not + * be converted into a frequency or other -E* if there was an error + * when calculating the static power. + */ +static int cpufreq_state2power(struct thermal_cooling_device *cdev, + struct thermal_zone_device *tz, + unsigned long state, u32 *power) +{ + unsigned int freq, num_cpus; + cpumask_t cpumask; + u32 static_power, dynamic_power; + int ret; + struct cpufreq_cooling_device *cpufreq_device = cdev->devdata; + + cpumask_and(&cpumask, &cpufreq_device->allowed_cpus, cpu_online_mask); + num_cpus = cpumask_weight(&cpumask); + + /* None of our cpus are online, so no power */ + if (num_cpus == 0) { + *power = 0; + return 0; + } + + freq = cpufreq_device->freq_table[state]; + if (!freq) + return -EINVAL; + + dynamic_power = cpu_freq_to_power(cpufreq_device, freq) * num_cpus; + ret = get_static_power(cpufreq_device, tz, freq, &static_power); + if (ret) + return ret; + + *power = static_power + dynamic_power; + return 0; +} + +/** + * cpufreq_power2state() - convert power to a cooling device state + * @cdev: &thermal_cooling_device pointer + * @tz: a valid thermal zone device pointer + * @power: power in milliwatts to be converted + * @state: pointer in which to store the resulting state + * + * Calculate a cooling device state for the cpus described by @cdev + * that would allow them to consume at most @power mW and store it in + * @state. Note that this calculation depends on external factors + * such as the cpu load or the current static power. Calling this + * function with the same power as input can yield different cooling + * device states depending on those external factors. + * + * Return: 0 on success, -ENODEV if no cpus are online or -EINVAL if + * the calculated frequency could not be converted to a valid state. + * The latter should not happen unless the frequencies available to + * cpufreq have changed since the initialization of the cpu cooling + * device. + */ +static int cpufreq_power2state(struct thermal_cooling_device *cdev, + struct thermal_zone_device *tz, u32 power, + unsigned long *state) +{ + unsigned int cpu, cur_freq, target_freq; + int ret; + s32 dyn_power; + u32 last_load, normalised_power, static_power; + struct cpufreq_cooling_device *cpufreq_device = cdev->devdata; + + cpu = cpumask_any_and(&cpufreq_device->allowed_cpus, cpu_online_mask); + + /* None of our cpus are online */ + if (cpu >= nr_cpu_ids) + return -ENODEV; + + cur_freq = cpufreq_quick_get(cpu); + ret = get_static_power(cpufreq_device, tz, cur_freq, &static_power); + if (ret) + return ret; + + dyn_power = power - static_power; + dyn_power = dyn_power > 0 ? dyn_power : 0; + last_load = cpufreq_device->last_load ?: 1; + normalised_power = (dyn_power * 100) / last_load; + target_freq = cpu_power_to_freq(cpufreq_device, normalised_power); + + *state = cpufreq_cooling_get_level(cpu, target_freq); + if (*state == THERMAL_CSTATE_INVALID) { + dev_warn_ratelimited(&cdev->device, + "Failed to convert %dKHz for cpu %d into a cdev state\n", + target_freq, cpu); + return -EINVAL; + } + + return 0; +} + /* Bind cpufreq callbacks to thermal cooling device ops */ -static struct thermal_cooling_device_ops const cpufreq_cooling_ops = { +static struct thermal_cooling_device_ops cpufreq_cooling_ops = { .get_max_state = cpufreq_get_max_state, .get_cur_state = cpufreq_get_cur_state, .set_cur_state = cpufreq_set_cur_state, @@ -311,6 +752,9 @@ static unsigned int find_next_max(struct cpufreq_frequency_table *table, * @np: a valid struct device_node to the cooling device device tree node * @clip_cpus: cpumask of cpus where the frequency constraints will happen. * Normally this should be same as cpufreq policy->related_cpus. + * @capacitance: dynamic power coefficient for these cpus + * @plat_static_func: function to calculate the static power consumed by these + * cpus (optional) * * This interface function registers the cpufreq cooling device with the name * "thermal-cpufreq-%x". This api can support multiple instances of cpufreq @@ -322,13 +766,14 @@ static unsigned int find_next_max(struct cpufreq_frequency_table *table, */ static struct thermal_cooling_device * __cpufreq_cooling_register(struct device_node *np, - const struct cpumask *clip_cpus) + const struct cpumask *clip_cpus, u32 capacitance, + get_static_t plat_static_func) { struct thermal_cooling_device *cool_dev; struct cpufreq_cooling_device *cpufreq_dev; char dev_name[THERMAL_NAME_LENGTH]; struct cpufreq_frequency_table *pos, *table; - unsigned int freq, i; + unsigned int freq, i, num_cpus; int ret; table = cpufreq_frequency_get_table(cpumask_first(clip_cpus)); @@ -341,6 +786,23 @@ __cpufreq_cooling_register(struct device_node *np, if (!cpufreq_dev) return ERR_PTR(-ENOMEM); + num_cpus = cpumask_weight(clip_cpus); + cpufreq_dev->time_in_idle = kcalloc(num_cpus, + sizeof(*cpufreq_dev->time_in_idle), + GFP_KERNEL); + if (!cpufreq_dev->time_in_idle) { + cool_dev = ERR_PTR(-ENOMEM); + goto free_cdev; + } + + cpufreq_dev->time_in_idle_timestamp = + kcalloc(num_cpus, sizeof(*cpufreq_dev->time_in_idle_timestamp), + GFP_KERNEL); + if (!cpufreq_dev->time_in_idle_timestamp) { + cool_dev = ERR_PTR(-ENOMEM); + goto free_time_in_idle; + } + /* Find max levels */ cpufreq_for_each_valid_entry(pos, table) cpufreq_dev->max_level++; @@ -349,7 +811,7 @@ __cpufreq_cooling_register(struct device_node *np, cpufreq_dev->max_level, GFP_KERNEL); if (!cpufreq_dev->freq_table) { cool_dev = ERR_PTR(-ENOMEM); - goto free_cdev; + goto free_time_in_idle_timestamp; } /* max_level is an index, not a counter */ @@ -357,6 +819,20 @@ __cpufreq_cooling_register(struct device_node *np, cpumask_copy(&cpufreq_dev->allowed_cpus, clip_cpus); + if (capacitance) { + cpufreq_cooling_ops.get_requested_power = + cpufreq_get_requested_power; + cpufreq_cooling_ops.state2power = cpufreq_state2power; + cpufreq_cooling_ops.power2state = cpufreq_power2state; + cpufreq_dev->plat_get_static_power = plat_static_func; + + ret = build_dyn_power_table(cpufreq_dev, capacitance); + if (ret) { + cool_dev = ERR_PTR(ret); + goto free_table; + } + } + ret = get_idr(&cpufreq_idr, &cpufreq_dev->id); if (ret) { cool_dev = ERR_PTR(ret); @@ -402,6 +878,10 @@ remove_idr: release_idr(&cpufreq_idr, cpufreq_dev->id); free_table: kfree(cpufreq_dev->freq_table); +free_time_in_idle_timestamp: + kfree(cpufreq_dev->time_in_idle_timestamp); +free_time_in_idle: + kfree(cpufreq_dev->time_in_idle); free_cdev: kfree(cpufreq_dev); @@ -422,7 +902,7 @@ free_cdev: struct thermal_cooling_device * cpufreq_cooling_register(const struct cpumask *clip_cpus) { - return __cpufreq_cooling_register(NULL, clip_cpus); + return __cpufreq_cooling_register(NULL, clip_cpus, 0, NULL); } EXPORT_SYMBOL_GPL(cpufreq_cooling_register); @@ -446,10 +926,77 @@ of_cpufreq_cooling_register(struct device_node *np, if (!np) return ERR_PTR(-EINVAL); - return __cpufreq_cooling_register(np, clip_cpus); + return __cpufreq_cooling_register(np, clip_cpus, 0, NULL); } EXPORT_SYMBOL_GPL(of_cpufreq_cooling_register); +/** + * cpufreq_power_cooling_register() - create cpufreq cooling device with power extensions + * @clip_cpus: cpumask of cpus where the frequency constraints will happen + * @capacitance: dynamic power coefficient for these cpus + * @plat_static_func: function to calculate the static power consumed by these + * cpus (optional) + * + * This interface function registers the cpufreq cooling device with + * the name "thermal-cpufreq-%x". This api can support multiple + * instances of cpufreq cooling devices. Using this function, the + * cooling device will implement the power extensions by using a + * simple cpu power model. The cpus must have registered their OPPs + * using the OPP library. + * + * An optional @plat_static_func may be provided to calculate the + * static power consumed by these cpus. If the platform's static + * power consumption is unknown or negligible, make it NULL. + * + * Return: a valid struct thermal_cooling_device pointer on success, + * on failure, it returns a corresponding ERR_PTR(). + */ +struct thermal_cooling_device * +cpufreq_power_cooling_register(const struct cpumask *clip_cpus, u32 capacitance, + get_static_t plat_static_func) +{ + return __cpufreq_cooling_register(NULL, clip_cpus, capacitance, + plat_static_func); +} +EXPORT_SYMBOL(cpufreq_power_cooling_register); + +/** + * of_cpufreq_power_cooling_register() - create cpufreq cooling device with power extensions + * @np: a valid struct device_node to the cooling device device tree node + * @clip_cpus: cpumask of cpus where the frequency constraints will happen + * @capacitance: dynamic power coefficient for these cpus + * @plat_static_func: function to calculate the static power consumed by these + * cpus (optional) + * + * This interface function registers the cpufreq cooling device with + * the name "thermal-cpufreq-%x". This api can support multiple + * instances of cpufreq cooling devices. Using this API, the cpufreq + * cooling device will be linked to the device tree node provided. + * Using this function, the cooling device will implement the power + * extensions by using a simple cpu power model. The cpus must have + * registered their OPPs using the OPP library. + * + * An optional @plat_static_func may be provided to calculate the + * static power consumed by these cpus. If the platform's static + * power consumption is unknown or negligible, make it NULL. + * + * Return: a valid struct thermal_cooling_device pointer on success, + * on failure, it returns a corresponding ERR_PTR(). + */ +struct thermal_cooling_device * +of_cpufreq_power_cooling_register(struct device_node *np, + const struct cpumask *clip_cpus, + u32 capacitance, + get_static_t plat_static_func) +{ + if (!np) + return ERR_PTR(-EINVAL); + + return __cpufreq_cooling_register(np, clip_cpus, capacitance, + plat_static_func); +} +EXPORT_SYMBOL(of_cpufreq_power_cooling_register); + /** * cpufreq_cooling_unregister - function to remove cpufreq cooling device. * @cdev: thermal cooling device pointer. @@ -475,6 +1022,8 @@ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev) thermal_cooling_device_unregister(cpufreq_dev->cool_dev); release_idr(&cpufreq_idr, cpufreq_dev->id); + kfree(cpufreq_dev->time_in_idle_timestamp); + kfree(cpufreq_dev->time_in_idle); kfree(cpufreq_dev->freq_table); kfree(cpufreq_dev); } diff --git a/include/linux/cpu_cooling.h b/include/linux/cpu_cooling.h index bd955270d5aa..c156f5082758 100644 --- a/include/linux/cpu_cooling.h +++ b/include/linux/cpu_cooling.h @@ -28,6 +28,9 @@ #include #include +typedef int (*get_static_t)(cpumask_t *cpumask, int interval, + unsigned long voltage, u32 *power); + #ifdef CONFIG_CPU_THERMAL /** * cpufreq_cooling_register - function to create cpufreq cooling device. @@ -36,6 +39,10 @@ struct thermal_cooling_device * cpufreq_cooling_register(const struct cpumask *clip_cpus); +struct thermal_cooling_device * +cpufreq_power_cooling_register(const struct cpumask *clip_cpus, + u32 capacitance, get_static_t plat_static_func); + /** * of_cpufreq_cooling_register - create cpufreq cooling device based on DT. * @np: a valid struct device_node to the cooling device device tree node. @@ -45,6 +52,12 @@ cpufreq_cooling_register(const struct cpumask *clip_cpus); struct thermal_cooling_device * of_cpufreq_cooling_register(struct device_node *np, const struct cpumask *clip_cpus); + +struct thermal_cooling_device * +of_cpufreq_power_cooling_register(struct device_node *np, + const struct cpumask *clip_cpus, + u32 capacitance, + get_static_t plat_static_func); #else static inline struct thermal_cooling_device * of_cpufreq_cooling_register(struct device_node *np, @@ -52,6 +65,15 @@ of_cpufreq_cooling_register(struct device_node *np, { return ERR_PTR(-ENOSYS); } + +static inline struct thermal_cooling_device * +of_cpufreq_power_cooling_register(struct device_node *np, + const struct cpumask *clip_cpus, + u32 capacitance, + get_static_t plat_static_func) +{ + return NULL; +} #endif /** @@ -67,12 +89,29 @@ cpufreq_cooling_register(const struct cpumask *clip_cpus) { return ERR_PTR(-ENOSYS); } +static inline struct thermal_cooling_device * +cpufreq_power_cooling_register(const struct cpumask *clip_cpus, + u32 capacitance, get_static_t plat_static_func) +{ + return NULL; +} + static inline struct thermal_cooling_device * of_cpufreq_cooling_register(struct device_node *np, const struct cpumask *clip_cpus) { return ERR_PTR(-ENOSYS); } + +static inline struct thermal_cooling_device * +of_cpufreq_power_cooling_register(struct device_node *np, + const struct cpumask *clip_cpus, + u32 capacitance, + get_static_t plat_static_func) +{ + return NULL; +} + static inline void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev) { -- cgit v1.2.3 From 6b775e870c56c59c3e16531ea2307b797395f9f7 Mon Sep 17 00:00:00 2001 From: Javi Merino Date: Mon, 2 Mar 2015 17:17:19 +0000 Subject: thermal: introduce the Power Allocator governor The power allocator governor is a thermal governor that controls system and device power allocation to control temperature. Conceptually, the implementation divides the sustainable power of a thermal zone among all the heat sources in that zone. This governor relies on "power actors", entities that represent heat sources. They can report current and maximum power consumption and can set a given maximum power consumption, usually via a cooling device. The governor uses a Proportional Integral Derivative (PID) controller driven by the temperature of the thermal zone. The output of the controller is a power budget that is then allocated to each power actor that can have bearing on the temperature we are trying to control. It decides how much power to give each cooling device based on the performance they are requesting. The PID controller ensures that the total power budget does not exceed the control temperature. Cc: Zhang Rui Cc: Eduardo Valentin Signed-off-by: Punit Agrawal Signed-off-by: Javi Merino Signed-off-by: Eduardo Valentin --- Documentation/thermal/power_allocator.txt | 247 ++++++++++++++ drivers/thermal/Kconfig | 15 + drivers/thermal/Makefile | 1 + drivers/thermal/power_allocator.c | 520 ++++++++++++++++++++++++++++++ drivers/thermal/thermal_core.c | 9 +- drivers/thermal/thermal_core.h | 8 + include/linux/thermal.h | 37 ++- 7 files changed, 830 insertions(+), 7 deletions(-) create mode 100644 Documentation/thermal/power_allocator.txt create mode 100644 drivers/thermal/power_allocator.c (limited to 'include/linux') diff --git a/Documentation/thermal/power_allocator.txt b/Documentation/thermal/power_allocator.txt new file mode 100644 index 000000000000..c3797b529991 --- /dev/null +++ b/Documentation/thermal/power_allocator.txt @@ -0,0 +1,247 @@ +Power allocator governor tunables +================================= + +Trip points +----------- + +The governor requires the following two passive trip points: + +1. "switch on" trip point: temperature above which the governor + control loop starts operating. This is the first passive trip + point of the thermal zone. + +2. "desired temperature" trip point: it should be higher than the + "switch on" trip point. This the target temperature the governor + is controlling for. This is the last passive trip point of the + thermal zone. + +PID Controller +-------------- + +The power allocator governor implements a +Proportional-Integral-Derivative controller (PID controller) with +temperature as the control input and power as the controlled output: + + P_max = k_p * e + k_i * err_integral + k_d * diff_err + sustainable_power + +where + e = desired_temperature - current_temperature + err_integral is the sum of previous errors + diff_err = e - previous_error + +It is similar to the one depicted below: + + k_d + | +current_temp | + | v + | +----------+ +---+ + | +----->| diff_err |-->| X |------+ + | | +----------+ +---+ | + | | | tdp actor + | | k_i | | get_requested_power() + | | | | | | | + | | | | | | | ... + v | v v v v v + +---+ | +-------+ +---+ +---+ +---+ +----------+ + | S |-------+----->| sum e |----->| X |--->| S |-->| S |-->|power | + +---+ | +-------+ +---+ +---+ +---+ |allocation| + ^ | ^ +----------+ + | | | | | + | | +---+ | | | + | +------->| X |-------------------+ v v + | +---+ granted performance +desired_temperature ^ + | + | + k_po/k_pu + +Sustainable power +----------------- + +An estimate of the sustainable dissipatable power (in mW) should be +provided while registering the thermal zone. This estimates the +sustained power that can be dissipated at the desired control +temperature. This is the maximum sustained power for allocation at +the desired maximum temperature. The actual sustained power can vary +for a number of reasons. The closed loop controller will take care of +variations such as environmental conditions, and some factors related +to the speed-grade of the silicon. `sustainable_power` is therefore +simply an estimate, and may be tuned to affect the aggressiveness of +the thermal ramp. For reference, the sustainable power of a 4" phone +is typically 2000mW, while on a 10" tablet is around 4500mW (may vary +depending on screen size). + +If you are using device tree, do add it as a property of the +thermal-zone. For example: + + thermal-zones { + soc_thermal { + polling-delay = <1000>; + polling-delay-passive = <100>; + sustainable-power = <2500>; + ... + +Instead, if the thermal zone is registered from the platform code, pass a +`thermal_zone_params` that has a `sustainable_power`. If no +`thermal_zone_params` were being passed, then something like below +will suffice: + + static const struct thermal_zone_params tz_params = { + .sustainable_power = 3500, + }; + +and then pass `tz_params` as the 5th parameter to +`thermal_zone_device_register()` + +k_po and k_pu +------------- + +The implementation of the PID controller in the power allocator +thermal governor allows the configuration of two proportional term +constants: `k_po` and `k_pu`. `k_po` is the proportional term +constant during temperature overshoot periods (current temperature is +above "desired temperature" trip point). Conversely, `k_pu` is the +proportional term constant during temperature undershoot periods +(current temperature below "desired temperature" trip point). + +These controls are intended as the primary mechanism for configuring +the permitted thermal "ramp" of the system. For instance, a lower +`k_pu` value will provide a slower ramp, at the cost of capping +available capacity at a low temperature. On the other hand, a high +value of `k_pu` will result in the governor granting very high power +whilst temperature is low, and may lead to temperature overshooting. + +The default value for `k_pu` is: + + 2 * sustainable_power / (desired_temperature - switch_on_temp) + +This means that at `switch_on_temp` the output of the controller's +proportional term will be 2 * `sustainable_power`. The default value +for `k_po` is: + + sustainable_power / (desired_temperature - switch_on_temp) + +Focusing on the proportional and feed forward values of the PID +controller equation we have: + + P_max = k_p * e + sustainable_power + +The proportional term is proportional to the difference between the +desired temperature and the current one. When the current temperature +is the desired one, then the proportional component is zero and +`P_max` = `sustainable_power`. That is, the system should operate in +thermal equilibrium under constant load. `sustainable_power` is only +an estimate, which is the reason for closed-loop control such as this. + +Expanding `k_pu` we get: + P_max = 2 * sustainable_power * (T_set - T) / (T_set - T_on) + + sustainable_power + +where + T_set is the desired temperature + T is the current temperature + T_on is the switch on temperature + +When the current temperature is the switch_on temperature, the above +formula becomes: + + P_max = 2 * sustainable_power * (T_set - T_on) / (T_set - T_on) + + sustainable_power = 2 * sustainable_power + sustainable_power = + 3 * sustainable_power + +Therefore, the proportional term alone linearly decreases power from +3 * `sustainable_power` to `sustainable_power` as the temperature +rises from the switch on temperature to the desired temperature. + +k_i and integral_cutoff +----------------------- + +`k_i` configures the PID loop's integral term constant. This term +allows the PID controller to compensate for long term drift and for +the quantized nature of the output control: cooling devices can't set +the exact power that the governor requests. When the temperature +error is below `integral_cutoff`, errors are accumulated in the +integral term. This term is then multiplied by `k_i` and the result +added to the output of the controller. Typically `k_i` is set low (1 +or 2) and `integral_cutoff` is 0. + +k_d +--- + +`k_d` configures the PID loop's derivative term constant. It's +recommended to leave it as the default: 0. + +Cooling device power API +======================== + +Cooling devices controlled by this governor must supply the additional +"power" API in their `cooling_device_ops`. It consists on three ops: + +1. int get_requested_power(struct thermal_cooling_device *cdev, + struct thermal_zone_device *tz, u32 *power); +@cdev: The `struct thermal_cooling_device` pointer +@tz: thermal zone in which we are currently operating +@power: pointer in which to store the calculated power + +`get_requested_power()` calculates the power requested by the device +in milliwatts and stores it in @power . It should return 0 on +success, -E* on failure. This is currently used by the power +allocator governor to calculate how much power to give to each cooling +device. + +2. int state2power(struct thermal_cooling_device *cdev, struct + thermal_zone_device *tz, unsigned long state, u32 *power); +@cdev: The `struct thermal_cooling_device` pointer +@tz: thermal zone in which we are currently operating +@state: A cooling device state +@power: pointer in which to store the equivalent power + +Convert cooling device state @state into power consumption in +milliwatts and store it in @power. It should return 0 on success, -E* +on failure. This is currently used by thermal core to calculate the +maximum power that an actor can consume. + +3. int power2state(struct thermal_cooling_device *cdev, u32 power, + unsigned long *state); +@cdev: The `struct thermal_cooling_device` pointer +@power: power in milliwatts +@state: pointer in which to store the resulting state + +Calculate a cooling device state that would make the device consume at +most @power mW and store it in @state. It should return 0 on success, +-E* on failure. This is currently used by the thermal core to convert +a given power set by the power allocator governor to a state that the +cooling device can set. It is a function because this conversion may +depend on external factors that may change so this function should the +best conversion given "current circumstances". + +Cooling device weights +---------------------- + +Weights are a mechanism to bias the allocation among cooling +devices. They express the relative power efficiency of different +cooling devices. Higher weight can be used to express higher power +efficiency. Weighting is relative such that if each cooling device +has a weight of one they are considered equal. This is particularly +useful in heterogeneous systems where two cooling devices may perform +the same kind of compute, but with different efficiency. For example, +a system with two different types of processors. + +If the thermal zone is registered using +`thermal_zone_device_register()` (i.e., platform code), then weights +are passed as part of the thermal zone's `thermal_bind_parameters`. +If the platform is registered using device tree, then they are passed +as the `contribution` property of each map in the `cooling-maps` node. + +Limitations of the power allocator governor +=========================================== + +The power allocator governor's PID controller works best if there is a +periodic tick. If you have a driver that calls +`thermal_zone_device_update()` (or anything that ends up calling the +governor's `throttle()` function) repetitively, the governor response +won't be very good. Note that this is not particular to this +governor, step-wise will also misbehave if you call its throttle() +faster than the normal thermal framework tick (due to interrupts for +example) as it will overreact. diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index 30aee81e9f5b..a1b43eab0a70 100644 --- a/drivers/thermal/Kconfig +++ b/drivers/thermal/Kconfig @@ -71,6 +71,14 @@ config THERMAL_DEFAULT_GOV_USER_SPACE Select this if you want to let the user space manage the platform thermals. +config THERMAL_DEFAULT_GOV_POWER_ALLOCATOR + bool "power_allocator" + select THERMAL_GOV_POWER_ALLOCATOR + help + Select this if you want to control temperature based on + system and device power allocation. This governor can only + operate on cooling devices that implement the power API. + endchoice config THERMAL_GOV_FAIR_SHARE @@ -99,6 +107,13 @@ config THERMAL_GOV_USER_SPACE help Enable this to let the user space manage the platform thermals. +config THERMAL_GOV_POWER_ALLOCATOR + bool "Power allocator thermal governor" + select THERMAL_POWER_ACTOR + help + Enable this to manage platform thermals by dynamically + allocating and limiting power to devices. + config CPU_THERMAL bool "generic cpu cooling support" depends on CPU_FREQ diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile index 1fe86652cfb6..b1783cf37ed2 100644 --- a/drivers/thermal/Makefile +++ b/drivers/thermal/Makefile @@ -14,6 +14,7 @@ thermal_sys-$(CONFIG_THERMAL_GOV_FAIR_SHARE) += fair_share.o thermal_sys-$(CONFIG_THERMAL_GOV_BANG_BANG) += gov_bang_bang.o thermal_sys-$(CONFIG_THERMAL_GOV_STEP_WISE) += step_wise.o thermal_sys-$(CONFIG_THERMAL_GOV_USER_SPACE) += user_space.o +thermal_sys-$(CONFIG_THERMAL_GOV_POWER_ALLOCATOR) += power_allocator.o # cpufreq cooling thermal_sys-$(CONFIG_CPU_THERMAL) += cpu_cooling.o diff --git a/drivers/thermal/power_allocator.c b/drivers/thermal/power_allocator.c new file mode 100644 index 000000000000..67982d79b76c --- /dev/null +++ b/drivers/thermal/power_allocator.c @@ -0,0 +1,520 @@ +/* + * A power allocator to manage temperature + * + * Copyright (C) 2014 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed "as is" WITHOUT ANY WARRANTY of any + * kind, whether express or implied; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#define pr_fmt(fmt) "Power allocator: " fmt + +#include +#include +#include + +#include "thermal_core.h" + +#define FRAC_BITS 10 +#define int_to_frac(x) ((x) << FRAC_BITS) +#define frac_to_int(x) ((x) >> FRAC_BITS) + +/** + * mul_frac() - multiply two fixed-point numbers + * @x: first multiplicand + * @y: second multiplicand + * + * Return: the result of multiplying two fixed-point numbers. The + * result is also a fixed-point number. + */ +static inline s64 mul_frac(s64 x, s64 y) +{ + return (x * y) >> FRAC_BITS; +} + +/** + * div_frac() - divide two fixed-point numbers + * @x: the dividend + * @y: the divisor + * + * Return: the result of dividing two fixed-point numbers. The + * result is also a fixed-point number. + */ +static inline s64 div_frac(s64 x, s64 y) +{ + return div_s64(x << FRAC_BITS, y); +} + +/** + * struct power_allocator_params - parameters for the power allocator governor + * @err_integral: accumulated error in the PID controller. + * @prev_err: error in the previous iteration of the PID controller. + * Used to calculate the derivative term. + * @trip_switch_on: first passive trip point of the thermal zone. The + * governor switches on when this trip point is crossed. + * @trip_max_desired_temperature: last passive trip point of the thermal + * zone. The temperature we are + * controlling for. + */ +struct power_allocator_params { + s64 err_integral; + s32 prev_err; + int trip_switch_on; + int trip_max_desired_temperature; +}; + +/** + * pid_controller() - PID controller + * @tz: thermal zone we are operating in + * @current_temp: the current temperature in millicelsius + * @control_temp: the target temperature in millicelsius + * @max_allocatable_power: maximum allocatable power for this thermal zone + * + * This PID controller increases the available power budget so that the + * temperature of the thermal zone gets as close as possible to + * @control_temp and limits the power if it exceeds it. k_po is the + * proportional term when we are overshooting, k_pu is the + * proportional term when we are undershooting. integral_cutoff is a + * threshold below which we stop accumulating the error. The + * accumulated error is only valid if the requested power will make + * the system warmer. If the system is mostly idle, there's no point + * in accumulating positive error. + * + * Return: The power budget for the next period. + */ +static u32 pid_controller(struct thermal_zone_device *tz, + unsigned long current_temp, + unsigned long control_temp, + u32 max_allocatable_power) +{ + s64 p, i, d, power_range; + s32 err, max_power_frac; + struct power_allocator_params *params = tz->governor_data; + + max_power_frac = int_to_frac(max_allocatable_power); + + err = ((s32)control_temp - (s32)current_temp); + err = int_to_frac(err); + + /* Calculate the proportional term */ + p = mul_frac(err < 0 ? tz->tzp->k_po : tz->tzp->k_pu, err); + + /* + * Calculate the integral term + * + * if the error is less than cut off allow integration (but + * the integral is limited to max power) + */ + i = mul_frac(tz->tzp->k_i, params->err_integral); + + if (err < int_to_frac(tz->tzp->integral_cutoff)) { + s64 i_next = i + mul_frac(tz->tzp->k_i, err); + + if (abs64(i_next) < max_power_frac) { + i = i_next; + params->err_integral += err; + } + } + + /* + * Calculate the derivative term + * + * We do err - prev_err, so with a positive k_d, a decreasing + * error (i.e. driving closer to the line) results in less + * power being applied, slowing down the controller) + */ + d = mul_frac(tz->tzp->k_d, err - params->prev_err); + d = div_frac(d, tz->passive_delay); + params->prev_err = err; + + power_range = p + i + d; + + /* feed-forward the known sustainable dissipatable power */ + power_range = tz->tzp->sustainable_power + frac_to_int(power_range); + + return clamp(power_range, (s64)0, (s64)max_allocatable_power); +} + +/** + * divvy_up_power() - divvy the allocated power between the actors + * @req_power: each actor's requested power + * @max_power: each actor's maximum available power + * @num_actors: size of the @req_power, @max_power and @granted_power's array + * @total_req_power: sum of @req_power + * @power_range: total allocated power + * @granted_power: output array: each actor's granted power + * @extra_actor_power: an appropriately sized array to be used in the + * function as temporary storage of the extra power given + * to the actors + * + * This function divides the total allocated power (@power_range) + * fairly between the actors. It first tries to give each actor a + * share of the @power_range according to how much power it requested + * compared to the rest of the actors. For example, if only one actor + * requests power, then it receives all the @power_range. If + * three actors each requests 1mW, each receives a third of the + * @power_range. + * + * If any actor received more than their maximum power, then that + * surplus is re-divvied among the actors based on how far they are + * from their respective maximums. + * + * Granted power for each actor is written to @granted_power, which + * should've been allocated by the calling function. + */ +static void divvy_up_power(u32 *req_power, u32 *max_power, int num_actors, + u32 total_req_power, u32 power_range, + u32 *granted_power, u32 *extra_actor_power) +{ + u32 extra_power, capped_extra_power; + int i; + + /* + * Prevent division by 0 if none of the actors request power. + */ + if (!total_req_power) + total_req_power = 1; + + capped_extra_power = 0; + extra_power = 0; + for (i = 0; i < num_actors; i++) { + u64 req_range = req_power[i] * power_range; + + granted_power[i] = div_u64(req_range, total_req_power); + + if (granted_power[i] > max_power[i]) { + extra_power += granted_power[i] - max_power[i]; + granted_power[i] = max_power[i]; + } + + extra_actor_power[i] = max_power[i] - granted_power[i]; + capped_extra_power += extra_actor_power[i]; + } + + if (!extra_power) + return; + + /* + * Re-divvy the reclaimed extra among actors based on + * how far they are from the max + */ + extra_power = min(extra_power, capped_extra_power); + if (capped_extra_power > 0) + for (i = 0; i < num_actors; i++) + granted_power[i] += (extra_actor_power[i] * + extra_power) / capped_extra_power; +} + +static int allocate_power(struct thermal_zone_device *tz, + unsigned long current_temp, + unsigned long control_temp) +{ + struct thermal_instance *instance; + struct power_allocator_params *params = tz->governor_data; + u32 *req_power, *max_power, *granted_power, *extra_actor_power; + u32 total_req_power, max_allocatable_power; + u32 power_range; + int i, num_actors, total_weight, ret = 0; + int trip_max_desired_temperature = params->trip_max_desired_temperature; + + mutex_lock(&tz->lock); + + num_actors = 0; + total_weight = 0; + list_for_each_entry(instance, &tz->thermal_instances, tz_node) { + if ((instance->trip == trip_max_desired_temperature) && + cdev_is_power_actor(instance->cdev)) { + num_actors++; + total_weight += instance->weight; + } + } + + /* + * We need to allocate three arrays of the same size: + * req_power, max_power and granted_power. They are going to + * be needed until this function returns. Allocate them all + * in one go to simplify the allocation and deallocation + * logic. + */ + BUILD_BUG_ON(sizeof(*req_power) != sizeof(*max_power)); + BUILD_BUG_ON(sizeof(*req_power) != sizeof(*granted_power)); + BUILD_BUG_ON(sizeof(*req_power) != sizeof(*extra_actor_power)); + req_power = devm_kcalloc(&tz->device, num_actors * 4, + sizeof(*req_power), GFP_KERNEL); + if (!req_power) { + ret = -ENOMEM; + goto unlock; + } + + max_power = &req_power[num_actors]; + granted_power = &req_power[2 * num_actors]; + extra_actor_power = &req_power[3 * num_actors]; + + i = 0; + total_req_power = 0; + max_allocatable_power = 0; + + list_for_each_entry(instance, &tz->thermal_instances, tz_node) { + int weight; + struct thermal_cooling_device *cdev = instance->cdev; + + if (instance->trip != trip_max_desired_temperature) + continue; + + if (!cdev_is_power_actor(cdev)) + continue; + + if (cdev->ops->get_requested_power(cdev, tz, &req_power[i])) + continue; + + if (!total_weight) + weight = 1 << FRAC_BITS; + else + weight = instance->weight; + + req_power[i] = frac_to_int(weight * req_power[i]); + + if (power_actor_get_max_power(cdev, tz, &max_power[i])) + continue; + + total_req_power += req_power[i]; + max_allocatable_power += max_power[i]; + + i++; + } + + power_range = pid_controller(tz, current_temp, control_temp, + max_allocatable_power); + + divvy_up_power(req_power, max_power, num_actors, total_req_power, + power_range, granted_power, extra_actor_power); + + i = 0; + list_for_each_entry(instance, &tz->thermal_instances, tz_node) { + if (instance->trip != trip_max_desired_temperature) + continue; + + if (!cdev_is_power_actor(instance->cdev)) + continue; + + power_actor_set_power(instance->cdev, instance, + granted_power[i]); + + i++; + } + + devm_kfree(&tz->device, req_power); +unlock: + mutex_unlock(&tz->lock); + + return ret; +} + +static int get_governor_trips(struct thermal_zone_device *tz, + struct power_allocator_params *params) +{ + int i, ret, last_passive; + bool found_first_passive; + + found_first_passive = false; + last_passive = -1; + ret = -EINVAL; + + for (i = 0; i < tz->trips; i++) { + enum thermal_trip_type type; + + ret = tz->ops->get_trip_type(tz, i, &type); + if (ret) + return ret; + + if (!found_first_passive) { + if (type == THERMAL_TRIP_PASSIVE) { + params->trip_switch_on = i; + found_first_passive = true; + } + } else if (type == THERMAL_TRIP_PASSIVE) { + last_passive = i; + } else { + break; + } + } + + if (last_passive != -1) { + params->trip_max_desired_temperature = last_passive; + ret = 0; + } else { + ret = -EINVAL; + } + + return ret; +} + +static void reset_pid_controller(struct power_allocator_params *params) +{ + params->err_integral = 0; + params->prev_err = 0; +} + +static void allow_maximum_power(struct thermal_zone_device *tz) +{ + struct thermal_instance *instance; + struct power_allocator_params *params = tz->governor_data; + + list_for_each_entry(instance, &tz->thermal_instances, tz_node) { + if ((instance->trip != params->trip_max_desired_temperature) || + (!cdev_is_power_actor(instance->cdev))) + continue; + + instance->target = 0; + instance->cdev->updated = false; + thermal_cdev_update(instance->cdev); + } +} + +/** + * power_allocator_bind() - bind the power_allocator governor to a thermal zone + * @tz: thermal zone to bind it to + * + * Check that the thermal zone is valid for this governor, that is, it + * has two thermal trips. If so, initialize the PID controller + * parameters and bind it to the thermal zone. + * + * Return: 0 on success, -EINVAL if the trips were invalid or -ENOMEM + * if we ran out of memory. + */ +static int power_allocator_bind(struct thermal_zone_device *tz) +{ + int ret; + struct power_allocator_params *params; + unsigned long switch_on_temp, control_temp; + u32 temperature_threshold; + + if (!tz->tzp || !tz->tzp->sustainable_power) { + dev_err(&tz->device, + "power_allocator: missing sustainable_power\n"); + return -EINVAL; + } + + params = devm_kzalloc(&tz->device, sizeof(*params), GFP_KERNEL); + if (!params) + return -ENOMEM; + + ret = get_governor_trips(tz, params); + if (ret) { + dev_err(&tz->device, + "thermal zone %s has wrong trip setup for power allocator\n", + tz->type); + goto free; + } + + ret = tz->ops->get_trip_temp(tz, params->trip_switch_on, + &switch_on_temp); + if (ret) + goto free; + + ret = tz->ops->get_trip_temp(tz, params->trip_max_desired_temperature, + &control_temp); + if (ret) + goto free; + + temperature_threshold = control_temp - switch_on_temp; + + tz->tzp->k_po = tz->tzp->k_po ?: + int_to_frac(tz->tzp->sustainable_power) / temperature_threshold; + tz->tzp->k_pu = tz->tzp->k_pu ?: + int_to_frac(2 * tz->tzp->sustainable_power) / + temperature_threshold; + tz->tzp->k_i = tz->tzp->k_i ?: int_to_frac(10) / 1000; + /* + * The default for k_d and integral_cutoff is 0, so we can + * leave them as they are. + */ + + reset_pid_controller(params); + + tz->governor_data = params; + + return 0; + +free: + devm_kfree(&tz->device, params); + return ret; +} + +static void power_allocator_unbind(struct thermal_zone_device *tz) +{ + dev_dbg(&tz->device, "Unbinding from thermal zone %d\n", tz->id); + devm_kfree(&tz->device, tz->governor_data); + tz->governor_data = NULL; +} + +static int power_allocator_throttle(struct thermal_zone_device *tz, int trip) +{ + int ret; + unsigned long switch_on_temp, control_temp, current_temp; + struct power_allocator_params *params = tz->governor_data; + + /* + * We get called for every trip point but we only need to do + * our calculations once + */ + if (trip != params->trip_max_desired_temperature) + return 0; + + ret = thermal_zone_get_temp(tz, ¤t_temp); + if (ret) { + dev_warn(&tz->device, "Failed to get temperature: %d\n", ret); + return ret; + } + + ret = tz->ops->get_trip_temp(tz, params->trip_switch_on, + &switch_on_temp); + if (ret) { + dev_warn(&tz->device, + "Failed to get switch on temperature: %d\n", ret); + return ret; + } + + if (current_temp < switch_on_temp) { + tz->passive = 0; + reset_pid_controller(params); + allow_maximum_power(tz); + return 0; + } + + tz->passive = 1; + + ret = tz->ops->get_trip_temp(tz, params->trip_max_desired_temperature, + &control_temp); + if (ret) { + dev_warn(&tz->device, + "Failed to get the maximum desired temperature: %d\n", + ret); + return ret; + } + + return allocate_power(tz, current_temp, control_temp); +} + +static struct thermal_governor thermal_gov_power_allocator = { + .name = "power_allocator", + .bind_to_tz = power_allocator_bind, + .unbind_from_tz = power_allocator_unbind, + .throttle = power_allocator_throttle, +}; + +int thermal_gov_power_allocator_register(void) +{ + return thermal_register_governor(&thermal_gov_power_allocator); +} + +void thermal_gov_power_allocator_unregister(void) +{ + thermal_unregister_governor(&thermal_gov_power_allocator); +} diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 263628b0e862..b389bc2ec0fa 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -1616,7 +1616,7 @@ static void remove_trip_attrs(struct thermal_zone_device *tz) struct thermal_zone_device *thermal_zone_device_register(const char *type, int trips, int mask, void *devdata, struct thermal_zone_device_ops *ops, - const struct thermal_zone_params *tzp, + struct thermal_zone_params *tzp, int passive_delay, int polling_delay) { struct thermal_zone_device *tz; @@ -1968,7 +1968,11 @@ static int __init thermal_register_governors(void) if (result) return result; - return thermal_gov_user_space_register(); + result = thermal_gov_user_space_register(); + if (result) + return result; + + return thermal_gov_power_allocator_register(); } static void thermal_unregister_governors(void) @@ -1977,6 +1981,7 @@ static void thermal_unregister_governors(void) thermal_gov_fair_share_unregister(); thermal_gov_bang_bang_unregister(); thermal_gov_user_space_unregister(); + thermal_gov_power_allocator_unregister(); } static int __init thermal_init(void) diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h index faebe881f062..8a6624488cc5 100644 --- a/drivers/thermal/thermal_core.h +++ b/drivers/thermal/thermal_core.h @@ -88,6 +88,14 @@ static inline int thermal_gov_user_space_register(void) { return 0; } static inline void thermal_gov_user_space_unregister(void) {} #endif /* CONFIG_THERMAL_GOV_USER_SPACE */ +#ifdef CONFIG_THERMAL_GOV_POWER_ALLOCATOR +int thermal_gov_power_allocator_register(void); +void thermal_gov_power_allocator_unregister(void); +#else +static inline int thermal_gov_power_allocator_register(void) { return 0; } +static inline void thermal_gov_power_allocator_unregister(void) {} +#endif /* CONFIG_THERMAL_GOV_POWER_ALLOCATOR */ + /* device tree support */ #ifdef CONFIG_THERMAL_OF int of_parse_thermal_zones(void); diff --git a/include/linux/thermal.h b/include/linux/thermal.h index bf3c55f405c2..6bbe11c97cea 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -59,6 +59,8 @@ #define DEFAULT_THERMAL_GOVERNOR "fair_share" #elif defined(CONFIG_THERMAL_DEFAULT_GOV_USER_SPACE) #define DEFAULT_THERMAL_GOVERNOR "user_space" +#elif defined(CONFIG_THERMAL_DEFAULT_GOV_POWER_ALLOCATOR) +#define DEFAULT_THERMAL_GOVERNOR "power_allocator" #endif struct thermal_zone_device; @@ -154,8 +156,7 @@ struct thermal_attr { * @devdata: private pointer for device private data * @trips: number of trip points the thermal zone supports * @passive_delay: number of milliseconds to wait between polls when - * performing passive cooling. Currenty only used by the - * step-wise governor + * performing passive cooling. * @polling_delay: number of milliseconds to wait between polls when * checking whether trip points have been crossed (0 for * interrupt driven systems) @@ -165,7 +166,6 @@ struct thermal_attr { * @last_temperature: previous temperature read * @emul_temperature: emulated temperature when using CONFIG_THERMAL_EMULATION * @passive: 1 if you've crossed a passive trip point, 0 otherwise. - * Currenty only used by the step-wise governor. * @forced_passive: If > 0, temperature at which to switch on all ACPI * processor cooling devices. Currently only used by the * step-wise governor. @@ -197,7 +197,7 @@ struct thermal_zone_device { int passive; unsigned int forced_passive; struct thermal_zone_device_ops *ops; - const struct thermal_zone_params *tzp; + struct thermal_zone_params *tzp; struct thermal_governor *governor; void *governor_data; struct list_head thermal_instances; @@ -275,6 +275,33 @@ struct thermal_zone_params { int num_tbps; /* Number of tbp entries */ struct thermal_bind_params *tbp; + + /* + * Sustainable power (heat) that this thermal zone can dissipate in + * mW + */ + u32 sustainable_power; + + /* + * Proportional parameter of the PID controller when + * overshooting (i.e., when temperature is below the target) + */ + s32 k_po; + + /* + * Proportional parameter of the PID controller when + * undershooting + */ + s32 k_pu; + + /* Integral parameter of the PID controller */ + s32 k_i; + + /* Derivative parameter of the PID controller */ + s32 k_d; + + /* threshold below which the error is no longer accumulated */ + s32 integral_cutoff; }; struct thermal_genl_event { @@ -350,7 +377,7 @@ int power_actor_set_power(struct thermal_cooling_device *, struct thermal_instance *, u32); struct thermal_zone_device *thermal_zone_device_register(const char *, int, int, void *, struct thermal_zone_device_ops *, - const struct thermal_zone_params *, int, int); + struct thermal_zone_params *, int, int); void thermal_zone_device_unregister(struct thermal_zone_device *); int thermal_zone_bind_cooling_device(struct thermal_zone_device *, int, -- cgit v1.2.3 From f31105347cc56c13d552b844ada04418769d875d Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 28 Apr 2015 12:17:50 +0200 Subject: irqchip: irqc: Remove platform data support As of commit 914d7d148411997c ("ARM: shmobile: r8a73a4: Remove legacy code"), the Renesas R-Mobile/R-Car interrupt controller is used with DT only, and interrupt numbers are thus always assigned automatically. Drop the platform data declaration and all related support code. Signed-off-by: Geert Uytterhoeven Cc: Magnus Damm Cc: Jason Cooper Link: http://lkml.kernel.org/r/1430216270-31929-1-git-send-email-geert%2Brenesas@glider.be Signed-off-by: Thomas Gleixner --- drivers/irqchip/irq-renesas-irqc.c | 17 +--------------- include/linux/platform_data/irq-renesas-irqc.h | 27 -------------------------- 2 files changed, 1 insertion(+), 43 deletions(-) delete mode 100644 include/linux/platform_data/irq-renesas-irqc.h (limited to 'include/linux') diff --git a/drivers/irqchip/irq-renesas-irqc.c b/drivers/irqchip/irq-renesas-irqc.c index df5bf21a701f..778bd076aeea 100644 --- a/drivers/irqchip/irq-renesas-irqc.c +++ b/drivers/irqchip/irq-renesas-irqc.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #define IRQC_IRQ_MAX 32 /* maximum 32 interrupts per driver instance */ @@ -62,7 +61,6 @@ struct irqc_priv { void __iomem *iomem; void __iomem *cpu_int_base; struct irqc_irq irq[IRQC_IRQ_MAX]; - struct renesas_irqc_config config; unsigned int number_of_irqs; struct platform_device *pdev; struct irq_chip irq_chip; @@ -175,7 +173,6 @@ static const struct irq_domain_ops irqc_irq_domain_ops = { static int irqc_probe(struct platform_device *pdev) { - struct renesas_irqc_config *pdata = pdev->dev.platform_data; struct irqc_priv *p; struct resource *io; struct resource *irq; @@ -191,10 +188,6 @@ static int irqc_probe(struct platform_device *pdev) goto err0; } - /* deal with driver instance configuration */ - if (pdata) - memcpy(&p->config, pdata, sizeof(*pdata)); - p->pdev = pdev; platform_set_drvdata(pdev, p); @@ -251,8 +244,7 @@ static int irqc_probe(struct platform_device *pdev) irq_chip->flags = IRQCHIP_MASK_ON_SUSPEND; p->irq_domain = irq_domain_add_simple(pdev->dev.of_node, - p->number_of_irqs, - p->config.irq_base, + p->number_of_irqs, 0, &irqc_irq_domain_ops, p); if (!p->irq_domain) { ret = -ENXIO; @@ -272,13 +264,6 @@ static int irqc_probe(struct platform_device *pdev) dev_info(&pdev->dev, "driving %d irqs\n", p->number_of_irqs); - /* warn in case of mismatch if irq base is specified */ - if (p->config.irq_base) { - if (p->config.irq_base != p->irq[0].domain_irq) - dev_warn(&pdev->dev, "irq base mismatch (%d/%d)\n", - p->config.irq_base, p->irq[0].domain_irq); - } - return 0; err3: while (--k >= 0) diff --git a/include/linux/platform_data/irq-renesas-irqc.h b/include/linux/platform_data/irq-renesas-irqc.h deleted file mode 100644 index 3ae17b3e00ed..000000000000 --- a/include/linux/platform_data/irq-renesas-irqc.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Renesas IRQC Driver - * - * Copyright (C) 2013 Magnus Damm - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#ifndef __IRQ_RENESAS_IRQC_H__ -#define __IRQ_RENESAS_IRQC_H__ - -struct renesas_irqc_config { - unsigned int irq_base; -}; - -#endif /* __IRQ_RENESAS_IRQC_H__ */ -- cgit v1.2.3 From 406c057c4e00744453d5b0731eb23629ec14dcdf Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Mon, 4 May 2015 21:54:20 -0400 Subject: libata: READ LOG DMA EXT support can be in either page 119 or 120 Support for the READ/WRITE LOG DMA EXT commands can be signaled either in page 119 or page 120. We should check both pages. Signed-off-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Tejun Heo --- include/linux/ata.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ata.h b/include/linux/ata.h index b666b773e111..fed36418dd1c 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -704,9 +704,19 @@ static inline bool ata_id_wcache_enabled(const u16 *id) static inline bool ata_id_has_read_log_dma_ext(const u16 *id) { + /* Word 86 must have bit 15 set */ if (!(id[ATA_ID_CFS_ENABLE_2] & (1 << 15))) return false; - return id[ATA_ID_COMMAND_SET_3] & (1 << 3); + + /* READ LOG DMA EXT support can be signaled either from word 119 + * or from word 120. The format is the same for both words: Bit + * 15 must be cleared, bit 14 set and bit 3 set. + */ + if ((id[ATA_ID_COMMAND_SET_3] & 0xC008) == 0x4008 || + (id[ATA_ID_COMMAND_SET_4] & 0xC008) == 0x4008) + return true; + + return false; } static inline bool ata_id_has_sense_reporting(const u16 *id) -- cgit v1.2.3 From 5d3abf8ff67f49271a42c0f7fa4f20f9e046bf0e Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Mon, 4 May 2015 21:54:21 -0400 Subject: libata: Fall back to unqueued READ LOG EXT if the DMA variant fails Some devices advertise support for the READ/WRITE LOG DMA EXT commands but fail when we try to issue them. This can lead to queued TRIM being unintentionally disabled since the relevant feature flag is located in a general purpose log page. Fall back to unqueued READ LOG EXT if the DMA variant fails while reading a log page. Signed-off-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Tejun Heo --- drivers/ata/libata-eh.c | 12 +++++++++++- include/linux/libata.h | 1 + 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c index 07f41be38fbe..2893563d0537 100644 --- a/drivers/ata/libata-eh.c +++ b/drivers/ata/libata-eh.c @@ -1507,13 +1507,17 @@ unsigned int ata_read_log_page(struct ata_device *dev, u8 log, { struct ata_taskfile tf; unsigned int err_mask; + bool dma = false; DPRINTK("read log page - log 0x%x, page 0x%x\n", log, page); +retry: ata_tf_init(dev, &tf); - if (dev->dma_mode && ata_id_has_read_log_dma_ext(dev->id)) { + if (dev->dma_mode && ata_id_has_read_log_dma_ext(dev->id) && + !(dev->horkage & ATA_HORKAGE_NO_NCQ_LOG)) { tf.command = ATA_CMD_READ_LOG_DMA_EXT; tf.protocol = ATA_PROT_DMA; + dma = true; } else { tf.command = ATA_CMD_READ_LOG_EXT; tf.protocol = ATA_PROT_PIO; @@ -1527,6 +1531,12 @@ unsigned int ata_read_log_page(struct ata_device *dev, u8 log, err_mask = ata_exec_internal(dev, &tf, NULL, DMA_FROM_DEVICE, buf, sectors * ATA_SECT_SIZE, 0); + if (err_mask && dma) { + dev->horkage |= ATA_HORKAGE_NO_NCQ_LOG; + ata_dev_warn(dev, "READ LOG DMA EXT failed, trying unqueued\n"); + goto retry; + } + DPRINTK("EXIT, err_mask=%x\n", err_mask); return err_mask; } diff --git a/include/linux/libata.h b/include/linux/libata.h index 8dad4a307bb8..c3ef58014b33 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -424,6 +424,7 @@ enum { ATA_HORKAGE_NOLPM = (1 << 20), /* don't use LPM */ ATA_HORKAGE_WD_BROKEN_LPM = (1 << 21), /* some WDs have broken LPM */ ATA_HORKAGE_ZERO_AFTER_TRIM = (1 << 22),/* guarantees zero after trim */ + ATA_HORKAGE_NO_NCQ_LOG = (1 << 23), /* don't use NCQ for log read */ /* DMA mask for user DMA control: User visible values; DO NOT renumber */ -- cgit v1.2.3 From c4cf5261f8bffd9de132b50660a69148e7575bd6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 17 Apr 2015 16:15:18 -0600 Subject: bio: skip atomic inc/dec of ->bi_remaining for non-chains Struct bio has an atomic ref count for chained bio's, and we use this to know when to end IO on the bio. However, most bio's are not chained, so we don't need to always introduce this atomic operation as part of ending IO. Add a helper to elevate the bi_remaining count, and flag the bio as now actually needing the decrement at end_io time. Rename the field to __bi_remaining to catch any current users of this doing the incrementing manually. For high IOPS workloads, this reduces the overhead of bio_endio() substantially. Tested-by: Robert Elliott Acked-by: Kent Overstreet Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- block/bio.c | 38 +++++++++++++++++++++++++++++--------- drivers/md/dm-cache-target.c | 2 +- drivers/md/dm-raid1.c | 2 +- drivers/md/dm-snap.c | 2 +- drivers/md/dm-thin.c | 4 ++-- include/linux/bio.h | 11 +++++++++++ include/linux/blk_types.h | 3 ++- 7 files changed, 47 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/block/bio.c b/block/bio.c index f66a4eae16ee..117da319afb6 100644 --- a/block/bio.c +++ b/block/bio.c @@ -270,7 +270,7 @@ void bio_init(struct bio *bio) { memset(bio, 0, sizeof(*bio)); bio->bi_flags = 1 << BIO_UPTODATE; - atomic_set(&bio->bi_remaining, 1); + atomic_set(&bio->__bi_remaining, 1); atomic_set(&bio->bi_cnt, 1); } EXPORT_SYMBOL(bio_init); @@ -292,8 +292,8 @@ void bio_reset(struct bio *bio) __bio_free(bio); memset(bio, 0, BIO_RESET_BYTES); - bio->bi_flags = flags|(1 << BIO_UPTODATE); - atomic_set(&bio->bi_remaining, 1); + bio->bi_flags = flags | (1 << BIO_UPTODATE); + atomic_set(&bio->__bi_remaining, 1); } EXPORT_SYMBOL(bio_reset); @@ -320,7 +320,7 @@ void bio_chain(struct bio *bio, struct bio *parent) bio->bi_private = parent; bio->bi_end_io = bio_chain_endio; - atomic_inc(&parent->bi_remaining); + bio_inc_remaining(parent); } EXPORT_SYMBOL(bio_chain); @@ -1741,6 +1741,23 @@ void bio_flush_dcache_pages(struct bio *bi) EXPORT_SYMBOL(bio_flush_dcache_pages); #endif +static inline bool bio_remaining_done(struct bio *bio) +{ + /* + * If we're not chaining, then ->__bi_remaining is always 1 and + * we always end io on the first invocation. + */ + if (!bio_flagged(bio, BIO_CHAIN)) + return true; + + BUG_ON(atomic_read(&bio->__bi_remaining) <= 0); + + if (atomic_dec_and_test(&bio->__bi_remaining)) + return true; + + return false; +} + /** * bio_endio - end I/O on a bio * @bio: bio @@ -1758,15 +1775,13 @@ EXPORT_SYMBOL(bio_flush_dcache_pages); void bio_endio(struct bio *bio, int error) { while (bio) { - BUG_ON(atomic_read(&bio->bi_remaining) <= 0); - if (error) clear_bit(BIO_UPTODATE, &bio->bi_flags); else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) error = -EIO; - if (!atomic_dec_and_test(&bio->bi_remaining)) - return; + if (unlikely(!bio_remaining_done(bio))) + break; /* * Need to have a real endio function for chained bios, @@ -1799,7 +1814,12 @@ EXPORT_SYMBOL(bio_endio); **/ void bio_endio_nodec(struct bio *bio, int error) { - atomic_inc(&bio->bi_remaining); + /* + * If it's not flagged as a chain, we are not going to dec the count + */ + if (bio_flagged(bio, BIO_CHAIN)) + bio_inc_remaining(bio); + bio_endio(bio, error); } EXPORT_SYMBOL(bio_endio_nodec); diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 7755af351867..705eb7b99d69 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -91,7 +91,7 @@ static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) * Must bump bi_remaining to allow bio to complete with * restored bi_end_io. */ - atomic_inc(&bio->bi_remaining); + bio_inc_remaining(bio); } /*----------------------------------------------------------------*/ diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 089d62751f7f..d6a1c096b777 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -1254,7 +1254,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error) dm_bio_restore(bd, bio); bio_record->details.bi_bdev = NULL; - atomic_inc(&bio->bi_remaining); + bio_inc_remaining(bio); queue_bio(ms, bio, rw); return DM_ENDIO_INCOMPLETE; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index f83a0f3fc365..8bfeae218531 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1478,7 +1478,7 @@ out: if (full_bio) { full_bio->bi_end_io = pe->full_bio_end_io; full_bio->bi_private = pe->full_bio_private; - atomic_inc(&full_bio->bi_remaining); + bio_inc_remaining(full_bio); } increment_pending_exceptions_done_count(); diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 921aafd12aee..342dbdad6131 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -795,7 +795,7 @@ static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) { if (m->bio) { m->bio->bi_end_io = m->saved_bi_end_io; - atomic_inc(&m->bio->bi_remaining); + bio_inc_remaining(m->bio); } cell_error(m->tc->pool, m->cell); list_del(&m->list); @@ -812,7 +812,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) bio = m->bio; if (bio) { bio->bi_end_io = m->saved_bi_end_io; - atomic_inc(&bio->bi_remaining); + bio_inc_remaining(bio); } if (m->err) { diff --git a/include/linux/bio.h b/include/linux/bio.h index da3a127c9958..8bfe9eee6d1a 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -644,6 +644,17 @@ static inline struct bio *bio_list_get(struct bio_list *bl) return bio; } +/* + * Increment chain count for the bio. Make sure the CHAIN flag update + * is visible before the raised count. + */ +static inline void bio_inc_remaining(struct bio *bio) +{ + bio->bi_flags |= (1 << BIO_CHAIN); + smp_mb__before_atomic(); + atomic_inc(&bio->__bi_remaining); +} + /* * bio_set is used to allow other portions of the IO system to * allocate their own private memory pools for bio and iovec structures. diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index a1b25e35ea5f..8b07e0603887 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -65,7 +65,7 @@ struct bio { unsigned int bi_seg_front_size; unsigned int bi_seg_back_size; - atomic_t bi_remaining; + atomic_t __bi_remaining; bio_end_io_t *bi_end_io; @@ -122,6 +122,7 @@ struct bio { #define BIO_NULL_MAPPED 8 /* contains invalid user pages */ #define BIO_QUIET 9 /* Make BIO Quiet */ #define BIO_SNAP_STABLE 10 /* bio data must be snapshotted during write */ +#define BIO_CHAIN 11 /* chained bio, ->bi_remaining in effect */ /* * Flags starting here get preserved by bio_reset() - this includes -- cgit v1.2.3 From dac56212e8127dbc0bff7be35c508bc280213309 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 17 Apr 2015 16:23:59 -0600 Subject: bio: skip atomic inc/dec of ->bi_cnt for most use cases Struct bio has a reference count that controls when it can be freed. Most uses cases is allocating the bio, which then returns with a single reference to it, doing IO, and then dropping that single reference. We can remove this atomic_dec_and_test() in the completion path, if nobody else is holding a reference to the bio. If someone does call bio_get() on the bio, then we flag the bio as now having valid count and that we must properly honor the reference count when it's being put. Tested-by: Robert Elliott Signed-off-by: Jens Axboe --- block/bio.c | 18 +++++++++++------- drivers/md/bcache/request.c | 2 +- fs/btrfs/volumes.c | 2 +- fs/xfs/xfs_aops.c | 1 - include/linux/bio.h | 16 +++++++++++++++- include/linux/blk_types.h | 3 ++- 6 files changed, 30 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/block/bio.c b/block/bio.c index 117da319afb6..c2ff8a88aef1 100644 --- a/block/bio.c +++ b/block/bio.c @@ -271,7 +271,7 @@ void bio_init(struct bio *bio) memset(bio, 0, sizeof(*bio)); bio->bi_flags = 1 << BIO_UPTODATE; atomic_set(&bio->__bi_remaining, 1); - atomic_set(&bio->bi_cnt, 1); + atomic_set(&bio->__bi_cnt, 1); } EXPORT_SYMBOL(bio_init); @@ -524,13 +524,17 @@ EXPORT_SYMBOL(zero_fill_bio); **/ void bio_put(struct bio *bio) { - BIO_BUG_ON(!atomic_read(&bio->bi_cnt)); - - /* - * last put frees it - */ - if (atomic_dec_and_test(&bio->bi_cnt)) + if (!bio_flagged(bio, BIO_REFFED)) bio_free(bio); + else { + BIO_BUG_ON(!atomic_read(&bio->__bi_cnt)); + + /* + * last put frees it + */ + if (atomic_dec_and_test(&bio->__bi_cnt)) + bio_free(bio); + } } EXPORT_SYMBOL(bio_put); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index ab43faddb447..1616f668a4cb 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -619,7 +619,7 @@ static void do_bio_hook(struct search *s, struct bio *orig_bio) bio->bi_end_io = request_endio; bio->bi_private = &s->cl; - atomic_set(&bio->bi_cnt, 3); + bio_cnt_set(bio, 3); } static void search_free(struct closure *cl) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 96aebf3bcd5b..8e8d1d1e28a5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -345,7 +345,7 @@ loop_lock: waitqueue_active(&fs_info->async_submit_wait)) wake_up(&fs_info->async_submit_wait); - BUG_ON(atomic_read(&cur->bi_cnt) == 0); + BUG_ON(atomic_read(&cur->__bi_cnt) == 0); /* * if we're doing the sync list, record that our diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index a56960dd1684..095f94c2d8b5 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -356,7 +356,6 @@ xfs_end_bio( { xfs_ioend_t *ioend = bio->bi_private; - ASSERT(atomic_read(&bio->bi_cnt) >= 1); ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error; /* Toss bio and pass work off to an xfsdatad thread */ diff --git a/include/linux/bio.h b/include/linux/bio.h index 8bfe9eee6d1a..7486ea103f6e 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -290,7 +290,21 @@ static inline unsigned bio_segments(struct bio *bio) * returns. and then bio would be freed memory when if (bio->bi_flags ...) * runs */ -#define bio_get(bio) atomic_inc(&(bio)->bi_cnt) +static inline void bio_get(struct bio *bio) +{ + bio->bi_flags |= (1 << BIO_REFFED); + smp_mb__before_atomic(); + atomic_inc(&bio->__bi_cnt); +} + +static inline void bio_cnt_set(struct bio *bio, unsigned int count) +{ + if (count != 1) { + bio->bi_flags |= (1 << BIO_REFFED); + smp_mb__before_atomic(); + } + atomic_set(&bio->__bi_cnt, count); +} enum bip_flags { BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 8b07e0603887..93d2e7153816 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -92,7 +92,7 @@ struct bio { unsigned short bi_max_vecs; /* max bvl_vecs we can hold */ - atomic_t bi_cnt; /* pin count */ + atomic_t __bi_cnt; /* pin count */ struct bio_vec *bi_io_vec; /* the actual vec list */ @@ -123,6 +123,7 @@ struct bio { #define BIO_QUIET 9 /* Make BIO Quiet */ #define BIO_SNAP_STABLE 10 /* bio data must be snapshotted during write */ #define BIO_CHAIN 11 /* chained bio, ->bi_remaining in effect */ +#define BIO_REFFED 12 /* bio has elevated ->bi_cnt */ /* * Flags starting here get preserved by bio_reset() - this includes -- cgit v1.2.3 From 84be456f883c4685680fba8e5154b5f72e92957e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 1 May 2015 12:46:15 +0200 Subject: remove We don't have any arch specific scatterlist now that parisc switched over to the generic one. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- arch/alpha/include/asm/pci.h | 2 +- arch/arm/include/asm/dma.h | 2 +- arch/arm/mach-footbridge/dma.c | 2 +- arch/blackfin/include/asm/pci.h | 2 +- arch/cris/include/asm/dma-mapping.h | 2 +- arch/cris/include/asm/pci.h | 2 +- arch/frv/include/asm/dma-mapping.h | 2 +- arch/frv/include/asm/pci.h | 2 +- arch/ia64/include/asm/pci.h | 2 +- arch/microblaze/include/asm/pci.h | 2 +- arch/mips/include/asm/dma-mapping.h | 2 +- arch/mips/include/asm/pci.h | 2 +- arch/mn10300/include/asm/pci.h | 2 +- arch/parisc/include/asm/dma-mapping.h | 2 +- arch/parisc/include/asm/pci.h | 2 +- arch/powerpc/include/asm/pci.h | 2 +- arch/powerpc/include/asm/vio.h | 2 +- arch/sparc/kernel/iommu_common.h | 2 +- arch/x86/include/asm/pci.h | 2 +- arch/xtensa/include/asm/pci.h | 2 +- drivers/mmc/host/android-goldfish.c | 2 +- include/asm-generic/scatterlist.h | 34 ------------------------------ include/linux/blkdev.h | 3 +-- include/linux/dmapool.h | 2 +- include/linux/scatterlist.h | 39 ++++++++++++++++++++++++++++------- lib/swiotlb.c | 2 +- 26 files changed, 56 insertions(+), 66 deletions(-) delete mode 100644 include/asm-generic/scatterlist.h (limited to 'include/linux') diff --git a/arch/alpha/include/asm/pci.h b/arch/alpha/include/asm/pci.h index f7f680f7457d..bf7d97dce9e8 100644 --- a/arch/alpha/include/asm/pci.h +++ b/arch/alpha/include/asm/pci.h @@ -5,7 +5,7 @@ #include #include -#include +#include #include #include diff --git a/arch/arm/include/asm/dma.h b/arch/arm/include/asm/dma.h index 99084431d6ae..bb4fa67da541 100644 --- a/arch/arm/include/asm/dma.h +++ b/arch/arm/include/asm/dma.h @@ -19,7 +19,7 @@ * It should not be re-used except for that purpose. */ #include -#include +#include #include diff --git a/arch/arm/mach-footbridge/dma.c b/arch/arm/mach-footbridge/dma.c index e2e0df8bcee2..22536b85a81d 100644 --- a/arch/arm/mach-footbridge/dma.c +++ b/arch/arm/mach-footbridge/dma.c @@ -13,9 +13,9 @@ #include #include #include +#include #include -#include #include #include diff --git a/arch/blackfin/include/asm/pci.h b/arch/blackfin/include/asm/pci.h index c737909fba47..14efc0db1ade 100644 --- a/arch/blackfin/include/asm/pci.h +++ b/arch/blackfin/include/asm/pci.h @@ -3,7 +3,7 @@ #ifndef _ASM_BFIN_PCI_H #define _ASM_BFIN_PCI_H -#include +#include #include #include diff --git a/arch/cris/include/asm/dma-mapping.h b/arch/cris/include/asm/dma-mapping.h index 2f0f654f1b44..57f794ee6039 100644 --- a/arch/cris/include/asm/dma-mapping.h +++ b/arch/cris/include/asm/dma-mapping.h @@ -5,10 +5,10 @@ #include #include +#include #include #include -#include #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) diff --git a/arch/cris/include/asm/pci.h b/arch/cris/include/asm/pci.h index cc2399c175e9..c15b4b4baafa 100644 --- a/arch/cris/include/asm/pci.h +++ b/arch/cris/include/asm/pci.h @@ -29,7 +29,7 @@ int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq); #include #include -#include +#include #include #include diff --git a/arch/frv/include/asm/dma-mapping.h b/arch/frv/include/asm/dma-mapping.h index 1746a2b8e6e7..2840adcd6d92 100644 --- a/arch/frv/include/asm/dma-mapping.h +++ b/arch/frv/include/asm/dma-mapping.h @@ -2,9 +2,9 @@ #define _ASM_DMA_MAPPING_H #include +#include #include #include -#include #include /* diff --git a/arch/frv/include/asm/pci.h b/arch/frv/include/asm/pci.h index 2035a4d3f9b9..43d224685144 100644 --- a/arch/frv/include/asm/pci.h +++ b/arch/frv/include/asm/pci.h @@ -14,7 +14,7 @@ #define _ASM_FRV_PCI_H #include -#include +#include #include #include diff --git a/arch/ia64/include/asm/pci.h b/arch/ia64/include/asm/pci.h index 52af5ed9f60b..0c0e25d6427a 100644 --- a/arch/ia64/include/asm/pci.h +++ b/arch/ia64/include/asm/pci.h @@ -6,9 +6,9 @@ #include #include #include +#include #include -#include #include struct pci_vector_struct { diff --git a/arch/microblaze/include/asm/pci.h b/arch/microblaze/include/asm/pci.h index 468aca8cec0d..81f27aef979c 100644 --- a/arch/microblaze/include/asm/pci.h +++ b/arch/microblaze/include/asm/pci.h @@ -16,8 +16,8 @@ #include #include #include +#include -#include #include #include #include diff --git a/arch/mips/include/asm/dma-mapping.h b/arch/mips/include/asm/dma-mapping.h index fd1b4a150759..360b3387182a 100644 --- a/arch/mips/include/asm/dma-mapping.h +++ b/arch/mips/include/asm/dma-mapping.h @@ -1,7 +1,7 @@ #ifndef _ASM_DMA_MAPPING_H #define _ASM_DMA_MAPPING_H -#include +#include #include #include #include diff --git a/arch/mips/include/asm/pci.h b/arch/mips/include/asm/pci.h index d9692993fc83..3413b10d833b 100644 --- a/arch/mips/include/asm/pci.h +++ b/arch/mips/include/asm/pci.h @@ -99,7 +99,7 @@ static inline void pci_resource_to_user(const struct pci_dev *dev, int bar, #include #include -#include +#include #include #include #include diff --git a/arch/mn10300/include/asm/pci.h b/arch/mn10300/include/asm/pci.h index 5f70af25c7d0..393b51d38f36 100644 --- a/arch/mn10300/include/asm/pci.h +++ b/arch/mn10300/include/asm/pci.h @@ -55,7 +55,7 @@ void pcibios_set_master(struct pci_dev *dev); #include #include -#include +#include #include #include diff --git a/arch/parisc/include/asm/dma-mapping.h b/arch/parisc/include/asm/dma-mapping.h index d0eae5f2bd87..d8d60a57183f 100644 --- a/arch/parisc/include/asm/dma-mapping.h +++ b/arch/parisc/include/asm/dma-mapping.h @@ -2,8 +2,8 @@ #define _PARISC_DMA_MAPPING_H #include +#include #include -#include /* See Documentation/DMA-API-HOWTO.txt */ struct hppa_dma_ops { diff --git a/arch/parisc/include/asm/pci.h b/arch/parisc/include/asm/pci.h index 20df2b04fc09..794d8820db5d 100644 --- a/arch/parisc/include/asm/pci.h +++ b/arch/parisc/include/asm/pci.h @@ -1,7 +1,7 @@ #ifndef __ASM_PARISC_PCI_H #define __ASM_PARISC_PCI_H -#include +#include diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h index 4aef8d660999..3be43ab63181 100644 --- a/arch/powerpc/include/asm/pci.h +++ b/arch/powerpc/include/asm/pci.h @@ -13,9 +13,9 @@ #include #include #include +#include #include -#include #include #include #include diff --git a/arch/powerpc/include/asm/vio.h b/arch/powerpc/include/asm/vio.h index 4f9b7ca0710f..84286ec77b12 100644 --- a/arch/powerpc/include/asm/vio.h +++ b/arch/powerpc/include/asm/vio.h @@ -19,9 +19,9 @@ #include #include #include +#include #include -#include /* * Architecture-specific constants for drivers to diff --git a/arch/sparc/kernel/iommu_common.h b/arch/sparc/kernel/iommu_common.h index f4be0d724fc6..b40cec252905 100644 --- a/arch/sparc/kernel/iommu_common.h +++ b/arch/sparc/kernel/iommu_common.h @@ -13,9 +13,9 @@ #include #include #include +#include #include -#include /* * These give mapping size of each iommu pte/tlb. diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 4e370a5d8117..81da003df21b 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/xtensa/include/asm/pci.h b/arch/xtensa/include/asm/pci.h index 5d52dc43dfe7..e438a00fbd63 100644 --- a/arch/xtensa/include/asm/pci.h +++ b/arch/xtensa/include/asm/pci.h @@ -33,7 +33,7 @@ extern struct pci_controller* pcibios_alloc_controller(void); #include #include -#include +#include #include #include diff --git a/drivers/mmc/host/android-goldfish.c b/drivers/mmc/host/android-goldfish.c index 8b4e20a3f16c..b1eac719a4cc 100644 --- a/drivers/mmc/host/android-goldfish.c +++ b/drivers/mmc/host/android-goldfish.c @@ -42,10 +42,10 @@ #include #include #include +#include #include #include -#include #include #include diff --git a/include/asm-generic/scatterlist.h b/include/asm-generic/scatterlist.h deleted file mode 100644 index 5de07355fad4..000000000000 --- a/include/asm-generic/scatterlist.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef __ASM_GENERIC_SCATTERLIST_H -#define __ASM_GENERIC_SCATTERLIST_H - -#include - -struct scatterlist { -#ifdef CONFIG_DEBUG_SG - unsigned long sg_magic; -#endif - unsigned long page_link; - unsigned int offset; - unsigned int length; - dma_addr_t dma_address; -#ifdef CONFIG_NEED_SG_DMA_LENGTH - unsigned int dma_length; -#endif -}; - -/* - * These macros should be used after a dma_map_sg call has been done - * to get bus addresses of each of the SG entries and their lengths. - * You should only work with the number of sg entries pci_map_sg - * returns, or alternatively stop on the first sg_dma_len(sg) which - * is 0. - */ -#define sg_dma_address(sg) ((sg)->dma_address) - -#ifdef CONFIG_NEED_SG_DMA_LENGTH -#define sg_dma_len(sg) ((sg)->dma_length) -#else -#define sg_dma_len(sg) ((sg)->length) -#endif - -#endif /* __ASM_GENERIC_SCATTERLIST_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7f9a516f24de..504af1e65ce1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -22,8 +22,7 @@ #include #include #include - -#include +#include struct module; struct scsi_ioctl_command; diff --git a/include/linux/dmapool.h b/include/linux/dmapool.h index 52456aa566a0..e1043f79122f 100644 --- a/include/linux/dmapool.h +++ b/include/linux/dmapool.h @@ -11,8 +11,8 @@ #ifndef LINUX_DMAPOOL_H #define LINUX_DMAPOOL_H +#include #include -#include struct device; diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index ed8f9e70df9b..eca1ec93775c 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -2,13 +2,39 @@ #define _LINUX_SCATTERLIST_H #include +#include #include #include - -#include -#include #include +struct scatterlist { +#ifdef CONFIG_DEBUG_SG + unsigned long sg_magic; +#endif + unsigned long page_link; + unsigned int offset; + unsigned int length; + dma_addr_t dma_address; +#ifdef CONFIG_NEED_SG_DMA_LENGTH + unsigned int dma_length; +#endif +}; + +/* + * These macros should be used after a dma_map_sg call has been done + * to get bus addresses of each of the SG entries and their lengths. + * You should only work with the number of sg entries dma_map_sg + * returns, or alternatively stop on the first sg_dma_len(sg) which + * is 0. + */ +#define sg_dma_address(sg) ((sg)->dma_address) + +#ifdef CONFIG_NEED_SG_DMA_LENGTH +#define sg_dma_len(sg) ((sg)->dma_length) +#else +#define sg_dma_len(sg) ((sg)->length) +#endif + struct sg_table { struct scatterlist *sgl; /* the list */ unsigned int nents; /* number of mapped entries */ @@ -18,10 +44,9 @@ struct sg_table { /* * Notes on SG table design. * - * Architectures must provide an unsigned long page_link field in the - * scatterlist struct. We use that to place the page pointer AND encode - * information about the sg table as well. The two lower bits are reserved - * for this information. + * We use the unsigned long page_link field in the scatterlist struct to place + * the page pointer AND encode information about the sg table as well. The two + * lower bits are reserved for this information. * * If bit 0 is set, then the page_link contains a pointer to the next sg * table list. Otherwise the next entry is at sg + 1. diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 4abda074ea45..341268841b31 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -29,10 +29,10 @@ #include #include #include +#include #include #include -#include #include #include -- cgit v1.2.3 From 4f8c9510ba71bb54477841bebb90154ef140860f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Apr 2015 22:37:16 +0200 Subject: block: rename REQ_TYPE_SPECIAL to REQ_TYPE_DRV_PRIV Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 2 +- drivers/block/paride/pd.c | 4 ++-- drivers/block/sx8.c | 4 ++-- drivers/block/virtio_blk.c | 6 +++--- drivers/ide/ide-atapi.c | 4 ++-- drivers/ide/ide-cd.c | 2 +- drivers/ide/ide-cd_ioctl.c | 2 +- drivers/ide/ide-devsets.c | 2 +- drivers/ide/ide-eh.c | 2 +- drivers/ide/ide-floppy.c | 6 +++--- drivers/ide/ide-io.c | 4 ++-- drivers/ide/ide-ioctls.c | 2 +- drivers/ide/ide-park.c | 4 ++-- drivers/ide/ide-tape.c | 4 ++-- include/linux/blkdev.h | 4 ++-- 15 files changed, 26 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 39e5f7fae3ef..9cf52ac328fe 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -592,7 +592,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, fsync_bdev(bdev); mutex_lock(&nbd->tx_lock); blk_rq_init(NULL, &sreq); - sreq.cmd_type = REQ_TYPE_SPECIAL; + sreq.cmd_type = REQ_TYPE_DRV_PRIV; nbd_cmd(&sreq) = NBD_CMD_DISC; /* Check again after getting mutex back. */ diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index d48715b287e6..dbb4da1cdca8 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -442,7 +442,7 @@ static char *pd_buf; /* buffer for request in progress */ static enum action do_pd_io_start(void) { - if (pd_req->cmd_type == REQ_TYPE_SPECIAL) { + if (pd_req->cmd_type == REQ_TYPE_DRV_PRIV) { phase = pd_special; return pd_special(); } @@ -725,7 +725,7 @@ static int pd_special_command(struct pd_unit *disk, if (IS_ERR(rq)) return PTR_ERR(rq); - rq->cmd_type = REQ_TYPE_SPECIAL; + rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->special = func; err = blk_execute_rq(disk->gd->queue, disk->gd, rq, 0); diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index 5d552857de41..59c91d49b14b 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -620,7 +620,7 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx) spin_unlock_irq(&host->lock); DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx); - crq->rq->cmd_type = REQ_TYPE_SPECIAL; + crq->rq->cmd_type = REQ_TYPE_DRV_PRIV; crq->rq->special = crq; blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL); @@ -661,7 +661,7 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func) crq->msg_bucket = (u32) rc; DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx); - crq->rq->cmd_type = REQ_TYPE_SPECIAL; + crq->rq->cmd_type = REQ_TYPE_DRV_PRIV; crq->rq->special = crq; blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 5ea2f0bbbc7c..d4d05f064d39 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -124,7 +124,7 @@ static inline void virtblk_request_done(struct request *req) req->resid_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.residual); req->sense_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.sense_len); req->errors = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.errors); - } else if (req->cmd_type == REQ_TYPE_SPECIAL) { + } else if (req->cmd_type == REQ_TYPE_DRV_PRIV) { req->errors = (error != 0); } @@ -188,7 +188,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, vbr->out_hdr.sector = 0; vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req)); break; - case REQ_TYPE_SPECIAL: + case REQ_TYPE_DRV_PRIV: vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID); vbr->out_hdr.sector = 0; vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req)); @@ -251,7 +251,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str) return PTR_ERR(req); } - req->cmd_type = REQ_TYPE_SPECIAL; + req->cmd_type = REQ_TYPE_DRV_PRIV; err = blk_execute_rq(vblk->disk->queue, vblk->disk, req, false); blk_put_request(req); diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index fac3d9da2e07..b367300ce479 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -93,7 +93,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk, int error; rq = blk_get_request(drive->queue, READ, __GFP_WAIT); - rq->cmd_type = REQ_TYPE_SPECIAL; + rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->special = (char *)pc; if (buf && bufflen) { @@ -477,7 +477,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) if (uptodate == 0) drive->failed_pc = NULL; - if (rq->cmd_type == REQ_TYPE_SPECIAL) { + if (rq->cmd_type == REQ_TYPE_DRV_PRIV) { rq->errors = 0; error = 0; } else { diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 0b510bafd90e..9a32603c6c74 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -799,7 +799,7 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq, cdrom_do_block_pc(drive, rq); break; - case REQ_TYPE_SPECIAL: + case REQ_TYPE_DRV_PRIV: /* right now this can only be a reset... */ uptodate = 1; goto out_end; diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c index 02caa7dd51c8..066e39036518 100644 --- a/drivers/ide/ide-cd_ioctl.c +++ b/drivers/ide/ide-cd_ioctl.c @@ -304,7 +304,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi) int ret; rq = blk_get_request(drive->queue, READ, __GFP_WAIT); - rq->cmd_type = REQ_TYPE_SPECIAL; + rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->cmd_flags = REQ_QUIET; ret = blk_execute_rq(drive->queue, cd->disk, rq, 0); blk_put_request(rq); diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c index 9e98122f646e..b05a74d78ef5 100644 --- a/drivers/ide/ide-devsets.c +++ b/drivers/ide/ide-devsets.c @@ -166,7 +166,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting, return setting->set(drive, arg); rq = blk_get_request(q, READ, __GFP_WAIT); - rq->cmd_type = REQ_TYPE_SPECIAL; + rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->cmd_len = 5; rq->cmd[0] = REQ_DEVSET_EXEC; *(int *)&rq->cmd[1] = arg; diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c index 32970664c275..19d809c48a8d 100644 --- a/drivers/ide/ide-eh.c +++ b/drivers/ide/ide-eh.c @@ -147,7 +147,7 @@ static inline void ide_complete_drive_reset(ide_drive_t *drive, int err) { struct request *rq = drive->hwif->rq; - if (rq && rq->cmd_type == REQ_TYPE_SPECIAL && + if (rq && rq->cmd_type == REQ_TYPE_DRV_PRIV && rq->cmd[0] == REQ_DRIVE_RESET) { if (err <= 0 && rq->errors == 0) rq->errors = -EIO; diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index 8c6363cdd208..3dbfa5b6db6a 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -97,7 +97,7 @@ static int ide_floppy_callback(ide_drive_t *drive, int dsc) "Aborting request!\n"); } - if (rq->cmd_type == REQ_TYPE_SPECIAL) + if (rq->cmd_type == REQ_TYPE_DRV_PRIV) rq->errors = uptodate ? 0 : IDE_DRV_ERROR_GENERAL; return uptodate; @@ -246,7 +246,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive, } else printk(KERN_ERR PFX "%s: I/O error\n", drive->name); - if (rq->cmd_type == REQ_TYPE_SPECIAL) { + if (rq->cmd_type == REQ_TYPE_DRV_PRIV) { rq->errors = 0; ide_complete_rq(drive, 0, blk_rq_bytes(rq)); return ide_stopped; @@ -265,7 +265,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive, pc = &floppy->queued_pc; idefloppy_create_rw_cmd(drive, pc, rq, (unsigned long)block); break; - case REQ_TYPE_SPECIAL: + case REQ_TYPE_DRV_PRIV: case REQ_TYPE_SENSE: pc = (struct ide_atapi_pc *)rq->special; break; diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 177db6d5b2f5..8e55abd03a24 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -135,7 +135,7 @@ EXPORT_SYMBOL(ide_complete_rq); void ide_kill_rq(ide_drive_t *drive, struct request *rq) { - u8 drv_req = (rq->cmd_type == REQ_TYPE_SPECIAL) && rq->rq_disk; + u8 drv_req = (rq->cmd_type == REQ_TYPE_DRV_PRIV) && rq->rq_disk; u8 media = drive->media; drive->failed_pc = NULL; @@ -353,7 +353,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq) pm->pm_step == IDE_PM_COMPLETED) ide_complete_pm_rq(drive, rq); return startstop; - } else if (!rq->rq_disk && rq->cmd_type == REQ_TYPE_SPECIAL) + } else if (!rq->rq_disk && rq->cmd_type == REQ_TYPE_DRV_PRIV) /* * TODO: Once all ULDs have been modified to * check for specific op codes rather than diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c index 6233fa2cb8a9..aa2e9b77b20d 100644 --- a/drivers/ide/ide-ioctls.c +++ b/drivers/ide/ide-ioctls.c @@ -222,7 +222,7 @@ static int generic_drive_reset(ide_drive_t *drive) int ret = 0; rq = blk_get_request(drive->queue, READ, __GFP_WAIT); - rq->cmd_type = REQ_TYPE_SPECIAL; + rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->cmd_len = 1; rq->cmd[0] = REQ_DRIVE_RESET; if (blk_execute_rq(drive->queue, NULL, rq, 1)) diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c index ca958604cda2..c80868520488 100644 --- a/drivers/ide/ide-park.c +++ b/drivers/ide/ide-park.c @@ -34,7 +34,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout) rq = blk_get_request(q, READ, __GFP_WAIT); rq->cmd[0] = REQ_PARK_HEADS; rq->cmd_len = 1; - rq->cmd_type = REQ_TYPE_SPECIAL; + rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->special = &timeout; rc = blk_execute_rq(q, NULL, rq, 1); blk_put_request(rq); @@ -51,7 +51,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout) rq->cmd[0] = REQ_UNPARK_HEADS; rq->cmd_len = 1; - rq->cmd_type = REQ_TYPE_SPECIAL; + rq->cmd_type = REQ_TYPE_DRV_PRIV; elv_add_request(q, rq, ELEVATOR_INSERT_FRONT); out: diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 6eb738ca6d2f..2a5d543db9f5 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -576,7 +576,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive, rq->cmd[0], (unsigned long long)blk_rq_pos(rq), blk_rq_sectors(rq)); - BUG_ON(!(rq->cmd_type == REQ_TYPE_SPECIAL || + BUG_ON(!(rq->cmd_type == REQ_TYPE_DRV_PRIV || rq->cmd_type == REQ_TYPE_SENSE)); /* Retry a failed packet command */ @@ -853,7 +853,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size) BUG_ON(size < 0 || size % tape->blk_size); rq = blk_get_request(drive->queue, READ, __GFP_WAIT); - rq->cmd_type = REQ_TYPE_SPECIAL; + rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->cmd[13] = cmd; rq->rq_disk = tape->disk; rq->__sector = tape->first_frame; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7f9a516f24de..98c90272443b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -79,10 +79,10 @@ enum rq_cmd_type_bits { REQ_TYPE_PM_SUSPEND, /* suspend request */ REQ_TYPE_PM_RESUME, /* resume request */ REQ_TYPE_PM_SHUTDOWN, /* shutdown request */ - REQ_TYPE_SPECIAL, /* driver defined type */ + REQ_TYPE_DRV_PRIV, /* driver defined type */ /* * for ATA/ATAPI devices. this really doesn't belong here, ide should - * use REQ_TYPE_SPECIAL and use rq->cmd[0] with the range of driver + * use REQ_TYPE_DRV_PRIV and use rq->cmd[0] with the range of driver * private REQ_LB opcodes to differentiate what type of request this is */ REQ_TYPE_ATA_TASKFILE, -- cgit v1.2.3 From b42171ef7d938a66fa52e66a3d911ed63770b5ca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Apr 2015 22:37:17 +0200 Subject: block: move REQ_TYPE_ATA_TASKFILE and REQ_TYPE_ATA_PC to ide.h These values are only used by the IDE driver, so move them into it by allowing drivers to take cmd_type values after the first private one. Note that we have to turn cmd_type into a plain unsigned integer so that gcc doesn't complain about mismatching enum types. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 11 ++--------- include/linux/ide.h | 7 +++++++ 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 98c90272443b..9cb4d80a4987 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -79,14 +79,7 @@ enum rq_cmd_type_bits { REQ_TYPE_PM_SUSPEND, /* suspend request */ REQ_TYPE_PM_RESUME, /* resume request */ REQ_TYPE_PM_SHUTDOWN, /* shutdown request */ - REQ_TYPE_DRV_PRIV, /* driver defined type */ - /* - * for ATA/ATAPI devices. this really doesn't belong here, ide should - * use REQ_TYPE_DRV_PRIV and use rq->cmd[0] with the range of driver - * private REQ_LB opcodes to differentiate what type of request this is - */ - REQ_TYPE_ATA_TASKFILE, - REQ_TYPE_ATA_PC, + REQ_TYPE_DRV_PRIV, /* driver defined types from here */ }; #define BLK_MAX_CDB 16 @@ -108,7 +101,7 @@ struct request { struct blk_mq_ctx *mq_ctx; u64 cmd_flags; - enum rq_cmd_type_bits cmd_type; + unsigned cmd_type; unsigned long atomic_flags; int cpu; diff --git a/include/linux/ide.h b/include/linux/ide.h index 93b5ca754b5b..62ac399144a6 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -39,6 +39,12 @@ struct device; +/* IDE-specific values for req->cmd_type */ +enum ata_cmd_type_bits { + REQ_TYPE_ATA_TASKFILE = REQ_TYPE_DRV_PRIV + 1, + REQ_TYPE_ATA_PC, +}; + /* Error codes returned in rq->errors to the higher part of the driver. */ enum { IDE_DRV_ERROR_GENERAL = 101, @@ -1551,4 +1557,5 @@ static inline void ide_set_drivedata(ide_drive_t *drive, void *data) #define ide_host_for_each_port(i, port, host) \ for ((i) = 0; ((port) = (host)->ports[i]) || (i) < MAX_HOST_PORTS; (i)++) + #endif /* _IDE_H */ -- cgit v1.2.3 From b0b93b48a30e809240ddd7449a6ad60a5ddf7b4d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Apr 2015 22:37:18 +0200 Subject: block: move REQ_TYPE_SENSE to the ide driver Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/ide/ide-atapi.c | 6 +++--- drivers/ide/ide-cd.c | 8 ++++---- drivers/ide/ide-floppy.c | 2 +- drivers/ide/ide-tape.c | 2 +- include/linux/blkdev.h | 1 - include/linux/ide.h | 1 + 6 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index b367300ce479..1362ad80a76c 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -191,7 +191,7 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq) BUG_ON(sense_len > sizeof(*sense)); - if (rq->cmd_type == REQ_TYPE_SENSE || drive->sense_rq_armed) + if (rq->cmd_type == REQ_TYPE_ATA_SENSE || drive->sense_rq_armed) return; memset(sense, 0, sizeof(*sense)); @@ -210,7 +210,7 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq) sense_rq->rq_disk = rq->rq_disk; sense_rq->cmd[0] = GPCMD_REQUEST_SENSE; sense_rq->cmd[4] = cmd_len; - sense_rq->cmd_type = REQ_TYPE_SENSE; + sense_rq->cmd_type = REQ_TYPE_ATA_SENSE; sense_rq->cmd_flags |= REQ_PREEMPT; if (drive->media == ide_tape) @@ -310,7 +310,7 @@ int ide_cd_get_xferlen(struct request *rq) switch (rq->cmd_type) { case REQ_TYPE_FS: return 32768; - case REQ_TYPE_SENSE: + case REQ_TYPE_ATA_SENSE: case REQ_TYPE_BLOCK_PC: case REQ_TYPE_ATA_PC: return blk_rq_bytes(rq); diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 9a32603c6c74..64a6b827b3dd 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -210,7 +210,7 @@ static void cdrom_analyze_sense_data(ide_drive_t *drive, static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq) { /* - * For REQ_TYPE_SENSE, "rq->special" points to the original + * For REQ_TYPE_ATA_SENSE, "rq->special" points to the original * failed request. Also, the sense data should be read * directly from rq which might be different from the original * sense buffer if it got copied during mapping. @@ -285,7 +285,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat) "stat 0x%x", rq->cmd[0], rq->cmd_type, err, stat); - if (rq->cmd_type == REQ_TYPE_SENSE) { + if (rq->cmd_type == REQ_TYPE_ATA_SENSE) { /* * We got an error trying to get sense info from the drive * (probably while trying to recover from a former error). @@ -526,7 +526,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) ide_expiry_t *expiry = NULL; int dma_error = 0, dma, thislen, uptodate = 0; int write = (rq_data_dir(rq) == WRITE) ? 1 : 0, rc = 0; - int sense = (rq->cmd_type == REQ_TYPE_SENSE); + int sense = (rq->cmd_type == REQ_TYPE_ATA_SENSE); unsigned int timeout; u16 len; u8 ireason, stat; @@ -791,7 +791,7 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq, if (cdrom_start_rw(drive, rq) == ide_stopped) goto out_end; break; - case REQ_TYPE_SENSE: + case REQ_TYPE_ATA_SENSE: case REQ_TYPE_BLOCK_PC: case REQ_TYPE_ATA_PC: if (!rq->timeout) diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index 3dbfa5b6db6a..2fb5350c5410 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -266,7 +266,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive, idefloppy_create_rw_cmd(drive, pc, rq, (unsigned long)block); break; case REQ_TYPE_DRV_PRIV: - case REQ_TYPE_SENSE: + case REQ_TYPE_ATA_SENSE: pc = (struct ide_atapi_pc *)rq->special; break; case REQ_TYPE_BLOCK_PC: diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 2a5d543db9f5..f5d51d1d09ee 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -577,7 +577,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive, blk_rq_sectors(rq)); BUG_ON(!(rq->cmd_type == REQ_TYPE_DRV_PRIV || - rq->cmd_type == REQ_TYPE_SENSE)); + rq->cmd_type == REQ_TYPE_ATA_SENSE)); /* Retry a failed packet command */ if (drive->failed_pc && drive->pc->c[0] == REQUEST_SENSE) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9cb4d80a4987..6076b9e18dcb 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -75,7 +75,6 @@ struct request_list { enum rq_cmd_type_bits { REQ_TYPE_FS = 1, /* fs request */ REQ_TYPE_BLOCK_PC, /* scsi command */ - REQ_TYPE_SENSE, /* sense request */ REQ_TYPE_PM_SUSPEND, /* suspend request */ REQ_TYPE_PM_RESUME, /* resume request */ REQ_TYPE_PM_SHUTDOWN, /* shutdown request */ diff --git a/include/linux/ide.h b/include/linux/ide.h index 62ac399144a6..9856b7d455d9 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -43,6 +43,7 @@ struct device; enum ata_cmd_type_bits { REQ_TYPE_ATA_TASKFILE = REQ_TYPE_DRV_PRIV + 1, REQ_TYPE_ATA_PC, + REQ_TYPE_ATA_SENSE, /* sense request */ }; /* Error codes returned in rq->errors to the higher part of the driver. */ -- cgit v1.2.3 From ac7cdff00a33d48d27217560fa3b16d802e5f535 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Apr 2015 22:37:19 +0200 Subject: block: remove REQ_TYPE_PM_SHUTDOWN Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6076b9e18dcb..c2829ba5e738 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -77,7 +77,6 @@ enum rq_cmd_type_bits { REQ_TYPE_BLOCK_PC, /* scsi command */ REQ_TYPE_PM_SUSPEND, /* suspend request */ REQ_TYPE_PM_RESUME, /* resume request */ - REQ_TYPE_PM_SHUTDOWN, /* shutdown request */ REQ_TYPE_DRV_PRIV, /* driver defined types from here */ }; -- cgit v1.2.3 From a7928c1578c550bd6f4dec62d65132e6db226c57 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Apr 2015 22:37:20 +0200 Subject: block: move PM request support to IDE This removes the request types and hacks from the block code and into the old IDE driver. There is a small amunt of code duplication due to this, but it's not too bad. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 1 + block/blk-exec.c | 10 --------- block/blk.h | 2 -- drivers/ide/ide-eh.c | 2 +- drivers/ide/ide-io.c | 8 +++---- drivers/ide/ide-pm.c | 56 +++++++++++++++++++++++++++++++++++----------- drivers/ide/ide-taskfile.c | 2 +- include/linux/blkdev.h | 21 +---------------- include/linux/ide.h | 19 ++++++++++++++++ 9 files changed, 70 insertions(+), 51 deletions(-) (limited to 'include/linux') diff --git a/block/blk-core.c b/block/blk-core.c index fd154b94447a..2e5020f37d55 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -285,6 +285,7 @@ inline void __blk_run_queue_uncond(struct request_queue *q) q->request_fn(q); q->request_fn_active--; } +EXPORT_SYMBOL_GPL(__blk_run_queue_uncond); /** * __blk_run_queue - run a single device queue diff --git a/block/blk-exec.c b/block/blk-exec.c index 9924725fa50d..3fec8a29d0fa 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -53,7 +53,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, rq_end_io_fn *done) { int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; - bool is_pm_resume; WARN_ON(irqs_disabled()); WARN_ON(rq->cmd_type == REQ_TYPE_FS); @@ -70,12 +69,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, return; } - /* - * need to check this before __blk_run_queue(), because rq can - * be freed before that returns. - */ - is_pm_resume = rq->cmd_type == REQ_TYPE_PM_RESUME; - spin_lock_irq(q->queue_lock); if (unlikely(blk_queue_dying(q))) { @@ -88,9 +81,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, __elv_add_request(q, rq, where); __blk_run_queue(q); - /* the queue is stopped so it won't be run */ - if (is_pm_resume) - __blk_run_queue_uncond(q); spin_unlock_irq(q->queue_lock); } EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); diff --git a/block/blk.h b/block/blk.h index 43b036185712..4b48d55e588e 100644 --- a/block/blk.h +++ b/block/blk.h @@ -193,8 +193,6 @@ int blk_try_merge(struct request *rq, struct bio *bio); void blk_queue_congestion_threshold(struct request_queue *q); -void __blk_run_queue_uncond(struct request_queue *q); - int blk_dev_init(void); diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c index 19d809c48a8d..d6da011299f5 100644 --- a/drivers/ide/ide-eh.c +++ b/drivers/ide/ide-eh.c @@ -129,7 +129,7 @@ ide_startstop_t ide_error(ide_drive_t *drive, const char *msg, u8 stat) if (cmd) ide_complete_cmd(drive, cmd, stat, err); - } else if (blk_pm_request(rq)) { + } else if (ata_pm_request(rq)) { rq->errors = 1; ide_complete_pm_rq(drive, rq); return ide_stopped; diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 8e55abd03a24..669ea1e45795 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -320,7 +320,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq) goto kill_rq; } - if (blk_pm_request(rq)) + if (ata_pm_request(rq)) ide_check_pm_state(drive, rq); drive->hwif->tp_ops->dev_select(drive); @@ -342,8 +342,8 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq) if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE) return execute_drive_cmd(drive, rq); - else if (blk_pm_request(rq)) { - struct request_pm_state *pm = rq->special; + else if (ata_pm_request(rq)) { + struct ide_pm_state *pm = rq->special; #ifdef DEBUG_PM printk("%s: start_power_step(step: %d)\n", drive->name, pm->pm_step); @@ -538,7 +538,7 @@ repeat: * state machine. */ if ((drive->dev_flags & IDE_DFLAG_BLOCKED) && - blk_pm_request(rq) == 0 && + ata_pm_request(rq) == 0 && (rq->cmd_flags & REQ_PREEMPT) == 0) { /* there should be no pending command at this point */ ide_unlock_port(hwif); diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c index 8d1e32d7cd97..081e43458d50 100644 --- a/drivers/ide/ide-pm.c +++ b/drivers/ide/ide-pm.c @@ -8,7 +8,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg) ide_drive_t *pair = ide_get_pair_dev(drive); ide_hwif_t *hwif = drive->hwif; struct request *rq; - struct request_pm_state rqpm; + struct ide_pm_state rqpm; int ret; if (ide_port_acpi(hwif)) { @@ -19,7 +19,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg) memset(&rqpm, 0, sizeof(rqpm)); rq = blk_get_request(drive->queue, READ, __GFP_WAIT); - rq->cmd_type = REQ_TYPE_PM_SUSPEND; + rq->cmd_type = REQ_TYPE_ATA_PM_SUSPEND; rq->special = &rqpm; rqpm.pm_step = IDE_PM_START_SUSPEND; if (mesg.event == PM_EVENT_PRETHAW) @@ -38,13 +38,43 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg) return ret; } +static void ide_end_sync_rq(struct request *rq, int error) +{ + complete(rq->end_io_data); +} + +static int ide_pm_execute_rq(struct request *rq) +{ + struct request_queue *q = rq->q; + DECLARE_COMPLETION_ONSTACK(wait); + + rq->end_io_data = &wait; + rq->end_io = ide_end_sync_rq; + + spin_lock_irq(q->queue_lock); + if (unlikely(blk_queue_dying(q))) { + rq->cmd_flags |= REQ_QUIET; + rq->errors = -ENXIO; + __blk_end_request_all(rq, rq->errors); + spin_unlock_irq(q->queue_lock); + return -ENXIO; + } + __elv_add_request(q, rq, ELEVATOR_INSERT_FRONT); + __blk_run_queue_uncond(q); + spin_unlock_irq(q->queue_lock); + + wait_for_completion_io(&wait); + + return rq->errors ? -EIO : 0; +} + int generic_ide_resume(struct device *dev) { ide_drive_t *drive = to_ide_device(dev); ide_drive_t *pair = ide_get_pair_dev(drive); ide_hwif_t *hwif = drive->hwif; struct request *rq; - struct request_pm_state rqpm; + struct ide_pm_state rqpm; int err; if (ide_port_acpi(hwif)) { @@ -59,13 +89,13 @@ int generic_ide_resume(struct device *dev) memset(&rqpm, 0, sizeof(rqpm)); rq = blk_get_request(drive->queue, READ, __GFP_WAIT); - rq->cmd_type = REQ_TYPE_PM_RESUME; + rq->cmd_type = REQ_TYPE_ATA_PM_RESUME; rq->cmd_flags |= REQ_PREEMPT; rq->special = &rqpm; rqpm.pm_step = IDE_PM_START_RESUME; rqpm.pm_state = PM_EVENT_ON; - err = blk_execute_rq(drive->queue, NULL, rq, 1); + err = ide_pm_execute_rq(rq); blk_put_request(rq); if (err == 0 && dev->driver) { @@ -80,7 +110,7 @@ int generic_ide_resume(struct device *dev) void ide_complete_power_step(ide_drive_t *drive, struct request *rq) { - struct request_pm_state *pm = rq->special; + struct ide_pm_state *pm = rq->special; #ifdef DEBUG_PM printk(KERN_INFO "%s: complete_power_step(step: %d)\n", @@ -110,7 +140,7 @@ void ide_complete_power_step(ide_drive_t *drive, struct request *rq) ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq) { - struct request_pm_state *pm = rq->special; + struct ide_pm_state *pm = rq->special; struct ide_cmd cmd = { }; switch (pm->pm_step) { @@ -182,7 +212,7 @@ out_do_tf: void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq) { struct request_queue *q = drive->queue; - struct request_pm_state *pm = rq->special; + struct ide_pm_state *pm = rq->special; unsigned long flags; ide_complete_power_step(drive, rq); @@ -191,10 +221,10 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq) #ifdef DEBUG_PM printk("%s: completing PM request, %s\n", drive->name, - (rq->cmd_type == REQ_TYPE_PM_SUSPEND) ? "suspend" : "resume"); + (rq->cmd_type == REQ_TYPE_ATA_PM_SUSPEND) ? "suspend" : "resume"); #endif spin_lock_irqsave(q->queue_lock, flags); - if (rq->cmd_type == REQ_TYPE_PM_SUSPEND) + if (rq->cmd_type == REQ_TYPE_ATA_PM_SUSPEND) blk_stop_queue(q); else drive->dev_flags &= ~IDE_DFLAG_BLOCKED; @@ -208,13 +238,13 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq) void ide_check_pm_state(ide_drive_t *drive, struct request *rq) { - struct request_pm_state *pm = rq->special; + struct ide_pm_state *pm = rq->special; - if (rq->cmd_type == REQ_TYPE_PM_SUSPEND && + if (rq->cmd_type == REQ_TYPE_ATA_PM_SUSPEND && pm->pm_step == IDE_PM_START_SUSPEND) /* Mark drive blocked when starting the suspend sequence. */ drive->dev_flags |= IDE_DFLAG_BLOCKED; - else if (rq->cmd_type == REQ_TYPE_PM_RESUME && + else if (rq->cmd_type == REQ_TYPE_ATA_PM_RESUME && pm->pm_step == IDE_PM_START_RESUME) { /* * The first thing we do on wakeup is to wait for BSY bit to diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index dabb88b1cbec..0979e126fff1 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -186,7 +186,7 @@ static ide_startstop_t task_no_data_intr(ide_drive_t *drive) tf->command == ATA_CMD_CHK_POWER) { struct request *rq = hwif->rq; - if (blk_pm_request(rq)) + if (ata_pm_request(rq)) ide_complete_pm_rq(drive, rq); else ide_finish_cmd(drive, cmd, stat); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c2829ba5e738..2da818a48097 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -30,7 +30,6 @@ struct scsi_ioctl_command; struct request_queue; struct elevator_queue; -struct request_pm_state; struct blk_trace; struct request; struct sg_io_hdr; @@ -75,8 +74,6 @@ struct request_list { enum rq_cmd_type_bits { REQ_TYPE_FS = 1, /* fs request */ REQ_TYPE_BLOCK_PC, /* scsi command */ - REQ_TYPE_PM_SUSPEND, /* suspend request */ - REQ_TYPE_PM_RESUME, /* resume request */ REQ_TYPE_DRV_PRIV, /* driver defined types from here */ }; @@ -207,19 +204,6 @@ static inline unsigned short req_get_ioprio(struct request *req) return req->ioprio; } -/* - * State information carried for REQ_TYPE_PM_SUSPEND and REQ_TYPE_PM_RESUME - * requests. Some step values could eventually be made generic. - */ -struct request_pm_state -{ - /* PM state machine step value, currently driver specific */ - int pm_step; - /* requested PM state value (S1, S2, S3, S4, ...) */ - u32 pm_state; - void* data; /* for driver use */ -}; - #include struct blk_queue_ctx; @@ -601,10 +585,6 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) (((rq)->cmd_flags & REQ_STARTED) && \ ((rq)->cmd_type == REQ_TYPE_FS)) -#define blk_pm_request(rq) \ - ((rq)->cmd_type == REQ_TYPE_PM_SUSPEND || \ - (rq)->cmd_type == REQ_TYPE_PM_RESUME) - #define blk_rq_cpu_valid(rq) ((rq)->cpu != -1) #define blk_bidi_rq(rq) ((rq)->next_rq != NULL) /* rq->queuelist of dequeued request must be list_empty() */ @@ -838,6 +818,7 @@ extern void blk_stop_queue(struct request_queue *q); extern void blk_sync_queue(struct request_queue *q); extern void __blk_stop_queue(struct request_queue *q); extern void __blk_run_queue(struct request_queue *q); +extern void __blk_run_queue_uncond(struct request_queue *q); extern void blk_run_queue(struct request_queue *); extern void blk_run_queue_async(struct request_queue *q); extern int blk_rq_map_user(struct request_queue *, struct request *, diff --git a/include/linux/ide.h b/include/linux/ide.h index 9856b7d455d9..a633898f36ac 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -44,8 +44,14 @@ enum ata_cmd_type_bits { REQ_TYPE_ATA_TASKFILE = REQ_TYPE_DRV_PRIV + 1, REQ_TYPE_ATA_PC, REQ_TYPE_ATA_SENSE, /* sense request */ + REQ_TYPE_ATA_PM_SUSPEND,/* suspend request */ + REQ_TYPE_ATA_PM_RESUME, /* resume request */ }; +#define ata_pm_request(rq) \ + ((rq)->cmd_type == REQ_TYPE_ATA_PM_SUSPEND || \ + (rq)->cmd_type == REQ_TYPE_ATA_PM_RESUME) + /* Error codes returned in rq->errors to the higher part of the driver. */ enum { IDE_DRV_ERROR_GENERAL = 101, @@ -1321,6 +1327,19 @@ struct ide_port_info { u8 udma_mask; }; +/* + * State information carried for REQ_TYPE_ATA_PM_SUSPEND and REQ_TYPE_ATA_PM_RESUME + * requests. + */ +struct ide_pm_state { + /* PM state machine step value, currently driver specific */ + int pm_step; + /* requested PM state value (S1, S2, S3, S4, ...) */ + u32 pm_state; + void* data; /* for driver use */ +}; + + int ide_pci_init_one(struct pci_dev *, const struct ide_port_info *, void *); int ide_pci_init_two(struct pci_dev *, struct pci_dev *, const struct ide_port_info *, void *); -- cgit v1.2.3 From cd8ae85299d54155702a56811b2e035e63064d3d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 3 May 2015 21:34:46 -0700 Subject: tcp: provide SYN headers for passive connections This patch allows a server application to get the TCP SYN headers for its passive connections. This is useful if the server is doing fingerprinting of clients based on SYN packet contents. Two socket options are added: TCP_SAVE_SYN and TCP_SAVED_SYN. The first is used on a socket to enable saving the SYN headers for child connections. This can be set before or after the listen() call. The latter is used to retrieve the SYN headers for passive connections, if the parent listener has enabled TCP_SAVE_SYN. TCP_SAVED_SYN is read once, it frees the saved SYN headers. The data returned in TCP_SAVED_SYN are network (IPv4/IPv6) and TCP headers. Original patch was written by Tom Herbert, I changed it to not hold a full skb (and associated dst and conntracking reference). We have used such patch for about 3 years at Google. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Tested-by: Neal Cardwell Signed-off-by: David S. Miller --- include/linux/tcp.h | 8 ++++++++ include/net/request_sock.h | 4 +++- include/uapi/linux/tcp.h | 2 ++ net/ipv4/tcp.c | 35 +++++++++++++++++++++++++++++++++++ net/ipv4/tcp_input.c | 18 ++++++++++++++++++ net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_minisocks.c | 3 +++ 7 files changed, 70 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 3b2911502a8c..e6fb5df22db1 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -199,6 +199,7 @@ struct tcp_sock { syn_fastopen:1, /* SYN includes Fast Open option */ syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ + save_syn:1, /* Save headers of SYN packet */ is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */ @@ -326,6 +327,7 @@ struct tcp_sock { * socket. Used to retransmit SYNACKs etc. */ struct request_sock *fastopen_rsk; + u32 *saved_syn; }; enum tsq_flags { @@ -393,4 +395,10 @@ static inline int fastopen_init_queue(struct sock *sk, int backlog) return 0; } +static inline void tcp_saved_syn_free(struct tcp_sock *tp) +{ + kfree(tp->saved_syn); + tp->saved_syn = NULL; +} + #endif /* _LINUX_TCP_H */ diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 9f4265ce8892..87935cad2f7b 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -64,6 +64,7 @@ struct request_sock { struct timer_list rsk_timer; const struct request_sock_ops *rsk_ops; struct sock *sk; + u32 *saved_syn; u32 secid; u32 peer_secid; }; @@ -77,7 +78,7 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener) req->rsk_ops = ops; sock_hold(sk_listener); req->rsk_listener = sk_listener; - + req->saved_syn = NULL; /* Following is temporary. It is coupled with debugging * helpers in reqsk_put() & reqsk_free() */ @@ -104,6 +105,7 @@ static inline void reqsk_free(struct request_sock *req) req->rsk_ops->destructor(req); if (req->rsk_listener) sock_put(req->rsk_listener); + kfree(req->saved_syn); kmem_cache_free(req->rsk_ops->slab, req); } diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index faa72f4fa547..51ebedba577f 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -113,6 +113,8 @@ enum { #define TCP_TIMESTAMP 24 #define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */ #define TCP_CC_INFO 26 /* Get Congestion Control (optional) info */ +#define TCP_SAVE_SYN 27 /* Record SYN headers for new connections */ +#define TCP_SAVED_SYN 28 /* Get SYN headers recorded for connection */ struct tcp_repair_opt { __u32 opt_code; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 46efa03d2b11..ecccfdc50d76 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2482,6 +2482,13 @@ static int do_tcp_setsockopt(struct sock *sk, int level, icsk->icsk_syn_retries = val; break; + case TCP_SAVE_SYN: + if (val < 0 || val > 1) + err = -EINVAL; + else + tp->save_syn = val; + break; + case TCP_LINGER2: if (val < 0) tp->linger2 = -1; @@ -2818,6 +2825,34 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_NOTSENT_LOWAT: val = tp->notsent_lowat; break; + case TCP_SAVE_SYN: + val = tp->save_syn; + break; + case TCP_SAVED_SYN: { + if (get_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + if (tp->saved_syn) { + len = min_t(unsigned int, tp->saved_syn[0], len); + if (put_user(len, optlen)) { + release_sock(sk); + return -EFAULT; + } + if (copy_to_user(optval, tp->saved_syn + 1, len)) { + release_sock(sk); + return -EFAULT; + } + tcp_saved_syn_free(tp); + release_sock(sk); + } else { + release_sock(sk); + len = 0; + if (put_user(len, optlen)) + return -EFAULT; + } + return 0; + } default: return -ENOPROTOOPT; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 09bdc4abfcbb..df2ca615cd0c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6060,6 +6060,23 @@ static bool tcp_syn_flood_action(struct sock *sk, return want_cookie; } +static void tcp_reqsk_record_syn(const struct sock *sk, + struct request_sock *req, + const struct sk_buff *skb) +{ + if (tcp_sk(sk)->save_syn) { + u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb); + u32 *copy; + + copy = kmalloc(len + sizeof(u32), GFP_ATOMIC); + if (copy) { + copy[0] = len; + memcpy(©[1], skb_network_header(skb), len); + req->saved_syn = copy; + } + } +} + int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb) @@ -6192,6 +6209,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_rsk(req)->tfo_listener = false; af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); } + tcp_reqsk_record_syn(sk, req, skb); return 0; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fc1c658ec6c1..91cb4768a860 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1802,6 +1802,7 @@ void tcp_v4_destroy_sock(struct sock *sk) /* If socket is aborted during connect operation */ tcp_free_fastopen_req(tp); + tcp_saved_syn_free(tp); sk_sockets_allocated_dec(sk); sock_release_memcg(sk); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index e5d7649136fc..ebe2ab2596ed 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -536,6 +536,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->fastopen_rsk = NULL; newtp->syn_data_acked = 0; + newtp->saved_syn = req->saved_syn; + req->saved_syn = NULL; + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); } return newsk; -- cgit v1.2.3 From 2c7a88c252bf3381958cf716f31b6b2e0f2f3fa7 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 4 May 2015 14:33:48 -0700 Subject: etherdev: Fix sparse error, make test usable by other functions This change does two things. First it fixes a sparse error for the fact that the __be16 degrades to an integer. Since that is actually what I am kind of doing I am simply working around that by forcing both sides of the comparison to u16. Also I realized on some compilers I was generating another instruction for big endian systems such as PowerPC since it was masking the value before doing the comparison. So to resolve that I have simply pulled the mask out and wrapped it in an #ifndef __BIG_ENDIAN. Lastly I pulled this all out into its own function. I notices there are similar checks in a number of other places so this function can be reused there to help reduce overhead in these paths as well. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 18 ++++++++++++++++++ net/ethernet/eth.c | 2 +- 2 files changed, 19 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index c4a10f991fe0..9012f8775208 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -190,6 +190,24 @@ static inline bool is_valid_ether_addr(const u8 *addr) return !is_multicast_ether_addr(addr) && !is_zero_ether_addr(addr); } +/** + * eth_proto_is_802_3 - Determine if a given Ethertype/length is a protocol + * @proto: Ethertype/length value to be tested + * + * Check that the value from the Ethertype/length field is a valid Ethertype. + * + * Return true if the valid is an 802.3 supported Ethertype. + */ +static inline bool eth_proto_is_802_3(__be16 proto) +{ +#ifndef __BIG_ENDIAN + /* if CPU is little endian mask off bits representing LSB */ + proto &= htons(0xFF00); +#endif + /* cast both to u16 and compare since LSB can be ignored */ + return (__force u16)proto >= (__force u16)htons(ETH_P_802_3_MIN); +} + /** * eth_random_addr - Generate software assigned random Ethernet address * @addr: Pointer to a six-byte array containing the Ethernet address diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 314e4c5a5a5e..9045e2a1108f 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -179,7 +179,7 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) if (unlikely(netdev_uses_dsa(dev))) return htons(ETH_P_XDSA); - if (likely((eth->h_proto & htons(0xFF00)) >= htons(ETH_P_802_3_MIN))) + if (likely(eth_proto_is_802_3(eth->h_proto))) return eth->h_proto; /* -- cgit v1.2.3 From 9545b22da647cf6fbbac9c5a48c50fd72d892b11 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 4 May 2015 14:34:10 -0700 Subject: vlan: Use eth_proto_is_802_3 Replace "ntohs(proto) >= ETH_P_802_3_MIN" w/ eth_proto_is_802_3(proto). Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 920e4457ce6e..b9ab677c0c0a 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -539,7 +539,7 @@ static inline void vlan_set_encap_proto(struct sk_buff *skb, */ proto = vhdr->h_vlan_encapsulated_proto; - if (ntohs(proto) >= ETH_P_802_3_MIN) { + if (eth_proto_is_802_3(proto)) { skb->protocol = proto; return; } -- cgit v1.2.3 From 2893c379461a208b3059f55dfe4dafa06b4aa46a Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Tue, 31 Mar 2015 20:16:52 +0200 Subject: clk: make strings in parent name arrays const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The clk functions and structs declare the parent_name arrays as 'const char **parent_names' which means the parent name strings are const, but the array itself is not. Use 'const char * const * parent_names' instead which also makes the array const. This allows us to put the parent_name arrays into the __initconst section. Signed-off-by: Sascha Hauer Reviewed-by: Krzysztof Kozlowski Tested-by: Krzysztof Kozlowski Acked-by: Uwe Kleine-König [sboyd@codeaurora.org: Squelch 80-character checkpatch warnings] Signed-off-by: Stephen Boyd --- drivers/clk/clk-composite.c | 2 +- drivers/clk/clk-mux.c | 6 ++++-- include/linux/clk-provider.h | 10 ++++++---- 3 files changed, 11 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/clk/clk-composite.c b/drivers/clk/clk-composite.c index 956b7e54fa1c..077f4c7148f1 100644 --- a/drivers/clk/clk-composite.c +++ b/drivers/clk/clk-composite.c @@ -188,7 +188,7 @@ static void clk_composite_disable(struct clk_hw *hw) } struct clk *clk_register_composite(struct device *dev, const char *name, - const char **parent_names, int num_parents, + const char * const *parent_names, int num_parents, struct clk_hw *mux_hw, const struct clk_ops *mux_ops, struct clk_hw *rate_hw, const struct clk_ops *rate_ops, struct clk_hw *gate_hw, const struct clk_ops *gate_ops, diff --git a/drivers/clk/clk-mux.c b/drivers/clk/clk-mux.c index 69a094c3783d..6066a01b20ea 100644 --- a/drivers/clk/clk-mux.c +++ b/drivers/clk/clk-mux.c @@ -114,7 +114,8 @@ const struct clk_ops clk_mux_ro_ops = { EXPORT_SYMBOL_GPL(clk_mux_ro_ops); struct clk *clk_register_mux_table(struct device *dev, const char *name, - const char **parent_names, u8 num_parents, unsigned long flags, + const char * const *parent_names, u8 num_parents, + unsigned long flags, void __iomem *reg, u8 shift, u32 mask, u8 clk_mux_flags, u32 *table, spinlock_t *lock) { @@ -166,7 +167,8 @@ struct clk *clk_register_mux_table(struct device *dev, const char *name, EXPORT_SYMBOL_GPL(clk_register_mux_table); struct clk *clk_register_mux(struct device *dev, const char *name, - const char **parent_names, u8 num_parents, unsigned long flags, + const char * const *parent_names, u8 num_parents, + unsigned long flags, void __iomem *reg, u8 shift, u8 width, u8 clk_mux_flags, spinlock_t *lock) { diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index df695313f975..5378c2aba4d2 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -209,7 +209,7 @@ struct clk_ops { struct clk_init_data { const char *name; const struct clk_ops *ops; - const char **parent_names; + const char * const *parent_names; u8 num_parents; unsigned long flags; }; @@ -426,12 +426,14 @@ extern const struct clk_ops clk_mux_ops; extern const struct clk_ops clk_mux_ro_ops; struct clk *clk_register_mux(struct device *dev, const char *name, - const char **parent_names, u8 num_parents, unsigned long flags, + const char * const *parent_names, u8 num_parents, + unsigned long flags, void __iomem *reg, u8 shift, u8 width, u8 clk_mux_flags, spinlock_t *lock); struct clk *clk_register_mux_table(struct device *dev, const char *name, - const char **parent_names, u8 num_parents, unsigned long flags, + const char * const *parent_names, u8 num_parents, + unsigned long flags, void __iomem *reg, u8 shift, u32 mask, u8 clk_mux_flags, u32 *table, spinlock_t *lock); @@ -518,7 +520,7 @@ struct clk_composite { }; struct clk *clk_register_composite(struct device *dev, const char *name, - const char **parent_names, int num_parents, + const char * const *parent_names, int num_parents, struct clk_hw *mux_hw, const struct clk_ops *mux_ops, struct clk_hw *rate_hw, const struct clk_ops *rate_ops, struct clk_hw *gate_hw, const struct clk_ops *gate_ops, -- cgit v1.2.3 From d5622a9c13752be46e6fcde9d31391ce0bb0598b Mon Sep 17 00:00:00 2001 From: Russell King Date: Mon, 2 Mar 2015 15:45:41 +0000 Subject: clkdev: use clk_hw internally clk_add_alias() calls clk_get() followed by clk_put() but in between those two calls it saves away the struct clk pointer to a clk_lookup structure. This leaves the 'clk' member of the clk_lookup pointing at freed memory on configurations where CONFIG_COMMON_CLK=y. This is a problem because clk_get_sys() will eventually try to dereference the freed pointer by calling __clk_get_hw() on it. Fix this by saving away the struct clk_hw pointer instead of the struct clk pointer so that when we try to create a per-user struct clk in clk_get_sys() we don't dereference a junk pointer. Signed-off-by: Russell King --- drivers/clk/clkdev.c | 24 ++++++++++++++++-------- include/linux/clkdev.h | 1 + 2 files changed, 17 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/clk/clkdev.c b/drivers/clk/clkdev.c index 1fcb6ef2cdac..1bb120a3855f 100644 --- a/drivers/clk/clkdev.c +++ b/drivers/clk/clkdev.c @@ -177,7 +177,7 @@ struct clk *clk_get_sys(const char *dev_id, const char *con_id) if (!cl) goto out; - clk = __clk_create_clk(__clk_get_hw(cl->clk), dev_id, con_id); + clk = __clk_create_clk(cl->clk_hw, dev_id, con_id); if (IS_ERR(clk)) goto out; @@ -215,18 +215,26 @@ void clk_put(struct clk *clk) } EXPORT_SYMBOL(clk_put); -void clkdev_add(struct clk_lookup *cl) +static void __clkdev_add(struct clk_lookup *cl) { mutex_lock(&clocks_mutex); list_add_tail(&cl->node, &clocks); mutex_unlock(&clocks_mutex); } + +void clkdev_add(struct clk_lookup *cl) +{ + if (!cl->clk_hw) + cl->clk_hw = __clk_get_hw(cl->clk); + __clkdev_add(cl); +} EXPORT_SYMBOL(clkdev_add); void __init clkdev_add_table(struct clk_lookup *cl, size_t num) { mutex_lock(&clocks_mutex); while (num--) { + cl->clk_hw = __clk_get_hw(cl->clk); list_add_tail(&cl->node, &clocks); cl++; } @@ -243,7 +251,7 @@ struct clk_lookup_alloc { }; static struct clk_lookup * __init_refok -vclkdev_alloc(struct clk *clk, const char *con_id, const char *dev_fmt, +vclkdev_alloc(struct clk_hw *hw, const char *con_id, const char *dev_fmt, va_list ap) { struct clk_lookup_alloc *cla; @@ -252,7 +260,7 @@ vclkdev_alloc(struct clk *clk, const char *con_id, const char *dev_fmt, if (!cla) return NULL; - cla->cl.clk = clk; + cla->cl.clk_hw = hw; if (con_id) { strlcpy(cla->con_id, con_id, sizeof(cla->con_id)); cla->cl.con_id = cla->con_id; @@ -273,7 +281,7 @@ clkdev_alloc(struct clk *clk, const char *con_id, const char *dev_fmt, ...) va_list ap; va_start(ap, dev_fmt); - cl = vclkdev_alloc(clk, con_id, dev_fmt, ap); + cl = vclkdev_alloc(__clk_get_hw(clk), con_id, dev_fmt, ap); va_end(ap); return cl; @@ -334,7 +342,7 @@ int clk_register_clkdev(struct clk *clk, const char *con_id, return PTR_ERR(clk); va_start(ap, dev_fmt); - cl = vclkdev_alloc(clk, con_id, dev_fmt, ap); + cl = vclkdev_alloc(__clk_get_hw(clk), con_id, dev_fmt, ap); va_end(ap); if (!cl) @@ -365,8 +373,8 @@ int clk_register_clkdevs(struct clk *clk, struct clk_lookup *cl, size_t num) return PTR_ERR(clk); for (i = 0; i < num; i++, cl++) { - cl->clk = clk; - clkdev_add(cl); + cl->clk_hw = __clk_get_hw(clk); + __clkdev_add(cl); } return 0; diff --git a/include/linux/clkdev.h b/include/linux/clkdev.h index 94bad77eeb4a..3003afad46c9 100644 --- a/include/linux/clkdev.h +++ b/include/linux/clkdev.h @@ -22,6 +22,7 @@ struct clk_lookup { const char *dev_id; const char *con_id; struct clk *clk; + struct clk_hw *clk_hw; }; #define CLKDEV_INIT(d, n, c) \ -- cgit v1.2.3 From d2d14a77886485310ec66e575f00ea5232ac7a14 Mon Sep 17 00:00:00 2001 From: Russell King Date: Sat, 14 Mar 2015 15:12:35 +0000 Subject: clk: update clk API documentation to clarify clk_round_rate() The idea is that rate = clk_round_rate(clk, r) is equivalent to: clk_set_rate(clk, r); rate = clk_get_rate(clk); except that clk_round_rate() does not change the hardware in any way. Signed-off-by: Russell King --- include/linux/clk.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk.h b/include/linux/clk.h index 68c16a6bedb3..cafb22df8d00 100644 --- a/include/linux/clk.h +++ b/include/linux/clk.h @@ -306,6 +306,20 @@ void devm_clk_put(struct device *dev, struct clk *clk); * @clk: clock source * @rate: desired clock rate in Hz * + * This answers the question "if I were to pass @rate to clk_set_rate(), + * what clock rate would I end up with?" without changing the hardware + * in any way. In other words: + * + * rate = clk_round_rate(clk, r); + * + * and: + * + * clk_set_rate(clk, r); + * rate = clk_get_rate(clk); + * + * are equivalent except the former does not modify the clock hardware + * in any way. + * * Returns rounded clock rate in Hz, or negative errno. */ long clk_round_rate(struct clk *clk, unsigned long rate); -- cgit v1.2.3 From 2d34e507293102f29ee94d9a9c5b890696d42452 Mon Sep 17 00:00:00 2001 From: Russell King Date: Mon, 9 Mar 2015 11:03:00 +0000 Subject: clkdev: get rid of redundant clk_add_alias() prototype in linux/clk.h clk_add_alias() is provided by clkdev, and is not part of the clk API. Howver, it is prototyped in two locations: linux/clkdev.h and linux/clk.h. This is a mess. Get rid of the redundant and unnecessary version in linux/clk.h. Acked-by: Tony Lindgren Tested-by: Robert Jarzmik Acked-by: Sekhar Nori Signed-off-by: Russell King --- arch/arm/mach-davinci/da850.c | 1 + arch/arm/mach-omap1/board-nokia770.c | 2 +- arch/arm/mach-pxa/eseries.c | 1 + arch/arm/mach-pxa/lubbock.c | 1 + arch/arm/mach-pxa/tosa.c | 1 + include/linux/clk.h | 13 ------------- 6 files changed, 5 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/mach-davinci/da850.c b/arch/arm/mach-davinci/da850.c index 45ce065e7170..3b8740c083c4 100644 --- a/arch/arm/mach-davinci/da850.c +++ b/arch/arm/mach-davinci/da850.c @@ -11,6 +11,7 @@ * is licensed "as is" without any warranty of any kind, whether express * or implied. */ +#include #include #include #include diff --git a/arch/arm/mach-omap1/board-nokia770.c b/arch/arm/mach-omap1/board-nokia770.c index 85089d821982..3bc59390a943 100644 --- a/arch/arm/mach-omap1/board-nokia770.c +++ b/arch/arm/mach-omap1/board-nokia770.c @@ -7,6 +7,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ +#include #include #include #include @@ -14,7 +15,6 @@ #include #include #include -#include #include #include diff --git a/arch/arm/mach-pxa/eseries.c b/arch/arm/mach-pxa/eseries.c index cfb864173ce3..4427bf26ea47 100644 --- a/arch/arm/mach-pxa/eseries.c +++ b/arch/arm/mach-pxa/eseries.c @@ -10,6 +10,7 @@ * */ +#include #include #include #include diff --git a/arch/arm/mach-pxa/lubbock.c b/arch/arm/mach-pxa/lubbock.c index d8a1be619f21..235f2d9318c1 100644 --- a/arch/arm/mach-pxa/lubbock.c +++ b/arch/arm/mach-pxa/lubbock.c @@ -11,6 +11,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ +#include #include #include #include diff --git a/arch/arm/mach-pxa/tosa.c b/arch/arm/mach-pxa/tosa.c index 7780d1faa06f..92e56d8a24d8 100644 --- a/arch/arm/mach-pxa/tosa.c +++ b/arch/arm/mach-pxa/tosa.c @@ -12,6 +12,7 @@ * */ +#include #include #include #include diff --git a/include/linux/clk.h b/include/linux/clk.h index cafb22df8d00..0df4a51e1a78 100644 --- a/include/linux/clk.h +++ b/include/linux/clk.h @@ -485,19 +485,6 @@ static inline void clk_disable_unprepare(struct clk *clk) clk_unprepare(clk); } -/** - * clk_add_alias - add a new clock alias - * @alias: name for clock alias - * @alias_dev_name: device name - * @id: platform specific clock name - * @dev: device - * - * Allows using generic clock names for drivers by adding a new alias. - * Assumes clkdev, see clkdev.h for more info. - */ -int clk_add_alias(const char *alias, const char *alias_dev_name, char *id, - struct device *dev); - struct device_node; struct of_phandle_args; -- cgit v1.2.3 From b3d8d7e89fab374d731dfb46fe048f09766ca9c8 Mon Sep 17 00:00:00 2001 From: Russell King Date: Mon, 9 Mar 2015 10:43:04 +0000 Subject: clkdev: const-ify connection id to clk_add_alias() The connection id is only passed to clk_get() which is already const. Const-ify this argument too. Signed-off-by: Russell King --- drivers/clk/clkdev.c | 6 +++--- include/linux/clkdev.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/clk/clkdev.c b/drivers/clk/clkdev.c index 04b59ad6d852..e112e13af760 100644 --- a/drivers/clk/clkdev.c +++ b/drivers/clk/clkdev.c @@ -288,10 +288,10 @@ clkdev_alloc(struct clk *clk, const char *con_id, const char *dev_fmt, ...) } EXPORT_SYMBOL(clkdev_alloc); -int clk_add_alias(const char *alias, const char *alias_dev_name, char *id, - struct device *dev) +int clk_add_alias(const char *alias, const char *alias_dev_name, + const char *con_id, struct device *dev) { - struct clk *r = clk_get(dev, id); + struct clk *r = clk_get(dev, con_id); struct clk_lookup *l; if (IS_ERR(r)) diff --git a/include/linux/clkdev.h b/include/linux/clkdev.h index 3003afad46c9..cd93b215e3af 100644 --- a/include/linux/clkdev.h +++ b/include/linux/clkdev.h @@ -39,7 +39,7 @@ void clkdev_add(struct clk_lookup *cl); void clkdev_drop(struct clk_lookup *cl); void clkdev_add_table(struct clk_lookup *, size_t); -int clk_add_alias(const char *, const char *, char *, struct device *); +int clk_add_alias(const char *, const char *, const char *, struct device *); int clk_register_clkdev(struct clk *, const char *, const char *, ...); int clk_register_clkdevs(struct clk *, struct clk_lookup *, size_t); -- cgit v1.2.3 From 2568999835d7797afce3dcc3a3f368051ffcaf1f Mon Sep 17 00:00:00 2001 From: Russell King Date: Mon, 2 Mar 2015 15:40:29 +0000 Subject: clkdev: add clkdev_create() helper Add a helper to allocate and add a clk_lookup structure. This can not only be used in several places in clkdev.c to simplify the code, but more importantly, can be used by callers of the clkdev code to simplify their clkdev creation and registration. Signed-off-by: Russell King --- drivers/clk/clkdev.c | 53 ++++++++++++++++++++++++++++++++++++++------------ include/linux/clkdev.h | 3 +++ 2 files changed, 44 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/drivers/clk/clkdev.c b/drivers/clk/clkdev.c index e112e13af760..c0eaf0973bd2 100644 --- a/drivers/clk/clkdev.c +++ b/drivers/clk/clkdev.c @@ -274,6 +274,19 @@ vclkdev_alloc(struct clk_hw *hw, const char *con_id, const char *dev_fmt, return &cla->cl; } +static struct clk_lookup * +vclkdev_create(struct clk_hw *hw, const char *con_id, const char *dev_fmt, + va_list ap) +{ + struct clk_lookup *cl; + + cl = vclkdev_alloc(hw, con_id, dev_fmt, ap); + if (cl) + __clkdev_add(cl); + + return cl; +} + struct clk_lookup * __init_refok clkdev_alloc(struct clk *clk, const char *con_id, const char *dev_fmt, ...) { @@ -288,6 +301,29 @@ clkdev_alloc(struct clk *clk, const char *con_id, const char *dev_fmt, ...) } EXPORT_SYMBOL(clkdev_alloc); +/** + * clkdev_create - allocate and add a clkdev lookup structure + * @clk: struct clk to associate with all clk_lookups + * @con_id: connection ID string on device + * @dev_fmt: format string describing device name + * + * Returns a clk_lookup structure, which can be later unregistered and + * freed. + */ +struct clk_lookup *clkdev_create(struct clk *clk, const char *con_id, + const char *dev_fmt, ...) +{ + struct clk_lookup *cl; + va_list ap; + + va_start(ap, dev_fmt); + cl = vclkdev_create(__clk_get_hw(clk), con_id, dev_fmt, ap); + va_end(ap); + + return cl; +} +EXPORT_SYMBOL_GPL(clkdev_create); + int clk_add_alias(const char *alias, const char *alias_dev_name, const char *con_id, struct device *dev) { @@ -297,12 +333,10 @@ int clk_add_alias(const char *alias, const char *alias_dev_name, if (IS_ERR(r)) return PTR_ERR(r); - l = clkdev_alloc(r, alias, alias_dev_name); + l = clkdev_create(r, alias, "%s", alias_dev_name); clk_put(r); - if (!l) - return -ENODEV; - clkdev_add(l); - return 0; + + return l ? 0 : -ENODEV; } EXPORT_SYMBOL(clk_add_alias); @@ -342,15 +376,10 @@ int clk_register_clkdev(struct clk *clk, const char *con_id, return PTR_ERR(clk); va_start(ap, dev_fmt); - cl = vclkdev_alloc(__clk_get_hw(clk), con_id, dev_fmt, ap); + cl = vclkdev_create(__clk_get_hw(clk), con_id, dev_fmt, ap); va_end(ap); - if (!cl) - return -ENOMEM; - - clkdev_add(cl); - - return 0; + return cl ? 0 : -ENOMEM; } EXPORT_SYMBOL(clk_register_clkdev); diff --git a/include/linux/clkdev.h b/include/linux/clkdev.h index cd93b215e3af..a240b18e86fa 100644 --- a/include/linux/clkdev.h +++ b/include/linux/clkdev.h @@ -38,6 +38,9 @@ struct clk_lookup *clkdev_alloc(struct clk *clk, const char *con_id, void clkdev_add(struct clk_lookup *cl); void clkdev_drop(struct clk_lookup *cl); +struct clk_lookup *clkdev_create(struct clk *clk, const char *con_id, + const char *dev_fmt, ...); + void clkdev_add_table(struct clk_lookup *, size_t); int clk_add_alias(const char *, const char *, const char *, struct device *); -- cgit v1.2.3 From efb0de55b6a2ec15fc424e660601f22ae2fa487a Mon Sep 17 00:00:00 2001 From: Shobhit Kumar Date: Tue, 5 May 2015 15:04:18 +0530 Subject: pwm: Add support to remove registered consumer lookup tables In case some drivers are unloading, they can remove lookup tables which they had registered during their load time to avoid redundant entries if loaded again. CC: Samuel Ortiz Cc: Linus Walleij Cc: Alexandre Courbot Cc: Thierry Reding Signed-off-by: Shobhit Kumar Signed-off-by: Thierry Reding --- drivers/pwm/core.c | 17 +++++++++++++++++ include/linux/pwm.h | 5 +++++ 2 files changed, 22 insertions(+) (limited to 'include/linux') diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index ba34c7d89042..27cd58d16881 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -585,6 +585,23 @@ void pwm_add_table(struct pwm_lookup *table, size_t num) mutex_unlock(&pwm_lookup_lock); } +/** + * pwm_remove_table() - unregister PWM device consumers + * @table: array of consumers to unregister + * @num: number of consumers in table + */ +void pwm_remove_table(struct pwm_lookup *table, size_t num) +{ + mutex_lock(&pwm_lookup_lock); + + while (num--) { + list_del(&table->list); + table++; + } + + mutex_unlock(&pwm_lookup_lock); +} + /** * pwm_get() - look up and request a PWM device * @dev: device for PWM consumer diff --git a/include/linux/pwm.h b/include/linux/pwm.h index e90628cac8fa..cfe2d8df5be0 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -290,10 +290,15 @@ struct pwm_lookup { #if IS_ENABLED(CONFIG_PWM) void pwm_add_table(struct pwm_lookup *table, size_t num); +void pwm_remove_table(struct pwm_lookup *table, size_t num); #else static inline void pwm_add_table(struct pwm_lookup *table, size_t num) { } + +static inline void pwm_remove_table(struct pwm_lookup *table, size_t num) +{ +} #endif #ifdef CONFIG_PWM_SYSFS -- cgit v1.2.3 From fa76a3db7093a527333c380df82a0f158d9b8299 Mon Sep 17 00:00:00 2001 From: Sonic Zhang Date: Thu, 9 Apr 2015 11:13:07 +0800 Subject: pinctrl: allow exlusive GPIO/mux pin allocation Disallow simultaneous use of the the GPIO and peripheral mux functions by setting a flag "strict" in struct pinctrl_desc. The blackfin pinmux and gpio controller doesn't allow user to set up a pin for both GPIO and peripheral function. So, add flag strict in struct pinctrl_desc to check both gpio_owner and mux_owner before approving the pin request. v2-changes: - if strict flag is set, check gpio_owner and mux_onwer in if and else clause v3-changes: - add kerneldoc for this struct - augment Documentation/pinctrl.txt Signed-off-by: Sonic Zhang Signed-off-by: Linus Walleij --- Documentation/pinctrl.txt | 11 +++++++++++ drivers/pinctrl/pinctrl-adi2.c | 1 + drivers/pinctrl/pinmux.c | 13 +++++++++++++ include/linux/pinctrl/pinctrl.h | 3 +++ 4 files changed, 28 insertions(+) (limited to 'include/linux') diff --git a/Documentation/pinctrl.txt b/Documentation/pinctrl.txt index a9b47163bb5d..d6b2bed94c43 100644 --- a/Documentation/pinctrl.txt +++ b/Documentation/pinctrl.txt @@ -73,6 +73,7 @@ static struct pinctrl_desc foo_desc = { .pins = foo_pins, .npins = ARRAY_SIZE(foo_pins), .owner = THIS_MODULE, + .strict = true, }; int __init foo_probe(void) @@ -830,6 +831,11 @@ separate memory range only intended for GPIO driving, and the register range dealing with pin config and pin multiplexing get placed into a different memory range and a separate section of the data sheet. +A flag "strict" in struct pinctrl_desc is available to check and deny +simultaneous access to the same pin from GPIO and pin multiplexing +consumers on hardware of this type. The pinctrl driver should set this flag +accordingly. + (B) pin config @@ -850,6 +856,11 @@ possible that the GPIO, pin config and pin multiplex registers are placed into the same memory range and the same section of the data sheet, although that need not be the case. +In some pin controllers, although the physical pins are designed in the same +way as (B), the GPIO function still can't be enabled at the same time as the +peripheral functions. So again the "strict" flag should be set, denying +simultaneous activation by GPIO and other muxed in devices. + From a kernel point of view, however, these are different aspects of the hardware and shall be put into different subsystems: diff --git a/drivers/pinctrl/pinctrl-adi2.c b/drivers/pinctrl/pinctrl-adi2.c index 8434439c5017..fbd492668da1 100644 --- a/drivers/pinctrl/pinctrl-adi2.c +++ b/drivers/pinctrl/pinctrl-adi2.c @@ -710,6 +710,7 @@ static struct pinctrl_desc adi_pinmux_desc = { .name = DRIVER_NAME, .pctlops = &adi_pctrl_ops, .pmxops = &adi_pinmux_ops, + .strict = true, .owner = THIS_MODULE, }; diff --git a/drivers/pinctrl/pinmux.c b/drivers/pinctrl/pinmux.c index b874458dcb88..2546fa783464 100644 --- a/drivers/pinctrl/pinmux.c +++ b/drivers/pinctrl/pinmux.c @@ -107,6 +107,13 @@ static int pin_request(struct pinctrl_dev *pctldev, desc->name, desc->gpio_owner, owner); goto out; } + if (pctldev->desc->strict && desc->mux_usecount && + strcmp(desc->mux_owner, owner)) { + dev_err(pctldev->dev, + "pin %s already requested by %s; cannot claim for %s\n", + desc->name, desc->mux_owner, owner); + goto out; + } desc->gpio_owner = owner; } else { @@ -116,6 +123,12 @@ static int pin_request(struct pinctrl_dev *pctldev, desc->name, desc->mux_owner, owner); goto out; } + if (pctldev->desc->strict && desc->gpio_owner) { + dev_err(pctldev->dev, + "pin %s already requested by %s; cannot claim for %s\n", + desc->name, desc->gpio_owner, owner); + goto out; + } desc->mux_usecount++; if (desc->mux_usecount > 1) diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h index 66e4697516de..fc6b0348c375 100644 --- a/include/linux/pinctrl/pinctrl.h +++ b/include/linux/pinctrl/pinctrl.h @@ -114,6 +114,8 @@ struct pinctrl_ops { * of the pins field above * @pctlops: pin control operation vtable, to support global concepts like * grouping of pins, this is optional. + * @strict: check both gpio_owner and mux_owner strictly before approving + the pin request * @pmxops: pinmux operations vtable, if you support pinmuxing in your driver * @confops: pin config operations vtable, if you support pin configuration in * your driver @@ -132,6 +134,7 @@ struct pinctrl_desc { const struct pinctrl_ops *pctlops; const struct pinmux_ops *pmxops; const struct pinconf_ops *confops; + bool strict; struct module *owner; #ifdef CONFIG_GENERIC_PINCONF unsigned int num_custom_params; -- cgit v1.2.3 From 8c4c2016345feefcd289ce2479eb70286d30825a Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 6 May 2015 14:19:13 +0200 Subject: pinctrl: move strict option to pinmux_ops While the pinmux_ops are ideally just a vtable for pin mux calls, the "strict" setting belongs so intuitively with the pin multiplexing that we should move it here anyway. Putting it in the top pinctrl_desc makes no sense. Cc: Sonic Zhang Signed-off-by: Linus Walleij --- Documentation/pinctrl.txt | 2 +- drivers/pinctrl/pinctrl-adi2.c | 2 +- drivers/pinctrl/pinmux.c | 4 ++-- include/linux/pinctrl/pinctrl.h | 3 --- include/linux/pinctrl/pinmux.h | 4 ++++ 5 files changed, 8 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/Documentation/pinctrl.txt b/Documentation/pinctrl.txt index d6b2bed94c43..4976389e432d 100644 --- a/Documentation/pinctrl.txt +++ b/Documentation/pinctrl.txt @@ -73,7 +73,6 @@ static struct pinctrl_desc foo_desc = { .pins = foo_pins, .npins = ARRAY_SIZE(foo_pins), .owner = THIS_MODULE, - .strict = true, }; int __init foo_probe(void) @@ -715,6 +714,7 @@ static struct pinmux_ops foo_pmxops = { .get_function_name = foo_get_fname, .get_function_groups = foo_get_groups, .set_mux = foo_set_mux, + .strict = true, }; /* Pinmux operations are handled by some pin controller */ diff --git a/drivers/pinctrl/pinctrl-adi2.c b/drivers/pinctrl/pinctrl-adi2.c index fbd492668da1..49df9037b41e 100644 --- a/drivers/pinctrl/pinctrl-adi2.c +++ b/drivers/pinctrl/pinctrl-adi2.c @@ -703,6 +703,7 @@ static struct pinmux_ops adi_pinmux_ops = { .get_function_name = adi_pinmux_get_func_name, .get_function_groups = adi_pinmux_get_groups, .gpio_request_enable = adi_pinmux_request_gpio, + .strict = true, }; @@ -710,7 +711,6 @@ static struct pinctrl_desc adi_pinmux_desc = { .name = DRIVER_NAME, .pctlops = &adi_pctrl_ops, .pmxops = &adi_pinmux_ops, - .strict = true, .owner = THIS_MODULE, }; diff --git a/drivers/pinctrl/pinmux.c b/drivers/pinctrl/pinmux.c index 2546fa783464..c58c168b06c2 100644 --- a/drivers/pinctrl/pinmux.c +++ b/drivers/pinctrl/pinmux.c @@ -107,7 +107,7 @@ static int pin_request(struct pinctrl_dev *pctldev, desc->name, desc->gpio_owner, owner); goto out; } - if (pctldev->desc->strict && desc->mux_usecount && + if (ops->strict && desc->mux_usecount && strcmp(desc->mux_owner, owner)) { dev_err(pctldev->dev, "pin %s already requested by %s; cannot claim for %s\n", @@ -123,7 +123,7 @@ static int pin_request(struct pinctrl_dev *pctldev, desc->name, desc->mux_owner, owner); goto out; } - if (pctldev->desc->strict && desc->gpio_owner) { + if (ops->strict && desc->gpio_owner) { dev_err(pctldev->dev, "pin %s already requested by %s; cannot claim for %s\n", desc->name, desc->gpio_owner, owner); diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h index fc6b0348c375..66e4697516de 100644 --- a/include/linux/pinctrl/pinctrl.h +++ b/include/linux/pinctrl/pinctrl.h @@ -114,8 +114,6 @@ struct pinctrl_ops { * of the pins field above * @pctlops: pin control operation vtable, to support global concepts like * grouping of pins, this is optional. - * @strict: check both gpio_owner and mux_owner strictly before approving - the pin request * @pmxops: pinmux operations vtable, if you support pinmuxing in your driver * @confops: pin config operations vtable, if you support pin configuration in * your driver @@ -134,7 +132,6 @@ struct pinctrl_desc { const struct pinctrl_ops *pctlops; const struct pinmux_ops *pmxops; const struct pinconf_ops *confops; - bool strict; struct module *owner; #ifdef CONFIG_GENERIC_PINCONF unsigned int num_custom_params; diff --git a/include/linux/pinctrl/pinmux.h b/include/linux/pinctrl/pinmux.h index 511bda9ed4bf..d3740fa7073f 100644 --- a/include/linux/pinctrl/pinmux.h +++ b/include/linux/pinctrl/pinmux.h @@ -56,6 +56,9 @@ struct pinctrl_dev; * depending on whether the GPIO is configured as input or output, * a direction selector function may be implemented as a backing * to the GPIO controllers that need pin muxing. + * @strict: do not allow simultaneous use of the same pin for GPIO and another + * function. Check both gpio_owner and mux_owner strictly before approving + * the pin request. */ struct pinmux_ops { int (*request) (struct pinctrl_dev *pctldev, unsigned offset); @@ -79,6 +82,7 @@ struct pinmux_ops { struct pinctrl_gpio_range *range, unsigned offset, bool input); + bool strict; }; #endif /* CONFIG_PINMUX */ -- cgit v1.2.3 From cac089f9026e9ddb3481daf08f0fc4e5949fa1af Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Thu, 23 Apr 2015 16:56:22 -0700 Subject: gpio: omap: Allow building as a loadable module We currently get all kinds of errors building the omap gpio driver as a module starting with: undefined reference to `omap2_gpio_resume_after_idle' undefined reference to `omap2_gpio_prepare_for_idle' ... Let's fix the issue by adding inline functions to the header. Note that we can now also remove the two unused functions for omap_set_gpio_debounce and omap_set_gpio_debounce_time. Then doing rmmod on the module produces further warnings because of missing exit related functions. Let's add those. And finally, we can make the Kconfig entry just a tristate option that's selected for omaps. Cc: Javier Martinez Canillas Cc: Kevin Hilman Cc: Nishanth Menon Signed-off-by: Tony Lindgren Reviewed-by: Grygorii Strashko Acked-by: Santosh Shilimkar Reviewed-by: Felipe Balbi Signed-off-by: Linus Walleij --- drivers/gpio/Kconfig | 2 +- drivers/gpio/gpio-omap.c | 24 ++++++++++++++++++++++++ include/linux/platform_data/gpio-omap.h | 12 ++++++++++-- 3 files changed, 35 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index c2211258231b..4c4dba075277 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -316,7 +316,7 @@ config GPIO_OCTEON family of SOCs. config GPIO_OMAP - bool "TI OMAP GPIO support" if COMPILE_TEST && !ARCH_OMAP2PLUS + tristate "TI OMAP GPIO support" if ARCH_OMAP2PLUS || COMPILE_TEST default y if ARCH_OMAP depends on ARM select GENERIC_IRQ_CHIP diff --git a/drivers/gpio/gpio-omap.c b/drivers/gpio/gpio-omap.c index cd1d5bf48f36..38c268fb7ba8 100644 --- a/drivers/gpio/gpio-omap.c +++ b/drivers/gpio/gpio-omap.c @@ -1263,6 +1263,17 @@ static int omap_gpio_probe(struct platform_device *pdev) return 0; } +static int omap_gpio_remove(struct platform_device *pdev) +{ + struct gpio_bank *bank = platform_get_drvdata(pdev); + + list_del(&bank->node); + gpiochip_remove(&bank->chip); + pm_runtime_disable(bank->dev); + + return 0; +} + #ifdef CONFIG_ARCH_OMAP2PLUS #if defined(CONFIG_PM) @@ -1448,6 +1459,7 @@ static int omap_gpio_runtime_resume(struct device *dev) } #endif /* CONFIG_PM */ +#if IS_BUILTIN(CONFIG_GPIO_OMAP) void omap2_gpio_prepare_for_idle(int pwr_mode) { struct gpio_bank *bank; @@ -1473,6 +1485,7 @@ void omap2_gpio_resume_after_idle(void) pm_runtime_get_sync(bank->dev); } } +#endif #if defined(CONFIG_PM) static void omap_gpio_init_context(struct gpio_bank *p) @@ -1628,6 +1641,7 @@ MODULE_DEVICE_TABLE(of, omap_gpio_match); static struct platform_driver omap_gpio_driver = { .probe = omap_gpio_probe, + .remove = omap_gpio_remove, .driver = { .name = "omap_gpio", .pm = &gpio_pm_ops, @@ -1645,3 +1659,13 @@ static int __init omap_gpio_drv_reg(void) return platform_driver_register(&omap_gpio_driver); } postcore_initcall(omap_gpio_drv_reg); + +static void __exit omap_gpio_exit(void) +{ + platform_driver_unregister(&omap_gpio_driver); +} +module_exit(omap_gpio_exit); + +MODULE_DESCRIPTION("omap gpio driver"); +MODULE_ALIAS("platform:gpio-omap"); +MODULE_LICENSE("GPL v2"); diff --git a/include/linux/platform_data/gpio-omap.h b/include/linux/platform_data/gpio-omap.h index 5d50b25a73d7..cb2618147c34 100644 --- a/include/linux/platform_data/gpio-omap.h +++ b/include/linux/platform_data/gpio-omap.h @@ -208,9 +208,17 @@ struct omap_gpio_platform_data { int (*get_context_loss_count)(struct device *dev); }; +#if IS_BUILTIN(CONFIG_GPIO_OMAP) extern void omap2_gpio_prepare_for_idle(int off_mode); extern void omap2_gpio_resume_after_idle(void); -extern void omap_set_gpio_debounce(int gpio, int enable); -extern void omap_set_gpio_debounce_time(int gpio, int enable); +#else +static inline void omap2_gpio_prepare_for_idle(int off_mode) +{ +} + +static inline void omap2_gpio_resume_after_idle(void) +{ +} +#endif #endif -- cgit v1.2.3 From 66eb3bd857f5311f72c7c371f78ddc9c472befba Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 27 Apr 2015 18:04:05 +0200 Subject: pinctrl: use ERR_CAST instead of ERR_PTR/PTR_ERR Inspired by scripts/coccinelle/api/err_cast.cocci Signed-off-by: Fabian Frederick Signed-off-by: Linus Walleij --- include/linux/pinctrl/consumer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h index 18eccefea06e..d7e5d608faa7 100644 --- a/include/linux/pinctrl/consumer.h +++ b/include/linux/pinctrl/consumer.h @@ -142,7 +142,7 @@ static inline struct pinctrl * __must_check pinctrl_get_select( s = pinctrl_lookup_state(p, name); if (IS_ERR(s)) { pinctrl_put(p); - return ERR_PTR(PTR_ERR(s)); + return ERR_CAST(s); } ret = pinctrl_select_state(p, s); -- cgit v1.2.3 From 1d6b98774cff82860a3f044610e956bcbff556c1 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Tue, 31 Mar 2015 15:55:57 +0200 Subject: tty: constify return type of tty_name All users of tty_name pass the result directly to a printf-like function. This means we can actually let tty_name return the literal "NULL tty" or tty->name directly, avoiding the strcpy and a lot of medium-sized stack buffers. In preparation for that, make the return type const char*. While at it, we can also constify the tty parameter. Signed-off-by: Rasmus Villemoes Reviewed-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman --- drivers/tty/tty_io.c | 2 +- include/linux/tty.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index e5695467598f..5a49a4a80ebd 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -243,7 +243,7 @@ static void tty_del_file(struct file *file) * Locking: none */ -char *tty_name(struct tty_struct *tty, char *buf) +const char *tty_name(const struct tty_struct *tty, char *buf) { if (!tty) /* Hmm. NULL pointer. That's fun. */ strcpy(buf, "NULL tty"); diff --git a/include/linux/tty.h b/include/linux/tty.h index fe5623c9af71..4cbecfc7b3c9 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -421,7 +421,7 @@ static inline struct tty_struct *tty_kref_get(struct tty_struct *tty) extern int tty_paranoia_check(struct tty_struct *tty, struct inode *inode, const char *routine); -extern char *tty_name(struct tty_struct *tty, char *buf); +extern const char *tty_name(const struct tty_struct *tty, char *buf); extern void tty_wait_until_sent(struct tty_struct *tty, long timeout); extern int tty_check_change(struct tty_struct *tty); extern void __stop_tty(struct tty_struct *tty); -- cgit v1.2.3 From 429b474990cb4e5e8cfe2352daf649d0599cccb6 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Tue, 31 Mar 2015 15:55:59 +0200 Subject: tty: remove buf parameter from tty_name() tty_name no longer uses the buf parameter, so remove it along with all the 64 byte stack buffers that used to be passed in. Mostly generated by the coccinelle script @depends on patch@ identifier buf; constant C; expression tty; @@ - char buf[C]; <+... - tty_name(tty, buf) + tty_name(tty) ...+> allmodconfig compiles, so I'm fairly confident the stack buffers weren't used for other purposes as well. Signed-off-by: Rasmus Villemoes Reviewed-by: Peter Hurley Acked-by: Jesper Nilsson Acked-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/serio/serport.c | 5 ++--- drivers/tty/amiserial.c | 8 ++------ drivers/tty/cyclades.c | 8 ++------ drivers/tty/n_gsm.c | 3 +-- drivers/tty/n_tty.c | 7 ++----- drivers/tty/serial/crisv10.c | 8 ++------ drivers/tty/serial/serial_core.c | 4 +--- drivers/tty/tty_io.c | 28 +++++++++++----------------- drivers/tty/tty_ioctl.c | 4 +--- drivers/tty/tty_ldisc.c | 8 +++----- include/linux/tty.h | 2 +- 11 files changed, 28 insertions(+), 57 deletions(-) (limited to 'include/linux') diff --git a/drivers/input/serio/serport.c b/drivers/input/serio/serport.c index 69175b825346..9c927d35c1f5 100644 --- a/drivers/input/serio/serport.c +++ b/drivers/input/serio/serport.c @@ -167,7 +167,6 @@ static ssize_t serport_ldisc_read(struct tty_struct * tty, struct file * file, u { struct serport *serport = (struct serport*) tty->disc_data; struct serio *serio; - char name[64]; if (test_and_set_bit(SERPORT_BUSY, &serport->flags)) return -EBUSY; @@ -177,7 +176,7 @@ static ssize_t serport_ldisc_read(struct tty_struct * tty, struct file * file, u return -ENOMEM; strlcpy(serio->name, "Serial port", sizeof(serio->name)); - snprintf(serio->phys, sizeof(serio->phys), "%s/serio0", tty_name(tty, name)); + snprintf(serio->phys, sizeof(serio->phys), "%s/serio0", tty_name(tty)); serio->id = serport->id; serio->id.type = SERIO_RS232; serio->write = serport_serio_write; @@ -187,7 +186,7 @@ static ssize_t serport_ldisc_read(struct tty_struct * tty, struct file * file, u serio->dev.parent = tty->dev; serio_register_port(serport->serio); - printk(KERN_INFO "serio: Serial port %s\n", tty_name(tty, name)); + printk(KERN_INFO "serio: Serial port %s\n", tty_name(tty)); wait_event_interruptible(serport->wait, test_bit(SERPORT_DEAD, &serport->flags)); serio_unregister_port(serport->serio); diff --git a/drivers/tty/amiserial.c b/drivers/tty/amiserial.c index b2d760055952..894d3a84e285 100644 --- a/drivers/tty/amiserial.c +++ b/drivers/tty/amiserial.c @@ -966,9 +966,7 @@ static void rs_throttle(struct tty_struct * tty) struct serial_state *info = tty->driver_data; unsigned long flags; #ifdef SERIAL_DEBUG_THROTTLE - char buf[64]; - - printk("throttle %s: %d....\n", tty_name(tty, buf), + printk("throttle %s: %d....\n", tty_name(tty), tty->ldisc.chars_in_buffer(tty)); #endif @@ -991,9 +989,7 @@ static void rs_unthrottle(struct tty_struct * tty) struct serial_state *info = tty->driver_data; unsigned long flags; #ifdef SERIAL_DEBUG_THROTTLE - char buf[64]; - - printk("unthrottle %s: %d....\n", tty_name(tty, buf), + printk("unthrottle %s: %d....\n", tty_name(tty), tty->ldisc.chars_in_buffer(tty)); #endif diff --git a/drivers/tty/cyclades.c b/drivers/tty/cyclades.c index fd66f57390d0..87f6578c6f4a 100644 --- a/drivers/tty/cyclades.c +++ b/drivers/tty/cyclades.c @@ -2861,9 +2861,7 @@ static void cy_throttle(struct tty_struct *tty) unsigned long flags; #ifdef CY_DEBUG_THROTTLE - char buf[64]; - - printk(KERN_DEBUG "cyc:throttle %s: %ld...ttyC%d\n", tty_name(tty, buf), + printk(KERN_DEBUG "cyc:throttle %s: %ld...ttyC%d\n", tty_name(tty), tty->ldisc.chars_in_buffer(tty), info->line); #endif @@ -2902,10 +2900,8 @@ static void cy_unthrottle(struct tty_struct *tty) unsigned long flags; #ifdef CY_DEBUG_THROTTLE - char buf[64]; - printk(KERN_DEBUG "cyc:unthrottle %s: %ld...ttyC%d\n", - tty_name(tty, buf), tty_chars_in_buffer(tty), info->line); + tty_name(tty), tty_chars_in_buffer(tty), info->line); #endif if (serial_paranoia_check(info, tty->name, "cy_unthrottle")) diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c index 91abc00aa833..7e039669fc82 100644 --- a/drivers/tty/n_gsm.c +++ b/drivers/tty/n_gsm.c @@ -2274,7 +2274,6 @@ static void gsmld_receive_buf(struct tty_struct *tty, const unsigned char *cp, const unsigned char *dp; char *f; int i; - char buf[64]; char flags = TTY_NORMAL; if (debug & 4) @@ -2296,7 +2295,7 @@ static void gsmld_receive_buf(struct tty_struct *tty, const unsigned char *cp, break; default: WARN_ONCE(1, "%s: unknown flag %d\n", - tty_name(tty, buf), flags); + tty_name(tty), flags); break; } } diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c index cf6e0f2e1331..54da8f49394d 100644 --- a/drivers/tty/n_tty.c +++ b/drivers/tty/n_tty.c @@ -1179,13 +1179,12 @@ static void n_tty_receive_break(struct tty_struct *tty) static void n_tty_receive_overrun(struct tty_struct *tty) { struct n_tty_data *ldata = tty->disc_data; - char buf[64]; ldata->num_overrun++; if (time_after(jiffies, ldata->overrun_time + HZ) || time_after(ldata->overrun_time, jiffies)) { printk(KERN_WARNING "%s: %d input overrun(s)\n", - tty_name(tty, buf), + tty_name(tty), ldata->num_overrun); ldata->overrun_time = jiffies; ldata->num_overrun = 0; @@ -1460,8 +1459,6 @@ static void n_tty_receive_char_closing(struct tty_struct *tty, unsigned char c) static void n_tty_receive_char_flagged(struct tty_struct *tty, unsigned char c, char flag) { - char buf[64]; - switch (flag) { case TTY_BREAK: n_tty_receive_break(tty); @@ -1475,7 +1472,7 @@ n_tty_receive_char_flagged(struct tty_struct *tty, unsigned char c, char flag) break; default: printk(KERN_ERR "%s: unknown flag %d\n", - tty_name(tty, buf), flag); + tty_name(tty), flag); break; } } diff --git a/drivers/tty/serial/crisv10.c b/drivers/tty/serial/crisv10.c index 0c1825b0b41d..568ea0d2d699 100644 --- a/drivers/tty/serial/crisv10.c +++ b/drivers/tty/serial/crisv10.c @@ -3216,9 +3216,7 @@ rs_throttle(struct tty_struct * tty) { struct e100_serial *info = (struct e100_serial *)tty->driver_data; #ifdef SERIAL_DEBUG_THROTTLE - char buf[64]; - - printk("throttle %s: %lu....\n", tty_name(tty, buf), + printk("throttle %s: %lu....\n", tty_name(tty), (unsigned long)tty->ldisc.chars_in_buffer(tty)); #endif DFLOW(DEBUG_LOG(info->line,"rs_throttle %lu\n", tty->ldisc.chars_in_buffer(tty))); @@ -3238,9 +3236,7 @@ rs_unthrottle(struct tty_struct * tty) { struct e100_serial *info = (struct e100_serial *)tty->driver_data; #ifdef SERIAL_DEBUG_THROTTLE - char buf[64]; - - printk("unthrottle %s: %lu....\n", tty_name(tty, buf), + printk("unthrottle %s: %lu....\n", tty_name(tty), (unsigned long)tty->ldisc.chars_in_buffer(tty)); #endif DFLOW(DEBUG_LOG(info->line,"rs_unthrottle ldisc %d\n", tty->ldisc.chars_in_buffer(tty))); diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index 0b7bb12dfc68..eec067d8eedb 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -894,12 +894,10 @@ static int uart_set_info(struct tty_struct *tty, struct tty_port *port, * need to rate-limit; it's CAP_SYS_ADMIN only. */ if (uport->flags & UPF_SPD_MASK) { - char buf[64]; - dev_notice(uport->dev, "%s sets custom speed on %s. This is deprecated.\n", current->comm, - tty_name(port->tty, buf)); + tty_name(port->tty)); } uart_change_speed(tty, state, NULL); } diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 1eaf0fbd99e4..57fc6ee12332 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -235,7 +235,6 @@ static void tty_del_file(struct file *file) /** * tty_name - return tty naming * @tty: tty structure - * @buf: unused * * Convert a tty structure into a name. The name reflects the kernel * naming policy and if udev is in use may not reflect user space @@ -243,7 +242,7 @@ static void tty_del_file(struct file *file) * Locking: none */ -const char *tty_name(const struct tty_struct *tty, char *buf) +const char *tty_name(const struct tty_struct *tty) { if (!tty) /* Hmm. NULL pointer. That's fun. */ return "NULL tty"; @@ -768,8 +767,7 @@ static void do_tty_hangup(struct work_struct *work) void tty_hangup(struct tty_struct *tty) { #ifdef TTY_DEBUG_HANGUP - char buf[64]; - printk(KERN_DEBUG "%s hangup...\n", tty_name(tty, buf)); + printk(KERN_DEBUG "%s hangup...\n", tty_name(tty)); #endif schedule_work(&tty->hangup_work); } @@ -788,9 +786,7 @@ EXPORT_SYMBOL(tty_hangup); void tty_vhangup(struct tty_struct *tty) { #ifdef TTY_DEBUG_HANGUP - char buf[64]; - - printk(KERN_DEBUG "%s vhangup...\n", tty_name(tty, buf)); + printk(KERN_DEBUG "%s vhangup...\n", tty_name(tty)); #endif __tty_hangup(tty, 0); } @@ -829,9 +825,7 @@ void tty_vhangup_self(void) static void tty_vhangup_session(struct tty_struct *tty) { #ifdef TTY_DEBUG_HANGUP - char buf[64]; - - printk(KERN_DEBUG "%s vhangup session...\n", tty_name(tty, buf)); + printk(KERN_DEBUG "%s vhangup session...\n", tty_name(tty)); #endif __tty_hangup(tty, 1); } @@ -1767,7 +1761,6 @@ int tty_release(struct inode *inode, struct file *filp) struct tty_struct *o_tty = NULL; int do_sleep, final; int idx; - char buf[64]; long timeout = 0; int once = 1; @@ -1791,7 +1784,7 @@ int tty_release(struct inode *inode, struct file *filp) #ifdef TTY_DEBUG_HANGUP printk(KERN_DEBUG "%s: %s (tty count=%d)...\n", __func__, - tty_name(tty, buf), tty->count); + tty_name(tty), tty->count); #endif if (tty->ops->close) @@ -1842,7 +1835,7 @@ int tty_release(struct inode *inode, struct file *filp) if (once) { once = 0; printk(KERN_WARNING "%s: %s: read/write wait queue active!\n", - __func__, tty_name(tty, buf)); + __func__, tty_name(tty)); } schedule_timeout_killable(timeout); if (timeout < 120 * HZ) @@ -1854,13 +1847,13 @@ int tty_release(struct inode *inode, struct file *filp) if (o_tty) { if (--o_tty->count < 0) { printk(KERN_WARNING "%s: bad pty slave count (%d) for %s\n", - __func__, o_tty->count, tty_name(o_tty, buf)); + __func__, o_tty->count, tty_name(o_tty)); o_tty->count = 0; } } if (--tty->count < 0) { printk(KERN_WARNING "%s: bad tty->count (%d) for %s\n", - __func__, tty->count, tty_name(tty, buf)); + __func__, tty->count, tty_name(tty)); tty->count = 0; } @@ -1903,7 +1896,7 @@ int tty_release(struct inode *inode, struct file *filp) return 0; #ifdef TTY_DEBUG_HANGUP - printk(KERN_DEBUG "%s: %s: final close\n", __func__, tty_name(tty, buf)); + printk(KERN_DEBUG "%s: %s: final close\n", __func__, tty_name(tty)); #endif /* * Ask the line discipline code to release its structures @@ -1914,7 +1907,8 @@ int tty_release(struct inode *inode, struct file *filp) tty_flush_works(tty); #ifdef TTY_DEBUG_HANGUP - printk(KERN_DEBUG "%s: %s: freeing structure...\n", __func__, tty_name(tty, buf)); + printk(KERN_DEBUG "%s: %s: freeing structure...\n", __func__, + tty_name(tty)); #endif /* * The release_tty function takes care of the details of clearing diff --git a/drivers/tty/tty_ioctl.c b/drivers/tty/tty_ioctl.c index 8e53fe469664..5232fb60b0b1 100644 --- a/drivers/tty/tty_ioctl.c +++ b/drivers/tty/tty_ioctl.c @@ -211,9 +211,7 @@ int tty_unthrottle_safe(struct tty_struct *tty) void tty_wait_until_sent(struct tty_struct *tty, long timeout) { #ifdef TTY_DEBUG_WAIT_UNTIL_SENT - char buf[64]; - - printk(KERN_DEBUG "%s wait until sent...\n", tty_name(tty, buf)); + printk(KERN_DEBUG "%s wait until sent...\n", tty_name(tty)); #endif if (!timeout) timeout = MAX_SCHEDULE_TIMEOUT; diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c index 3737f55272d2..c07fb5d9bcf9 100644 --- a/drivers/tty/tty_ldisc.c +++ b/drivers/tty/tty_ldisc.c @@ -22,9 +22,8 @@ #undef LDISC_DEBUG_HANGUP #ifdef LDISC_DEBUG_HANGUP -#define tty_ldisc_debug(tty, f, args...) ({ \ - char __b[64]; \ - printk(KERN_DEBUG "%s: %s: " f, __func__, tty_name(tty, __b), ##args); \ +#define tty_ldisc_debug(tty, f, args...) ({ \ + printk(KERN_DEBUG "%s: %s: " f, __func__, tty_name(tty), ##args); \ }) #else #define tty_ldisc_debug(tty, f, args...) @@ -483,7 +482,6 @@ static void tty_ldisc_close(struct tty_struct *tty, struct tty_ldisc *ld) static void tty_ldisc_restore(struct tty_struct *tty, struct tty_ldisc *old) { - char buf[64]; struct tty_ldisc *new_ldisc; int r; @@ -504,7 +502,7 @@ static void tty_ldisc_restore(struct tty_struct *tty, struct tty_ldisc *old) if (r < 0) panic("Couldn't open N_TTY ldisc for " "%s --- error %d.", - tty_name(tty, buf), r); + tty_name(tty), r); } } diff --git a/include/linux/tty.h b/include/linux/tty.h index 4cbecfc7b3c9..9a72c9144d8a 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -421,7 +421,7 @@ static inline struct tty_struct *tty_kref_get(struct tty_struct *tty) extern int tty_paranoia_check(struct tty_struct *tty, struct inode *inode, const char *routine); -extern const char *tty_name(const struct tty_struct *tty, char *buf); +extern const char *tty_name(const struct tty_struct *tty); extern void tty_wait_until_sent(struct tty_struct *tty, long timeout); extern int tty_check_change(struct tty_struct *tty); extern void __stop_tty(struct tty_struct *tty); -- cgit v1.2.3 From 6b3cddccf4eec0883feb065aea28dd9770bb17d0 Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Sat, 11 Apr 2015 11:02:36 -0400 Subject: serial: core: Fix unused variable warnings from uart_console() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If CONFIG_SERIAL_CORE_CONSOLE=n, build warnings are generated by uart_console() macro expansion: drivers/tty/serial/of_serial.c: In function ‘of_serial_suspend_8250’: drivers/tty/serial/of_serial.c:262:20: warning: unused variable ‘port’ [-Wunused-variable] struct uart_port *port = &port8250->port; ^ drivers/tty/serial/of_serial.c: In function ‘of_serial_resume_8250’: drivers/tty/serial/of_serial.c:272:20: warning: unused variable ‘port’ [-Wunused-variable] struct uart_port *port = &port8250->port; Signed-off-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 025dad9dcde4..297d4fa1cfe5 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -35,7 +35,7 @@ #define uart_console(port) \ ((port)->cons && (port)->cons->index == (port)->line) #else -#define uart_console(port) (0) +#define uart_console(port) ({ (void)port; 0; }) #endif struct uart_port; -- cgit v1.2.3 From 17799359e7b3fa6ef4f2bf926cd6821cf7903ecf Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Sat, 28 Feb 2015 02:13:11 -0800 Subject: mtd: nand_bbt: make nand_scan_bbt() static This implementation detail is no longer needed outside of nand_bbt.c. Signed-off-by: Brian Norris --- drivers/mtd/nand/nand_bbt.c | 2 +- include/linux/mtd/nand.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/nand/nand_bbt.c b/drivers/mtd/nand/nand_bbt.c index 307a285afb78..53e17586fbed 100644 --- a/drivers/mtd/nand/nand_bbt.c +++ b/drivers/mtd/nand/nand_bbt.c @@ -1074,7 +1074,7 @@ static void verify_bbt_descr(struct mtd_info *mtd, struct nand_bbt_descr *bd) * The bad block table memory is allocated here. It must be freed by calling * the nand_free_bbt function. */ -int nand_scan_bbt(struct mtd_info *mtd, struct nand_bbt_descr *bd) +static int nand_scan_bbt(struct mtd_info *mtd, struct nand_bbt_descr *bd) { struct nand_chip *this = mtd->priv; int len, res = 0; diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 3d4ea7eb2b68..6c51876941f3 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -833,7 +833,6 @@ struct nand_manufacturers { extern struct nand_flash_dev nand_flash_ids[]; extern struct nand_manufacturers nand_manuf_ids[]; -extern int nand_scan_bbt(struct mtd_info *mtd, struct nand_bbt_descr *bd); extern int nand_default_bbt(struct mtd_info *mtd); extern int nand_markbad_bbt(struct mtd_info *mtd, loff_t offs); extern int nand_isreserved_bbt(struct mtd_info *mtd, loff_t offs); -- cgit v1.2.3 From 0097d12e504b3ce57b68810737ad6a5a64a98c68 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 30 Apr 2015 13:43:30 +0200 Subject: KVM: provide irq_unsafe kvm_guest_{enter|exit} Several kvm architectures disable interrupts before kvm_guest_enter. kvm_guest_enter then uses local_irq_save/restore to disable interrupts again or for the first time. Lets provide underscore versions of kvm_guest_{enter|exit} that assume being called locked. kvm_guest_enter now disables interrupts for the full function and thus we can remove the check for preemptible. This patch then adopts s390/kvm to use local_irq_disable/enable calls which are slighty cheaper that local_irq_save/restore and call these new functions. Signed-off-by: Christian Borntraeger Signed-off-by: Paolo Bonzini --- arch/s390/kvm/kvm-s390.c | 10 ++++++---- include/linux/kvm_host.h | 27 ++++++++++++++++++--------- 2 files changed, 24 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 8cd8e7b288c5..2be391bb8557 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1993,12 +1993,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) * As PF_VCPU will be used in fault handler, between * guest_enter and guest_exit should be no uaccess. */ - preempt_disable(); - kvm_guest_enter(); - preempt_enable(); + local_irq_disable(); + __kvm_guest_enter(); + local_irq_enable(); exit_reason = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs); - kvm_guest_exit(); + local_irq_disable(); + __kvm_guest_exit(); + local_irq_enable(); vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); rc = vcpu_post_run(vcpu, exit_reason); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ad45054309a0..efc16df1fc5d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -762,16 +762,10 @@ static inline void kvm_iommu_unmap_pages(struct kvm *kvm, } #endif -static inline void kvm_guest_enter(void) +/* must be called with irqs disabled */ +static inline void __kvm_guest_enter(void) { - unsigned long flags; - - BUG_ON(preemptible()); - - local_irq_save(flags); guest_enter(); - local_irq_restore(flags); - /* KVM does not hold any references to rcu protected data when it * switches CPU into a guest mode. In fact switching to a guest mode * is very similar to exiting to userspace from rcu point of view. In @@ -783,12 +777,27 @@ static inline void kvm_guest_enter(void) rcu_virt_note_context_switch(smp_processor_id()); } +/* must be called with irqs disabled */ +static inline void __kvm_guest_exit(void) +{ + guest_exit(); +} + +static inline void kvm_guest_enter(void) +{ + unsigned long flags; + + local_irq_save(flags); + __kvm_guest_enter(); + local_irq_restore(flags); +} + static inline void kvm_guest_exit(void) { unsigned long flags; local_irq_save(flags); - guest_exit(); + __kvm_guest_exit(); local_irq_restore(flags); } -- cgit v1.2.3 From 653f52c316a49c5ee2701bc13b15879f20790662 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 23 Apr 2015 11:52:37 -0400 Subject: kvm,x86: load guest FPU context more eagerly Currently KVM will clear the FPU bits in CR0.TS in the VMCS, and trap to re-load them every time the guest accesses the FPU after a switch back into the guest from the host. This patch copies the x86 task switch semantics for FPU loading, with the FPU loaded eagerly after first use if the system uses eager fpu mode, or if the guest uses the FPU frequently. In the latter case, after loading the FPU for 255 times, the fpu_counter will roll over, and we will revert to loading the FPU on demand, until it has been established that the guest is still actively using the FPU. This mirrors the x86 task switch policy, which seems to work. Signed-off-by: Rik van Riel Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 15 +++++++++++++-- include/linux/kvm_host.h | 1 + 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bf80ce7656b7..c42134e98e31 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7086,14 +7086,25 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { kvm_put_guest_xcr0(vcpu); - if (!vcpu->guest_fpu_loaded) + if (!vcpu->guest_fpu_loaded) { + vcpu->fpu_counter = 0; return; + } vcpu->guest_fpu_loaded = 0; fpu_save_init(&vcpu->arch.guest_fpu); __kernel_fpu_end(); ++vcpu->stat.fpu_reload; - kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); + /* + * If using eager FPU mode, or if the guest is a frequent user + * of the FPU, just leave the FPU active for next time. + * Every 255 times fpu_counter rolls over to 0; a guest that uses + * the FPU in bursts will revert to loading it on demand. + */ + if (!use_eager_fpu()) { + if (++vcpu->fpu_counter < 5) + kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); + } trace_kvm_fpu(0); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index efc16df1fc5d..b7a08cd6f4a8 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -230,6 +230,7 @@ struct kvm_vcpu { int fpu_active; int guest_fpu_loaded, guest_xcr0_loaded; + unsigned char fpu_counter; wait_queue_head_t wq; struct pid *pid; int sigset_active; -- cgit v1.2.3 From aed5ed47724f6a7453fa62e3c90f3cee93edbfe3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 6 May 2015 18:04:23 +0200 Subject: context_tracking: Protect against recursion Context tracking recursion can happen when an exception triggers in the middle of a call to a context tracking probe. This special case can be caused by vmalloc faults. If an access to a memory area allocated by vmalloc happens in the middle of context_tracking_enter(), we may run into an endless fault loop because the exception in turn calls context_tracking_enter() which faults on the same vmalloc'ed memory, triggering an exception again, etc... Some rare crashes have been reported so lets protect against this with a recursion counter. Reported-by: Dave Jones Signed-off-by: Frederic Weisbecker Reviewed-by: Rik van Riel Acked-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Chris Metcalf Cc: H. Peter Anvin Cc: Martin Schwidefsky Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Rafael J . Wysocki Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1430928266-24888-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/context_tracking_state.h | 1 + kernel/context_tracking.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'include/linux') diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index 6b7b96a32b75..678ecdf90cf6 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -12,6 +12,7 @@ struct context_tracking { * may be further optimized using static keys. */ bool active; + int recursion; enum ctx_state { CONTEXT_KERNEL = 0, CONTEXT_USER, diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 72d59a1a6eb6..5b11a10e196a 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -38,6 +38,25 @@ void context_tracking_cpu_set(int cpu) } } +static bool context_tracking_recursion_enter(void) +{ + int recursion; + + recursion = __this_cpu_inc_return(context_tracking.recursion); + if (recursion == 1) + return true; + + WARN_ONCE((recursion < 1), "Invalid context tracking recursion value %d\n", recursion); + __this_cpu_dec(context_tracking.recursion); + + return false; +} + +static void context_tracking_recursion_exit(void) +{ + __this_cpu_dec(context_tracking.recursion); +} + /** * context_tracking_enter - Inform the context tracking that the CPU is going * enter user or guest space mode. @@ -75,6 +94,9 @@ void context_tracking_enter(enum ctx_state state) WARN_ON_ONCE(!current->mm); local_irq_save(flags); + if (!context_tracking_recursion_enter()) + goto out_irq_restore; + if ( __this_cpu_read(context_tracking.state) != state) { if (__this_cpu_read(context_tracking.active)) { /* @@ -105,6 +127,8 @@ void context_tracking_enter(enum ctx_state state) */ __this_cpu_write(context_tracking.state, state); } + context_tracking_recursion_exit(); +out_irq_restore: local_irq_restore(flags); } NOKPROBE_SYMBOL(context_tracking_enter); @@ -139,6 +163,9 @@ void context_tracking_exit(enum ctx_state state) return; local_irq_save(flags); + if (!context_tracking_recursion_enter()) + goto out_irq_restore; + if (__this_cpu_read(context_tracking.state) == state) { if (__this_cpu_read(context_tracking.active)) { /* @@ -153,6 +180,8 @@ void context_tracking_exit(enum ctx_state state) } __this_cpu_write(context_tracking.state, CONTEXT_KERNEL); } + context_tracking_recursion_exit(); +out_irq_restore: local_irq_restore(flags); } NOKPROBE_SYMBOL(context_tracking_exit); -- cgit v1.2.3 From fafe870f31212a72f3c2d74e7b90e4ef39e83ee1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 6 May 2015 18:04:24 +0200 Subject: context_tracking: Inherit TIF_NOHZ through forks instead of context switches TIF_NOHZ is used by context_tracking to force syscall slow-path on every task in order to track userspace roundtrips. As such, it must be set on all running tasks. It's currently explicitly inherited through context switches. There is no need to do it in this fast-path though. The flag could simply be set once for all on all tasks, whether they are running or not. Lets do this by setting the flag for the init task on early boot, and let it propagate through fork inheritance. While at it, mark context_tracking_cpu_set() as init code, we only need it at early boot time. Suggested-by: Oleg Nesterov Signed-off-by: Frederic Weisbecker Reviewed-by: Rik van Riel Cc: Borislav Petkov Cc: Chris Metcalf Cc: Dave Jones Cc: H. Peter Anvin Cc: Martin Schwidefsky Cc: Mike Galbraith Cc: Paul E . McKenney Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1430928266-24888-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/context_tracking.h | 10 --------- include/linux/sched.h | 3 +++ kernel/context_tracking.c | 44 +++++++++++++++++----------------------- kernel/sched/core.c | 1 - 4 files changed, 22 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 2821838256b4..b96bd299966f 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -14,8 +14,6 @@ extern void context_tracking_enter(enum ctx_state state); extern void context_tracking_exit(enum ctx_state state); extern void context_tracking_user_enter(void); extern void context_tracking_user_exit(void); -extern void __context_tracking_task_switch(struct task_struct *prev, - struct task_struct *next); static inline void user_enter(void) { @@ -51,19 +49,11 @@ static inline void exception_exit(enum ctx_state prev_ctx) } } -static inline void context_tracking_task_switch(struct task_struct *prev, - struct task_struct *next) -{ - if (context_tracking_is_enabled()) - __context_tracking_task_switch(prev, next); -} #else static inline void user_enter(void) { } static inline void user_exit(void) { } static inline enum ctx_state exception_enter(void) { return 0; } static inline void exception_exit(enum ctx_state prev_ctx) { } -static inline void context_tracking_task_switch(struct task_struct *prev, - struct task_struct *next) { } #endif /* !CONFIG_CONTEXT_TRACKING */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 26a2e6122734..185a750e4ed4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2532,6 +2532,9 @@ static inline unsigned long wait_task_inactive(struct task_struct *p, } #endif +#define tasklist_empty() \ + list_empty(&init_task.tasks) + #define next_task(p) \ list_entry_rcu((p)->tasks.next, struct task_struct, tasks) diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 5b11a10e196a..0a495ab35bc7 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -30,14 +30,6 @@ EXPORT_SYMBOL_GPL(context_tracking_enabled); DEFINE_PER_CPU(struct context_tracking, context_tracking); EXPORT_SYMBOL_GPL(context_tracking); -void context_tracking_cpu_set(int cpu) -{ - if (!per_cpu(context_tracking.active, cpu)) { - per_cpu(context_tracking.active, cpu) = true; - static_key_slow_inc(&context_tracking_enabled); - } -} - static bool context_tracking_recursion_enter(void) { int recursion; @@ -193,24 +185,26 @@ void context_tracking_user_exit(void) } NOKPROBE_SYMBOL(context_tracking_user_exit); -/** - * __context_tracking_task_switch - context switch the syscall callbacks - * @prev: the task that is being switched out - * @next: the task that is being switched in - * - * The context tracking uses the syscall slow path to implement its user-kernel - * boundaries probes on syscalls. This way it doesn't impact the syscall fast - * path on CPUs that don't do context tracking. - * - * But we need to clear the flag on the previous task because it may later - * migrate to some CPU that doesn't do the context tracking. As such the TIF - * flag may not be desired there. - */ -void __context_tracking_task_switch(struct task_struct *prev, - struct task_struct *next) +void __init context_tracking_cpu_set(int cpu) { - clear_tsk_thread_flag(prev, TIF_NOHZ); - set_tsk_thread_flag(next, TIF_NOHZ); + static __initdata bool initialized = false; + + if (!per_cpu(context_tracking.active, cpu)) { + per_cpu(context_tracking.active, cpu) = true; + static_key_slow_inc(&context_tracking_enabled); + } + + if (initialized) + return; + + /* + * Set TIF_NOHZ to init/0 and let it propagate to all tasks through fork + * This assumes that init is the only task at this early boot stage. + */ + set_tsk_thread_flag(&init_task, TIF_NOHZ); + WARN_ON_ONCE(!tasklist_empty()); + + initialized = true; } #ifdef CONFIG_CONTEXT_TRACKING_FORCE diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fe22f7510bce..54f032cea040 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2332,7 +2332,6 @@ context_switch(struct rq *rq, struct task_struct *prev, */ spin_release(&rq->lock.dep_map, 1, _THIS_IP_); - context_tracking_task_switch(prev, next); /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); barrier(); -- cgit v1.2.3 From 83dedea8a07fb4bf91863764b15c1c4ec00330f9 Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Wed, 6 May 2015 18:04:25 +0200 Subject: nohz: Add tick_nohz_full_add_cpus_to() API This API is useful to modify a cpumask indicating some special nohz-type functionality so that the nohz cores are automatically added to that set. Signed-off-by: Chris Metcalf Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Dave Jones Cc: H. Peter Anvin Cc: Martin Schwidefsky Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Rafael J. Wysocki Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1429024675-18938-1-git-send-email-cmetcalf@ezchip.com Link: http://lkml.kernel.org/r/1430928266-24888-4-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/tick.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tick.h b/include/linux/tick.h index f8492da57ad3..4191b5623a28 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -134,6 +134,12 @@ static inline bool tick_nohz_full_cpu(int cpu) return cpumask_test_cpu(cpu, tick_nohz_full_mask); } +static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) +{ + if (tick_nohz_full_enabled()) + cpumask_or(mask, mask, tick_nohz_full_mask); +} + extern void __tick_nohz_full_check(void); extern void tick_nohz_full_kick(void); extern void tick_nohz_full_kick_cpu(int cpu); @@ -142,6 +148,7 @@ extern void __tick_nohz_task_switch(struct task_struct *tsk); #else static inline bool tick_nohz_full_enabled(void) { return false; } static inline bool tick_nohz_full_cpu(int cpu) { return false; } +static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { } static inline void __tick_nohz_full_check(void) { } static inline void tick_nohz_full_kick_cpu(int cpu) { } static inline void tick_nohz_full_kick(void) { } -- cgit v1.2.3 From c6201cd8513db2db54b248a862672849ed9ccb82 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 7 May 2015 09:52:22 -0500 Subject: PCI/MSI: Remove unused pci_msi_off() pci_msi_off() is unused, so remove it. Removes the exported symbol pci_msi_off(). Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.c | 33 --------------------------------- include/linux/pci.h | 1 - 2 files changed, 34 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index acc4b6ef78c4..687af7264406 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -3101,39 +3101,6 @@ bool pci_check_and_unmask_intx(struct pci_dev *dev) } EXPORT_SYMBOL_GPL(pci_check_and_unmask_intx); -/** - * pci_msi_off - disables any MSI or MSI-X capabilities - * @dev: the PCI device to operate on - * - * If you want to use MSI, see pci_enable_msi() and friends. - * This is a lower-level primitive that allows us to disable - * MSI operation at the device level. - */ -void pci_msi_off(struct pci_dev *dev) -{ - int pos; - u16 control; - - /* - * This looks like it could go in msi.c, but we need it even when - * CONFIG_PCI_MSI=n. For the same reason, we can't use - * dev->msi_cap or dev->msix_cap here. - */ - pos = pci_find_capability(dev, PCI_CAP_ID_MSI); - if (pos) { - pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control); - control &= ~PCI_MSI_FLAGS_ENABLE; - pci_write_config_word(dev, pos + PCI_MSI_FLAGS, control); - } - pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); - if (pos) { - pci_read_config_word(dev, pos + PCI_MSIX_FLAGS, &control); - control &= ~PCI_MSIX_FLAGS_ENABLE; - pci_write_config_word(dev, pos + PCI_MSIX_FLAGS, control); - } -} -EXPORT_SYMBOL_GPL(pci_msi_off); - int pci_set_dma_max_seg_size(struct pci_dev *dev, unsigned int size) { return dma_set_max_seg_size(&dev->dev, size); diff --git a/include/linux/pci.h b/include/linux/pci.h index 353db8dc4c6e..50b7c7d0206f 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -974,7 +974,6 @@ void pci_intx(struct pci_dev *dev, int enable); bool pci_intx_mask_supported(struct pci_dev *dev); bool pci_check_and_mask_intx(struct pci_dev *dev); bool pci_check_and_unmask_intx(struct pci_dev *dev); -void pci_msi_off(struct pci_dev *dev); int pci_set_dma_max_seg_size(struct pci_dev *dev, unsigned int size); int pci_set_dma_seg_boundary(struct pci_dev *dev, unsigned long mask); int pci_wait_for_pending(struct pci_dev *dev, int pos, u16 mask); -- cgit v1.2.3 From 385f83f85cd9428db82cae5e6f6f786be113b24c Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 28 Apr 2015 12:21:40 +0200 Subject: dmaengine: Remove Renesas Audio DMAC peri peri platform data Commit 3cd44dcd35a6 ("dmaengine: remove Renesas Audio DMAC peri peri") forgot to remove the header file with the platform data definitions. Signed-off-by: Geert Uytterhoeven Acked-by: Simon Horman Signed-off-by: Vinod Koul --- include/linux/platform_data/dma-rcar-audmapp.h | 34 -------------------------- 1 file changed, 34 deletions(-) delete mode 100644 include/linux/platform_data/dma-rcar-audmapp.h (limited to 'include/linux') diff --git a/include/linux/platform_data/dma-rcar-audmapp.h b/include/linux/platform_data/dma-rcar-audmapp.h deleted file mode 100644 index 471fffebbeb4..000000000000 --- a/include/linux/platform_data/dma-rcar-audmapp.h +++ /dev/null @@ -1,34 +0,0 @@ -/* - * This is for Renesas R-Car Audio-DMAC-peri-peri. - * - * Copyright (C) 2014 Renesas Electronics Corporation - * Copyright (C) 2014 Kuninori Morimoto - * - * This file is based on the include/linux/sh_dma.h - * - * Header for the new SH dmaengine driver - * - * Copyright (C) 2010 Guennadi Liakhovetski - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#ifndef SH_AUDMAPP_H -#define SH_AUDMAPP_H - -#include - -struct audmapp_slave_config { - int slave_id; - dma_addr_t src; - dma_addr_t dst; - u32 chcr; -}; - -struct audmapp_pdata { - struct audmapp_slave_config *slave; - int slave_num; -}; - -#endif /* SH_AUDMAPP_H */ -- cgit v1.2.3 From 3289bdb429884c0279bf9ab72dff7b934f19dfc6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 14 Apr 2015 13:19:42 +0200 Subject: sched: Move the loadavg code to a more obvious location I could not find the loadavg code.. turns out it was hidden in a file called proc.c. It further got mingled up with the cruft per rq load indexes (which we really want to get rid of). Move the per rq load indexes into the fair.c load-balance code (that's the only thing that uses them) and rename proc.c to loadavg.c so we can find it again. Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Paul Gortmaker Cc: Thomas Gleixner [ Did minor cleanups to the code. ] Signed-off-by: Ingo Molnar --- include/linux/sched.h | 5 + kernel/sched/Makefile | 2 +- kernel/sched/core.c | 7 +- kernel/sched/fair.c | 183 ++++++++++++++++ kernel/sched/loadavg.c | 394 +++++++++++++++++++++++++++++++++ kernel/sched/proc.c | 584 ------------------------------------------------- kernel/sched/sched.h | 8 +- 7 files changed, 593 insertions(+), 590 deletions(-) create mode 100644 kernel/sched/loadavg.c delete mode 100644 kernel/sched/proc.c (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 26a2e6122734..85cf253bc366 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -173,7 +173,12 @@ extern unsigned long nr_iowait_cpu(int cpu); extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load); extern void calc_global_load(unsigned long ticks); + +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) extern void update_cpu_load_nohz(void); +#else +static inline void update_cpu_load_nohz(void) { } +#endif extern unsigned long get_parent_ip(unsigned long addr); diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 46be87024875..67687973ce80 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif -obj-y += core.o proc.o clock.o cputime.o +obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o obj-y += wait.o completion.o idle.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fdf972d56f65..527fc28a737a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2397,9 +2397,9 @@ unsigned long nr_iowait_cpu(int cpu) void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) { - struct rq *this = this_rq(); - *nr_waiters = atomic_read(&this->nr_iowait); - *load = this->cpu_load[0]; + struct rq *rq = this_rq(); + *nr_waiters = atomic_read(&rq->nr_iowait); + *load = rq->load.weight; } #ifdef CONFIG_SMP @@ -2497,6 +2497,7 @@ void scheduler_tick(void) update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); update_cpu_load_active(rq); + calc_global_load_tick(rq); raw_spin_unlock(&rq->lock); perf_event_task_tick(); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ffeaa4105e48..4bc6013886ec 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4323,6 +4323,189 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP + +/* + * per rq 'load' arrray crap; XXX kill this. + */ + +/* + * The exact cpuload at various idx values, calculated at every tick would be + * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load + * + * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called + * on nth tick when cpu may be busy, then we have: + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load + * + * decay_load_missed() below does efficient calculation of + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load + * + * The calculation is approximated on a 128 point scale. + * degrade_zero_ticks is the number of ticks after which load at any + * particular idx is approximated to be zero. + * degrade_factor is a precomputed table, a row for each load idx. + * Each column corresponds to degradation factor for a power of two ticks, + * based on 128 point scale. + * Example: + * row 2, col 3 (=12) says that the degradation at load idx 2 after + * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). + * + * With this power of 2 load factors, we can degrade the load n times + * by looking at 1 bits in n and doing as many mult/shift instead of + * n mult/shifts needed by the exact degradation. + */ +#define DEGRADE_SHIFT 7 +static const unsigned char + degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; +static const unsigned char + degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { + {0, 0, 0, 0, 0, 0, 0, 0}, + {64, 32, 8, 0, 0, 0, 0, 0}, + {96, 72, 40, 12, 1, 0, 0}, + {112, 98, 75, 43, 15, 1, 0}, + {120, 112, 98, 76, 45, 16, 2} }; + +/* + * Update cpu_load for any missed ticks, due to tickless idle. The backlog + * would be when CPU is idle and so we just decay the old load without + * adding any new load. + */ +static unsigned long +decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) +{ + int j = 0; + + if (!missed_updates) + return load; + + if (missed_updates >= degrade_zero_ticks[idx]) + return 0; + + if (idx == 1) + return load >> missed_updates; + + while (missed_updates) { + if (missed_updates % 2) + load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; + + missed_updates >>= 1; + j++; + } + return load; +} + +/* + * Update rq->cpu_load[] statistics. This function is usually called every + * scheduler tick (TICK_NSEC). With tickless idle this will not be called + * every tick. We fix it up based on jiffies. + */ +static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, + unsigned long pending_updates) +{ + int i, scale; + + this_rq->nr_load_updates++; + + /* Update our load: */ + this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ + for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { + unsigned long old_load, new_load; + + /* scale is effectively 1 << i now, and >> i divides by scale */ + + old_load = this_rq->cpu_load[i]; + old_load = decay_load_missed(old_load, pending_updates - 1, i); + new_load = this_load; + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (new_load > old_load) + new_load += scale - 1; + + this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; + } + + sched_avg_update(this_rq); +} + +#ifdef CONFIG_NO_HZ_COMMON +/* + * There is no sane way to deal with nohz on smp when using jiffies because the + * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. + * + * Therefore we cannot use the delta approach from the regular tick since that + * would seriously skew the load calculation. However we'll make do for those + * updates happening while idle (nohz_idle_balance) or coming out of idle + * (tick_nohz_idle_exit). + * + * This means we might still be one tick off for nohz periods. + */ + +/* + * Called from nohz_idle_balance() to update the load ratings before doing the + * idle balance. + */ +static void update_idle_cpu_load(struct rq *this_rq) +{ + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long load = this_rq->cfs.runnable_load_avg; + unsigned long pending_updates; + + /* + * bail if there's load or we're actually up-to-date. + */ + if (load || curr_jiffies == this_rq->last_load_update_tick) + return; + + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + this_rq->last_load_update_tick = curr_jiffies; + + __update_cpu_load(this_rq, load, pending_updates); +} + +/* + * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. + */ +void update_cpu_load_nohz(void) +{ + struct rq *this_rq = this_rq(); + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long pending_updates; + + if (curr_jiffies == this_rq->last_load_update_tick) + return; + + raw_spin_lock(&this_rq->lock); + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + if (pending_updates) { + this_rq->last_load_update_tick = curr_jiffies; + /* + * We were idle, this means load 0, the current load might be + * !0 due to remote wakeups and the sort. + */ + __update_cpu_load(this_rq, 0, pending_updates); + } + raw_spin_unlock(&this_rq->lock); +} +#endif /* CONFIG_NO_HZ */ + +/* + * Called from scheduler_tick() + */ +void update_cpu_load_active(struct rq *this_rq) +{ + unsigned long load = this_rq->cfs.runnable_load_avg; + /* + * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). + */ + this_rq->last_load_update_tick = jiffies; + __update_cpu_load(this_rq, load, 1); +} + /* Used instead of source_load when we know the type == 0 */ static unsigned long weighted_cpuload(const int cpu) { diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c new file mode 100644 index 000000000000..ef7159012cf3 --- /dev/null +++ b/kernel/sched/loadavg.c @@ -0,0 +1,394 @@ +/* + * kernel/sched/loadavg.c + * + * This file contains the magic bits required to compute the global loadavg + * figure. Its a silly number but people think its important. We go through + * great pains to make it work on big machines and tickless kernels. + */ + +#include + +#include "sched.h" + +/* + * Global load-average calculations + * + * We take a distributed and async approach to calculating the global load-avg + * in order to minimize overhead. + * + * The global load average is an exponentially decaying average of nr_running + + * nr_uninterruptible. + * + * Once every LOAD_FREQ: + * + * nr_active = 0; + * for_each_possible_cpu(cpu) + * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; + * + * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) + * + * Due to a number of reasons the above turns in the mess below: + * + * - for_each_possible_cpu() is prohibitively expensive on machines with + * serious number of cpus, therefore we need to take a distributed approach + * to calculating nr_active. + * + * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 + * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } + * + * So assuming nr_active := 0 when we start out -- true per definition, we + * can simply take per-cpu deltas and fold those into a global accumulate + * to obtain the same result. See calc_load_fold_active(). + * + * Furthermore, in order to avoid synchronizing all per-cpu delta folding + * across the machine, we assume 10 ticks is sufficient time for every + * cpu to have completed this task. + * + * This places an upper-bound on the IRQ-off latency of the machine. Then + * again, being late doesn't loose the delta, just wrecks the sample. + * + * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because + * this would add another cross-cpu cacheline miss and atomic operation + * to the wakeup path. Instead we increment on whatever cpu the task ran + * when it went into uninterruptible state and decrement on whatever cpu + * did the wakeup. This means that only the sum of nr_uninterruptible over + * all cpus yields the correct result. + * + * This covers the NO_HZ=n code, for extra head-aches, see the comment below. + */ + +/* Variables and functions for calc_load */ +atomic_long_t calc_load_tasks; +unsigned long calc_load_update; +unsigned long avenrun[3]; +EXPORT_SYMBOL(avenrun); /* should be removed */ + +/** + * get_avenrun - get the load average array + * @loads: pointer to dest load array + * @offset: offset to add + * @shift: shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) +{ + loads[0] = (avenrun[0] + offset) << shift; + loads[1] = (avenrun[1] + offset) << shift; + loads[2] = (avenrun[2] + offset) << shift; +} + +long calc_load_fold_active(struct rq *this_rq) +{ + long nr_active, delta = 0; + + nr_active = this_rq->nr_running; + nr_active += (long)this_rq->nr_uninterruptible; + + if (nr_active != this_rq->calc_load_active) { + delta = nr_active - this_rq->calc_load_active; + this_rq->calc_load_active = nr_active; + } + + return delta; +} + +/* + * a1 = a0 * e + a * (1 - e) + */ +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ + load *= exp; + load += active * (FIXED_1 - exp); + load += 1UL << (FSHIFT - 1); + return load >> FSHIFT; +} + +#ifdef CONFIG_NO_HZ_COMMON +/* + * Handle NO_HZ for the global load-average. + * + * Since the above described distributed algorithm to compute the global + * load-average relies on per-cpu sampling from the tick, it is affected by + * NO_HZ. + * + * The basic idea is to fold the nr_active delta into a global idle-delta upon + * entering NO_HZ state such that we can include this as an 'extra' cpu delta + * when we read the global state. + * + * Obviously reality has to ruin such a delightfully simple scheme: + * + * - When we go NO_HZ idle during the window, we can negate our sample + * contribution, causing under-accounting. + * + * We avoid this by keeping two idle-delta counters and flipping them + * when the window starts, thus separating old and new NO_HZ load. + * + * The only trick is the slight shift in index flip for read vs write. + * + * 0s 5s 10s 15s + * +10 +10 +10 +10 + * |-|-----------|-|-----------|-|-----------|-| + * r:0 0 1 1 0 0 1 1 0 + * w:0 1 1 0 0 1 1 0 0 + * + * This ensures we'll fold the old idle contribution in this window while + * accumlating the new one. + * + * - When we wake up from NO_HZ idle during the window, we push up our + * contribution, since we effectively move our sample point to a known + * busy state. + * + * This is solved by pushing the window forward, and thus skipping the + * sample, for this cpu (effectively using the idle-delta for this cpu which + * was in effect at the time the window opened). This also solves the issue + * of having to deal with a cpu having been in NOHZ idle for multiple + * LOAD_FREQ intervals. + * + * When making the ILB scale, we should try to pull this in as well. + */ +static atomic_long_t calc_load_idle[2]; +static int calc_load_idx; + +static inline int calc_load_write_idx(void) +{ + int idx = calc_load_idx; + + /* + * See calc_global_nohz(), if we observe the new index, we also + * need to observe the new update time. + */ + smp_rmb(); + + /* + * If the folding window started, make sure we start writing in the + * next idle-delta. + */ + if (!time_before(jiffies, calc_load_update)) + idx++; + + return idx & 1; +} + +static inline int calc_load_read_idx(void) +{ + return calc_load_idx & 1; +} + +void calc_load_enter_idle(void) +{ + struct rq *this_rq = this_rq(); + long delta; + + /* + * We're going into NOHZ mode, if there's any pending delta, fold it + * into the pending idle delta. + */ + delta = calc_load_fold_active(this_rq); + if (delta) { + int idx = calc_load_write_idx(); + + atomic_long_add(delta, &calc_load_idle[idx]); + } +} + +void calc_load_exit_idle(void) +{ + struct rq *this_rq = this_rq(); + + /* + * If we're still before the sample window, we're done. + */ + if (time_before(jiffies, this_rq->calc_load_update)) + return; + + /* + * We woke inside or after the sample window, this means we're already + * accounted through the nohz accounting, so skip the entire deal and + * sync up for the next window. + */ + this_rq->calc_load_update = calc_load_update; + if (time_before(jiffies, this_rq->calc_load_update + 10)) + this_rq->calc_load_update += LOAD_FREQ; +} + +static long calc_load_fold_idle(void) +{ + int idx = calc_load_read_idx(); + long delta = 0; + + if (atomic_long_read(&calc_load_idle[idx])) + delta = atomic_long_xchg(&calc_load_idle[idx], 0); + + return delta; +} + +/** + * fixed_power_int - compute: x^n, in O(log n) time + * + * @x: base of the power + * @frac_bits: fractional bits of @x + * @n: power to raise @x to. + * + * By exploiting the relation between the definition of the natural power + * function: x^n := x*x*...*x (x multiplied by itself for n times), and + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, + * (where: n_i \elem {0, 1}, the binary vector representing n), + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is + * of course trivially computable in O(log_2 n), the length of our binary + * vector. + */ +static unsigned long +fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) +{ + unsigned long result = 1UL << frac_bits; + + if (n) { + for (;;) { + if (n & 1) { + result *= x; + result += 1UL << (frac_bits - 1); + result >>= frac_bits; + } + n >>= 1; + if (!n) + break; + x *= x; + x += 1UL << (frac_bits - 1); + x >>= frac_bits; + } + } + + return result; +} + +/* + * a1 = a0 * e + a * (1 - e) + * + * a2 = a1 * e + a * (1 - e) + * = (a0 * e + a * (1 - e)) * e + a * (1 - e) + * = a0 * e^2 + a * (1 - e) * (1 + e) + * + * a3 = a2 * e + a * (1 - e) + * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) + * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) + * + * ... + * + * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] + * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) + * = a0 * e^n + a * (1 - e^n) + * + * [1] application of the geometric series: + * + * n 1 - x^(n+1) + * S_n := \Sum x^i = ------------- + * i=0 1 - x + */ +static unsigned long +calc_load_n(unsigned long load, unsigned long exp, + unsigned long active, unsigned int n) +{ + return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); +} + +/* + * NO_HZ can leave us missing all per-cpu ticks calling + * calc_load_account_active(), but since an idle CPU folds its delta into + * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold + * in the pending idle delta if our idle period crossed a load cycle boundary. + * + * Once we've updated the global active value, we need to apply the exponential + * weights adjusted to the number of cycles missed. + */ +static void calc_global_nohz(void) +{ + long delta, active, n; + + if (!time_before(jiffies, calc_load_update + 10)) { + /* + * Catch-up, fold however many we are behind still + */ + delta = jiffies - calc_load_update - 10; + n = 1 + (delta / LOAD_FREQ); + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; + + avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); + avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); + avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); + + calc_load_update += n * LOAD_FREQ; + } + + /* + * Flip the idle index... + * + * Make sure we first write the new time then flip the index, so that + * calc_load_write_idx() will see the new time when it reads the new + * index, this avoids a double flip messing things up. + */ + smp_wmb(); + calc_load_idx++; +} +#else /* !CONFIG_NO_HZ_COMMON */ + +static inline long calc_load_fold_idle(void) { return 0; } +static inline void calc_global_nohz(void) { } + +#endif /* CONFIG_NO_HZ_COMMON */ + +/* + * calc_load - update the avenrun load estimates 10 ticks after the + * CPUs have updated calc_load_tasks. + * + * Called from the global timer code. + */ +void calc_global_load(unsigned long ticks) +{ + long active, delta; + + if (time_before(jiffies, calc_load_update + 10)) + return; + + /* + * Fold the 'old' idle-delta to include all NO_HZ cpus. + */ + delta = calc_load_fold_idle(); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; + + avenrun[0] = calc_load(avenrun[0], EXP_1, active); + avenrun[1] = calc_load(avenrun[1], EXP_5, active); + avenrun[2] = calc_load(avenrun[2], EXP_15, active); + + calc_load_update += LOAD_FREQ; + + /* + * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. + */ + calc_global_nohz(); +} + +/* + * Called from scheduler_tick() to periodically update this CPU's + * active count. + */ +void calc_global_load_tick(struct rq *this_rq) +{ + long delta; + + if (time_before(jiffies, this_rq->calc_load_update)) + return; + + delta = calc_load_fold_active(this_rq); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + + this_rq->calc_load_update += LOAD_FREQ; +} diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c deleted file mode 100644 index 8ecd552fe4f2..000000000000 --- a/kernel/sched/proc.c +++ /dev/null @@ -1,584 +0,0 @@ -/* - * kernel/sched/proc.c - * - * Kernel load calculations, forked from sched/core.c - */ - -#include - -#include "sched.h" - -/* - * Global load-average calculations - * - * We take a distributed and async approach to calculating the global load-avg - * in order to minimize overhead. - * - * The global load average is an exponentially decaying average of nr_running + - * nr_uninterruptible. - * - * Once every LOAD_FREQ: - * - * nr_active = 0; - * for_each_possible_cpu(cpu) - * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; - * - * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) - * - * Due to a number of reasons the above turns in the mess below: - * - * - for_each_possible_cpu() is prohibitively expensive on machines with - * serious number of cpus, therefore we need to take a distributed approach - * to calculating nr_active. - * - * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 - * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } - * - * So assuming nr_active := 0 when we start out -- true per definition, we - * can simply take per-cpu deltas and fold those into a global accumulate - * to obtain the same result. See calc_load_fold_active(). - * - * Furthermore, in order to avoid synchronizing all per-cpu delta folding - * across the machine, we assume 10 ticks is sufficient time for every - * cpu to have completed this task. - * - * This places an upper-bound on the IRQ-off latency of the machine. Then - * again, being late doesn't loose the delta, just wrecks the sample. - * - * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because - * this would add another cross-cpu cacheline miss and atomic operation - * to the wakeup path. Instead we increment on whatever cpu the task ran - * when it went into uninterruptible state and decrement on whatever cpu - * did the wakeup. This means that only the sum of nr_uninterruptible over - * all cpus yields the correct result. - * - * This covers the NO_HZ=n code, for extra head-aches, see the comment below. - */ - -/* Variables and functions for calc_load */ -atomic_long_t calc_load_tasks; -unsigned long calc_load_update; -unsigned long avenrun[3]; -EXPORT_SYMBOL(avenrun); /* should be removed */ - -/** - * get_avenrun - get the load average array - * @loads: pointer to dest load array - * @offset: offset to add - * @shift: shift count to shift the result left - * - * These values are estimates at best, so no need for locking. - */ -void get_avenrun(unsigned long *loads, unsigned long offset, int shift) -{ - loads[0] = (avenrun[0] + offset) << shift; - loads[1] = (avenrun[1] + offset) << shift; - loads[2] = (avenrun[2] + offset) << shift; -} - -long calc_load_fold_active(struct rq *this_rq) -{ - long nr_active, delta = 0; - - nr_active = this_rq->nr_running; - nr_active += (long) this_rq->nr_uninterruptible; - - if (nr_active != this_rq->calc_load_active) { - delta = nr_active - this_rq->calc_load_active; - this_rq->calc_load_active = nr_active; - } - - return delta; -} - -/* - * a1 = a0 * e + a * (1 - e) - */ -static unsigned long -calc_load(unsigned long load, unsigned long exp, unsigned long active) -{ - load *= exp; - load += active * (FIXED_1 - exp); - load += 1UL << (FSHIFT - 1); - return load >> FSHIFT; -} - -#ifdef CONFIG_NO_HZ_COMMON -/* - * Handle NO_HZ for the global load-average. - * - * Since the above described distributed algorithm to compute the global - * load-average relies on per-cpu sampling from the tick, it is affected by - * NO_HZ. - * - * The basic idea is to fold the nr_active delta into a global idle-delta upon - * entering NO_HZ state such that we can include this as an 'extra' cpu delta - * when we read the global state. - * - * Obviously reality has to ruin such a delightfully simple scheme: - * - * - When we go NO_HZ idle during the window, we can negate our sample - * contribution, causing under-accounting. - * - * We avoid this by keeping two idle-delta counters and flipping them - * when the window starts, thus separating old and new NO_HZ load. - * - * The only trick is the slight shift in index flip for read vs write. - * - * 0s 5s 10s 15s - * +10 +10 +10 +10 - * |-|-----------|-|-----------|-|-----------|-| - * r:0 0 1 1 0 0 1 1 0 - * w:0 1 1 0 0 1 1 0 0 - * - * This ensures we'll fold the old idle contribution in this window while - * accumlating the new one. - * - * - When we wake up from NO_HZ idle during the window, we push up our - * contribution, since we effectively move our sample point to a known - * busy state. - * - * This is solved by pushing the window forward, and thus skipping the - * sample, for this cpu (effectively using the idle-delta for this cpu which - * was in effect at the time the window opened). This also solves the issue - * of having to deal with a cpu having been in NOHZ idle for multiple - * LOAD_FREQ intervals. - * - * When making the ILB scale, we should try to pull this in as well. - */ -static atomic_long_t calc_load_idle[2]; -static int calc_load_idx; - -static inline int calc_load_write_idx(void) -{ - int idx = calc_load_idx; - - /* - * See calc_global_nohz(), if we observe the new index, we also - * need to observe the new update time. - */ - smp_rmb(); - - /* - * If the folding window started, make sure we start writing in the - * next idle-delta. - */ - if (!time_before(jiffies, calc_load_update)) - idx++; - - return idx & 1; -} - -static inline int calc_load_read_idx(void) -{ - return calc_load_idx & 1; -} - -void calc_load_enter_idle(void) -{ - struct rq *this_rq = this_rq(); - long delta; - - /* - * We're going into NOHZ mode, if there's any pending delta, fold it - * into the pending idle delta. - */ - delta = calc_load_fold_active(this_rq); - if (delta) { - int idx = calc_load_write_idx(); - atomic_long_add(delta, &calc_load_idle[idx]); - } -} - -void calc_load_exit_idle(void) -{ - struct rq *this_rq = this_rq(); - - /* - * If we're still before the sample window, we're done. - */ - if (time_before(jiffies, this_rq->calc_load_update)) - return; - - /* - * We woke inside or after the sample window, this means we're already - * accounted through the nohz accounting, so skip the entire deal and - * sync up for the next window. - */ - this_rq->calc_load_update = calc_load_update; - if (time_before(jiffies, this_rq->calc_load_update + 10)) - this_rq->calc_load_update += LOAD_FREQ; -} - -static long calc_load_fold_idle(void) -{ - int idx = calc_load_read_idx(); - long delta = 0; - - if (atomic_long_read(&calc_load_idle[idx])) - delta = atomic_long_xchg(&calc_load_idle[idx], 0); - - return delta; -} - -/** - * fixed_power_int - compute: x^n, in O(log n) time - * - * @x: base of the power - * @frac_bits: fractional bits of @x - * @n: power to raise @x to. - * - * By exploiting the relation between the definition of the natural power - * function: x^n := x*x*...*x (x multiplied by itself for n times), and - * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, - * (where: n_i \elem {0, 1}, the binary vector representing n), - * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is - * of course trivially computable in O(log_2 n), the length of our binary - * vector. - */ -static unsigned long -fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) -{ - unsigned long result = 1UL << frac_bits; - - if (n) for (;;) { - if (n & 1) { - result *= x; - result += 1UL << (frac_bits - 1); - result >>= frac_bits; - } - n >>= 1; - if (!n) - break; - x *= x; - x += 1UL << (frac_bits - 1); - x >>= frac_bits; - } - - return result; -} - -/* - * a1 = a0 * e + a * (1 - e) - * - * a2 = a1 * e + a * (1 - e) - * = (a0 * e + a * (1 - e)) * e + a * (1 - e) - * = a0 * e^2 + a * (1 - e) * (1 + e) - * - * a3 = a2 * e + a * (1 - e) - * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) - * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) - * - * ... - * - * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] - * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) - * = a0 * e^n + a * (1 - e^n) - * - * [1] application of the geometric series: - * - * n 1 - x^(n+1) - * S_n := \Sum x^i = ------------- - * i=0 1 - x - */ -static unsigned long -calc_load_n(unsigned long load, unsigned long exp, - unsigned long active, unsigned int n) -{ - - return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); -} - -/* - * NO_HZ can leave us missing all per-cpu ticks calling - * calc_load_account_active(), but since an idle CPU folds its delta into - * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold - * in the pending idle delta if our idle period crossed a load cycle boundary. - * - * Once we've updated the global active value, we need to apply the exponential - * weights adjusted to the number of cycles missed. - */ -static void calc_global_nohz(void) -{ - long delta, active, n; - - if (!time_before(jiffies, calc_load_update + 10)) { - /* - * Catch-up, fold however many we are behind still - */ - delta = jiffies - calc_load_update - 10; - n = 1 + (delta / LOAD_FREQ); - - active = atomic_long_read(&calc_load_tasks); - active = active > 0 ? active * FIXED_1 : 0; - - avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); - avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); - avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); - - calc_load_update += n * LOAD_FREQ; - } - - /* - * Flip the idle index... - * - * Make sure we first write the new time then flip the index, so that - * calc_load_write_idx() will see the new time when it reads the new - * index, this avoids a double flip messing things up. - */ - smp_wmb(); - calc_load_idx++; -} -#else /* !CONFIG_NO_HZ_COMMON */ - -static inline long calc_load_fold_idle(void) { return 0; } -static inline void calc_global_nohz(void) { } - -#endif /* CONFIG_NO_HZ_COMMON */ - -/* - * calc_load - update the avenrun load estimates 10 ticks after the - * CPUs have updated calc_load_tasks. - */ -void calc_global_load(unsigned long ticks) -{ - long active, delta; - - if (time_before(jiffies, calc_load_update + 10)) - return; - - /* - * Fold the 'old' idle-delta to include all NO_HZ cpus. - */ - delta = calc_load_fold_idle(); - if (delta) - atomic_long_add(delta, &calc_load_tasks); - - active = atomic_long_read(&calc_load_tasks); - active = active > 0 ? active * FIXED_1 : 0; - - avenrun[0] = calc_load(avenrun[0], EXP_1, active); - avenrun[1] = calc_load(avenrun[1], EXP_5, active); - avenrun[2] = calc_load(avenrun[2], EXP_15, active); - - calc_load_update += LOAD_FREQ; - - /* - * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. - */ - calc_global_nohz(); -} - -/* - * Called from update_cpu_load() to periodically update this CPU's - * active count. - */ -static void calc_load_account_active(struct rq *this_rq) -{ - long delta; - - if (time_before(jiffies, this_rq->calc_load_update)) - return; - - delta = calc_load_fold_active(this_rq); - if (delta) - atomic_long_add(delta, &calc_load_tasks); - - this_rq->calc_load_update += LOAD_FREQ; -} - -/* - * End of global load-average stuff - */ - -/* - * The exact cpuload at various idx values, calculated at every tick would be - * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load - * - * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called - * on nth tick when cpu may be busy, then we have: - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load - * - * decay_load_missed() below does efficient calculation of - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load - * - * The calculation is approximated on a 128 point scale. - * degrade_zero_ticks is the number of ticks after which load at any - * particular idx is approximated to be zero. - * degrade_factor is a precomputed table, a row for each load idx. - * Each column corresponds to degradation factor for a power of two ticks, - * based on 128 point scale. - * Example: - * row 2, col 3 (=12) says that the degradation at load idx 2 after - * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). - * - * With this power of 2 load factors, we can degrade the load n times - * by looking at 1 bits in n and doing as many mult/shift instead of - * n mult/shifts needed by the exact degradation. - */ -#define DEGRADE_SHIFT 7 -static const unsigned char - degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; -static const unsigned char - degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { - {0, 0, 0, 0, 0, 0, 0, 0}, - {64, 32, 8, 0, 0, 0, 0, 0}, - {96, 72, 40, 12, 1, 0, 0}, - {112, 98, 75, 43, 15, 1, 0}, - {120, 112, 98, 76, 45, 16, 2} }; - -/* - * Update cpu_load for any missed ticks, due to tickless idle. The backlog - * would be when CPU is idle and so we just decay the old load without - * adding any new load. - */ -static unsigned long -decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) -{ - int j = 0; - - if (!missed_updates) - return load; - - if (missed_updates >= degrade_zero_ticks[idx]) - return 0; - - if (idx == 1) - return load >> missed_updates; - - while (missed_updates) { - if (missed_updates % 2) - load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; - - missed_updates >>= 1; - j++; - } - return load; -} - -/* - * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). With tickless idle this will not be called - * every tick. We fix it up based on jiffies. - */ -static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, - unsigned long pending_updates) -{ - int i, scale; - - this_rq->nr_load_updates++; - - /* Update our load: */ - this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ - for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { - unsigned long old_load, new_load; - - /* scale is effectively 1 << i now, and >> i divides by scale */ - - old_load = this_rq->cpu_load[i]; - old_load = decay_load_missed(old_load, pending_updates - 1, i); - new_load = this_load; - /* - * Round up the averaging division if load is increasing. This - * prevents us from getting stuck on 9 if the load is 10, for - * example. - */ - if (new_load > old_load) - new_load += scale - 1; - - this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; - } - - sched_avg_update(this_rq); -} - -#ifdef CONFIG_SMP -static inline unsigned long get_rq_runnable_load(struct rq *rq) -{ - return rq->cfs.runnable_load_avg; -} -#else -static inline unsigned long get_rq_runnable_load(struct rq *rq) -{ - return rq->load.weight; -} -#endif - -#ifdef CONFIG_NO_HZ_COMMON -/* - * There is no sane way to deal with nohz on smp when using jiffies because the - * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading - * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. - * - * Therefore we cannot use the delta approach from the regular tick since that - * would seriously skew the load calculation. However we'll make do for those - * updates happening while idle (nohz_idle_balance) or coming out of idle - * (tick_nohz_idle_exit). - * - * This means we might still be one tick off for nohz periods. - */ - -/* - * Called from nohz_idle_balance() to update the load ratings before doing the - * idle balance. - */ -void update_idle_cpu_load(struct rq *this_rq) -{ - unsigned long curr_jiffies = ACCESS_ONCE(jiffies); - unsigned long load = get_rq_runnable_load(this_rq); - unsigned long pending_updates; - - /* - * bail if there's load or we're actually up-to-date. - */ - if (load || curr_jiffies == this_rq->last_load_update_tick) - return; - - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - this_rq->last_load_update_tick = curr_jiffies; - - __update_cpu_load(this_rq, load, pending_updates); -} - -/* - * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. - */ -void update_cpu_load_nohz(void) -{ - struct rq *this_rq = this_rq(); - unsigned long curr_jiffies = ACCESS_ONCE(jiffies); - unsigned long pending_updates; - - if (curr_jiffies == this_rq->last_load_update_tick) - return; - - raw_spin_lock(&this_rq->lock); - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - if (pending_updates) { - this_rq->last_load_update_tick = curr_jiffies; - /* - * We were idle, this means load 0, the current load might be - * !0 due to remote wakeups and the sort. - */ - __update_cpu_load(this_rq, 0, pending_updates); - } - raw_spin_unlock(&this_rq->lock); -} -#endif /* CONFIG_NO_HZ */ - -/* - * Called from scheduler_tick() - */ -void update_cpu_load_active(struct rq *this_rq) -{ - unsigned long load = get_rq_runnable_load(this_rq); - /* - * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). - */ - this_rq->last_load_update_tick = jiffies; - __update_cpu_load(this_rq, load, 1); - - calc_load_account_active(this_rq); -} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e0e129993958..09ed26a89f31 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -26,8 +26,14 @@ extern __read_mostly int scheduler_running; extern unsigned long calc_load_update; extern atomic_long_t calc_load_tasks; +extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq); + +#ifdef CONFIG_SMP extern void update_cpu_load_active(struct rq *this_rq); +#else +static inline void update_cpu_load_active(struct rq *this_rq) { } +#endif /* * Helpers for converting nanosecond timing to jiffy resolution @@ -1298,8 +1304,6 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se); unsigned long to_ratio(u64 period, u64 runtime); -extern void update_idle_cpu_load(struct rq *this_rq); - extern void init_task_runnable_average(struct task_struct *p); static inline void add_nr_running(struct rq *rq, unsigned count) -- cgit v1.2.3 From b76808e6808e34e7e78131d2b8cb0535622b8e9f Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Thu, 30 Apr 2015 21:19:57 -0700 Subject: signals, sched: Change all uses of JOBCTL_* from 'int' to 'long' c56fb6564dcd ("Fix a misaligned load inside ptrace_attach()") makes jobctl an "unsigned long". It makes sense to have the masks applied to it match that type. This is currently just a cosmetic change, but it will prevent the mask from being unexpectedly truncated if we ever end up with masks with more bits. One instance of "signr" is an int, but I left this alone because the mask ensures that it will never overflow. Signed-off-by: Palmer Dabbelt Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chris Metcalf Cc: Andrew Morton Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: bobby.prani@gmail.com Cc: oleg@redhat.com Cc: paulmck@linux.vnet.ibm.com Cc: richard@nod.at Cc: vdavydov@parallels.com Link: http://lkml.kernel.org/r/1430453997-32459-4-git-send-email-palmer@dabbelt.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 18 +++++++++--------- kernel/signal.c | 6 +++--- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 85cf253bc366..4f066cb625ad 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2082,22 +2082,22 @@ TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab) #define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */ #define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */ -#define JOBCTL_STOP_DEQUEUED (1 << JOBCTL_STOP_DEQUEUED_BIT) -#define JOBCTL_STOP_PENDING (1 << JOBCTL_STOP_PENDING_BIT) -#define JOBCTL_STOP_CONSUME (1 << JOBCTL_STOP_CONSUME_BIT) -#define JOBCTL_TRAP_STOP (1 << JOBCTL_TRAP_STOP_BIT) -#define JOBCTL_TRAP_NOTIFY (1 << JOBCTL_TRAP_NOTIFY_BIT) -#define JOBCTL_TRAPPING (1 << JOBCTL_TRAPPING_BIT) -#define JOBCTL_LISTENING (1 << JOBCTL_LISTENING_BIT) +#define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT) +#define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT) +#define JOBCTL_STOP_CONSUME (1UL << JOBCTL_STOP_CONSUME_BIT) +#define JOBCTL_TRAP_STOP (1UL << JOBCTL_TRAP_STOP_BIT) +#define JOBCTL_TRAP_NOTIFY (1UL << JOBCTL_TRAP_NOTIFY_BIT) +#define JOBCTL_TRAPPING (1UL << JOBCTL_TRAPPING_BIT) +#define JOBCTL_LISTENING (1UL << JOBCTL_LISTENING_BIT) #define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY) #define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK) extern bool task_set_jobctl_pending(struct task_struct *task, - unsigned int mask); + unsigned long mask); extern void task_clear_jobctl_trapping(struct task_struct *task); extern void task_clear_jobctl_pending(struct task_struct *task, - unsigned int mask); + unsigned long mask); static inline void rcu_copy_process(struct task_struct *p) { diff --git a/kernel/signal.c b/kernel/signal.c index d51c5ddd855c..f19833b5db3c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -245,7 +245,7 @@ static inline void print_dropped_signal(int sig) * RETURNS: * %true if @mask is set, %false if made noop because @task was dying. */ -bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask) +bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask) { BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME | JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING)); @@ -297,7 +297,7 @@ void task_clear_jobctl_trapping(struct task_struct *task) * CONTEXT: * Must be called with @task->sighand->siglock held. */ -void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask) +void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask) { BUG_ON(mask & ~JOBCTL_PENDING_MASK); @@ -2000,7 +2000,7 @@ static bool do_signal_stop(int signr) struct signal_struct *sig = current->signal; if (!(current->jobctl & JOBCTL_STOP_PENDING)) { - unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; + unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; struct task_struct *t; /* signr will be recorded in task->jobctl for retries */ -- cgit v1.2.3 From 7e60598785f30cf3dc9e476cc0fc3feeb37a0c63 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Thu, 30 Apr 2015 21:19:56 -0700 Subject: sched/wait: Change wait_on_bit*() to take an unsigned long *, not a void * The implementations of wait_on_bit*() will only work with long-aligned memory on systems that don't support misaligned loads and stores. This patch changes the function prototypes to ensure that the compiler will enforce alignment. Running make defconfig make KFLAGS="-Werror" seems to indicate that, as of c56fb6564dcd ("Fix a misaligned load inside ptrace_attach()"), there are now no users of non-long-aligned calls to wait_on_bit*(). I additionally tried a few "make randconfig" attempts, none of which failed to compile for this reason. Signed-off-by: Palmer Dabbelt Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chris Metcalf Cc: Andrew Morton Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: bobby.prani@gmail.com Cc: oleg@redhat.com Cc: paulmck@linux.vnet.ibm.com Cc: richard@nod.at Cc: vdavydov@parallels.com Link: http://lkml.kernel.org/r/1430453997-32459-3-git-send-email-palmer@dabbelt.com Signed-off-by: Ingo Molnar --- include/linux/wait.h | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index 2db83349865b..d69ac4ecc88b 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -969,7 +969,7 @@ extern int bit_wait_io_timeout(struct wait_bit_key *); * on that signal. */ static inline int -wait_on_bit(void *word, int bit, unsigned mode) +wait_on_bit(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_bit(bit, word)) @@ -994,7 +994,7 @@ wait_on_bit(void *word, int bit, unsigned mode) * on that signal. */ static inline int -wait_on_bit_io(void *word, int bit, unsigned mode) +wait_on_bit_io(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_bit(bit, word)) @@ -1020,7 +1020,8 @@ wait_on_bit_io(void *word, int bit, unsigned mode) * received a signal and the mode permitted wakeup on that signal. */ static inline int -wait_on_bit_timeout(void *word, int bit, unsigned mode, unsigned long timeout) +wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode, + unsigned long timeout) { might_sleep(); if (!test_bit(bit, word)) @@ -1047,7 +1048,8 @@ wait_on_bit_timeout(void *word, int bit, unsigned mode, unsigned long timeout) * on that signal. */ static inline int -wait_on_bit_action(void *word, int bit, wait_bit_action_f *action, unsigned mode) +wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action, + unsigned mode) { might_sleep(); if (!test_bit(bit, word)) @@ -1075,7 +1077,7 @@ wait_on_bit_action(void *word, int bit, wait_bit_action_f *action, unsigned mode * the @mode allows that signal to wake the process. */ static inline int -wait_on_bit_lock(void *word, int bit, unsigned mode) +wait_on_bit_lock(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_and_set_bit(bit, word)) @@ -1099,7 +1101,7 @@ wait_on_bit_lock(void *word, int bit, unsigned mode) * the @mode allows that signal to wake the process. */ static inline int -wait_on_bit_lock_io(void *word, int bit, unsigned mode) +wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_and_set_bit(bit, word)) @@ -1125,7 +1127,8 @@ wait_on_bit_lock_io(void *word, int bit, unsigned mode) * the @mode allows that signal to wake the process. */ static inline int -wait_on_bit_lock_action(void *word, int bit, wait_bit_action_f *action, unsigned mode) +wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action, + unsigned mode) { might_sleep(); if (!test_and_set_bit(bit, word)) -- cgit v1.2.3 From e7cc4173115347bcdaa5de2824dd46ef2c58425f Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Thu, 30 Apr 2015 21:19:55 -0700 Subject: signals, ptrace, sched: Fix a misaligned load inside ptrace_attach() The misaligned load exception arises when running ptrace_attach() on the RISC-V (which hasn't been upstreamed yet). The problem is that wait_on_bit() takes a void* but then proceeds to call test_bit(), which takes a long*. This allows an int-aligned pointer to be passed to test_bit(), which promptly fails. This will manifest on any other asm-generic port where unaligned loads trap, where sizeof(long) > sizeof(int), and where task_struct.jobctl ends up not being long-aligned. This patch changes task_struct.jobctl to be a long, which ensures it has the correct alignment. Signed-off-by: Palmer Dabbelt Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chris Metcalf Cc: Andrew Morton Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: bobby.prani@gmail.com Cc: oleg@redhat.com Cc: paulmck@linux.vnet.ibm.com Cc: richard@nod.at Cc: vdavydov@parallels.com Link: http://lkml.kernel.org/r/1430453997-32459-2-git-send-email-palmer@dabbelt.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4f066cb625ad..fb650a2f4a73 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1374,7 +1374,7 @@ struct task_struct { int exit_state; int exit_code, exit_signal; int pdeath_signal; /* The signal sent when the parent dies */ - unsigned int jobctl; /* JOBCTL_*, siglock protected */ + unsigned long jobctl; /* JOBCTL_*, siglock protected */ /* Used for emulating ABI behavior of previous Linux versions */ unsigned int personality; -- cgit v1.2.3 From 316c1608d15c736439d4065ed12f306db554b3da Mon Sep 17 00:00:00 2001 From: Jason Low Date: Tue, 28 Apr 2015 13:00:20 -0700 Subject: sched, timer: Convert usages of ACCESS_ONCE() in the scheduler to READ_ONCE()/WRITE_ONCE() ACCESS_ONCE doesn't work reliably on non-scalar types. This patch removes the rest of the existing usages of ACCESS_ONCE() in the scheduler, and use the new READ_ONCE() and WRITE_ONCE() APIs as appropriate. Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Acked-by: Rik van Riel Acked-by: Waiman Long Cc: Andrew Morton Cc: Aswin Chandramouleeswaran Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Preeti U Murthy Cc: Scott J Norton Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1430251224-5764-2-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++-- kernel/fork.c | 2 +- kernel/sched/auto_group.c | 2 +- kernel/sched/auto_group.h | 2 +- kernel/sched/core.c | 4 ++-- kernel/sched/cputime.c | 2 +- kernel/sched/deadline.c | 2 +- kernel/sched/fair.c | 18 +++++++++--------- kernel/sched/rt.c | 2 +- kernel/sched/sched.h | 2 +- kernel/sched/wait.c | 4 ++-- kernel/time/posix-cpu-timers.c | 8 ++++---- 12 files changed, 26 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index fb650a2f4a73..d70910355b20 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -3085,13 +3085,13 @@ static inline void mm_update_next_owner(struct mm_struct *mm) static inline unsigned long task_rlimit(const struct task_struct *tsk, unsigned int limit) { - return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_cur); + return READ_ONCE(tsk->signal->rlim[limit].rlim_cur); } static inline unsigned long task_rlimit_max(const struct task_struct *tsk, unsigned int limit) { - return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_max); + return READ_ONCE(tsk->signal->rlim[limit].rlim_max); } static inline unsigned long rlimit(unsigned int limit) diff --git a/kernel/fork.c b/kernel/fork.c index 03c1eaaa6ef5..47c37a411a62 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1094,7 +1094,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) /* Thread group counters. */ thread_group_cputime_init(sig); - cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); + cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); if (cpu_limit != RLIM_INFINITY) { sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); sig->cputimer.running = 1; diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 1a3b58d531b2..750ed601ddf7 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -139,7 +139,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) p->signal->autogroup = autogroup_kref_get(ag); - if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) + if (!READ_ONCE(sysctl_sched_autogroup_enabled)) goto out; for_each_thread(p, t) diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h index 8bd047142816..890c95f2587a 100644 --- a/kernel/sched/auto_group.h +++ b/kernel/sched/auto_group.h @@ -29,7 +29,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); static inline struct task_group * autogroup_task_group(struct task_struct *p, struct task_group *tg) { - int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); + int enabled = READ_ONCE(sysctl_sched_autogroup_enabled); if (enabled && task_wants_autogroup(p, tg)) return p->signal->autogroup->tg; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 46a5d6f05208..22b53c863ef3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -511,7 +511,7 @@ static bool set_nr_and_not_polling(struct task_struct *p) static bool set_nr_if_polling(struct task_struct *p) { struct thread_info *ti = task_thread_info(p); - typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags); + typeof(ti->flags) old, val = READ_ONCE(ti->flags); for (;;) { if (!(val & _TIF_POLLING_NRFLAG)) @@ -2526,7 +2526,7 @@ void scheduler_tick(void) u64 scheduler_tick_max_deferment(void) { struct rq *rq = this_rq(); - unsigned long next, now = ACCESS_ONCE(jiffies); + unsigned long next, now = READ_ONCE(jiffies); next = rq->last_sched_tick + HZ; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8394b1ee600c..f5a64ffad176 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -567,7 +567,7 @@ static void cputime_advance(cputime_t *counter, cputime_t new) { cputime_t old; - while (new > (old = ACCESS_ONCE(*counter))) + while (new > (old = READ_ONCE(*counter))) cmpxchg_cputime(counter, old, new); } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 5e95145088fd..890ce951c717 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -995,7 +995,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) rq = cpu_rq(cpu); rcu_read_lock(); - curr = ACCESS_ONCE(rq->curr); /* unlocked access */ + curr = READ_ONCE(rq->curr); /* unlocked access */ /* * If we are dealing with a -deadline task, we must diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4bc6013886ec..d6915a038d8a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -834,7 +834,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p) static unsigned int task_scan_min(struct task_struct *p) { - unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size); + unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size); unsigned int scan, floor; unsigned int windows = 1; @@ -1794,7 +1794,7 @@ static void task_numa_placement(struct task_struct *p) u64 runtime, period; spinlock_t *group_lock = NULL; - seq = ACCESS_ONCE(p->mm->numa_scan_seq); + seq = READ_ONCE(p->mm->numa_scan_seq); if (p->numa_scan_seq == seq) return; p->numa_scan_seq = seq; @@ -1938,7 +1938,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, } rcu_read_lock(); - tsk = ACCESS_ONCE(cpu_rq(cpu)->curr); + tsk = READ_ONCE(cpu_rq(cpu)->curr); if (!cpupid_match_pid(tsk, cpupid)) goto no_join; @@ -2107,7 +2107,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) static void reset_ptenuma_scan(struct task_struct *p) { - ACCESS_ONCE(p->mm->numa_scan_seq)++; + WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1); p->mm->numa_scan_offset = 0; } @@ -4451,7 +4451,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, */ static void update_idle_cpu_load(struct rq *this_rq) { - unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long curr_jiffies = READ_ONCE(jiffies); unsigned long load = this_rq->cfs.runnable_load_avg; unsigned long pending_updates; @@ -4473,7 +4473,7 @@ static void update_idle_cpu_load(struct rq *this_rq) void update_cpu_load_nohz(void) { struct rq *this_rq = this_rq(); - unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long curr_jiffies = READ_ONCE(jiffies); unsigned long pending_updates; if (curr_jiffies == this_rq->last_load_update_tick) @@ -4558,7 +4558,7 @@ static unsigned long capacity_orig_of(int cpu) static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running); + unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); unsigned long load_avg = rq->cfs.runnable_load_avg; if (nr_running) @@ -6220,8 +6220,8 @@ static unsigned long scale_rt_capacity(int cpu) * Since we're reading these variables without serialization make sure * we read them once before doing sanity checks on them. */ - age_stamp = ACCESS_ONCE(rq->age_stamp); - avg = ACCESS_ONCE(rq->rt_avg); + age_stamp = READ_ONCE(rq->age_stamp); + avg = READ_ONCE(rq->rt_avg); delta = __rq_clock_broken(rq) - age_stamp; if (unlikely(delta < 0)) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 575da76a3874..560d2fa623c3 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1323,7 +1323,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) rq = cpu_rq(cpu); rcu_read_lock(); - curr = ACCESS_ONCE(rq->curr); /* unlocked access */ + curr = READ_ONCE(rq->curr); /* unlocked access */ /* * If the current task on @p's runqueue is an RT task, then diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 09ed26a89f31..d85455539d5c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -713,7 +713,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); static inline u64 __rq_clock_broken(struct rq *rq) { - return ACCESS_ONCE(rq->clock); + return READ_ONCE(rq->clock); } static inline u64 rq_clock(struct rq *rq) diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 852143a79f36..2ccec988d6b7 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -601,7 +601,7 @@ EXPORT_SYMBOL(bit_wait_io); __sched int bit_wait_timeout(struct wait_bit_key *word) { - unsigned long now = ACCESS_ONCE(jiffies); + unsigned long now = READ_ONCE(jiffies); if (signal_pending_state(current->state, current)) return 1; if (time_after_eq(now, word->timeout)) @@ -613,7 +613,7 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout); __sched int bit_wait_io_timeout(struct wait_bit_key *word) { - unsigned long now = ACCESS_ONCE(jiffies); + unsigned long now = READ_ONCE(jiffies); if (signal_pending_state(current->state, current)) return 1; if (time_after_eq(now, word->timeout)) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 0075da74abf0..e072d982f64c 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -852,10 +852,10 @@ static void check_thread_timers(struct task_struct *tsk, /* * Check for the special case thread timers. */ - soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur); + soft = READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur); if (soft != RLIM_INFINITY) { unsigned long hard = - ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); + READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); if (hard != RLIM_INFINITY && tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { @@ -958,11 +958,11 @@ static void check_process_timers(struct task_struct *tsk, SIGPROF); check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, SIGVTALRM); - soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); + soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); if (soft != RLIM_INFINITY) { unsigned long psecs = cputime_to_secs(ptime); unsigned long hard = - ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max); + READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max); cputime_t x; if (psecs >= hard) { /* -- cgit v1.2.3 From 1018016c706f7ff9f56fde3a649789c47085a293 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Tue, 28 Apr 2015 13:00:22 -0700 Subject: sched, timer: Replace spinlocks with atomics in thread_group_cputimer(), to improve scalability While running a database workload, we found a scalability issue with itimers. Much of the problem was caused by the thread_group_cputimer spinlock. Each time we account for group system/user time, we need to obtain a thread_group_cputimer's spinlock to update the timers. On larger systems (such as a 16 socket machine), this caused more than 30% of total time spent trying to obtain this kernel lock to update these group timer stats. This patch converts the timers to 64-bit atomic variables and use atomic add to update them without a lock. With this patch, the percent of total time spent updating thread group cputimer timers was reduced from 30% down to less than 1%. Note: On 32-bit systems using the generic 64-bit atomics, this causes sample_group_cputimer() to take locks 3 times instead of just 1 time. However, we tested this patch on a 32-bit system ARM system using the generic atomics and did not find the overhead to be much of an issue. An explanation for why this isn't an issue is that 32-bit systems usually have small numbers of CPUs, and cacheline contention from extra spinlocks called periodically is not really apparent on smaller systems. Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Andrew Morton Cc: Aswin Chandramouleeswaran Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Scott J Norton Cc: Steven Rostedt Cc: Waiman Long Link: http://lkml.kernel.org/r/1430251224-5764-4-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 7 ++-- include/linux/sched.h | 10 ++---- kernel/fork.c | 3 -- kernel/sched/stats.h | 15 +++----- kernel/time/posix-cpu-timers.c | 79 ++++++++++++++++++++++++++---------------- 5 files changed, 62 insertions(+), 52 deletions(-) (limited to 'include/linux') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 696d22312b31..7b9d8b59e7bf 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -50,9 +50,10 @@ extern struct fs_struct init_fs; .cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \ .rlim = INIT_RLIMITS, \ .cputimer = { \ - .cputime = INIT_CPUTIME, \ - .running = 0, \ - .lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \ + .utime = ATOMIC64_INIT(0), \ + .stime = ATOMIC64_INIT(0), \ + .sum_exec_runtime = ATOMIC64_INIT(0), \ + .running = 0 \ }, \ .cred_guard_mutex = \ __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ diff --git a/include/linux/sched.h b/include/linux/sched.h index d70910355b20..a45874c3fab6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -598,9 +598,10 @@ struct task_cputime { * used for thread group CPU timer calculations. */ struct thread_group_cputimer { - struct task_cputime cputime; + atomic64_t utime; + atomic64_t stime; + atomic64_t sum_exec_runtime; int running; - raw_spinlock_t lock; }; #include @@ -2967,11 +2968,6 @@ static __always_inline bool need_resched(void) void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times); void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times); -static inline void thread_group_cputime_init(struct signal_struct *sig) -{ - raw_spin_lock_init(&sig->cputimer.lock); -} - /* * Reevaluate whether the task has signals pending delivery. * Wake the task if so. diff --git a/kernel/fork.c b/kernel/fork.c index 47c37a411a62..2e670864174f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1091,9 +1091,6 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) { unsigned long cpu_limit; - /* Thread group counters. */ - thread_group_cputime_init(sig); - cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); if (cpu_limit != RLIM_INFINITY) { sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 4ab704339656..c6d1c7da3ea5 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -174,7 +174,8 @@ static inline bool cputimer_running(struct task_struct *tsk) { struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; - if (!cputimer->running) + /* Check if cputimer isn't running. This is accessed without locking. */ + if (!READ_ONCE(cputimer->running)) return false; /* @@ -215,9 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk, if (!cputimer_running(tsk)) return; - raw_spin_lock(&cputimer->lock); - cputimer->cputime.utime += cputime; - raw_spin_unlock(&cputimer->lock); + atomic64_add(cputime, &cputimer->utime); } /** @@ -238,9 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk, if (!cputimer_running(tsk)) return; - raw_spin_lock(&cputimer->lock); - cputimer->cputime.stime += cputime; - raw_spin_unlock(&cputimer->lock); + atomic64_add(cputime, &cputimer->stime); } /** @@ -261,7 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, if (!cputimer_running(tsk)) return; - raw_spin_lock(&cputimer->lock); - cputimer->cputime.sum_exec_runtime += ns; - raw_spin_unlock(&cputimer->lock); + atomic64_add(ns, &cputimer->sum_exec_runtime); } diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index e072d982f64c..d85730669410 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -196,39 +196,62 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, return 0; } -static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) +/* + * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg + * to avoid race conditions with concurrent updates to cputime. + */ +static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime) { - if (b->utime > a->utime) - a->utime = b->utime; + u64 curr_cputime; +retry: + curr_cputime = atomic64_read(cputime); + if (sum_cputime > curr_cputime) { + if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime) + goto retry; + } +} - if (b->stime > a->stime) - a->stime = b->stime; +static void update_gt_cputime(struct thread_group_cputimer *cputimer, struct task_cputime *sum) +{ + __update_gt_cputime(&cputimer->utime, sum->utime); + __update_gt_cputime(&cputimer->stime, sum->stime); + __update_gt_cputime(&cputimer->sum_exec_runtime, sum->sum_exec_runtime); +} - if (b->sum_exec_runtime > a->sum_exec_runtime) - a->sum_exec_runtime = b->sum_exec_runtime; +/* Sample thread_group_cputimer values in "cputimer", store results in "times". */ +static inline void sample_group_cputimer(struct task_cputime *times, + struct thread_group_cputimer *cputimer) +{ + times->utime = atomic64_read(&cputimer->utime); + times->stime = atomic64_read(&cputimer->stime); + times->sum_exec_runtime = atomic64_read(&cputimer->sum_exec_runtime); } void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) { struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; struct task_cputime sum; - unsigned long flags; - if (!cputimer->running) { + /* Check if cputimer isn't running. This is accessed without locking. */ + if (!READ_ONCE(cputimer->running)) { /* * The POSIX timer interface allows for absolute time expiry * values through the TIMER_ABSTIME flag, therefore we have - * to synchronize the timer to the clock every time we start - * it. + * to synchronize the timer to the clock every time we start it. */ thread_group_cputime(tsk, &sum); - raw_spin_lock_irqsave(&cputimer->lock, flags); - cputimer->running = 1; - update_gt_cputime(&cputimer->cputime, &sum); - } else - raw_spin_lock_irqsave(&cputimer->lock, flags); - *times = cputimer->cputime; - raw_spin_unlock_irqrestore(&cputimer->lock, flags); + update_gt_cputime(cputimer, &sum); + + /* + * We're setting cputimer->running without a lock. Ensure + * this only gets written to in one operation. We set + * running after update_gt_cputime() as a small optimization, + * but barriers are not required because update_gt_cputime() + * can handle concurrent updates. + */ + WRITE_ONCE(cputimer->running, 1); + } + sample_group_cputimer(times, cputimer); } /* @@ -582,7 +605,8 @@ bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk) if (!task_cputime_zero(&tsk->cputime_expires)) return false; - if (tsk->signal->cputimer.running) + /* Check if cputimer is running. This is accessed without locking. */ + if (READ_ONCE(tsk->signal->cputimer.running)) return false; return true; @@ -882,14 +906,12 @@ static void check_thread_timers(struct task_struct *tsk, } } -static void stop_process_timers(struct signal_struct *sig) +static inline void stop_process_timers(struct signal_struct *sig) { struct thread_group_cputimer *cputimer = &sig->cputimer; - unsigned long flags; - raw_spin_lock_irqsave(&cputimer->lock, flags); - cputimer->running = 0; - raw_spin_unlock_irqrestore(&cputimer->lock, flags); + /* Turn off cputimer->running. This is done without locking. */ + WRITE_ONCE(cputimer->running, 0); } static u32 onecputick; @@ -1111,12 +1133,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk) } sig = tsk->signal; - if (sig->cputimer.running) { + /* Check if cputimer is running. This is accessed without locking. */ + if (READ_ONCE(sig->cputimer.running)) { struct task_cputime group_sample; - raw_spin_lock(&sig->cputimer.lock); - group_sample = sig->cputimer.cputime; - raw_spin_unlock(&sig->cputimer.lock); + sample_group_cputimer(&group_sample, &sig->cputimer); if (task_cputime_expired(&group_sample, &sig->cputime_expires)) return 1; @@ -1157,7 +1178,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) * If there are any active process wide timers (POSIX 1.b, itimers, * RLIMIT_CPU) cputimer must be running. */ - if (tsk->signal->cputimer.running) + if (READ_ONCE(tsk->signal->cputimer.running)) check_process_timers(tsk, &firing); /* -- cgit v1.2.3 From 971e8a985482c76487edb5a49811e99b96e846e1 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Tue, 28 Apr 2015 13:00:23 -0700 Subject: sched, timer: Provide an atomic 'struct task_cputime' data structure This patch adds an atomic variant of the 'struct task_cputime' data structure, which can be used to store and update task_cputime statistics without needing to do locking. Suggested-by: Ingo Molnar Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Andrew Morton Cc: Aswin Chandramouleeswaran Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Scott J Norton Cc: Steven Rostedt Cc: Waiman Long Link: http://lkml.kernel.org/r/1430251224-5764-5-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index a45874c3fab6..6eb78cd45da7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -572,6 +572,23 @@ struct task_cputime { .sum_exec_runtime = 0, \ } +/* + * This is the atomic variant of task_cputime, which can be used for + * storing and updating task_cputime statistics without locking. + */ +struct task_cputime_atomic { + atomic64_t utime; + atomic64_t stime; + atomic64_t sum_exec_runtime; +}; + +#define INIT_CPUTIME_ATOMIC \ + (struct task_cputime_atomic) { \ + .utime = ATOMIC64_INIT(0), \ + .stime = ATOMIC64_INIT(0), \ + .sum_exec_runtime = ATOMIC64_INIT(0), \ + } + #ifdef CONFIG_PREEMPT_COUNT #define PREEMPT_DISABLED (1 + PREEMPT_ENABLED) #else -- cgit v1.2.3 From 7110744516276e906f9197e2857d026eb2343393 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Tue, 28 Apr 2015 13:00:24 -0700 Subject: sched, timer: Use the atomic task_cputime in thread_group_cputimer Recent optimizations were made to thread_group_cputimer to improve its scalability by keeping track of cputime stats without a lock. However, the values were open coded to the structure, causing them to be at a different abstraction level from the regular task_cputime structure. Furthermore, any subsequent similar optimizations would not be able to share the new code, since they are specific to thread_group_cputimer. This patch adds the new task_cputime_atomic data structure (introduced in the previous patch in the series) to thread_group_cputimer for keeping track of the cputime atomically, which also helps generalize the code. Suggested-by: Ingo Molnar Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Andrew Morton Cc: Aswin Chandramouleeswaran Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Scott J Norton Cc: Steven Rostedt Cc: Waiman Long Link: http://lkml.kernel.org/r/1430251224-5764-6-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 6 ++---- include/linux/sched.h | 4 +--- kernel/sched/stats.h | 6 +++--- kernel/time/posix-cpu-timers.c | 26 +++++++++++++------------- 4 files changed, 19 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 7b9d8b59e7bf..bb9b075f0eb0 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -50,10 +50,8 @@ extern struct fs_struct init_fs; .cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \ .rlim = INIT_RLIMITS, \ .cputimer = { \ - .utime = ATOMIC64_INIT(0), \ - .stime = ATOMIC64_INIT(0), \ - .sum_exec_runtime = ATOMIC64_INIT(0), \ - .running = 0 \ + .cputime_atomic = INIT_CPUTIME_ATOMIC, \ + .running = 0, \ }, \ .cred_guard_mutex = \ __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 6eb78cd45da7..4adc536a3b03 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -615,9 +615,7 @@ struct task_cputime_atomic { * used for thread group CPU timer calculations. */ struct thread_group_cputimer { - atomic64_t utime; - atomic64_t stime; - atomic64_t sum_exec_runtime; + struct task_cputime_atomic cputime_atomic; int running; }; diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index c6d1c7da3ea5..077ebbd5e10f 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -216,7 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk, if (!cputimer_running(tsk)) return; - atomic64_add(cputime, &cputimer->utime); + atomic64_add(cputime, &cputimer->cputime_atomic.utime); } /** @@ -237,7 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk, if (!cputimer_running(tsk)) return; - atomic64_add(cputime, &cputimer->stime); + atomic64_add(cputime, &cputimer->cputime_atomic.stime); } /** @@ -258,5 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, if (!cputimer_running(tsk)) return; - atomic64_add(ns, &cputimer->sum_exec_runtime); + atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime); } diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index d85730669410..892e3dae0aac 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -211,20 +211,20 @@ retry: } } -static void update_gt_cputime(struct thread_group_cputimer *cputimer, struct task_cputime *sum) +static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum) { - __update_gt_cputime(&cputimer->utime, sum->utime); - __update_gt_cputime(&cputimer->stime, sum->stime); - __update_gt_cputime(&cputimer->sum_exec_runtime, sum->sum_exec_runtime); + __update_gt_cputime(&cputime_atomic->utime, sum->utime); + __update_gt_cputime(&cputime_atomic->stime, sum->stime); + __update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime); } -/* Sample thread_group_cputimer values in "cputimer", store results in "times". */ -static inline void sample_group_cputimer(struct task_cputime *times, - struct thread_group_cputimer *cputimer) +/* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */ +static inline void sample_cputime_atomic(struct task_cputime *times, + struct task_cputime_atomic *atomic_times) { - times->utime = atomic64_read(&cputimer->utime); - times->stime = atomic64_read(&cputimer->stime); - times->sum_exec_runtime = atomic64_read(&cputimer->sum_exec_runtime); + times->utime = atomic64_read(&atomic_times->utime); + times->stime = atomic64_read(&atomic_times->stime); + times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime); } void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) @@ -240,7 +240,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) * to synchronize the timer to the clock every time we start it. */ thread_group_cputime(tsk, &sum); - update_gt_cputime(cputimer, &sum); + update_gt_cputime(&cputimer->cputime_atomic, &sum); /* * We're setting cputimer->running without a lock. Ensure @@ -251,7 +251,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) */ WRITE_ONCE(cputimer->running, 1); } - sample_group_cputimer(times, cputimer); + sample_cputime_atomic(times, &cputimer->cputime_atomic); } /* @@ -1137,7 +1137,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk) if (READ_ONCE(sig->cputimer.running)) { struct task_cputime group_sample; - sample_group_cputimer(&group_sample, &sig->cputimer); + sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic); if (task_cputime_expired(&group_sample, &sig->cputime_expires)) return 1; -- cgit v1.2.3 From 7675104990ed255b9315a82ae827ff312a2a88a2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 1 May 2015 08:27:50 -0700 Subject: sched: Implement lockless wake-queues This is useful for locking primitives that can effect multiple wakeups per operation and want to avoid lock internal lock contention by delaying the wakeups until we've released the lock internal locks. Alternatively it can be used to avoid issuing multiple wakeups, and thus save a few cycles, in packet processing. Queue all target tasks and wakeup once you've processed all packets. That way you avoid waking the target task multiple times if there were multiple packets for the same task. Properties of a wake_q are: - Lockless, as queue head must reside on the stack. - Being a queue, maintains wakeup order passed by the callers. This can be important for otherwise, in scenarios where highly contended locks could affect any reliance on lock fairness. - A queued task cannot be added again until it is woken up. This patch adds the needed infrastructure into the scheduler code and uses the new wake_list to delay the futex wakeups until after we've released the hash bucket locks. Signed-off-by: Peter Zijlstra (Intel) [tweaks, adjustments, comments, etc.] Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Cc: Borislav Petkov Cc: Chris Mason Cc: Davidlohr Bueso Cc: George Spelvin Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Manfred Spraul Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1430494072-30283-2-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- include/linux/sched.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/core.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4adc536a3b03..254d88e80f65 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -920,6 +920,50 @@ enum cpu_idle_type { #define SCHED_CAPACITY_SHIFT 10 #define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) +/* + * Wake-queues are lists of tasks with a pending wakeup, whose + * callers have already marked the task as woken internally, + * and can thus carry on. A common use case is being able to + * do the wakeups once the corresponding user lock as been + * released. + * + * We hold reference to each task in the list across the wakeup, + * thus guaranteeing that the memory is still valid by the time + * the actual wakeups are performed in wake_up_q(). + * + * One per task suffices, because there's never a need for a task to be + * in two wake queues simultaneously; it is forbidden to abandon a task + * in a wake queue (a call to wake_up_q() _must_ follow), so if a task is + * already in a wake queue, the wakeup will happen soon and the second + * waker can just skip it. + * + * The WAKE_Q macro declares and initializes the list head. + * wake_up_q() does NOT reinitialize the list; it's expected to be + * called near the end of a function, where the fact that the queue is + * not used again will be easy to see by inspection. + * + * Note that this can cause spurious wakeups. schedule() callers + * must ensure the call is done inside a loop, confirming that the + * wakeup condition has in fact occurred. + */ +struct wake_q_node { + struct wake_q_node *next; +}; + +struct wake_q_head { + struct wake_q_node *first; + struct wake_q_node **lastp; +}; + +#define WAKE_Q_TAIL ((struct wake_q_node *) 0x01) + +#define WAKE_Q(name) \ + struct wake_q_head name = { WAKE_Q_TAIL, &name.first } + +extern void wake_q_add(struct wake_q_head *head, + struct task_struct *task); +extern void wake_up_q(struct wake_q_head *head); + /* * sched-domains (multiprocessor balancing) declarations: */ @@ -1532,6 +1576,8 @@ struct task_struct { /* Protection of the PI data structures: */ raw_spinlock_t pi_lock; + struct wake_q_node wake_q; + #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task */ struct rb_root pi_waiters; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 22b53c863ef3..355f9538ca33 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -541,6 +541,52 @@ static bool set_nr_if_polling(struct task_struct *p) #endif #endif +void wake_q_add(struct wake_q_head *head, struct task_struct *task) +{ + struct wake_q_node *node = &task->wake_q; + + /* + * Atomically grab the task, if ->wake_q is !nil already it means + * its already queued (either by us or someone else) and will get the + * wakeup due to that. + * + * This cmpxchg() implies a full barrier, which pairs with the write + * barrier implied by the wakeup in wake_up_list(). + */ + if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) + return; + + get_task_struct(task); + + /* + * The head is context local, there can be no concurrency. + */ + *head->lastp = node; + head->lastp = &node->next; +} + +void wake_up_q(struct wake_q_head *head) +{ + struct wake_q_node *node = head->first; + + while (node != WAKE_Q_TAIL) { + struct task_struct *task; + + task = container_of(node, struct task_struct, wake_q); + BUG_ON(!task); + /* task can safely be re-inserted now */ + node = node->next; + task->wake_q.next = NULL; + + /* + * wake_up_process() implies a wmb() to pair with the queueing + * in wake_q_add() so as not to miss wakeups. + */ + wake_up_process(task); + put_task_struct(task); + } +} + /* * resched_curr - mark rq's current task 'to be rescheduled now'. * -- cgit v1.2.3 From ff303e66c240ba6269e31817a386995440a18c99 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 Apr 2015 20:05:30 +0200 Subject: perf: Fix software migrate events Stephane asked about PERF_COUNT_SW_CPU_MIGRATIONS and I realized it was borken: > The problem is that the task isn't actually scheduled while its being > migrated (obviously), and if its not scheduled, the counters aren't > scheduled either, so there's no observing of the fact. > > A further problem with migrations is that many migrations happen from > softirq context, which is nested inside the 'random' task context of > whoemever happens to run at that time, similarly for the wakeup > migrations triggered from (soft)irq context. All those end up being > accounted in the task that's currently running, eg. your 'ls'. The below cures this by marking a task as migrated and accounting it on the subsequent sched_in(). Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 24 ++++++++++++++++++++++++ include/linux/sched.h | 7 ++++--- kernel/sched/core.c | 2 +- 3 files changed, 29 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 61992cf2e977..e86f85abeda7 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -798,11 +798,33 @@ perf_sw_event_sched(u32 event_id, u64 nr, u64 addr) extern struct static_key_deferred perf_sched_events; +static __always_inline bool +perf_sw_migrate_enabled(void) +{ + if (static_key_false(&perf_swevent_enabled[PERF_COUNT_SW_CPU_MIGRATIONS])) + return true; + return false; +} + +static inline void perf_event_task_migrate(struct task_struct *task) +{ + if (perf_sw_migrate_enabled()) + task->sched_migrated = 1; +} + static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { if (static_key_false(&perf_sched_events.key)) __perf_event_task_sched_in(prev, task); + + if (perf_sw_migrate_enabled() && task->sched_migrated) { + struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]); + + perf_fetch_caller_regs(regs); + ___perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, regs, 0); + task->sched_migrated = 0; + } } static inline void perf_event_task_sched_out(struct task_struct *prev, @@ -925,6 +947,8 @@ perf_aux_output_skip(struct perf_output_handle *handle, static inline void * perf_get_aux(struct perf_output_handle *handle) { return NULL; } static inline void +perf_event_task_migrate(struct task_struct *task) { } +static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { } static inline void diff --git a/include/linux/sched.h b/include/linux/sched.h index 26a2e6122734..2c5e6c3db654 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1356,9 +1356,6 @@ struct task_struct { #endif struct mm_struct *mm, *active_mm; -#ifdef CONFIG_COMPAT_BRK - unsigned brk_randomized:1; -#endif /* per-thread vma caching */ u32 vmacache_seqnum; struct vm_area_struct *vmacache[VMACACHE_SIZE]; @@ -1381,10 +1378,14 @@ struct task_struct { /* Revert to default priority/policy when forking */ unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; + unsigned sched_migrated:1; #ifdef CONFIG_MEMCG_KMEM unsigned memcg_kmem_skip_account:1; #endif +#ifdef CONFIG_COMPAT_BRK + unsigned brk_randomized:1; +#endif unsigned long atomic_flags; /* Flags needing atomic access. */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fe22f7510bce..8652fd540780 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1049,7 +1049,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (p->sched_class->migrate_task_rq) p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; - perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); + perf_event_task_migrate(p); } __set_task_cpu(p, new_cpu); -- cgit v1.2.3 From 59aabfc7e959f5f213e4e5cc7567ab4934da2adf Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 30 Apr 2015 17:12:16 -0400 Subject: locking/rwsem: Reduce spinlock contention in wakeup after up_read()/up_write() In up_write()/up_read(), rwsem_wake() will be called whenever it detects that some writers/readers are waiting. The rwsem_wake() function will take the wait_lock and call __rwsem_do_wake() to do the real wakeup. For a heavily contended rwsem, doing a spin_lock() on wait_lock will cause further contention on the heavily contended rwsem cacheline resulting in delay in the completion of the up_read/up_write operations. This patch makes the wait_lock taking and the call to __rwsem_do_wake() optional if at least one spinning writer is present. The spinning writer will be able to take the rwsem and call rwsem_wake() later when it calls up_write(). With the presence of a spinning writer, rwsem_wake() will now try to acquire the lock using trylock. If that fails, it will just quit. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Davidlohr Bueso Acked-by: Jason Low Cc: Andrew Morton Cc: Borislav Petkov Cc: Douglas Hatch Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1430428337-16802-2-git-send-email-Waiman.Long@hp.com Signed-off-by: Ingo Molnar --- include/linux/osq_lock.h | 5 +++++ kernel/locking/rwsem-xadd.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) (limited to 'include/linux') diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h index 3a6490e81b28..703ea5c30a33 100644 --- a/include/linux/osq_lock.h +++ b/include/linux/osq_lock.h @@ -32,4 +32,9 @@ static inline void osq_lock_init(struct optimistic_spin_queue *lock) extern bool osq_lock(struct optimistic_spin_queue *lock); extern void osq_unlock(struct optimistic_spin_queue *lock); +static inline bool osq_is_locked(struct optimistic_spin_queue *lock) +{ + return atomic_read(&lock->tail) != OSQ_UNLOCKED_VAL; +} + #endif diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 3417d0172a5d..0f189714e457 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -409,11 +409,24 @@ done: return taken; } +/* + * Return true if the rwsem has active spinner + */ +static inline bool rwsem_has_spinner(struct rw_semaphore *sem) +{ + return osq_is_locked(&sem->osq); +} + #else static bool rwsem_optimistic_spin(struct rw_semaphore *sem) { return false; } + +static inline bool rwsem_has_spinner(struct rw_semaphore *sem) +{ + return false; +} #endif /* @@ -496,7 +509,38 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) { unsigned long flags; + /* + * If a spinner is present, it is not necessary to do the wakeup. + * Try to do wakeup only if the trylock succeeds to minimize + * spinlock contention which may introduce too much delay in the + * unlock operation. + * + * spinning writer up_write/up_read caller + * --------------- ----------------------- + * [S] osq_unlock() [L] osq + * MB RMB + * [RmW] rwsem_try_write_lock() [RmW] spin_trylock(wait_lock) + * + * Here, it is important to make sure that there won't be a missed + * wakeup while the rwsem is free and the only spinning writer goes + * to sleep without taking the rwsem. Even when the spinning writer + * is just going to break out of the waiting loop, it will still do + * a trylock in rwsem_down_write_failed() before sleeping. IOW, if + * rwsem_has_spinner() is true, it will guarantee at least one + * trylock attempt on the rwsem later on. + */ + if (rwsem_has_spinner(sem)) { + /* + * The smp_rmb() here is to make sure that the spinner + * state is consulted before reading the wait_lock. + */ + smp_rmb(); + if (!raw_spin_trylock_irqsave(&sem->wait_lock, flags)) + return sem; + goto locked; + } raw_spin_lock_irqsave(&sem->wait_lock, flags); +locked: /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) -- cgit v1.2.3 From 663fdcbee0a656cdaef934e7f50e6c2670373bc9 Mon Sep 17 00:00:00 2001 From: Preeti U Murthy Date: Thu, 30 Apr 2015 17:27:21 +0530 Subject: kernel: Replace reference to ASSIGN_ONCE() with WRITE_ONCE() in comment Looks like commit : 43239cbe79fc ("kernel: Change ASSIGN_ONCE(val, x) to WRITE_ONCE(x, val)") left behind a reference to ASSIGN_ONCE(). Update this to WRITE_ONCE(). Signed-off-by: Preeti U Murthy Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: borntraeger@de.ibm.com Cc: dave@stgolabs.net Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/20150430115721.22278.94082.stgit@preeti.in.ibm.com Signed-off-by: Ingo Molnar --- include/linux/compiler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 867722591be2..a7c0941d10da 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -450,7 +450,7 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s * with an explicit memory barrier or atomic instruction that provides the * required ordering. * - * If possible use READ_ONCE/ASSIGN_ONCE instead. + * If possible use READ_ONCE()/WRITE_ONCE() instead. */ #define __ACCESS_ONCE(x) ({ \ __maybe_unused typeof(x) __var = (__force typeof(x)) 0; \ -- cgit v1.2.3 From 56f13c0d9524c5816f5dc9c91b9d766d6b1064ca Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 9 Apr 2015 12:35:47 +0300 Subject: dmaengine: of_dma: Support for DMA routers DMA routers are transparent devices used to mux DMA requests from peripherals to DMA controllers. They are used when the SoC integrates more devices with DMA requests then their controller can handle. DRA7x is one example of such SoC, where the sDMA can hanlde 128 DMA request lines, but in SoC level it has 205 DMA requests. The of_dma_router will be registered as of_dma_controller with special xlate function and additional parameters. The driver for the router is responsible to craft the dma_spec (in the of_dma_route_allocate callback) which can be used to requests a DMA channel from the real DMA controller. This way the router can be transparent for the system while remaining generic enough to be used in different environments. Signed-off-by: Peter Ujfalusi Signed-off-by: Vinod Koul --- Documentation/devicetree/bindings/dma/dma.txt | 28 +++++++++ drivers/dma/dmaengine.c | 7 +++ drivers/dma/of-dma.c | 89 +++++++++++++++++++++++++++ include/linux/dmaengine.h | 17 +++++ include/linux/of_dma.h | 21 +++++++ 5 files changed, 162 insertions(+) (limited to 'include/linux') diff --git a/Documentation/devicetree/bindings/dma/dma.txt b/Documentation/devicetree/bindings/dma/dma.txt index 82104271e754..6312fb00ce8d 100644 --- a/Documentation/devicetree/bindings/dma/dma.txt +++ b/Documentation/devicetree/bindings/dma/dma.txt @@ -31,6 +31,34 @@ Example: dma-requests = <127>; }; +* DMA router + +DMA routers are transparent IP blocks used to route DMA request lines from +devices to the DMA controller. Some SoCs (like TI DRA7x) have more peripherals +integrated with DMA requests than what the DMA controller can handle directly. + +Required property: +- dma-masters: phandle of the DMA controller or list of phandles for + the DMA controllers the router can direct the signal to. +- #dma-cells: Must be at least 1. Used to provide DMA router specific + information. See DMA client binding below for more + details. + +Optional properties: +- dma-requests: Number of incoming request lines the router can handle. +- In the node pointed by the dma-masters: + - dma-requests: The router driver might need to look for this in order + to configure the routing. + +Example: + sdma_xbar: dma-router@4a002b78 { + compatible = "ti,dra7-dma-crossbar"; + reg = <0x4a002b78 0xfc>; + #dma-cells = <1>; + dma-requests = <205>; + ti,dma-safe-map = <0>; + dma-masters = <&sdma>; + }; * DMA client diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index 0e035a8cf401..9e5949696b1b 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -267,6 +267,13 @@ static void dma_chan_put(struct dma_chan *chan) /* This channel is not in use anymore, free it */ if (!chan->client_count && chan->device->device_free_chan_resources) chan->device->device_free_chan_resources(chan); + + /* If the channel is used via a DMA request router, free the mapping */ + if (chan->router && chan->router->route_free) { + chan->router->route_free(chan->router->dev, chan->route_data); + chan->router = NULL; + chan->route_data = NULL; + } } enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie) diff --git a/drivers/dma/of-dma.c b/drivers/dma/of-dma.c index cbd4a8aff120..1e1f2986eba8 100644 --- a/drivers/dma/of-dma.c +++ b/drivers/dma/of-dma.c @@ -44,6 +44,50 @@ static struct of_dma *of_dma_find_controller(struct of_phandle_args *dma_spec) return NULL; } +/** + * of_dma_router_xlate - translation function for router devices + * @dma_spec: pointer to DMA specifier as found in the device tree + * @of_dma: pointer to DMA controller data (router information) + * + * The function creates new dma_spec to be passed to the router driver's + * of_dma_route_allocate() function to prepare a dma_spec which will be used + * to request channel from the real DMA controller. + */ +static struct dma_chan *of_dma_router_xlate(struct of_phandle_args *dma_spec, + struct of_dma *ofdma) +{ + struct dma_chan *chan; + struct of_dma *ofdma_target; + struct of_phandle_args dma_spec_target; + void *route_data; + + /* translate the request for the real DMA controller */ + memcpy(&dma_spec_target, dma_spec, sizeof(dma_spec_target)); + route_data = ofdma->of_dma_route_allocate(&dma_spec_target, ofdma); + if (IS_ERR(route_data)) + return NULL; + + ofdma_target = of_dma_find_controller(&dma_spec_target); + if (!ofdma_target) + return NULL; + + chan = ofdma_target->of_dma_xlate(&dma_spec_target, ofdma_target); + if (chan) { + chan->router = ofdma->dma_router; + chan->route_data = route_data; + } else { + ofdma->dma_router->route_free(ofdma->dma_router->dev, + route_data); + } + + /* + * Need to put the node back since the ofdma->of_dma_route_allocate + * has taken it for generating the new, translated dma_spec + */ + of_node_put(dma_spec_target.np); + return chan; +} + /** * of_dma_controller_register - Register a DMA controller to DT DMA helpers * @np: device node of DMA controller @@ -109,6 +153,51 @@ void of_dma_controller_free(struct device_node *np) } EXPORT_SYMBOL_GPL(of_dma_controller_free); +/** + * of_dma_router_register - Register a DMA router to DT DMA helpers as a + * controller + * @np: device node of DMA router + * @of_dma_route_allocate: setup function for the router which need to + * modify the dma_spec for the DMA controller to + * use and to set up the requested route. + * @dma_router: pointer to dma_router structure to be used when + * the route need to be free up. + * + * Returns 0 on success or appropriate errno value on error. + * + * Allocated memory should be freed with appropriate of_dma_controller_free() + * call. + */ +int of_dma_router_register(struct device_node *np, + void *(*of_dma_route_allocate) + (struct of_phandle_args *, struct of_dma *), + struct dma_router *dma_router) +{ + struct of_dma *ofdma; + + if (!np || !of_dma_route_allocate || !dma_router) { + pr_err("%s: not enough information provided\n", __func__); + return -EINVAL; + } + + ofdma = kzalloc(sizeof(*ofdma), GFP_KERNEL); + if (!ofdma) + return -ENOMEM; + + ofdma->of_node = np; + ofdma->of_dma_xlate = of_dma_router_xlate; + ofdma->of_dma_route_allocate = of_dma_route_allocate; + ofdma->dma_router = dma_router; + + /* Now queue of_dma controller structure in list */ + mutex_lock(&of_dma_lock); + list_add_tail(&ofdma->of_dma_controllers, &of_dma_list); + mutex_unlock(&of_dma_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(of_dma_router_register); + /** * of_dma_match_channel - Check if a DMA specifier matches name * @np: device node to look for DMA channels diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index ad419757241f..abf63ceabef9 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -221,6 +221,16 @@ struct dma_chan_percpu { unsigned long bytes_transferred; }; +/** + * struct dma_router - DMA router structure + * @dev: pointer to the DMA router device + * @route_free: function to be called when the route can be disconnected + */ +struct dma_router { + struct device *dev; + void (*route_free)(struct device *dev, void *route_data); +}; + /** * struct dma_chan - devices supply DMA channels, clients use them * @device: ptr to the dma device who supplies this channel, always !%NULL @@ -232,6 +242,8 @@ struct dma_chan_percpu { * @local: per-cpu pointer to a struct dma_chan_percpu * @client_count: how many clients are using this channel * @table_count: number of appearances in the mem-to-mem allocation table + * @router: pointer to the DMA router structure + * @route_data: channel specific data for the router * @private: private data for certain client-channel associations */ struct dma_chan { @@ -247,6 +259,11 @@ struct dma_chan { struct dma_chan_percpu __percpu *local; int client_count; int table_count; + + /* DMA router */ + struct dma_router *router; + void *route_data; + void *private; }; diff --git a/include/linux/of_dma.h b/include/linux/of_dma.h index 56bc026c143f..98ba7525929e 100644 --- a/include/linux/of_dma.h +++ b/include/linux/of_dma.h @@ -23,6 +23,9 @@ struct of_dma { struct device_node *of_node; struct dma_chan *(*of_dma_xlate) (struct of_phandle_args *, struct of_dma *); + void *(*of_dma_route_allocate) + (struct of_phandle_args *, struct of_dma *); + struct dma_router *dma_router; void *of_dma_data; }; @@ -37,12 +40,20 @@ extern int of_dma_controller_register(struct device_node *np, (struct of_phandle_args *, struct of_dma *), void *data); extern void of_dma_controller_free(struct device_node *np); + +extern int of_dma_router_register(struct device_node *np, + void *(*of_dma_route_allocate) + (struct of_phandle_args *, struct of_dma *), + struct dma_router *dma_router); +#define of_dma_router_free of_dma_controller_free + extern struct dma_chan *of_dma_request_slave_channel(struct device_node *np, const char *name); extern struct dma_chan *of_dma_simple_xlate(struct of_phandle_args *dma_spec, struct of_dma *ofdma); extern struct dma_chan *of_dma_xlate_by_chan_id(struct of_phandle_args *dma_spec, struct of_dma *ofdma); + #else static inline int of_dma_controller_register(struct device_node *np, struct dma_chan *(*of_dma_xlate) @@ -56,6 +67,16 @@ static inline void of_dma_controller_free(struct device_node *np) { } +static inline int of_dma_router_register(struct device_node *np, + void *(*of_dma_route_allocate) + (struct of_phandle_args *, struct of_dma *), + struct dma_router *dma_router) +{ + return -ENODEV; +} + +#define of_dma_router_free of_dma_controller_free + static inline struct dma_chan *of_dma_request_slave_channel(struct device_node *np, const char *name) { -- cgit v1.2.3 From 4ae92bc77ac8e620f7c8d59b5882a4cb0d1c4ef1 Mon Sep 17 00:00:00 2001 From: Nicolas Schichan Date: Wed, 6 May 2015 16:12:27 +0200 Subject: net: filter: add a callback to allow classic post-verifier transformations This is in preparation for use by the seccomp code, the rationale is not to duplicate additional code within the seccomp layer, but instead, have it abstracted and hidden within the classic BPF API. As an interim step, this now also makes bpf_prepare_filter() visible (not as exported symbol though), so that seccomp can reuse that code path instead of reimplementing it. Joint work with Daniel Borkmann. Signed-off-by: Nicolas Schichan Signed-off-by: Daniel Borkmann Cc: Alexei Starovoitov Cc: Kees Cook Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 6 ++++++ net/core/filter.c | 18 +++++++++++++++--- 2 files changed, 21 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index fa11b3a367be..91996247cb55 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -384,7 +384,13 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_attach_bpf(u32 ufd, struct sock *sk); int sk_detach_filter(struct sock *sk); +typedef int (*bpf_aux_classic_check_t)(struct sock_filter *filter, + unsigned int flen); + int bpf_check_classic(const struct sock_filter *filter, unsigned int flen); +struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, + bpf_aux_classic_check_t trans); + int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, unsigned int len); diff --git a/net/core/filter.c b/net/core/filter.c index bf831a85c315..e670494f1d83 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -988,7 +988,8 @@ out_err: return ERR_PTR(err); } -static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp) +struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, + bpf_aux_classic_check_t trans) { int err; @@ -1001,6 +1002,17 @@ static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp) return ERR_PTR(err); } + /* There might be additional checks and transformations + * needed on classic filters, f.e. in case of seccomp. + */ + if (trans) { + err = trans(fp->insns, fp->len); + if (err) { + __bpf_prog_release(fp); + return ERR_PTR(err); + } + } + /* Probe if we can JIT compile the filter and if so, do * the compilation of the filter. */ @@ -1050,7 +1062,7 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ - fp = bpf_prepare_filter(fp); + fp = bpf_prepare_filter(fp, NULL); if (IS_ERR(fp)) return PTR_ERR(fp); @@ -1135,7 +1147,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ - prog = bpf_prepare_filter(prog); + prog = bpf_prepare_filter(prog, NULL); if (IS_ERR(prog)) return PTR_ERR(prog); -- cgit v1.2.3 From d9e12f42e58da475379b9080708b94f2095904af Mon Sep 17 00:00:00 2001 From: Nicolas Schichan Date: Wed, 6 May 2015 16:12:28 +0200 Subject: seccomp: simplify seccomp_prepare_filter and reuse bpf_prepare_filter Remove the calls to bpf_check_classic(), bpf_convert_filter() and bpf_migrate_runtime() and let bpf_prepare_filter() take care of that instead. seccomp_check_filter() is passed to bpf_prepare_filter() so that it gets called from there, after bpf_check_classic(). We can now remove exposure of two internal classic BPF functions previously used by seccomp. The export of bpf_check_classic() symbol, previously known as sk_chk_filter(), was there since pre git times, and no in-tree module was using it, therefore remove it. Joint work with Daniel Borkmann. Signed-off-by: Nicolas Schichan Signed-off-by: Daniel Borkmann Cc: Alexei Starovoitov Cc: Kees Cook Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 4 --- kernel/seccomp.c | 68 ++++++++++++++++---------------------------------- net/core/filter.c | 8 +++--- 3 files changed, 26 insertions(+), 54 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 91996247cb55..0dcb44bcfc5f 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -363,9 +363,6 @@ int sk_filter(struct sock *sk, struct sk_buff *skb); void bpf_prog_select_runtime(struct bpf_prog *fp); void bpf_prog_free(struct bpf_prog *fp); -int bpf_convert_filter(struct sock_filter *prog, int len, - struct bpf_insn *new_prog, int *new_len); - struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags); struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags); @@ -387,7 +384,6 @@ int sk_detach_filter(struct sock *sk); typedef int (*bpf_aux_classic_check_t)(struct sock_filter *filter, unsigned int flen); -int bpf_check_classic(const struct sock_filter *filter, unsigned int flen); struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, bpf_aux_classic_check_t trans); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 4f44028943e6..93d40f7f3683 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -347,15 +347,14 @@ static inline void seccomp_sync_threads(void) static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) { struct seccomp_filter *filter; - unsigned long fp_size; - struct sock_filter *fp; - int new_len; - long ret; + struct bpf_prog *prog; + unsigned long fsize; if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) return ERR_PTR(-EINVAL); + BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter)); - fp_size = fprog->len * sizeof(struct sock_filter); + fsize = bpf_classic_proglen(fprog); /* * Installing a seccomp filter requires that the task has @@ -368,60 +367,37 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) CAP_SYS_ADMIN) != 0) return ERR_PTR(-EACCES); - fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN); - if (!fp) + prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); + if (!prog) return ERR_PTR(-ENOMEM); /* Copy the instructions from fprog. */ - ret = -EFAULT; - if (copy_from_user(fp, fprog->filter, fp_size)) - goto free_prog; - - /* Check and rewrite the fprog via the skb checker */ - ret = bpf_check_classic(fp, fprog->len); - if (ret) - goto free_prog; + if (copy_from_user(prog->insns, fprog->filter, fsize)) { + __bpf_prog_free(prog); + return ERR_PTR(-EFAULT); + } - /* Check and rewrite the fprog for seccomp use */ - ret = seccomp_check_filter(fp, fprog->len); - if (ret) - goto free_prog; + prog->len = fprog->len; - /* Convert 'sock_filter' insns to 'bpf_insn' insns */ - ret = bpf_convert_filter(fp, fprog->len, NULL, &new_len); - if (ret) - goto free_prog; + /* bpf_prepare_filter() already takes care of freeing + * memory in case something goes wrong. + */ + prog = bpf_prepare_filter(prog, seccomp_check_filter); + if (IS_ERR(prog)) + return ERR_CAST(prog); /* Allocate a new seccomp_filter */ - ret = -ENOMEM; filter = kzalloc(sizeof(struct seccomp_filter), GFP_KERNEL|__GFP_NOWARN); - if (!filter) - goto free_prog; - - filter->prog = bpf_prog_alloc(bpf_prog_size(new_len), __GFP_NOWARN); - if (!filter->prog) - goto free_filter; - - ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); - if (ret) - goto free_filter_prog; + if (!filter) { + bpf_prog_destroy(prog); + return ERR_PTR(-ENOMEM); + } - kfree(fp); + filter->prog = prog; atomic_set(&filter->usage, 1); - filter->prog->len = new_len; - - bpf_prog_select_runtime(filter->prog); return filter; - -free_filter_prog: - __bpf_prog_free(filter->prog); -free_filter: - kfree(filter); -free_prog: - kfree(fp); - return ERR_PTR(ret); } /** diff --git a/net/core/filter.c b/net/core/filter.c index e670494f1d83..f887084740cd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -355,8 +355,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, * for socket filters: ctx == 'struct sk_buff *', for seccomp: * ctx == 'struct seccomp_data *'. */ -int bpf_convert_filter(struct sock_filter *prog, int len, - struct bpf_insn *new_prog, int *new_len) +static int bpf_convert_filter(struct sock_filter *prog, int len, + struct bpf_insn *new_prog, int *new_len) { int new_flen = 0, pass = 0, target, i; struct bpf_insn *new_insn; @@ -751,7 +751,8 @@ static bool chk_code_allowed(u16 code_to_probe) * * Returns 0 if the rule set is legal or -EINVAL if not. */ -int bpf_check_classic(const struct sock_filter *filter, unsigned int flen) +static int bpf_check_classic(const struct sock_filter *filter, + unsigned int flen) { bool anc_found; int pc; @@ -825,7 +826,6 @@ int bpf_check_classic(const struct sock_filter *filter, unsigned int flen) return -EINVAL; } -EXPORT_SYMBOL(bpf_check_classic); static int bpf_prog_store_orig_filter(struct bpf_prog *fp, const struct sock_fprog *fprog) -- cgit v1.2.3 From ac67eb2c5347bd9976308c0e0cf1d9e7ca690342 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 6 May 2015 16:12:30 +0200 Subject: seccomp, filter: add and use bpf_prog_create_from_user from seccomp Seccomp has always been a special candidate when it comes to preparation of its filters in seccomp_prepare_filter(). Due to the extra checks and filter rewrite it partially duplicates code and has BPF internals exposed. This patch adds a generic API inside the BPF code code that seccomp can use and thus keep it's filter preparation code minimal and better maintainable. The other side-effect is that now classic JITs can add seccomp support as well by only providing a BPF_LDX | BPF_W | BPF_ABS translation. Tested with seccomp and BPF test suites. Signed-off-by: Daniel Borkmann Cc: Nicolas Schichan Cc: Alexei Starovoitov Cc: Kees Cook Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 12 +++++------- kernel/seccomp.c | 42 ++++++++++++----------------------------- net/core/filter.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 66 insertions(+), 39 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 0dcb44bcfc5f..3c03a6085b82 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -374,19 +374,17 @@ static inline void bpf_prog_unlock_free(struct bpf_prog *fp) __bpf_prog_free(fp); } +typedef int (*bpf_aux_classic_check_t)(struct sock_filter *filter, + unsigned int flen); + int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog); +int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, + bpf_aux_classic_check_t trans); void bpf_prog_destroy(struct bpf_prog *fp); int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_attach_bpf(u32 ufd, struct sock *sk); int sk_detach_filter(struct sock *sk); - -typedef int (*bpf_aux_classic_check_t)(struct sock_filter *filter, - unsigned int flen); - -struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, - bpf_aux_classic_check_t trans); - int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, unsigned int len); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 93d40f7f3683..245df6b32b81 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -346,15 +346,13 @@ static inline void seccomp_sync_threads(void) */ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) { - struct seccomp_filter *filter; - struct bpf_prog *prog; - unsigned long fsize; + struct seccomp_filter *sfilter; + int ret; if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) return ERR_PTR(-EINVAL); BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter)); - fsize = bpf_classic_proglen(fprog); /* * Installing a seccomp filter requires that the task has @@ -367,37 +365,21 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) CAP_SYS_ADMIN) != 0) return ERR_PTR(-EACCES); - prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); - if (!prog) - return ERR_PTR(-ENOMEM); - - /* Copy the instructions from fprog. */ - if (copy_from_user(prog->insns, fprog->filter, fsize)) { - __bpf_prog_free(prog); - return ERR_PTR(-EFAULT); - } - - prog->len = fprog->len; - - /* bpf_prepare_filter() already takes care of freeing - * memory in case something goes wrong. - */ - prog = bpf_prepare_filter(prog, seccomp_check_filter); - if (IS_ERR(prog)) - return ERR_CAST(prog); - /* Allocate a new seccomp_filter */ - filter = kzalloc(sizeof(struct seccomp_filter), - GFP_KERNEL|__GFP_NOWARN); - if (!filter) { - bpf_prog_destroy(prog); + sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN); + if (!sfilter) return ERR_PTR(-ENOMEM); + + ret = bpf_prog_create_from_user(&sfilter->prog, fprog, + seccomp_check_filter); + if (ret < 0) { + kfree(sfilter); + return ERR_PTR(ret); } - filter->prog = prog; - atomic_set(&filter->usage, 1); + atomic_set(&sfilter->usage, 1); - return filter; + return sfilter; } /** diff --git a/net/core/filter.c b/net/core/filter.c index 45c015d6b974..a831f193e2c7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -991,8 +991,8 @@ out_err: return ERR_PTR(err); } -struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, - bpf_aux_classic_check_t trans) +static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, + bpf_aux_classic_check_t trans) { int err; @@ -1074,6 +1074,53 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) } EXPORT_SYMBOL_GPL(bpf_prog_create); +/** + * bpf_prog_create_from_user - create an unattached filter from user buffer + * @pfp: the unattached filter that is created + * @fprog: the filter program + * @trans: post-classic verifier transformation handler + * + * This function effectively does the same as bpf_prog_create(), only + * that it builds up its insns buffer from user space provided buffer. + * It also allows for passing a bpf_aux_classic_check_t handler. + */ +int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, + bpf_aux_classic_check_t trans) +{ + unsigned int fsize = bpf_classic_proglen(fprog); + struct bpf_prog *fp; + + /* Make sure new filter is there and in the right amounts. */ + if (fprog->filter == NULL) + return -EINVAL; + + fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); + if (!fp) + return -ENOMEM; + + if (copy_from_user(fp->insns, fprog->filter, fsize)) { + __bpf_prog_free(fp); + return -EFAULT; + } + + fp->len = fprog->len; + /* Since unattached filters are not copied back to user + * space through sk_get_filter(), we do not need to hold + * a copy here, and can spare us the work. + */ + fp->orig_prog = NULL; + + /* bpf_prepare_filter() already takes care of freeing + * memory in case something goes wrong. + */ + fp = bpf_prepare_filter(fp, trans); + if (IS_ERR(fp)) + return PTR_ERR(fp); + + *pfp = fp; + return 0; +} + void bpf_prog_destroy(struct bpf_prog *fp) { __bpf_prog_release(fp); -- cgit v1.2.3 From 59324cf35aba5336b611074028777838a963d03b Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 7 May 2015 11:02:53 +0200 Subject: netlink: allow to listen "all" netns More accurately, listen all netns that have a nsid assigned into the netns where the netlink socket is opened. For this purpose, a netlink socket option is added: NETLINK_LISTEN_ALL_NSID. When this option is set on a netlink socket, this socket will receive netlink notifications from all netns that have a nsid assigned into the netns where the socket has been opened. The nsid is sent to userland via an anscillary data. With this patch, a daemon needs only one socket to listen many netns. This is useful when the number of netns is high. Because 0 is a valid value for a nsid, the field nsid_is_set indicates if the field nsid is valid or not. skb->cb is initialized to 0 on skb allocation, thus we are sure that we will never send a nsid 0 by error to the userland. Signed-off-by: Nicolas Dichtel Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/netlink.h | 2 ++ include/net/net_namespace.h | 2 ++ include/uapi/linux/netlink.h | 1 + net/core/net_namespace.c | 10 ++++++++- net/netlink/af_netlink.c | 52 +++++++++++++++++++++++++++++++++++++++----- 5 files changed, 61 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 6835c1279df7..9120edb650a0 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -28,6 +28,8 @@ struct netlink_skb_parms { __u32 dst_group; __u32 flags; struct sock *sk; + bool nsid_is_set; + int nsid; }; #define NETLINK_CB(skb) (*(struct netlink_skb_parms*)&((skb)->cb)) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 6d1e2eae32fb..3f850acc844e 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -272,6 +272,8 @@ static inline struct net *read_pnet(const possible_net_t *pnet) #endif int peernet2id_alloc(struct net *net, struct net *peer); +int peernet2id(struct net *net, struct net *peer); +bool peernet_has_id(struct net *net, struct net *peer); struct net *get_net_ns_by_id(struct net *net, int id); struct pernet_operations { diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index 1a85940f8ab7..3e34b7d702f8 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -108,6 +108,7 @@ struct nlmsgerr { #define NETLINK_NO_ENOBUFS 5 #define NETLINK_RX_RING 6 #define NETLINK_TX_RING 7 +#define NETLINK_LISTEN_ALL_NSID 8 struct nl_pktinfo { __u32 group; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index ae5008b097de..a665bf490c88 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -229,7 +229,7 @@ int peernet2id_alloc(struct net *net, struct net *peer) EXPORT_SYMBOL(peernet2id_alloc); /* This function returns, if assigned, the id of a peer netns. */ -static int peernet2id(struct net *net, struct net *peer) +int peernet2id(struct net *net, struct net *peer) { unsigned long flags; int id; @@ -240,6 +240,14 @@ static int peernet2id(struct net *net, struct net *peer) return id; } +/* This function returns true is the peer netns has an id assigned into the + * current netns. + */ +bool peernet_has_id(struct net *net, struct net *peer) +{ + return peernet2id(net, peer) >= 0; +} + struct net *get_net_ns_by_id(struct net *net, int id) { unsigned long flags; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index bf7f56d7a9aa..a5fff75accf8 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -83,6 +83,7 @@ struct listeners { #define NETLINK_F_RECV_PKTINFO 0x2 #define NETLINK_F_BROADCAST_SEND_ERROR 0x4 #define NETLINK_F_RECV_NO_ENOBUFS 0x8 +#define NETLINK_F_LISTEN_ALL_NSID 0x10 static inline int netlink_is_kernel(struct sock *sk) { @@ -1932,8 +1933,17 @@ static void do_one_broadcast(struct sock *sk, !test_bit(p->group - 1, nlk->groups)) return; - if (!net_eq(sock_net(sk), p->net)) - return; + if (!net_eq(sock_net(sk), p->net)) { + if (!(nlk->flags & NETLINK_F_LISTEN_ALL_NSID)) + return; + + if (!peernet_has_id(sock_net(sk), p->net)) + return; + + if (!file_ns_capable(sk->sk_socket->file, p->net->user_ns, + CAP_NET_BROADCAST)) + return; + } if (p->failure) { netlink_overrun(sk); @@ -1959,13 +1969,22 @@ static void do_one_broadcast(struct sock *sk, p->failure = 1; if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR) p->delivery_failure = 1; - } else if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) { + goto out; + } + if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) { kfree_skb(p->skb2); p->skb2 = NULL; - } else if (sk_filter(sk, p->skb2)) { + goto out; + } + if (sk_filter(sk, p->skb2)) { kfree_skb(p->skb2); p->skb2 = NULL; - } else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) { + goto out; + } + NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net); + NETLINK_CB(p->skb2).nsid_is_set = true; + val = netlink_broadcast_deliver(sk, p->skb2); + if (val < 0) { netlink_overrun(sk); if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR) p->delivery_failure = 1; @@ -1974,6 +1993,7 @@ static void do_one_broadcast(struct sock *sk, p->delivered = 1; p->skb2 = NULL; } +out: sock_put(sk); } @@ -2202,6 +2222,16 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, break; } #endif /* CONFIG_NETLINK_MMAP */ + case NETLINK_LISTEN_ALL_NSID: + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST)) + return -EPERM; + + if (val) + nlk->flags |= NETLINK_F_LISTEN_ALL_NSID; + else + nlk->flags &= ~NETLINK_F_LISTEN_ALL_NSID; + err = 0; + break; default: err = -ENOPROTOOPT; } @@ -2268,6 +2298,16 @@ static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info); } +static void netlink_cmsg_listen_all_nsid(struct sock *sk, struct msghdr *msg, + struct sk_buff *skb) +{ + if (!NETLINK_CB(skb).nsid_is_set) + return; + + put_cmsg(msg, SOL_NETLINK, NETLINK_LISTEN_ALL_NSID, sizeof(int), + &NETLINK_CB(skb).nsid); +} + static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; @@ -2421,6 +2461,8 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (nlk->flags & NETLINK_F_RECV_PKTINFO) netlink_cmsg_recv_pktinfo(msg, skb); + if (nlk->flags & NETLINK_F_LISTEN_ALL_NSID) + netlink_cmsg_listen_all_nsid(sk, msg, skb); memset(&scm, 0, sizeof(scm)); scm.creds = *NETLINK_CREDS(skb); -- cgit v1.2.3 From 920ce39f6c204d4ce4d8acebe7522f0dfa95f662 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Fri, 8 May 2015 14:31:50 -0700 Subject: sched, timer: Fix documentation for 'struct thread_group_cputimer' Fix the docbook build bug reported by Fengguang Wu. Reported-by: Fengguang Wu Signed-off-by: Jason Low Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman.Long@hp.com Cc: aswin@hp.com Cc: bp@alien8.de Cc: dave@stgolabs.net Cc: fweisbec@gmail.com Cc: mgorman@suse.de Cc: oleg@redhat.com Cc: paulmck@linux.vnet.ibm.com Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Cc: rostedt@goodmis.org Cc: scott.norton@hp.com Cc: torvalds@linux-foundation.org Cc: umgwanakikbuti@gmail.com Link: http://lkml.kernel.org/r/1431120710.5136.12.camel@j-VirtualBox Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 254d88e80f65..0eceeec5a01a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -606,10 +606,9 @@ struct task_cputime_atomic { /** * struct thread_group_cputimer - thread group interval timer counts - * @cputime: thread group interval timers. + * @cputime_atomic: atomic thread group interval timers. * @running: non-zero when there are timers running and * @cputime receives updates. - * @lock: lock for fields in this struct. * * This structure contains the version of task_cputime, above, that is * used for thread group CPU timer calculations. -- cgit v1.2.3 From 34ef33f7da6b00900d3a896d33522a035a930245 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 28 Apr 2015 14:04:07 +0200 Subject: usb: phy: Remove the phy-rcar-gen2-usb driver The phy-rcar-gen2-usb driver, which supports legacy platform data only, is no longer used since commit a483dcbfa21f919c ("ARM: shmobile: lager: Remove legacy board support"). This driver was superseded by the DT-only phy-rcar-gen2 driver, which was introduced in commit 1233f59f745b237d ("phy: Renesas R-Car Gen2 PHY driver"). Signed-off-by: Geert Uytterhoeven Signed-off-by: Greg Kroah-Hartman --- drivers/usb/phy/Kconfig | 13 -- drivers/usb/phy/Makefile | 1 - drivers/usb/phy/phy-rcar-gen2-usb.c | 246 ------------------------ include/linux/platform_data/usb-rcar-gen2-phy.h | 22 --- 4 files changed, 282 deletions(-) delete mode 100644 drivers/usb/phy/phy-rcar-gen2-usb.c delete mode 100644 include/linux/platform_data/usb-rcar-gen2-phy.h (limited to 'include/linux') diff --git a/drivers/usb/phy/Kconfig b/drivers/usb/phy/Kconfig index 2175678e674e..3cd3bee54ca6 100644 --- a/drivers/usb/phy/Kconfig +++ b/drivers/usb/phy/Kconfig @@ -186,19 +186,6 @@ config USB_RCAR_PHY To compile this driver as a module, choose M here: the module will be called phy-rcar-usb. -config USB_RCAR_GEN2_PHY - tristate "Renesas R-Car Gen2 USB PHY support" - depends on ARCH_R8A7790 || ARCH_R8A7791 || COMPILE_TEST - select USB_PHY - help - Say Y here to add support for the Renesas R-Car Gen2 USB PHY driver. - It is typically used to control internal USB PHY for USBHS, - and to configure shared USB channels 0 and 2. - This driver supports R8A7790 and R8A7791. - - To compile this driver as a module, choose M here: the - module will be called phy-rcar-gen2-usb. - config USB_ULPI bool "Generic ULPI Transceiver Driver" depends on ARM || ARM64 diff --git a/drivers/usb/phy/Makefile b/drivers/usb/phy/Makefile index 75f2bba58c84..e36ab1d46d8b 100644 --- a/drivers/usb/phy/Makefile +++ b/drivers/usb/phy/Makefile @@ -23,7 +23,6 @@ obj-$(CONFIG_USB_MSM_OTG) += phy-msm-usb.o obj-$(CONFIG_USB_MV_OTG) += phy-mv-usb.o obj-$(CONFIG_USB_MXS_PHY) += phy-mxs-usb.o obj-$(CONFIG_USB_RCAR_PHY) += phy-rcar-usb.o -obj-$(CONFIG_USB_RCAR_GEN2_PHY) += phy-rcar-gen2-usb.o obj-$(CONFIG_USB_ULPI) += phy-ulpi.o obj-$(CONFIG_USB_ULPI_VIEWPORT) += phy-ulpi-viewport.o obj-$(CONFIG_KEYSTONE_USB_PHY) += phy-keystone.o diff --git a/drivers/usb/phy/phy-rcar-gen2-usb.c b/drivers/usb/phy/phy-rcar-gen2-usb.c deleted file mode 100644 index f81800b6562a..000000000000 --- a/drivers/usb/phy/phy-rcar-gen2-usb.c +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Renesas R-Car Gen2 USB phy driver - * - * Copyright (C) 2013 Renesas Solutions Corp. - * Copyright (C) 2013 Cogent Embedded, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -struct rcar_gen2_usb_phy_priv { - struct usb_phy phy; - void __iomem *base; - struct clk *clk; - spinlock_t lock; - int usecount; - u32 ugctrl2; -}; - -#define usb_phy_to_priv(p) container_of(p, struct rcar_gen2_usb_phy_priv, phy) - -/* Low Power Status register */ -#define USBHS_LPSTS_REG 0x02 -#define USBHS_LPSTS_SUSPM (1 << 14) - -/* USB General control register */ -#define USBHS_UGCTRL_REG 0x80 -#define USBHS_UGCTRL_CONNECT (1 << 2) -#define USBHS_UGCTRL_PLLRESET (1 << 0) - -/* USB General control register 2 */ -#define USBHS_UGCTRL2_REG 0x84 -#define USBHS_UGCTRL2_USB0_PCI (1 << 4) -#define USBHS_UGCTRL2_USB0_HS (3 << 4) -#define USBHS_UGCTRL2_USB2_PCI (0 << 31) -#define USBHS_UGCTRL2_USB2_SS (1 << 31) - -/* USB General status register */ -#define USBHS_UGSTS_REG 0x88 -#define USBHS_UGSTS_LOCK (1 << 8) - -/* Enable USBHS internal phy */ -static int __rcar_gen2_usbhs_phy_enable(void __iomem *base) -{ - u32 val; - int i; - - /* USBHS PHY power on */ - val = ioread32(base + USBHS_UGCTRL_REG); - val &= ~USBHS_UGCTRL_PLLRESET; - iowrite32(val, base + USBHS_UGCTRL_REG); - - val = ioread16(base + USBHS_LPSTS_REG); - val |= USBHS_LPSTS_SUSPM; - iowrite16(val, base + USBHS_LPSTS_REG); - - for (i = 0; i < 20; i++) { - val = ioread32(base + USBHS_UGSTS_REG); - if ((val & USBHS_UGSTS_LOCK) == USBHS_UGSTS_LOCK) { - val = ioread32(base + USBHS_UGCTRL_REG); - val |= USBHS_UGCTRL_CONNECT; - iowrite32(val, base + USBHS_UGCTRL_REG); - return 0; - } - udelay(1); - } - - /* Timed out waiting for the PLL lock */ - return -ETIMEDOUT; -} - -/* Disable USBHS internal phy */ -static int __rcar_gen2_usbhs_phy_disable(void __iomem *base) -{ - u32 val; - - /* USBHS PHY power off */ - val = ioread32(base + USBHS_UGCTRL_REG); - val &= ~USBHS_UGCTRL_CONNECT; - iowrite32(val, base + USBHS_UGCTRL_REG); - - val = ioread16(base + USBHS_LPSTS_REG); - val &= ~USBHS_LPSTS_SUSPM; - iowrite16(val, base + USBHS_LPSTS_REG); - - val = ioread32(base + USBHS_UGCTRL_REG); - val |= USBHS_UGCTRL_PLLRESET; - iowrite32(val, base + USBHS_UGCTRL_REG); - return 0; -} - -/* Setup USB channels */ -static void __rcar_gen2_usb_phy_init(struct rcar_gen2_usb_phy_priv *priv) -{ - u32 val; - - clk_prepare_enable(priv->clk); - - /* Set USB channels in the USBHS UGCTRL2 register */ - val = ioread32(priv->base + USBHS_UGCTRL2_REG); - val &= ~(USBHS_UGCTRL2_USB0_HS | USBHS_UGCTRL2_USB2_SS); - val |= priv->ugctrl2; - iowrite32(val, priv->base + USBHS_UGCTRL2_REG); -} - -/* Shutdown USB channels */ -static void __rcar_gen2_usb_phy_shutdown(struct rcar_gen2_usb_phy_priv *priv) -{ - __rcar_gen2_usbhs_phy_disable(priv->base); - clk_disable_unprepare(priv->clk); -} - -static int rcar_gen2_usb_phy_set_suspend(struct usb_phy *phy, int suspend) -{ - struct rcar_gen2_usb_phy_priv *priv = usb_phy_to_priv(phy); - unsigned long flags; - int retval; - - spin_lock_irqsave(&priv->lock, flags); - retval = suspend ? __rcar_gen2_usbhs_phy_disable(priv->base) : - __rcar_gen2_usbhs_phy_enable(priv->base); - spin_unlock_irqrestore(&priv->lock, flags); - return retval; -} - -static int rcar_gen2_usb_phy_init(struct usb_phy *phy) -{ - struct rcar_gen2_usb_phy_priv *priv = usb_phy_to_priv(phy); - unsigned long flags; - - spin_lock_irqsave(&priv->lock, flags); - /* - * Enable the clock and setup USB channels - * if it's the first user - */ - if (!priv->usecount++) - __rcar_gen2_usb_phy_init(priv); - spin_unlock_irqrestore(&priv->lock, flags); - return 0; -} - -static void rcar_gen2_usb_phy_shutdown(struct usb_phy *phy) -{ - struct rcar_gen2_usb_phy_priv *priv = usb_phy_to_priv(phy); - unsigned long flags; - - spin_lock_irqsave(&priv->lock, flags); - if (!priv->usecount) { - dev_warn(phy->dev, "Trying to disable phy with 0 usecount\n"); - goto out; - } - - /* Disable everything if it's the last user */ - if (!--priv->usecount) - __rcar_gen2_usb_phy_shutdown(priv); -out: - spin_unlock_irqrestore(&priv->lock, flags); -} - -static int rcar_gen2_usb_phy_probe(struct platform_device *pdev) -{ - struct device *dev = &pdev->dev; - struct rcar_gen2_phy_platform_data *pdata; - struct rcar_gen2_usb_phy_priv *priv; - struct resource *res; - void __iomem *base; - struct clk *clk; - int retval; - - pdata = dev_get_platdata(dev); - if (!pdata) { - dev_err(dev, "No platform data\n"); - return -EINVAL; - } - - clk = devm_clk_get(dev, "usbhs"); - if (IS_ERR(clk)) { - dev_err(dev, "Can't get the clock\n"); - return PTR_ERR(clk); - } - - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - base = devm_ioremap_resource(dev, res); - if (IS_ERR(base)) - return PTR_ERR(base); - - priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); - if (!priv) - return -ENOMEM; - - spin_lock_init(&priv->lock); - priv->clk = clk; - priv->base = base; - priv->ugctrl2 = pdata->chan0_pci ? - USBHS_UGCTRL2_USB0_PCI : USBHS_UGCTRL2_USB0_HS; - priv->ugctrl2 |= pdata->chan2_pci ? - USBHS_UGCTRL2_USB2_PCI : USBHS_UGCTRL2_USB2_SS; - priv->phy.dev = dev; - priv->phy.label = dev_name(dev); - priv->phy.init = rcar_gen2_usb_phy_init; - priv->phy.shutdown = rcar_gen2_usb_phy_shutdown; - priv->phy.set_suspend = rcar_gen2_usb_phy_set_suspend; - - retval = usb_add_phy_dev(&priv->phy); - if (retval < 0) { - dev_err(dev, "Failed to add USB phy\n"); - return retval; - } - - platform_set_drvdata(pdev, priv); - - return retval; -} - -static int rcar_gen2_usb_phy_remove(struct platform_device *pdev) -{ - struct rcar_gen2_usb_phy_priv *priv = platform_get_drvdata(pdev); - - usb_remove_phy(&priv->phy); - - return 0; -} - -static struct platform_driver rcar_gen2_usb_phy_driver = { - .driver = { - .name = "usb_phy_rcar_gen2", - }, - .probe = rcar_gen2_usb_phy_probe, - .remove = rcar_gen2_usb_phy_remove, -}; - -module_platform_driver(rcar_gen2_usb_phy_driver); - -MODULE_LICENSE("GPL v2"); -MODULE_DESCRIPTION("Renesas R-Car Gen2 USB phy"); -MODULE_AUTHOR("Valentine Barshak "); diff --git a/include/linux/platform_data/usb-rcar-gen2-phy.h b/include/linux/platform_data/usb-rcar-gen2-phy.h deleted file mode 100644 index dd3ba46c0d90..000000000000 --- a/include/linux/platform_data/usb-rcar-gen2-phy.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright (C) 2013 Renesas Solutions Corp. - * Copyright (C) 2013 Cogent Embedded, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#ifndef __USB_RCAR_GEN2_PHY_H -#define __USB_RCAR_GEN2_PHY_H - -#include - -struct rcar_gen2_phy_platform_data { - /* USB channel 0 configuration */ - bool chan0_pci:1; /* true: PCI USB host 0, false: USBHS */ - /* USB channel 2 configuration */ - bool chan2_pci:1; /* true: PCI USB host 2, false: USBSS */ -}; - -#endif -- cgit v1.2.3 From 1c5841e832e2d7563c31de4946118e78baf573a3 Mon Sep 17 00:00:00 2001 From: Eddie Huang Date: Tue, 28 Apr 2015 21:40:32 +0800 Subject: tty: serial: 8250: export early_serial8250_setup function 8250-like uart driver may call early_serial8250_setup to reuse 8250_early.c character output function. Signed-off-by: Eddie Huang Tested-by: Sascha Hauer Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_early.c | 2 +- include/linux/serial_8250.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/tty/serial/8250/8250_early.c b/drivers/tty/serial/8250/8250_early.c index 6c0fd8b9d1c3..771dda29a0f8 100644 --- a/drivers/tty/serial/8250/8250_early.c +++ b/drivers/tty/serial/8250/8250_early.c @@ -131,7 +131,7 @@ static void __init init_port(struct earlycon_device *device) serial8250_early_out(port, UART_LCR, c & ~UART_LCR_DLAB); } -static int __init early_serial8250_setup(struct earlycon_device *device, +int __init early_serial8250_setup(struct earlycon_device *device, const char *options) { if (!(device->port.membase || device->port.iobase)) diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h index 78097e7a330a..f0c68d88b6f4 100644 --- a/include/linux/serial_8250.h +++ b/include/linux/serial_8250.h @@ -137,6 +137,8 @@ extern int early_serial_setup(struct uart_port *port); extern unsigned int serial8250_early_in(struct uart_port *port, int offset); extern void serial8250_early_out(struct uart_port *port, int offset, int value); +extern int early_serial8250_setup(struct earlycon_device *device, + const char *options); extern void serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios, struct ktermios *old); extern int serial8250_do_startup(struct uart_port *port); -- cgit v1.2.3 From c27ffc1080179c3f3b85e1e194fa61f1c9923b62 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 30 Apr 2015 18:21:25 +0200 Subject: serial: sh-sci: Move private definitions to private header file Move private register definitions and enums from the public header file to the driver private "sh-sci.h" header file. The common Serial Control Register definitions are left in the public header file, as they're needed to fill in plat_sci_port.scscr on legacy systems not using DT. Signed-off-by: Geert Uytterhoeven Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/sh-sci.h | 83 +++++++++++++++++++++++++++++++++++++++++---- include/linux/serial_sci.h | 67 +----------------------------------- 2 files changed, 77 insertions(+), 73 deletions(-) (limited to 'include/linux') diff --git a/drivers/tty/serial/sh-sci.h b/drivers/tty/serial/sh-sci.h index 350717404507..238ad0c5584b 100644 --- a/drivers/tty/serial/sh-sci.h +++ b/drivers/tty/serial/sh-sci.h @@ -2,6 +2,82 @@ #include #include +#define SCI_MAJOR 204 +#define SCI_MINOR_START 8 + + +/* + * SCI register subset common for all port types. + * Not all registers will exist on all parts. + */ +enum { + SCSMR, /* Serial Mode Register */ + SCBRR, /* Bit Rate Register */ + SCSCR, /* Serial Control Register */ + SCxSR, /* Serial Status Register */ + SCFCR, /* FIFO Control Register */ + SCFDR, /* FIFO Data Count Register */ + SCxTDR, /* Transmit (FIFO) Data Register */ + SCxRDR, /* Receive (FIFO) Data Register */ + SCLSR, /* Line Status Register */ + SCTFDR, /* Transmit FIFO Data Count Register */ + SCRFDR, /* Receive FIFO Data Count Register */ + SCSPTR, /* Serial Port Register */ + HSSRR, /* Sampling Rate Register */ + + SCIx_NR_REGS, +}; + + +/* SCSMR (Serial Mode Register) */ +#define SCSMR_CHR (1 << 6) /* 7-bit Character Length */ +#define SCSMR_PE (1 << 5) /* Parity Enable */ +#define SCSMR_ODD (1 << 4) /* Odd Parity */ +#define SCSMR_STOP (1 << 3) /* Stop Bit Length */ +#define SCSMR_CKS 0x0003 /* Clock Select */ + +/* Serial Control Register, SCIFA/SCIFB only bits */ +#define SCSCR_TDRQE (1 << 15) /* Tx Data Transfer Request Enable */ +#define SCSCR_RDRQE (1 << 14) /* Rx Data Transfer Request Enable */ + +/* SCxSR (Serial Status Register) on SCI */ +#define SCI_TDRE 0x80 /* Transmit Data Register Empty */ +#define SCI_RDRF 0x40 /* Receive Data Register Full */ +#define SCI_ORER 0x20 /* Overrun Error */ +#define SCI_FER 0x10 /* Framing Error */ +#define SCI_PER 0x08 /* Parity Error */ +#define SCI_TEND 0x04 /* Transmit End */ + +#define SCI_DEFAULT_ERROR_MASK (SCI_PER | SCI_FER) + +/* SCxSR (Serial Status Register) on SCIF, HSCIF */ +#define SCIF_ER 0x0080 /* Receive Error */ +#define SCIF_TEND 0x0040 /* Transmission End */ +#define SCIF_TDFE 0x0020 /* Transmit FIFO Data Empty */ +#define SCIF_BRK 0x0010 /* Break Detect */ +#define SCIF_FER 0x0008 /* Framing Error */ +#define SCIF_PER 0x0004 /* Parity Error */ +#define SCIF_RDF 0x0002 /* Receive FIFO Data Full */ +#define SCIF_DR 0x0001 /* Receive Data Ready */ + +#define SCIF_DEFAULT_ERROR_MASK (SCIF_PER | SCIF_FER | SCIF_ER | SCIF_BRK) + +/* SCFCR (FIFO Control Register) */ +#define SCFCR_MCE 0x0008 +#define SCFCR_TFRST 0x0004 +#define SCFCR_RFRST 0x0002 +#define SCFCR_LOOP (1 << 0) /* Loopback Test */ + +/* SCSPTR (Serial Port Register), optional */ +#define SCSPTR_RTSIO (1 << 7) /* Serial Port RTS Pin Input/Output */ +#define SCSPTR_CTSIO (1 << 5) /* Serial Port CTS Pin Input/Output */ +#define SCSPTR_SPB2IO (1 << 1) /* Serial Port Break Input/Output */ +#define SCSPTR_SPB2DT (1 << 0) /* Serial Port Break Data */ + +/* HSSRR HSCIF */ +#define HSCIF_SRE 0x8000 /* Sampling Rate Register Enable */ + + #define SCxSR_TEND(port) (((port)->type == PORT_SCI) ? SCI_TEND : SCIF_TEND) #define SCxSR_RDxF(port) (((port)->type == PORT_SCI) ? SCI_RDRF : SCIF_RDF) #define SCxSR_TDxE(port) (((port)->type == PORT_SCI) ? SCI_TDRE : SCIF_TDFE) @@ -28,10 +104,3 @@ # define SCxSR_BREAK_CLEAR(port) (((port)->type == PORT_SCI) ? 0xc4 : 0x00e3) #endif -/* SCFCR */ -#define SCFCR_RFRST 0x0002 -#define SCFCR_TFRST 0x0004 -#define SCFCR_MCE 0x0008 - -#define SCI_MAJOR 204 -#define SCI_MINOR_START 8 diff --git a/include/linux/serial_sci.h b/include/linux/serial_sci.h index 6c5e3bb282b0..395fceb8c060 100644 --- a/include/linux/serial_sci.h +++ b/include/linux/serial_sci.h @@ -10,13 +10,6 @@ #define SCIx_NOT_SUPPORTED (-1) -/* SCSMR (Serial Mode Register) */ -#define SCSMR_CHR (1 << 6) /* 7-bit Character Length */ -#define SCSMR_PE (1 << 5) /* Parity Enable */ -#define SCSMR_ODD (1 << 4) /* Odd Parity */ -#define SCSMR_STOP (1 << 3) /* Stop Bit Length */ -#define SCSMR_CKS 0x0003 /* Clock Select */ - /* Serial Control Register (@ = not supported by all parts) */ #define SCSCR_TIE (1 << 7) /* Transmit Interrupt Enable */ #define SCSCR_RIE (1 << 6) /* Receive Interrupt Enable */ @@ -26,43 +19,7 @@ #define SCSCR_TOIE (1 << 2) /* Timeout Interrupt Enable @ */ #define SCSCR_CKE1 (1 << 1) /* Clock Enable 1 */ #define SCSCR_CKE0 (1 << 0) /* Clock Enable 0 */ -/* SCIFA/SCIFB only */ -#define SCSCR_TDRQE (1 << 15) /* Tx Data Transfer Request Enable */ -#define SCSCR_RDRQE (1 << 14) /* Rx Data Transfer Request Enable */ - -/* SCxSR (Serial Status Register) on SCI */ -#define SCI_TDRE 0x80 /* Transmit Data Register Empty */ -#define SCI_RDRF 0x40 /* Receive Data Register Full */ -#define SCI_ORER 0x20 /* Overrun Error */ -#define SCI_FER 0x10 /* Framing Error */ -#define SCI_PER 0x08 /* Parity Error */ -#define SCI_TEND 0x04 /* Transmit End */ - -#define SCI_DEFAULT_ERROR_MASK (SCI_PER | SCI_FER) - -/* SCxSR (Serial Status Register) on SCIF, HSCIF */ -#define SCIF_ER 0x0080 /* Receive Error */ -#define SCIF_TEND 0x0040 /* Transmission End */ -#define SCIF_TDFE 0x0020 /* Transmit FIFO Data Empty */ -#define SCIF_BRK 0x0010 /* Break Detect */ -#define SCIF_FER 0x0008 /* Framing Error */ -#define SCIF_PER 0x0004 /* Parity Error */ -#define SCIF_RDF 0x0002 /* Receive FIFO Data Full */ -#define SCIF_DR 0x0001 /* Receive Data Ready */ - -#define SCIF_DEFAULT_ERROR_MASK (SCIF_PER | SCIF_FER | SCIF_ER | SCIF_BRK) - -/* SCFCR (FIFO Control Register) */ -#define SCFCR_LOOP (1 << 0) /* Loopback Test */ - -/* SCSPTR (Serial Port Register), optional */ -#define SCSPTR_RTSIO (1 << 7) /* Serial Port RTS Pin Input/Output */ -#define SCSPTR_CTSIO (1 << 5) /* Serial Port CTS Pin Input/Output */ -#define SCSPTR_SPB2IO (1 << 1) /* Serial Port Break Input/Output */ -#define SCSPTR_SPB2DT (1 << 0) /* Serial Port Break Data */ - -/* HSSRR HSCIF */ -#define HSCIF_SRE 0x8000 /* Sampling Rate Register Enable */ + enum { SCIx_PROBE_REGTYPE, @@ -82,28 +39,6 @@ enum { SCIx_NR_REGTYPES, }; -/* - * SCI register subset common for all port types. - * Not all registers will exist on all parts. - */ -enum { - SCSMR, /* Serial Mode Register */ - SCBRR, /* Bit Rate Register */ - SCSCR, /* Serial Control Register */ - SCxSR, /* Serial Status Register */ - SCFCR, /* FIFO Control Register */ - SCFDR, /* FIFO Data Count Register */ - SCxTDR, /* Transmit (FIFO) Data Register */ - SCxRDR, /* Receive (FIFO) Data Register */ - SCLSR, /* Line Status Register */ - SCTFDR, /* Transmit FIFO Data Count Register */ - SCRFDR, /* Receive FIFO Data Count Register */ - SCSPTR, /* Serial Port Register */ - HSSRR, /* Sampling Rate Register */ - - SCIx_NR_REGS, -}; - struct device; struct plat_sci_port_ops { -- cgit v1.2.3 From d94a0a3857987c76c37a8095977fe554799ab69d Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 30 Apr 2015 18:21:29 +0200 Subject: serial: sh-sci: Standardize on using the BIT() macro to define register bits Signed-off-by: Geert Uytterhoeven Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/sh-sci.h | 73 +++++++++++++++++++++++---------------------- include/linux/serial_sci.h | 19 ++++++------ 2 files changed, 47 insertions(+), 45 deletions(-) (limited to 'include/linux') diff --git a/drivers/tty/serial/sh-sci.h b/drivers/tty/serial/sh-sci.h index 1586d68470b6..5282738375ae 100644 --- a/drivers/tty/serial/sh-sci.h +++ b/drivers/tty/serial/sh-sci.h @@ -1,3 +1,4 @@ +#include #include #include #include @@ -32,62 +33,62 @@ enum { /* SCSMR (Serial Mode Register) */ -#define SCSMR_CHR (1 << 6) /* 7-bit Character Length */ -#define SCSMR_PE (1 << 5) /* Parity Enable */ -#define SCSMR_ODD (1 << 4) /* Odd Parity */ -#define SCSMR_STOP (1 << 3) /* Stop Bit Length */ -#define SCSMR_CKS 0x0003 /* Clock Select */ +#define SCSMR_CHR BIT(6) /* 7-bit Character Length */ +#define SCSMR_PE BIT(5) /* Parity Enable */ +#define SCSMR_ODD BIT(4) /* Odd Parity */ +#define SCSMR_STOP BIT(3) /* Stop Bit Length */ +#define SCSMR_CKS 0x0003 /* Clock Select */ /* Serial Control Register, SCIFA/SCIFB only bits */ -#define SCSCR_TDRQE (1 << 15) /* Tx Data Transfer Request Enable */ -#define SCSCR_RDRQE (1 << 14) /* Rx Data Transfer Request Enable */ +#define SCSCR_TDRQE BIT(15) /* Tx Data Transfer Request Enable */ +#define SCSCR_RDRQE BIT(14) /* Rx Data Transfer Request Enable */ /* SCxSR (Serial Status Register) on SCI */ -#define SCI_TDRE 0x80 /* Transmit Data Register Empty */ -#define SCI_RDRF 0x40 /* Receive Data Register Full */ -#define SCI_ORER 0x20 /* Overrun Error */ -#define SCI_FER 0x10 /* Framing Error */ -#define SCI_PER 0x08 /* Parity Error */ -#define SCI_TEND 0x04 /* Transmit End */ +#define SCI_TDRE BIT(7) /* Transmit Data Register Empty */ +#define SCI_RDRF BIT(6) /* Receive Data Register Full */ +#define SCI_ORER BIT(5) /* Overrun Error */ +#define SCI_FER BIT(4) /* Framing Error */ +#define SCI_PER BIT(3) /* Parity Error */ +#define SCI_TEND BIT(2) /* Transmit End */ #define SCI_DEFAULT_ERROR_MASK (SCI_PER | SCI_FER) /* SCxSR (Serial Status Register) on SCIF, HSCIF */ -#define SCIF_ER 0x0080 /* Receive Error */ -#define SCIF_TEND 0x0040 /* Transmission End */ -#define SCIF_TDFE 0x0020 /* Transmit FIFO Data Empty */ -#define SCIF_BRK 0x0010 /* Break Detect */ -#define SCIF_FER 0x0008 /* Framing Error */ -#define SCIF_PER 0x0004 /* Parity Error */ -#define SCIF_RDF 0x0002 /* Receive FIFO Data Full */ -#define SCIF_DR 0x0001 /* Receive Data Ready */ +#define SCIF_ER BIT(7) /* Receive Error */ +#define SCIF_TEND BIT(6) /* Transmission End */ +#define SCIF_TDFE BIT(5) /* Transmit FIFO Data Empty */ +#define SCIF_BRK BIT(4) /* Break Detect */ +#define SCIF_FER BIT(3) /* Framing Error */ +#define SCIF_PER BIT(2) /* Parity Error */ +#define SCIF_RDF BIT(1) /* Receive FIFO Data Full */ +#define SCIF_DR BIT(0) /* Receive Data Ready */ #define SCIF_DEFAULT_ERROR_MASK (SCIF_PER | SCIF_FER | SCIF_ER | SCIF_BRK) /* SCFCR (FIFO Control Register) */ -#define SCFCR_MCE 0x0008 /* Modem Control Enable */ -#define SCFCR_TFRST 0x0004 /* Transmit FIFO Data Register Reset */ -#define SCFCR_RFRST 0x0002 /* Receive FIFO Data Register Reset */ -#define SCFCR_LOOP (1 << 0) /* Loopback Test */ +#define SCFCR_MCE BIT(3) /* Modem Control Enable */ +#define SCFCR_TFRST BIT(2) /* Transmit FIFO Data Register Reset */ +#define SCFCR_RFRST BIT(1) /* Receive FIFO Data Register Reset */ +#define SCFCR_LOOP BIT(0) /* Loopback Test */ /* SCSPTR (Serial Port Register), optional */ -#define SCSPTR_RTSIO (1 << 7) /* Serial Port RTS Pin Input/Output */ -#define SCSPTR_RTSDT (1 << 6) /* Serial Port RTS Pin Data */ -#define SCSPTR_CTSIO (1 << 5) /* Serial Port CTS Pin Input/Output */ -#define SCSPTR_CTSDT (1 << 4) /* Serial Port CTS Pin Data */ -#define SCSPTR_SPB2IO (1 << 1) /* Serial Port Break Input/Output */ -#define SCSPTR_SPB2DT (1 << 0) /* Serial Port Break Data */ +#define SCSPTR_RTSIO BIT(7) /* Serial Port RTS Pin Input/Output */ +#define SCSPTR_RTSDT BIT(6) /* Serial Port RTS Pin Data */ +#define SCSPTR_CTSIO BIT(5) /* Serial Port CTS Pin Input/Output */ +#define SCSPTR_CTSDT BIT(4) /* Serial Port CTS Pin Data */ +#define SCSPTR_SPB2IO BIT(1) /* Serial Port Break Input/Output */ +#define SCSPTR_SPB2DT BIT(0) /* Serial Port Break Data */ /* HSSRR HSCIF */ -#define HSCIF_SRE 0x8000 /* Sampling Rate Register Enable */ +#define HSCIF_SRE BIT(15) /* Sampling Rate Register Enable */ /* SCPCR (Serial Port Control Register), SCIFA/SCIFB only */ -#define SCPCR_RTSC (1 << 4) /* Serial Port RTS Pin / Output Pin */ -#define SCPCR_CTSC (1 << 3) /* Serial Port CTS Pin / Input Pin */ +#define SCPCR_RTSC BIT(4) /* Serial Port RTS Pin / Output Pin */ +#define SCPCR_CTSC BIT(3) /* Serial Port CTS Pin / Input Pin */ /* SCPDR (Serial Port Data Register), SCIFA/SCIFB only */ -#define SCPDR_RTSD (1 << 4) /* Serial Port RTS Output Pin Data */ -#define SCPDR_CTSD (1 << 3) /* Serial Port CTS Input Pin Data */ +#define SCPDR_RTSD BIT(4) /* Serial Port RTS Output Pin Data */ +#define SCPDR_CTSD BIT(3) /* Serial Port CTS Input Pin Data */ #define SCxSR_TEND(port) (((port)->type == PORT_SCI) ? SCI_TEND : SCIF_TEND) diff --git a/include/linux/serial_sci.h b/include/linux/serial_sci.h index 395fceb8c060..7c536ac5be05 100644 --- a/include/linux/serial_sci.h +++ b/include/linux/serial_sci.h @@ -1,6 +1,7 @@ #ifndef __LINUX_SERIAL_SCI_H #define __LINUX_SERIAL_SCI_H +#include #include #include @@ -11,14 +12,14 @@ #define SCIx_NOT_SUPPORTED (-1) /* Serial Control Register (@ = not supported by all parts) */ -#define SCSCR_TIE (1 << 7) /* Transmit Interrupt Enable */ -#define SCSCR_RIE (1 << 6) /* Receive Interrupt Enable */ -#define SCSCR_TE (1 << 5) /* Transmit Enable */ -#define SCSCR_RE (1 << 4) /* Receive Enable */ -#define SCSCR_REIE (1 << 3) /* Receive Error Interrupt Enable @ */ -#define SCSCR_TOIE (1 << 2) /* Timeout Interrupt Enable @ */ -#define SCSCR_CKE1 (1 << 1) /* Clock Enable 1 */ -#define SCSCR_CKE0 (1 << 0) /* Clock Enable 0 */ +#define SCSCR_TIE BIT(7) /* Transmit Interrupt Enable */ +#define SCSCR_RIE BIT(6) /* Receive Interrupt Enable */ +#define SCSCR_TE BIT(5) /* Transmit Enable */ +#define SCSCR_RE BIT(4) /* Receive Enable */ +#define SCSCR_REIE BIT(3) /* Receive Error Interrupt Enable @ */ +#define SCSCR_TOIE BIT(2) /* Timeout Interrupt Enable @ */ +#define SCSCR_CKE1 BIT(1) /* Clock Enable 1 */ +#define SCSCR_CKE0 BIT(0) /* Clock Enable 0 */ enum { @@ -48,7 +49,7 @@ struct plat_sci_port_ops { /* * Port-specific capabilities */ -#define SCIx_HAVE_RTSCTS (1 << 0) +#define SCIx_HAVE_RTSCTS BIT(0) /* * Platform device specific platform_data struct -- cgit v1.2.3 From bd63364caa8df38bad2b25b11b2a1b849475cce5 Mon Sep 17 00:00:00 2001 From: Scot Doyle Date: Thu, 26 Mar 2015 13:54:39 +0000 Subject: vt: add cursor blink interval escape sequence Add an escape sequence to specify the current console's cursor blink interval. The interval is specified as a number of milliseconds until the next cursor display state toggle, from 50 to 65535. /proc/loadavg did not show a difference with a one msec interval, but the lower bound is set to 50 msecs since slower hardware wasn't tested. Store the interval in the vc_data structure for later access by fbcon, initializing the value to fbcon's current hardcoded value of 200 msecs. Signed-off-by: Scot Doyle Acked-by: Pavel Machek Signed-off-by: Greg Kroah-Hartman --- drivers/tty/vt/vt.c | 9 +++++++++ include/linux/console_struct.h | 1 + 2 files changed, 10 insertions(+) (limited to 'include/linux') diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c index 4a24eb2b0ede..b075489f314e 100644 --- a/drivers/tty/vt/vt.c +++ b/drivers/tty/vt/vt.c @@ -135,6 +135,7 @@ const struct consw *conswitchp; */ #define DEFAULT_BELL_PITCH 750 #define DEFAULT_BELL_DURATION (HZ/8) +#define DEFAULT_CURSOR_BLINK_MS 200 struct vc vc_cons [MAX_NR_CONSOLES]; @@ -1590,6 +1591,13 @@ static void setterm_command(struct vc_data *vc) case 15: /* activate the previous console */ set_console(last_console); break; + case 16: /* set cursor blink duration in msec */ + if (vc->vc_npar >= 1 && vc->vc_par[1] >= 50 && + vc->vc_par[1] <= USHRT_MAX) + vc->vc_cur_blink_ms = vc->vc_par[1]; + else + vc->vc_cur_blink_ms = DEFAULT_CURSOR_BLINK_MS; + break; } } @@ -1717,6 +1725,7 @@ static void reset_terminal(struct vc_data *vc, int do_clear) vc->vc_bell_pitch = DEFAULT_BELL_PITCH; vc->vc_bell_duration = DEFAULT_BELL_DURATION; + vc->vc_cur_blink_ms = DEFAULT_CURSOR_BLINK_MS; gotoxy(vc, 0, 0); save_cur(vc); diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h index e859c98d1767..e329ee2667e1 100644 --- a/include/linux/console_struct.h +++ b/include/linux/console_struct.h @@ -104,6 +104,7 @@ struct vc_data { unsigned int vc_resize_user; /* resize request from user */ unsigned int vc_bell_pitch; /* Console bell pitch */ unsigned int vc_bell_duration; /* Console bell duration */ + unsigned short vc_cur_blink_ms; /* Cursor blink duration */ struct vc_data **vc_display_fg; /* [!] Ptr to var holding fg console for this display */ struct uni_pagedir *vc_uni_pagedir; struct uni_pagedir **vc_uni_pagedir_loc; /* [!] Location of uni_pagedir variable for this console */ -- cgit v1.2.3 From faaa44955dedc661f083636d816af90975a359ee Mon Sep 17 00:00:00 2001 From: Irina Tirdea Date: Wed, 29 Apr 2015 21:16:39 +0300 Subject: iio: core: Introduce IIO_CHAN_INFO_OVERSAMPLING_RATIO Some magnetometers can perform a number of repetitions in HW for each measurement to increase accuracy. One example is Bosch BMC150: http://ae-bst.resource.bosch.com/media/products/dokumente/bmc150/BST-BMC150-DS000-04.pdf. Introduce an interface to set the oversampling ratio for these devices. Signed-off-by: Irina Tirdea Signed-off-by: Jonathan Cameron --- Documentation/ABI/testing/sysfs-bus-iio | 12 ++++++++++++ drivers/iio/industrialio-core.c | 1 + include/linux/iio/iio.h | 1 + 3 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio index 866b4ec4aab6..e46c71fbd047 100644 --- a/Documentation/ABI/testing/sysfs-bus-iio +++ b/Documentation/ABI/testing/sysfs-bus-iio @@ -1375,3 +1375,15 @@ Description: The emissivity ratio of the surface in the field of view of the contactless temperature sensor. Emissivity varies from 0 to 1, with 1 being the emissivity of a black body. + +What: /sys/bus/iio/devices/iio:deviceX/in_magn_x_oversampling_ratio +What: /sys/bus/iio/devices/iio:deviceX/in_magn_y_oversampling_ratio +What: /sys/bus/iio/devices/iio:deviceX/in_magn_z_oversampling_ratio +KernelVersion: 4.2 +Contact: linux-iio@vger.kernel.org +Description: + Hardware applied number of measurements for acquiring one + data point. The HW will do [_name]_oversampling_ratio + measurements and return the average value as output data. Each + value resulted from [_name]_oversampling_ratio measurements + is considered as one sample for [_name]_sampling_frequency. diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c index 7c98bc1504e6..dfa81db3b910 100644 --- a/drivers/iio/industrialio-core.c +++ b/drivers/iio/industrialio-core.c @@ -129,6 +129,7 @@ static const char * const iio_chan_info_postfix[] = { [IIO_CHAN_INFO_DEBOUNCE_COUNT] = "debounce_count", [IIO_CHAN_INFO_DEBOUNCE_TIME] = "debounce_time", [IIO_CHAN_INFO_CALIBEMISSIVITY] = "calibemissivity", + [IIO_CHAN_INFO_OVERSAMPLING_RATIO] = "oversampling_ratio", }; /** diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index b1e46ae89aa7..058441da4984 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -44,6 +44,7 @@ enum iio_chan_info_enum { IIO_CHAN_INFO_DEBOUNCE_COUNT, IIO_CHAN_INFO_DEBOUNCE_TIME, IIO_CHAN_INFO_CALIBEMISSIVITY, + IIO_CHAN_INFO_OVERSAMPLING_RATIO, }; enum iio_shared_by { -- cgit v1.2.3 From 61ba64fc0768879a300599b011c176203bdf27d9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 2 May 2015 09:54:06 -0400 Subject: libfs: simple_follow_link() let "fast" symlinks store the pointer to the body into ->i_link and use simple_follow_link for ->follow_link() Reviewed-by: Jan Kara Signed-off-by: Al Viro --- fs/inode.c | 1 + fs/libfs.c | 13 +++++++++++++ include/linux/fs.h | 3 +++ 3 files changed, 17 insertions(+) (limited to 'include/linux') diff --git a/fs/inode.c b/fs/inode.c index ea37cd17b53f..952fb4852e38 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -152,6 +152,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_pipe = NULL; inode->i_bdev = NULL; inode->i_cdev = NULL; + inode->i_link = NULL; inode->i_rdev = 0; inode->dirtied_when = 0; diff --git a/fs/libfs.c b/fs/libfs.c index cb1fb4b9b637..72e4e015455f 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1093,3 +1093,16 @@ simple_nosetlease(struct file *filp, long arg, struct file_lock **flp, return -EINVAL; } EXPORT_SYMBOL(simple_nosetlease); + +void *simple_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + nd_set_link(nd, d_inode(dentry)->i_link); + return NULL; +} +EXPORT_SYMBOL(simple_follow_link); + +const struct inode_operations simple_symlink_inode_operations = { + .follow_link = simple_follow_link, + .readlink = generic_readlink +}; +EXPORT_SYMBOL(simple_symlink_inode_operations); diff --git a/include/linux/fs.h b/include/linux/fs.h index 35ec87e490b1..0ac758fcff00 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -656,6 +656,7 @@ struct inode { struct pipe_inode_info *i_pipe; struct block_device *i_bdev; struct cdev *i_cdev; + char *i_link; }; __u32 i_generation; @@ -2721,6 +2722,8 @@ void __inode_sub_bytes(struct inode *inode, loff_t bytes); void inode_sub_bytes(struct inode *inode, loff_t bytes); loff_t inode_get_bytes(struct inode *inode); void inode_set_bytes(struct inode *inode, loff_t bytes); +void *simple_follow_link(struct dentry *, struct nameidata *); +extern const struct inode_operations simple_symlink_inode_operations; extern int iterate_dir(struct file *, struct dir_context *); -- cgit v1.2.3 From 5723cb01f0295ace2b029b0737dd6525a2de337f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 2 May 2015 10:27:18 -0400 Subject: debugfs: switch to simple_follow_link() Reviewed-by: Jan Kara Signed-off-by: Al Viro --- fs/debugfs/file.c | 12 ------------ fs/debugfs/inode.c | 6 +++--- include/linux/debugfs.h | 1 - 3 files changed, 3 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 830a7e76f5c6..284f9aa0028b 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -43,17 +42,6 @@ const struct file_operations debugfs_file_operations = { .llseek = noop_llseek, }; -static void *debugfs_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - nd_set_link(nd, d_inode(dentry)->i_private); - return NULL; -} - -const struct inode_operations debugfs_link_operations = { - .readlink = generic_readlink, - .follow_link = debugfs_follow_link, -}; - static int debugfs_u8_set(void *data, u64 val) { *(u8 *)data = val; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index c1e7ffb0dab6..7eaec88ea970 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -174,7 +174,7 @@ static void debugfs_evict_inode(struct inode *inode) truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (S_ISLNK(inode->i_mode)) - kfree(inode->i_private); + kfree(inode->i_link); } static const struct super_operations debugfs_super_operations = { @@ -511,8 +511,8 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, return failed_creating(dentry); } inode->i_mode = S_IFLNK | S_IRWXUGO; - inode->i_op = &debugfs_link_operations; - inode->i_private = link; + inode->i_op = &simple_symlink_inode_operations; + inode->i_link = link; d_instantiate(dentry, inode); return end_creating(dentry); } diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index cb25af461054..420311bcee38 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -45,7 +45,6 @@ extern struct dentry *arch_debugfs_dir; /* declared over in file.c */ extern const struct file_operations debugfs_file_operations; -extern const struct inode_operations debugfs_link_operations; struct dentry *debugfs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, -- cgit v1.2.3 From 37882db0546c759ff75b561c188539ac96fd0bfe Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 23 Mar 2015 13:37:39 +1100 Subject: SECURITY: remove nameidata arg from inode_follow_link. No ->inode_follow_link() methods use the nameidata arg, and it is about to become private to namei.c. So remove from all inode_follow_link() functions. Signed-off-by: NeilBrown Signed-off-by: Al Viro --- fs/namei.c | 2 +- include/linux/security.h | 9 +++------ security/capability.c | 3 +-- security/security.c | 4 ++-- security/selinux/hooks.c | 2 +- 5 files changed, 8 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/fs/namei.c b/fs/namei.c index fe30d3be43a8..7f20b40426dc 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -871,7 +871,7 @@ follow_link(struct path *link, struct nameidata *nd, void **p) touch_atime(link); nd_set_link(nd, NULL); - error = security_inode_follow_link(link->dentry, nd); + error = security_inode_follow_link(dentry); if (error) goto out_put_nd_path; diff --git a/include/linux/security.h b/include/linux/security.h index 18264ea9e314..62a66202ecf1 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -43,7 +43,6 @@ struct file; struct vfsmount; struct path; struct qstr; -struct nameidata; struct iattr; struct fown_struct; struct file_operations; @@ -477,7 +476,6 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @inode_follow_link: * Check permission to follow a symbolic link when looking up a pathname. * @dentry contains the dentry structure for the link. - * @nd contains the nameidata structure for the parent directory. * Return 0 if permission is granted. * @inode_permission: * Check permission before accessing an inode. This hook is called by the @@ -1553,7 +1551,7 @@ struct security_operations { int (*inode_rename) (struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); int (*inode_readlink) (struct dentry *dentry); - int (*inode_follow_link) (struct dentry *dentry, struct nameidata *nd); + int (*inode_follow_link) (struct dentry *dentry); int (*inode_permission) (struct inode *inode, int mask); int (*inode_setattr) (struct dentry *dentry, struct iattr *attr); int (*inode_getattr) (const struct path *path); @@ -1839,7 +1837,7 @@ int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); int security_inode_readlink(struct dentry *dentry); -int security_inode_follow_link(struct dentry *dentry, struct nameidata *nd); +int security_inode_follow_link(struct dentry *dentry); int security_inode_permission(struct inode *inode, int mask); int security_inode_setattr(struct dentry *dentry, struct iattr *attr); int security_inode_getattr(const struct path *path); @@ -2241,8 +2239,7 @@ static inline int security_inode_readlink(struct dentry *dentry) return 0; } -static inline int security_inode_follow_link(struct dentry *dentry, - struct nameidata *nd) +static inline int security_inode_follow_link(struct dentry *dentry) { return 0; } diff --git a/security/capability.c b/security/capability.c index 0d03fcc489a4..fae99db180ca 100644 --- a/security/capability.c +++ b/security/capability.c @@ -209,8 +209,7 @@ static int cap_inode_readlink(struct dentry *dentry) return 0; } -static int cap_inode_follow_link(struct dentry *dentry, - struct nameidata *nameidata) +static int cap_inode_follow_link(struct dentry *dentry) { return 0; } diff --git a/security/security.c b/security/security.c index 8e9b1f4b9b45..d7c30b03e144 100644 --- a/security/security.c +++ b/security/security.c @@ -581,11 +581,11 @@ int security_inode_readlink(struct dentry *dentry) return security_ops->inode_readlink(dentry); } -int security_inode_follow_link(struct dentry *dentry, struct nameidata *nd) +int security_inode_follow_link(struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; - return security_ops->inode_follow_link(dentry, nd); + return security_ops->inode_follow_link(dentry); } int security_inode_permission(struct inode *inode, int mask) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 7dade28affba..801622947282 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2861,7 +2861,7 @@ static int selinux_inode_readlink(struct dentry *dentry) return dentry_has_perm(cred, dentry, FILE__READ); } -static int selinux_inode_follow_link(struct dentry *dentry, struct nameidata *nameidata) +static int selinux_inode_follow_link(struct dentry *dentry) { const struct cred *cred = current_cred(); -- cgit v1.2.3 From 680baacbca69d18a6d7315374ad83d05ac9c0977 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 2 May 2015 13:32:22 -0400 Subject: new ->follow_link() and ->put_link() calling conventions a) instead of storing the symlink body (via nd_set_link()) and returning an opaque pointer later passed to ->put_link(), ->follow_link() _stores_ that opaque pointer (into void * passed by address by caller) and returns the symlink body. Returning ERR_PTR() on error, NULL on jump (procfs magic symlinks) and pointer to symlink body for normal symlinks. Stored pointer is ignored in all cases except the last one. Storing NULL for opaque pointer (or not storing it at all) means no call of ->put_link(). b) the body used to be passed to ->put_link() implicitly (via nameidata). Now only the opaque pointer is. In the cases when we used the symlink body to free stuff, ->follow_link() now should store it as opaque pointer in addition to returning it. Signed-off-by: Al Viro --- Documentation/filesystems/Locking | 4 +- Documentation/filesystems/vfs.txt | 4 +- drivers/staging/lustre/lustre/llite/symlink.c | 11 ++--- fs/9p/vfs_inode.c | 13 +++--- fs/9p/vfs_inode_dotl.c | 7 ++- fs/autofs4/symlink.c | 5 +- fs/befs/linuxvfs.c | 35 +++++++------- fs/cifs/cifsfs.h | 2 +- fs/cifs/link.c | 28 ++++++------ fs/configfs/symlink.c | 28 +++++------- fs/ecryptfs/inode.c | 8 ++-- fs/ext4/symlink.c | 9 ++-- fs/f2fs/namei.c | 18 +++----- fs/fuse/dir.c | 19 ++------ fs/gfs2/inode.c | 10 ++-- fs/hostfs/hostfs_kern.c | 15 +++--- fs/hppfs/hppfs.c | 9 ++-- fs/kernfs/symlink.c | 22 ++++----- fs/libfs.c | 12 ++--- fs/namei.c | 66 +++++++++------------------ fs/nfs/symlink.c | 19 +++----- fs/overlayfs/inode.c | 18 ++++---- fs/proc/base.c | 2 +- fs/proc/inode.c | 9 ++-- fs/proc/namespaces.c | 2 +- fs/proc/self.c | 24 +++++----- fs/proc/thread_self.c | 22 ++++----- fs/xfs/xfs_iops.c | 10 ++-- include/linux/fs.h | 12 ++--- include/linux/namei.h | 2 - mm/shmem.c | 23 +++++----- 31 files changed, 195 insertions(+), 273 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 0a926e2ba3ab..7fa6c4ac858c 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -50,8 +50,8 @@ prototypes: int (*rename2) (struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); int (*readlink) (struct dentry *, char __user *,int); - void * (*follow_link) (struct dentry *, struct nameidata *); - void (*put_link) (struct dentry *, struct nameidata *, void *); + const char *(*follow_link) (struct dentry *, void **, struct nameidata *); + void (*put_link) (struct dentry *, void *); void (*truncate) (struct inode *); int (*permission) (struct inode *, int, unsigned int); int (*get_acl)(struct inode *, int); diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 5d833b32bbcd..1c6b03ac2e5a 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -350,8 +350,8 @@ struct inode_operations { int (*rename2) (struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); int (*readlink) (struct dentry *, char __user *,int); - void * (*follow_link) (struct dentry *, struct nameidata *); - void (*put_link) (struct dentry *, struct nameidata *, void *); + const char *(*follow_link) (struct dentry *, void **, struct nameidata *); + void (*put_link) (struct dentry *, void *); int (*permission) (struct inode *, int); int (*get_acl)(struct inode *, int); int (*setattr) (struct dentry *, struct iattr *); diff --git a/drivers/staging/lustre/lustre/llite/symlink.c b/drivers/staging/lustre/lustre/llite/symlink.c index 3711e671a4df..e488cb3bb25d 100644 --- a/drivers/staging/lustre/lustre/llite/symlink.c +++ b/drivers/staging/lustre/lustre/llite/symlink.c @@ -118,7 +118,7 @@ failed: return rc; } -static void *ll_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *ll_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) { struct inode *inode = d_inode(dentry); struct ptlrpc_request *request = NULL; @@ -140,18 +140,17 @@ static void *ll_follow_link(struct dentry *dentry, struct nameidata *nd) } if (rc) { ptlrpc_req_finished(request); - request = NULL; - symname = ERR_PTR(rc); + return ERR_PTR(rc); } - nd_set_link(nd, symname); /* symname may contain a pointer to the request message buffer, * we delay request releasing until ll_put_link then. */ - return request; + *cookie = request; + return symname; } -static void ll_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) +static void ll_put_link(struct dentry *dentry, void *cookie) { ptlrpc_req_finished(cookie); } diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 0ba11712b388..7cc70a39a1d8 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -1230,11 +1230,12 @@ ino_t v9fs_qid2ino(struct p9_qid *qid) * */ -static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *v9fs_vfs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) { struct v9fs_session_info *v9ses = v9fs_dentry2v9ses(dentry); struct p9_fid *fid = v9fs_fid_lookup(dentry); struct p9_wstat *st; + char *res; p9_debug(P9_DEBUG_VFS, "%pd\n", dentry); @@ -1253,14 +1254,14 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd) kfree(st); return ERR_PTR(-EINVAL); } - if (strlen(st->extension) >= PATH_MAX) - st->extension[PATH_MAX - 1] = '\0'; - - nd_set_link(nd, st->extension); + res = st->extension; st->extension = NULL; + if (strlen(res) >= PATH_MAX) + res[PATH_MAX - 1] = '\0'; + p9stat_free(st); kfree(st); - return NULL; + return *cookie = res; } /** diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index bc2a91f2b910..ae062ffa0f1f 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -909,8 +909,8 @@ error: * */ -static void * -v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd) +static const char * +v9fs_vfs_follow_link_dotl(struct dentry *dentry, void **cookie, struct nameidata *nd) { struct p9_fid *fid = v9fs_fid_lookup(dentry); char *target; @@ -923,8 +923,7 @@ v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd) retval = p9_client_readlink(fid, &target); if (retval) return ERR_PTR(retval); - nd_set_link(nd, target); - return NULL; + return *cookie = target; } int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c index de58cc7b8076..9c6a07739c9b 100644 --- a/fs/autofs4/symlink.c +++ b/fs/autofs4/symlink.c @@ -12,14 +12,13 @@ #include "autofs_i.h" -static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *autofs4_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); if (ino && !autofs4_oz_mode(sbi)) ino->last_used = jiffies; - nd_set_link(nd, d_inode(dentry)->i_private); - return NULL; + return d_inode(dentry)->i_private; } const struct inode_operations autofs4_symlink_inode_operations = { diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 172e306d68a7..3a1aefb86a11 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -42,7 +42,7 @@ static struct inode *befs_iget(struct super_block *, unsigned long); static struct inode *befs_alloc_inode(struct super_block *sb); static void befs_destroy_inode(struct inode *inode); static void befs_destroy_inodecache(void); -static void *befs_follow_link(struct dentry *, struct nameidata *); +static const char *befs_follow_link(struct dentry *, void **, struct nameidata *nd); static int befs_utf2nls(struct super_block *sb, const char *in, int in_len, char **out, int *out_len); static int befs_nls2utf(struct super_block *sb, const char *in, int in_len, @@ -463,8 +463,8 @@ befs_destroy_inodecache(void) * The data stream become link name. Unless the LONG_SYMLINK * flag is set. */ -static void * -befs_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char * +befs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) { struct super_block *sb = dentry->d_sb; struct befs_inode_info *befs_ino = BEFS_I(d_inode(dentry)); @@ -474,23 +474,20 @@ befs_follow_link(struct dentry *dentry, struct nameidata *nd) if (len == 0) { befs_error(sb, "Long symlink with illegal length"); - link = ERR_PTR(-EIO); - } else { - befs_debug(sb, "Follow long symlink"); - - link = kmalloc(len, GFP_NOFS); - if (!link) { - link = ERR_PTR(-ENOMEM); - } else if (befs_read_lsymlink(sb, data, link, len) != len) { - kfree(link); - befs_error(sb, "Failed to read entire long symlink"); - link = ERR_PTR(-EIO); - } else { - link[len - 1] = '\0'; - } + return ERR_PTR(-EIO); } - nd_set_link(nd, link); - return NULL; + befs_debug(sb, "Follow long symlink"); + + link = kmalloc(len, GFP_NOFS); + if (!link) + return ERR_PTR(-ENOMEM); + if (befs_read_lsymlink(sb, data, link, len) != len) { + kfree(link); + befs_error(sb, "Failed to read entire long symlink"); + return ERR_PTR(-EIO); + } + link[len - 1] = '\0'; + return *cookie = link; } /* diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 252f5c15806b..61012da7e9d8 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -120,7 +120,7 @@ extern struct vfsmount *cifs_dfs_d_automount(struct path *path); #endif /* Functions related to symlinks */ -extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd); +extern const char *cifs_follow_link(struct dentry *direntry, void **cookie, struct nameidata *nd); extern int cifs_readlink(struct dentry *direntry, char __user *buffer, int buflen); extern int cifs_symlink(struct inode *inode, struct dentry *direntry, diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 252e672d5604..4a439c2c0c7f 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -626,8 +626,8 @@ cifs_hl_exit: return rc; } -void * -cifs_follow_link(struct dentry *direntry, struct nameidata *nd) +const char * +cifs_follow_link(struct dentry *direntry, void **cookie, struct nameidata *nd) { struct inode *inode = d_inode(direntry); int rc = -ENOMEM; @@ -643,16 +643,18 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd) tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) { - rc = PTR_ERR(tlink); - tlink = NULL; - goto out; + free_xid(xid); + return ERR_CAST(tlink); } tcon = tlink_tcon(tlink); server = tcon->ses->server; full_path = build_path_from_dentry(direntry); - if (!full_path) - goto out; + if (!full_path) { + free_xid(xid); + cifs_put_tlink(tlink); + return ERR_PTR(-ENOMEM); + } cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", full_path, inode); @@ -670,17 +672,13 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd) &target_path, cifs_sb); kfree(full_path); -out: + free_xid(xid); + cifs_put_tlink(tlink); if (rc != 0) { kfree(target_path); - target_path = ERR_PTR(rc); + return ERR_PTR(rc); } - - free_xid(xid); - if (tlink) - cifs_put_tlink(tlink); - nd_set_link(nd, target_path); - return NULL; + return *cookie = target_path; } int diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index cc9f2546ea4a..fac8e8517f33 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -279,30 +279,26 @@ static int configfs_getlink(struct dentry *dentry, char * path) } -static void *configfs_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *configfs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) { - int error = -ENOMEM; unsigned long page = get_zeroed_page(GFP_KERNEL); + int error; - if (page) { - error = configfs_getlink(dentry, (char *)page); - if (!error) { - nd_set_link(nd, (char *)page); - return (void *)page; - } + if (!page) + return ERR_PTR(-ENOMEM); + + error = configfs_getlink(dentry, (char *)page); + if (!error) { + return *cookie = (void *)page; } - nd_set_link(nd, ERR_PTR(error)); - return NULL; + free_page(page); + return ERR_PTR(error); } -static void configfs_put_link(struct dentry *dentry, struct nameidata *nd, - void *cookie) +static void configfs_put_link(struct dentry *dentry, void *cookie) { - if (cookie) { - unsigned long page = (unsigned long)cookie; - free_page(page); - } + free_page((unsigned long)cookie); } const struct inode_operations configfs_symlink_inode_operations = { diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index fc850b55db67..cdb9d6c4532d 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -675,18 +675,16 @@ out: return rc ? ERR_PTR(rc) : buf; } -static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *ecryptfs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) { size_t len; char *buf = ecryptfs_readlink_lower(dentry, &len); if (IS_ERR(buf)) - goto out; + return buf; fsstack_copy_attr_atime(d_inode(dentry), d_inode(ecryptfs_dentry_to_lower(dentry))); buf[len] = '\0'; -out: - nd_set_link(nd, buf); - return NULL; + return *cookie = buf; } /** diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 4264fb1e341a..afec475aaf5c 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -23,7 +23,7 @@ #include "xattr.h" #ifdef CONFIG_EXT4_FS_ENCRYPTION -static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *ext4_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) { struct page *cpage = NULL; char *caddr, *paddr = NULL; @@ -37,7 +37,7 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) ctx = ext4_get_fname_crypto_ctx(inode, inode->i_sb->s_blocksize); if (IS_ERR(ctx)) - return ctx; + return ERR_CAST(ctx); if (ext4_inode_is_fast_symlink(inode)) { caddr = (char *) EXT4_I(inode)->i_data; @@ -46,7 +46,7 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) cpage = read_mapping_page(inode->i_mapping, 0, NULL); if (IS_ERR(cpage)) { ext4_put_fname_crypto_ctx(&ctx); - return cpage; + return ERR_CAST(cpage); } caddr = kmap(cpage); caddr[size] = 0; @@ -77,13 +77,12 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) /* Null-terminate the name */ if (res <= plen) paddr[res] = '\0'; - nd_set_link(nd, paddr); ext4_put_fname_crypto_ctx(&ctx); if (cpage) { kunmap(cpage); page_cache_release(cpage); } - return NULL; + return *cookie = paddr; errout: ext4_put_fname_crypto_ctx(&ctx); if (cpage) { diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 658e8079aaf9..d2947937515e 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -296,19 +296,15 @@ fail: return err; } -static void *f2fs_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *f2fs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) { - struct page *page = page_follow_link_light(dentry, nd); - - if (IS_ERR_OR_NULL(page)) - return page; - - /* this is broken symlink case */ - if (*nd_get_link(nd) == 0) { - page_put_link(dentry, nd, page); - return ERR_PTR(-ENOENT); + const char *link = page_follow_link_light(dentry, cookie, nd); + if (!IS_ERR(link) && !*link) { + /* this is broken symlink case */ + page_put_link(dentry, *cookie); + link = ERR_PTR(-ENOENT); } - return page; + return link; } static int f2fs_symlink(struct inode *dir, struct dentry *dentry, diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 0572bca49f15..f9cb260375cf 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1365,7 +1365,7 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx) return err; } -static char *read_link(struct dentry *dentry) +static const char *fuse_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) { struct inode *inode = d_inode(dentry); struct fuse_conn *fc = get_fuse_conn(inode); @@ -1389,26 +1389,15 @@ static char *read_link(struct dentry *dentry) link = ERR_PTR(ret); } else { link[ret] = '\0'; + *cookie = link; } fuse_invalidate_atime(inode); return link; } -static void free_link(char *link) +static void fuse_put_link(struct dentry *dentry, void *cookie) { - if (!IS_ERR(link)) - free_page((unsigned long) link); -} - -static void *fuse_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - nd_set_link(nd, read_link(dentry)); - return NULL; -} - -static void fuse_put_link(struct dentry *dentry, struct nameidata *nd, void *c) -{ - free_link(nd_get_link(nd)); + free_page((unsigned long) cookie); } static int fuse_dir_open(struct inode *inode, struct file *file) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 1b3ca7a2e3fc..f59390aebffb 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1548,7 +1548,7 @@ out: * Returns: 0 on success or error code */ -static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *gfs2_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) { struct gfs2_inode *ip = GFS2_I(d_inode(dentry)); struct gfs2_holder i_gh; @@ -1561,8 +1561,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) error = gfs2_glock_nq(&i_gh); if (error) { gfs2_holder_uninit(&i_gh); - nd_set_link(nd, ERR_PTR(error)); - return NULL; + return ERR_PTR(error); } size = (unsigned int)i_size_read(&ip->i_inode); @@ -1586,8 +1585,9 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) brelse(dibh); out: gfs2_glock_dq_uninit(&i_gh); - nd_set_link(nd, buf); - return NULL; + if (!IS_ERR(buf)) + *cookie = buf; + return buf; } /** diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index ef263174acd2..f650ed661fab 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -892,7 +892,7 @@ static const struct inode_operations hostfs_dir_iops = { .setattr = hostfs_setattr, }; -static void *hostfs_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *hostfs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) { char *link = __getname(); if (link) { @@ -906,21 +906,18 @@ static void *hostfs_follow_link(struct dentry *dentry, struct nameidata *nd) } if (err < 0) { __putname(link); - link = ERR_PTR(err); + return ERR_PTR(err); } } else { - link = ERR_PTR(-ENOMEM); + return ERR_PTR(-ENOMEM); } - nd_set_link(nd, link); - return NULL; + return *cookie = link; } -static void hostfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) +static void hostfs_put_link(struct dentry *dentry, void *cookie) { - char *s = nd_get_link(nd); - if (!IS_ERR(s)) - __putname(s); + __putname(cookie); } static const struct inode_operations hostfs_link_iops = { diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index fa2bd5366ecf..b8f24d3b04ee 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -642,20 +642,19 @@ static int hppfs_readlink(struct dentry *dentry, char __user *buffer, buflen); } -static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *hppfs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) { struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry; - return d_inode(proc_dentry)->i_op->follow_link(proc_dentry, nd); + return d_inode(proc_dentry)->i_op->follow_link(proc_dentry, cookie, nd); } -static void hppfs_put_link(struct dentry *dentry, struct nameidata *nd, - void *cookie) +static void hppfs_put_link(struct dentry *dentry, void *cookie) { struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry; if (d_inode(proc_dentry)->i_op->put_link) - d_inode(proc_dentry)->i_op->put_link(proc_dentry, nd, cookie); + d_inode(proc_dentry)->i_op->put_link(proc_dentry, cookie); } static const struct inode_operations hppfs_dir_iops = { diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c index 8a198898e39a..3c7e799974a2 100644 --- a/fs/kernfs/symlink.c +++ b/fs/kernfs/symlink.c @@ -112,25 +112,23 @@ static int kernfs_getlink(struct dentry *dentry, char *path) return error; } -static void *kernfs_iop_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *kernfs_iop_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) { int error = -ENOMEM; unsigned long page = get_zeroed_page(GFP_KERNEL); - if (page) { - error = kernfs_getlink(dentry, (char *) page); - if (error < 0) - free_page((unsigned long)page); + if (!page) + return ERR_PTR(-ENOMEM); + error = kernfs_getlink(dentry, (char *)page); + if (unlikely(error < 0)) { + free_page((unsigned long)page); + return ERR_PTR(error); } - nd_set_link(nd, error ? ERR_PTR(error) : (char *)page); - return NULL; + return *cookie = (char *)page; } -static void kernfs_iop_put_link(struct dentry *dentry, struct nameidata *nd, - void *cookie) +static void kernfs_iop_put_link(struct dentry *dentry, void *cookie) { - char *page = nd_get_link(nd); - if (!IS_ERR(page)) - free_page((unsigned long)page); + free_page((unsigned long)cookie); } const struct inode_operations kernfs_symlink_iops = { diff --git a/fs/libfs.c b/fs/libfs.c index 72e4e015455f..0c83fde20dbd 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1024,12 +1024,9 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync) } EXPORT_SYMBOL(noop_fsync); -void kfree_put_link(struct dentry *dentry, struct nameidata *nd, - void *cookie) +void kfree_put_link(struct dentry *dentry, void *cookie) { - char *s = nd_get_link(nd); - if (!IS_ERR(s)) - kfree(s); + kfree(cookie); } EXPORT_SYMBOL(kfree_put_link); @@ -1094,10 +1091,9 @@ simple_nosetlease(struct file *filp, long arg, struct file_lock **flp, } EXPORT_SYMBOL(simple_nosetlease); -void *simple_follow_link(struct dentry *dentry, struct nameidata *nd) +const char *simple_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) { - nd_set_link(nd, d_inode(dentry)->i_link); - return NULL; + return d_inode(dentry)->i_link; } EXPORT_SYMBOL(simple_follow_link); diff --git a/fs/namei.c b/fs/namei.c index ab2bcbdbd683..aeca44877371 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -502,7 +502,6 @@ struct nameidata { int last_type; unsigned depth; struct file *base; - char *saved_names[MAX_NESTED_LINKS + 1]; }; /* @@ -713,23 +712,11 @@ void nd_jump_link(struct nameidata *nd, struct path *path) nd->flags |= LOOKUP_JUMPED; } -void nd_set_link(struct nameidata *nd, char *path) -{ - nd->saved_names[nd->depth] = path; -} -EXPORT_SYMBOL(nd_set_link); - -char *nd_get_link(struct nameidata *nd) -{ - return nd->saved_names[nd->depth]; -} -EXPORT_SYMBOL(nd_get_link); - static inline void put_link(struct nameidata *nd, struct path *link, void *cookie) { struct inode *inode = link->dentry->d_inode; - if (inode->i_op->put_link) - inode->i_op->put_link(link->dentry, nd, cookie); + if (cookie && inode->i_op->put_link) + inode->i_op->put_link(link->dentry, cookie); path_put(link); } @@ -854,7 +841,7 @@ follow_link(struct path *link, struct nameidata *nd, void **p) { struct dentry *dentry = link->dentry; int error; - char *s; + const char *s; BUG_ON(nd->flags & LOOKUP_RCU); @@ -869,26 +856,20 @@ follow_link(struct path *link, struct nameidata *nd, void **p) current->total_link_count++; touch_atime(link); - nd_set_link(nd, NULL); error = security_inode_follow_link(dentry); if (error) goto out_put_nd_path; nd->last_type = LAST_BIND; - *p = dentry->d_inode->i_op->follow_link(dentry, nd); - error = PTR_ERR(*p); - if (IS_ERR(*p)) + *p = NULL; + s = dentry->d_inode->i_op->follow_link(dentry, p, nd); + error = PTR_ERR(s); + if (IS_ERR(s)) goto out_put_nd_path; error = 0; - s = nd_get_link(nd); if (s) { - if (unlikely(IS_ERR(s))) { - path_put(&nd->path); - put_link(nd, link, *p); - return PTR_ERR(s); - } if (*s == '/') { if (!nd->root.mnt) set_root(nd); @@ -906,7 +887,6 @@ follow_link(struct path *link, struct nameidata *nd, void **p) return error; out_put_nd_path: - *p = NULL; path_put(&nd->path); path_put(link); return error; @@ -4430,18 +4410,15 @@ EXPORT_SYMBOL(readlink_copy); */ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) { - struct nameidata nd; void *cookie; + const char *link = dentry->d_inode->i_op->follow_link(dentry, &cookie, NULL); int res; - nd.depth = 0; - cookie = dentry->d_inode->i_op->follow_link(dentry, &nd); - if (IS_ERR(cookie)) - return PTR_ERR(cookie); - - res = readlink_copy(buffer, buflen, nd_get_link(&nd)); - if (dentry->d_inode->i_op->put_link) - dentry->d_inode->i_op->put_link(dentry, &nd, cookie); + if (IS_ERR(link)) + return PTR_ERR(link); + res = readlink_copy(buffer, buflen, link); + if (cookie && dentry->d_inode->i_op->put_link) + dentry->d_inode->i_op->put_link(dentry, cookie); return res; } EXPORT_SYMBOL(generic_readlink); @@ -4473,22 +4450,21 @@ int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) } EXPORT_SYMBOL(page_readlink); -void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd) +const char *page_follow_link_light(struct dentry *dentry, void **cookie, struct nameidata *nd) { struct page *page = NULL; - nd_set_link(nd, page_getlink(dentry, &page)); - return page; + char *res = page_getlink(dentry, &page); + if (!IS_ERR(res)) + *cookie = page; + return res; } EXPORT_SYMBOL(page_follow_link_light); -void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) +void page_put_link(struct dentry *dentry, void *cookie) { struct page *page = cookie; - - if (page) { - kunmap(page); - page_cache_release(page); - } + kunmap(page); + page_cache_release(page); } EXPORT_SYMBOL(page_put_link); diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c index 2d56200655fe..c992b200ae7e 100644 --- a/fs/nfs/symlink.c +++ b/fs/nfs/symlink.c @@ -20,7 +20,6 @@ #include #include #include -#include /* Symlink caching in the page cache is even more simplistic * and straight-forward than readdir caching. @@ -43,7 +42,7 @@ error: return -EIO; } -static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *nfs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) { struct inode *inode = d_inode(dentry); struct page *page; @@ -51,19 +50,13 @@ static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd) err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping)); if (err) - goto read_failed; + return err; page = read_cache_page(&inode->i_data, 0, (filler_t *)nfs_symlink_filler, inode); - if (IS_ERR(page)) { - err = page; - goto read_failed; - } - nd_set_link(nd, kmap(page)); - return page; - -read_failed: - nd_set_link(nd, err); - return NULL; + if (IS_ERR(page)) + return ERR_CAST(page); + *cookie = page; + return kmap(page); } /* diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 1b4b9c5e51b7..235ad42afb57 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -140,12 +140,12 @@ struct ovl_link_data { void *cookie; }; -static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *ovl_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) { - void *ret; struct dentry *realdentry; struct inode *realinode; struct ovl_link_data *data = NULL; + const char *ret; realdentry = ovl_dentry_real(dentry); realinode = realdentry->d_inode; @@ -160,19 +160,21 @@ static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd) data->realdentry = realdentry; } - ret = realinode->i_op->follow_link(realdentry, nd); - if (IS_ERR(ret)) { + ret = realinode->i_op->follow_link(realdentry, cookie, nd); + if (IS_ERR_OR_NULL(ret)) { kfree(data); return ret; } if (data) - data->cookie = ret; + data->cookie = *cookie; - return data; + *cookie = data; + + return ret; } -static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c) +static void ovl_put_link(struct dentry *dentry, void *c) { struct inode *realinode; struct ovl_link_data *data = c; @@ -181,7 +183,7 @@ static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c) return; realinode = data->realdentry->d_inode; - realinode->i_op->put_link(data->realdentry, nd, data->cookie); + realinode->i_op->put_link(data->realdentry, data->cookie); kfree(data); } diff --git a/fs/proc/base.c b/fs/proc/base.c index 093ca14f5701..52652f86b187 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1380,7 +1380,7 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path) return -ENOENT; } -static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *proc_pid_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) { struct inode *inode = d_inode(dentry); struct path path; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 8272aaba1bb0..acd51d75387d 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -23,7 +23,6 @@ #include #include #include -#include #include @@ -394,16 +393,16 @@ static const struct file_operations proc_reg_file_ops_no_compat = { }; #endif -static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *proc_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) { struct proc_dir_entry *pde = PDE(d_inode(dentry)); if (unlikely(!use_pde(pde))) return ERR_PTR(-EINVAL); - nd_set_link(nd, pde->data); - return pde; + *cookie = pde; + return pde->data; } -static void proc_put_link(struct dentry *dentry, struct nameidata *nd, void *p) +static void proc_put_link(struct dentry *dentry, void *p) { unuse_pde(p); } diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index e512642dbbdc..10d24dd096e8 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -30,7 +30,7 @@ static const struct proc_ns_operations *ns_entries[] = { &mntns_operations, }; -static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *proc_ns_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) { struct inode *inode = d_inode(dentry); const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; diff --git a/fs/proc/self.c b/fs/proc/self.c index 6195b4a7c3b1..ad333946b53a 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -1,5 +1,4 @@ #include -#include #include #include #include "internal.h" @@ -19,21 +18,20 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer, return readlink_copy(buffer, buflen, tmp); } -static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *proc_self_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) { struct pid_namespace *ns = dentry->d_sb->s_fs_info; pid_t tgid = task_tgid_nr_ns(current, ns); - char *name = ERR_PTR(-ENOENT); - if (tgid) { - /* 11 for max length of signed int in decimal + NULL term */ - name = kmalloc(12, GFP_KERNEL); - if (!name) - name = ERR_PTR(-ENOMEM); - else - sprintf(name, "%d", tgid); - } - nd_set_link(nd, name); - return NULL; + char *name; + + if (!tgid) + return ERR_PTR(-ENOENT); + /* 11 for max length of signed int in decimal + NULL term */ + name = kmalloc(12, GFP_KERNEL); + if (!name) + return ERR_PTR(-ENOMEM); + sprintf(name, "%d", tgid); + return *cookie = name; } static const struct inode_operations proc_self_inode_operations = { diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index a8371993b4fb..85c96e0d7aaa 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -1,5 +1,4 @@ #include -#include #include #include #include "internal.h" @@ -20,21 +19,20 @@ static int proc_thread_self_readlink(struct dentry *dentry, char __user *buffer, return readlink_copy(buffer, buflen, tmp); } -static void *proc_thread_self_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *proc_thread_self_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) { struct pid_namespace *ns = dentry->d_sb->s_fs_info; pid_t tgid = task_tgid_nr_ns(current, ns); pid_t pid = task_pid_nr_ns(current, ns); - char *name = ERR_PTR(-ENOENT); - if (pid) { - name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL); - if (!name) - name = ERR_PTR(-ENOMEM); - else - sprintf(name, "%d/task/%d", tgid, pid); - } - nd_set_link(nd, name); - return NULL; + char *name; + + if (!pid) + return ERR_PTR(-ENOENT); + name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL); + if (!name) + return ERR_PTR(-ENOMEM); + sprintf(name, "%d/task/%d", tgid, pid); + return *cookie = name; } static const struct inode_operations proc_thread_self_inode_operations = { diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index f4cd7204e236..26c4dcb1ef56 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -41,7 +41,6 @@ #include #include -#include #include #include #include @@ -414,9 +413,10 @@ xfs_vn_rename( * we need to be very careful about how much stack we use. * uio is kmalloced for this reason... */ -STATIC void * +STATIC const char * xfs_vn_follow_link( struct dentry *dentry, + void **cookie, struct nameidata *nd) { char *link; @@ -430,14 +430,12 @@ xfs_vn_follow_link( if (unlikely(error)) goto out_kfree; - nd_set_link(nd, link); - return NULL; + return *cookie = link; out_kfree: kfree(link); out_err: - nd_set_link(nd, ERR_PTR(error)); - return NULL; + return ERR_PTR(error); } STATIC int diff --git a/include/linux/fs.h b/include/linux/fs.h index 0ac758fcff00..9ab934113a28 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1608,12 +1608,12 @@ struct file_operations { struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); - void * (*follow_link) (struct dentry *, struct nameidata *); + const char * (*follow_link) (struct dentry *, void **, struct nameidata *); int (*permission) (struct inode *, int); struct posix_acl * (*get_acl)(struct inode *, int); int (*readlink) (struct dentry *, char __user *,int); - void (*put_link) (struct dentry *, struct nameidata *, void *); + void (*put_link) (struct dentry *, void *); int (*create) (struct inode *,struct dentry *, umode_t, bool); int (*link) (struct dentry *,struct inode *,struct dentry *); @@ -2705,13 +2705,13 @@ extern const struct file_operations generic_ro_fops; extern int readlink_copy(char __user *, int, const char *); extern int page_readlink(struct dentry *, char __user *, int); -extern void *page_follow_link_light(struct dentry *, struct nameidata *); -extern void page_put_link(struct dentry *, struct nameidata *, void *); +extern const char *page_follow_link_light(struct dentry *, void **, struct nameidata *); +extern void page_put_link(struct dentry *, void *); extern int __page_symlink(struct inode *inode, const char *symname, int len, int nofs); extern int page_symlink(struct inode *inode, const char *symname, int len); extern const struct inode_operations page_symlink_inode_operations; -extern void kfree_put_link(struct dentry *, struct nameidata *, void *); +extern void kfree_put_link(struct dentry *, void *); extern int generic_readlink(struct dentry *, char __user *, int); extern void generic_fillattr(struct inode *, struct kstat *); int vfs_getattr_nosec(struct path *path, struct kstat *stat); @@ -2722,7 +2722,7 @@ void __inode_sub_bytes(struct inode *inode, loff_t bytes); void inode_sub_bytes(struct inode *inode, loff_t bytes); loff_t inode_get_bytes(struct inode *inode); void inode_set_bytes(struct inode *inode, loff_t bytes); -void *simple_follow_link(struct dentry *, struct nameidata *); +const char *simple_follow_link(struct dentry *, void **, struct nameidata *); extern const struct inode_operations simple_symlink_inode_operations; extern int iterate_dir(struct file *, struct dir_context *); diff --git a/include/linux/namei.h b/include/linux/namei.h index c8990779f0c3..a5d5bed2c0e1 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -71,8 +71,6 @@ extern struct dentry *lock_rename(struct dentry *, struct dentry *); extern void unlock_rename(struct dentry *, struct dentry *); extern void nd_jump_link(struct nameidata *nd, struct path *path); -extern void nd_set_link(struct nameidata *nd, char *path); -extern char *nd_get_link(struct nameidata *nd); static inline void nd_terminate_link(void *name, size_t len, size_t maxlen) { diff --git a/mm/shmem.c b/mm/shmem.c index 7f6e2f889122..d1693dcb4285 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2475,24 +2475,23 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s return 0; } -static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *shmem_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) { struct page *page = NULL; int error = shmem_getpage(d_inode(dentry), 0, &page, SGP_READ, NULL); - nd_set_link(nd, error ? ERR_PTR(error) : kmap(page)); - if (page) - unlock_page(page); - return page; + if (error) + return ERR_PTR(error); + unlock_page(page); + *cookie = page; + return kmap(page); } -static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) +static void shmem_put_link(struct dentry *dentry, void *cookie) { - if (!IS_ERR(nd_get_link(nd))) { - struct page *page = cookie; - kunmap(page); - mark_page_accessed(page); - page_cache_release(page); - } + struct page *page = cookie; + kunmap(page); + mark_page_accessed(page); + page_cache_release(page); } #ifdef CONFIG_TMPFS_XATTR -- cgit v1.2.3 From 894bc8c4662ba9daceafe943a5ba0dd407da5cd3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 2 May 2015 07:16:16 -0400 Subject: namei: remove restrictions on nesting depth The only restriction is that on the total amount of symlinks crossed; how they are nested does not matter Signed-off-by: Al Viro --- fs/namei.c | 66 ++++++++++++++++++++++++++++++++++++++++----------- include/linux/namei.h | 2 ++ 2 files changed, 54 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/fs/namei.c b/fs/namei.c index e5715a567860..1ae34cd0d590 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -492,6 +492,7 @@ void path_put(const struct path *path) } EXPORT_SYMBOL(path_put); +#define EMBEDDED_LEVELS 2 struct nameidata { struct path path; union { @@ -509,9 +510,42 @@ struct nameidata { struct path link; void *cookie; const char *name; - } stack[MAX_NESTED_LINKS + 1]; + } *stack, internal[EMBEDDED_LEVELS]; }; +static void set_nameidata(struct nameidata *nd) +{ + nd->stack = nd->internal; +} + +static void restore_nameidata(struct nameidata *nd) +{ + if (nd->stack != nd->internal) { + kfree(nd->stack); + nd->stack = nd->internal; + } +} + +static int __nd_alloc_stack(struct nameidata *nd) +{ + struct saved *p = kmalloc((MAXSYMLINKS + 1) * sizeof(struct saved), + GFP_KERNEL); + if (unlikely(!p)) + return -ENOMEM; + memcpy(p, nd->internal, sizeof(nd->internal)); + nd->stack = p; + return 0; +} + +static inline int nd_alloc_stack(struct nameidata *nd) +{ + if (likely(nd->depth != EMBEDDED_LEVELS - 1)) + return 0; + if (likely(nd->stack != nd->internal)) + return 0; + return __nd_alloc_stack(nd); +} + /* * Path walking has 2 modes, rcu-walk and ref-walk (see * Documentation/filesystems/path-lookup.txt). In situations when we can't @@ -857,7 +891,7 @@ const char *get_link(struct nameidata *nd) if (nd->link.mnt == nd->path.mnt) mntget(nd->link.mnt); - if (unlikely(current->total_link_count >= 40)) { + if (unlikely(current->total_link_count >= MAXSYMLINKS)) { path_put(&nd->path); path_put(&nd->link); return ERR_PTR(-ELOOP); @@ -1789,22 +1823,18 @@ Walked: if (err) { const char *s; - if (unlikely(current->link_count >= MAX_NESTED_LINKS)) { - path_put_conditional(&nd->link, nd); - path_put(&nd->path); - err = -ELOOP; - goto Err; + err = nd_alloc_stack(nd); + if (unlikely(err)) { + path_to_nameidata(&nd->link, nd); + break; } - BUG_ON(nd->depth >= MAX_NESTED_LINKS); nd->depth++; - current->link_count++; s = get_link(nd); if (unlikely(IS_ERR(s))) { err = PTR_ERR(s); - current->link_count--; nd->depth--; goto Err; } @@ -1812,7 +1842,6 @@ Walked: if (unlikely(!s)) { /* jumped */ put_link(nd); - current->link_count--; nd->depth--; } else { if (*s == '/') { @@ -1842,7 +1871,6 @@ Walked: Err: while (unlikely(nd->depth)) { put_link(nd); - current->link_count--; nd->depth--; } return err; @@ -1851,7 +1879,6 @@ OK: name = nd->stack[nd->depth].name; err = walk_component(nd, LOOKUP_FOLLOW); put_link(nd); - current->link_count--; nd->depth--; goto Walked; } @@ -2055,7 +2082,11 @@ static int path_lookupat(int dfd, const struct filename *name, static int filename_lookup(int dfd, struct filename *name, unsigned int flags, struct nameidata *nd) { - int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd); + int retval; + + set_nameidata(nd); + retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd); + if (unlikely(retval == -ECHILD)) retval = path_lookupat(dfd, name, flags, nd); if (unlikely(retval == -ESTALE)) @@ -2063,6 +2094,7 @@ static int filename_lookup(int dfd, struct filename *name, if (likely(!retval)) audit_inode(name, nd->path.dentry, flags & LOOKUP_PARENT); + restore_nameidata(nd); return retval; } @@ -2393,6 +2425,7 @@ filename_mountpoint(int dfd, struct filename *name, struct path *path, int error; if (IS_ERR(name)) return PTR_ERR(name); + set_nameidata(&nd); error = path_mountpoint(dfd, name, path, &nd, flags | LOOKUP_RCU); if (unlikely(error == -ECHILD)) error = path_mountpoint(dfd, name, path, &nd, flags); @@ -2400,6 +2433,7 @@ filename_mountpoint(int dfd, struct filename *name, struct path *path, error = path_mountpoint(dfd, name, path, &nd, flags | LOOKUP_REVAL); if (likely(!error)) audit_inode(name, path->dentry, 0); + restore_nameidata(&nd); putname(name); return error; } @@ -3288,11 +3322,13 @@ struct file *do_filp_open(int dfd, struct filename *pathname, int flags = op->lookup_flags; struct file *filp; + set_nameidata(&nd); filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU); if (unlikely(filp == ERR_PTR(-ECHILD))) filp = path_openat(dfd, pathname, &nd, op, flags); if (unlikely(filp == ERR_PTR(-ESTALE))) filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL); + restore_nameidata(&nd); return filp; } @@ -3306,6 +3342,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, nd.root.mnt = mnt; nd.root.dentry = dentry; + set_nameidata(&nd); if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN) return ERR_PTR(-ELOOP); @@ -3319,6 +3356,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, file = path_openat(-1, filename, &nd, op, flags); if (unlikely(file == ERR_PTR(-ESTALE))) file = path_openat(-1, filename, &nd, op, flags | LOOKUP_REVAL); + restore_nameidata(&nd); putname(filename); return file; } diff --git a/include/linux/namei.h b/include/linux/namei.h index a5d5bed2c0e1..3a6cc9651712 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -11,6 +11,8 @@ struct nameidata; enum { MAX_NESTED_LINKS = 8 }; +#define MAXSYMLINKS 40 + /* * Type of the last component on LOOKUP_PARENT */ -- cgit v1.2.3 From 756daf263ea53a8bfc89db26cb92e963953253a1 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 23 Mar 2015 13:37:38 +1100 Subject: VFS: replace {, total_}link_count in task_struct with pointer to nameidata task_struct currently contains two ad-hoc members for use by the VFS: link_count and total_link_count. These are only interesting to fs/namei.c, so exposing them explicitly is poor layering. Incidentally, link_count isn't used anymore, so it can just die. This patches replaces those with a single pointer to 'struct nameidata'. This structure represents the current filename lookup of which there can only be one per process, and is a natural place to store total_link_count. This will allow the current "nameidata" argument to all follow_link operations to be removed as current->nameidata can be used instead in the _very_ few instances that care about it at all. As there are occasional circumstances where pathname lookup can recurse, such as through kern_path_locked, we always save and old current->nameidata (if there is one) when setting a new value, and make sure any active link_counts are preserved. follow_mount and follow_automount now get a 'struct nameidata *' rather than 'int flags' so that they can directly access total_link_count, rather than going through 'current'. Suggested-by: Al Viro Signed-off-by: NeilBrown Signed-off-by: Al Viro --- fs/namei.c | 70 ++++++++++++++++++++++++++++----------------------- include/linux/sched.h | 2 +- 2 files changed, 40 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/fs/namei.c b/fs/namei.c index 046a703c081d..699e093f52b5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -505,6 +505,7 @@ struct nameidata { unsigned seq, m_seq; int last_type; unsigned depth; + int total_link_count; struct file *base; struct saved { struct path link; @@ -513,16 +514,25 @@ struct nameidata { } *stack, internal[EMBEDDED_LEVELS]; }; -static void set_nameidata(struct nameidata *nd) +static struct nameidata *set_nameidata(struct nameidata *p) { - nd->stack = nd->internal; + struct nameidata *old = current->nameidata; + p->stack = p->internal; + p->total_link_count = old ? old->total_link_count : 0; + current->nameidata = p; + return old; } -static void restore_nameidata(struct nameidata *nd) +static void restore_nameidata(struct nameidata *old) { - if (nd->stack != nd->internal) { - kfree(nd->stack); - nd->stack = nd->internal; + struct nameidata *now = current->nameidata; + + current->nameidata = old; + if (old) + old->total_link_count = now->total_link_count; + if (now->stack != now->internal) { + kfree(now->stack); + now->stack = now->internal; } } @@ -970,7 +980,7 @@ EXPORT_SYMBOL(follow_up); * - return -EISDIR to tell follow_managed() to stop and return the path we * were called with. */ -static int follow_automount(struct path *path, unsigned flags, +static int follow_automount(struct path *path, struct nameidata *nd, bool *need_mntput) { struct vfsmount *mnt; @@ -990,13 +1000,13 @@ static int follow_automount(struct path *path, unsigned flags, * as being automount points. These will need the attentions * of the daemon to instantiate them before they can be used. */ - if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | - LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && + if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | + LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && path->dentry->d_inode) return -EISDIR; - current->total_link_count++; - if (current->total_link_count >= 40) + nd->total_link_count++; + if (nd->total_link_count >= 40) return -ELOOP; mnt = path->dentry->d_op->d_automount(path); @@ -1010,7 +1020,7 @@ static int follow_automount(struct path *path, unsigned flags, * the path being looked up; if it wasn't then the remainder of * the path is inaccessible and we should say so. */ - if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_PARENT)) + if (PTR_ERR(mnt) == -EISDIR && (nd->flags & LOOKUP_PARENT)) return -EREMOTE; return PTR_ERR(mnt); } @@ -1050,7 +1060,7 @@ static int follow_automount(struct path *path, unsigned flags, * * Serialization is taken care of in namespace.c */ -static int follow_managed(struct path *path, unsigned flags) +static int follow_managed(struct path *path, struct nameidata *nd) { struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */ unsigned managed; @@ -1094,7 +1104,7 @@ static int follow_managed(struct path *path, unsigned flags) /* Handle an automount point */ if (managed & DCACHE_NEED_AUTOMOUNT) { - ret = follow_automount(path, flags, &need_mntput); + ret = follow_automount(path, nd, &need_mntput); if (ret < 0) break; continue; @@ -1483,7 +1493,7 @@ unlazy: } path->mnt = mnt; path->dentry = dentry; - err = follow_managed(path, nd->flags); + err = follow_managed(path, nd); if (unlikely(err < 0)) { path_put_conditional(path, nd); return err; @@ -1513,7 +1523,7 @@ static int lookup_slow(struct nameidata *nd, struct path *path) return PTR_ERR(dentry); path->mnt = nd->path.mnt; path->dentry = dentry; - err = follow_managed(path, nd->flags); + err = follow_managed(path, nd); if (unlikely(err < 0)) { path_put_conditional(path, nd); return err; @@ -1563,7 +1573,7 @@ static void terminate_walk(struct nameidata *nd) static int pick_link(struct nameidata *nd, struct path *link) { int error; - if (unlikely(current->total_link_count++ >= MAXSYMLINKS)) { + if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) { path_to_nameidata(link, nd); return -ELOOP; } @@ -1981,7 +1991,7 @@ static int path_init(int dfd, const struct filename *name, unsigned int flags, rcu_read_unlock(); return -ECHILD; done: - current->total_link_count = 0; + nd->total_link_count = 0; return link_path_walk(s, nd); } @@ -2087,10 +2097,9 @@ static int filename_lookup(int dfd, struct filename *name, unsigned int flags, struct nameidata *nd) { int retval; + struct nameidata *saved_nd = set_nameidata(nd); - set_nameidata(nd); retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd); - if (unlikely(retval == -ECHILD)) retval = path_lookupat(dfd, name, flags, nd); if (unlikely(retval == -ESTALE)) @@ -2098,7 +2107,7 @@ static int filename_lookup(int dfd, struct filename *name, if (likely(!retval)) audit_inode(name, nd->path.dentry, flags & LOOKUP_PARENT); - restore_nameidata(nd); + restore_nameidata(saved_nd); return retval; } @@ -2426,11 +2435,11 @@ static int filename_mountpoint(int dfd, struct filename *name, struct path *path, unsigned int flags) { - struct nameidata nd; + struct nameidata nd, *saved; int error; if (IS_ERR(name)) return PTR_ERR(name); - set_nameidata(&nd); + saved = set_nameidata(&nd); error = path_mountpoint(dfd, name, path, &nd, flags | LOOKUP_RCU); if (unlikely(error == -ECHILD)) error = path_mountpoint(dfd, name, path, &nd, flags); @@ -2438,7 +2447,7 @@ filename_mountpoint(int dfd, struct filename *name, struct path *path, error = path_mountpoint(dfd, name, path, &nd, flags | LOOKUP_REVAL); if (likely(!error)) audit_inode(name, path->dentry, 0); - restore_nameidata(&nd); + restore_nameidata(saved); putname(name); return error; } @@ -3087,7 +3096,7 @@ retry_lookup: if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) goto exit_dput; - error = follow_managed(&path, nd->flags); + error = follow_managed(&path, nd); if (error < 0) goto exit_dput; @@ -3323,31 +3332,29 @@ out2: struct file *do_filp_open(int dfd, struct filename *pathname, const struct open_flags *op) { - struct nameidata nd; + struct nameidata nd, *saved_nd = set_nameidata(&nd); int flags = op->lookup_flags; struct file *filp; - set_nameidata(&nd); filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU); if (unlikely(filp == ERR_PTR(-ECHILD))) filp = path_openat(dfd, pathname, &nd, op, flags); if (unlikely(filp == ERR_PTR(-ESTALE))) filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL); - restore_nameidata(&nd); + restore_nameidata(saved_nd); return filp; } struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, const char *name, const struct open_flags *op) { - struct nameidata nd; + struct nameidata nd, *saved_nd; struct file *file; struct filename *filename; int flags = op->lookup_flags | LOOKUP_ROOT; nd.root.mnt = mnt; nd.root.dentry = dentry; - set_nameidata(&nd); if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN) return ERR_PTR(-ELOOP); @@ -3356,12 +3363,13 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, if (unlikely(IS_ERR(filename))) return ERR_CAST(filename); + saved_nd = set_nameidata(&nd); file = path_openat(-1, filename, &nd, op, flags | LOOKUP_RCU); if (unlikely(file == ERR_PTR(-ECHILD))) file = path_openat(-1, filename, &nd, op, flags); if (unlikely(file == ERR_PTR(-ESTALE))) file = path_openat(-1, filename, &nd, op, flags | LOOKUP_REVAL); - restore_nameidata(&nd); + restore_nameidata(saved_nd); putname(filename); return file; } diff --git a/include/linux/sched.h b/include/linux/sched.h index 26a2e6122734..f6c9b69d66f2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1461,7 +1461,7 @@ struct task_struct { it with task_lock()) - initialized normally by setup_new_exec */ /* file system info */ - int link_count, total_link_count; + struct nameidata *nameidata; #ifdef CONFIG_SYSVIPC /* ipc stuff */ struct sysv_sem sysvsem; -- cgit v1.2.3 From 6e77137b363b8d866ac29c5a0c95e953614fb2d8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 2 May 2015 13:37:52 -0400 Subject: don't pass nameidata to ->follow_link() its only use is getting passed to nd_jump_link(), which can obtain it from current->nameidata Signed-off-by: Al Viro --- Documentation/filesystems/Locking | 2 +- Documentation/filesystems/vfs.txt | 2 +- drivers/staging/lustre/lustre/llite/symlink.c | 2 +- fs/9p/vfs_inode.c | 2 +- fs/9p/vfs_inode_dotl.c | 2 +- fs/autofs4/symlink.c | 2 +- fs/befs/linuxvfs.c | 4 ++-- fs/cifs/cifsfs.h | 2 +- fs/cifs/link.c | 2 +- fs/configfs/symlink.c | 2 +- fs/ecryptfs/inode.c | 2 +- fs/ext4/symlink.c | 2 +- fs/f2fs/namei.c | 4 ++-- fs/fuse/dir.c | 2 +- fs/gfs2/inode.c | 2 +- fs/hostfs/hostfs_kern.c | 2 +- fs/hppfs/hppfs.c | 4 ++-- fs/kernfs/symlink.c | 2 +- fs/libfs.c | 2 +- fs/namei.c | 11 ++++++----- fs/nfs/symlink.c | 2 +- fs/overlayfs/inode.c | 4 ++-- fs/proc/base.c | 4 ++-- fs/proc/inode.c | 2 +- fs/proc/namespaces.c | 4 ++-- fs/proc/self.c | 2 +- fs/proc/thread_self.c | 2 +- fs/xfs/xfs_iops.c | 3 +-- include/linux/fs.h | 6 +++--- include/linux/namei.h | 2 +- mm/shmem.c | 2 +- 31 files changed, 44 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 7fa6c4ac858c..5b5b4f54c033 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -50,7 +50,7 @@ prototypes: int (*rename2) (struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); int (*readlink) (struct dentry *, char __user *,int); - const char *(*follow_link) (struct dentry *, void **, struct nameidata *); + const char *(*follow_link) (struct dentry *, void **); void (*put_link) (struct dentry *, void *); void (*truncate) (struct inode *); int (*permission) (struct inode *, int, unsigned int); diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 1c6b03ac2e5a..0dec8c880be6 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -350,7 +350,7 @@ struct inode_operations { int (*rename2) (struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); int (*readlink) (struct dentry *, char __user *,int); - const char *(*follow_link) (struct dentry *, void **, struct nameidata *); + const char *(*follow_link) (struct dentry *, void **); void (*put_link) (struct dentry *, void *); int (*permission) (struct inode *, int); int (*get_acl)(struct inode *, int); diff --git a/drivers/staging/lustre/lustre/llite/symlink.c b/drivers/staging/lustre/lustre/llite/symlink.c index da6d9d17c50d..f3be3bf0f66f 100644 --- a/drivers/staging/lustre/lustre/llite/symlink.c +++ b/drivers/staging/lustre/lustre/llite/symlink.c @@ -118,7 +118,7 @@ failed: return rc; } -static const char *ll_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) +static const char *ll_follow_link(struct dentry *dentry, void **cookie) { struct inode *inode = d_inode(dentry); struct ptlrpc_request *request = NULL; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 7cc70a39a1d8..271f51af2f75 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -1230,7 +1230,7 @@ ino_t v9fs_qid2ino(struct p9_qid *qid) * */ -static const char *v9fs_vfs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) +static const char *v9fs_vfs_follow_link(struct dentry *dentry, void **cookie) { struct v9fs_session_info *v9ses = v9fs_dentry2v9ses(dentry); struct p9_fid *fid = v9fs_fid_lookup(dentry); diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index ae062ffa0f1f..16658ed677c9 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -910,7 +910,7 @@ error: */ static const char * -v9fs_vfs_follow_link_dotl(struct dentry *dentry, void **cookie, struct nameidata *nd) +v9fs_vfs_follow_link_dotl(struct dentry *dentry, void **cookie) { struct p9_fid *fid = v9fs_fid_lookup(dentry); char *target; diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c index 9c6a07739c9b..da0c33481bc0 100644 --- a/fs/autofs4/symlink.c +++ b/fs/autofs4/symlink.c @@ -12,7 +12,7 @@ #include "autofs_i.h" -static const char *autofs4_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) +static const char *autofs4_follow_link(struct dentry *dentry, void **cookie) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 3a1aefb86a11..46aedacfa6a8 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -42,7 +42,7 @@ static struct inode *befs_iget(struct super_block *, unsigned long); static struct inode *befs_alloc_inode(struct super_block *sb); static void befs_destroy_inode(struct inode *inode); static void befs_destroy_inodecache(void); -static const char *befs_follow_link(struct dentry *, void **, struct nameidata *nd); +static const char *befs_follow_link(struct dentry *, void **); static int befs_utf2nls(struct super_block *sb, const char *in, int in_len, char **out, int *out_len); static int befs_nls2utf(struct super_block *sb, const char *in, int in_len, @@ -464,7 +464,7 @@ befs_destroy_inodecache(void) * flag is set. */ static const char * -befs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) +befs_follow_link(struct dentry *dentry, void **cookie) { struct super_block *sb = dentry->d_sb; struct befs_inode_info *befs_ino = BEFS_I(d_inode(dentry)); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 61012da7e9d8..a782b22904e4 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -120,7 +120,7 @@ extern struct vfsmount *cifs_dfs_d_automount(struct path *path); #endif /* Functions related to symlinks */ -extern const char *cifs_follow_link(struct dentry *direntry, void **cookie, struct nameidata *nd); +extern const char *cifs_follow_link(struct dentry *direntry, void **cookie); extern int cifs_readlink(struct dentry *direntry, char __user *buffer, int buflen); extern int cifs_symlink(struct inode *inode, struct dentry *direntry, diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 4a439c2c0c7f..546f86ab09aa 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -627,7 +627,7 @@ cifs_hl_exit: } const char * -cifs_follow_link(struct dentry *direntry, void **cookie, struct nameidata *nd) +cifs_follow_link(struct dentry *direntry, void **cookie) { struct inode *inode = d_inode(direntry); int rc = -ENOMEM; diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index fac8e8517f33..0ace75649009 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -279,7 +279,7 @@ static int configfs_getlink(struct dentry *dentry, char * path) } -static const char *configfs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) +static const char *configfs_follow_link(struct dentry *dentry, void **cookie) { unsigned long page = get_zeroed_page(GFP_KERNEL); int error; diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index cdb9d6c4532d..73d20ae92478 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -675,7 +675,7 @@ out: return rc ? ERR_PTR(rc) : buf; } -static const char *ecryptfs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) +static const char *ecryptfs_follow_link(struct dentry *dentry, void **cookie) { size_t len; char *buf = ecryptfs_readlink_lower(dentry, &len); diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index afec475aaf5c..ba5bd18a9825 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -23,7 +23,7 @@ #include "xattr.h" #ifdef CONFIG_EXT4_FS_ENCRYPTION -static const char *ext4_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) +static const char *ext4_follow_link(struct dentry *dentry, void **cookie) { struct page *cpage = NULL; char *caddr, *paddr = NULL; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index d2947937515e..cd05a7c91533 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -296,9 +296,9 @@ fail: return err; } -static const char *f2fs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) +static const char *f2fs_follow_link(struct dentry *dentry, void **cookie) { - const char *link = page_follow_link_light(dentry, cookie, nd); + const char *link = page_follow_link_light(dentry, cookie); if (!IS_ERR(link) && !*link) { /* this is broken symlink case */ page_put_link(dentry, *cookie); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index f9cb260375cf..d5cdef8b7f3a 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1365,7 +1365,7 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx) return err; } -static const char *fuse_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) +static const char *fuse_follow_link(struct dentry *dentry, void **cookie) { struct inode *inode = d_inode(dentry); struct fuse_conn *fc = get_fuse_conn(inode); diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index f59390aebffb..3a1461de1551 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1548,7 +1548,7 @@ out: * Returns: 0 on success or error code */ -static const char *gfs2_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) +static const char *gfs2_follow_link(struct dentry *dentry, void **cookie) { struct gfs2_inode *ip = GFS2_I(d_inode(dentry)); struct gfs2_holder i_gh; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index f650ed661fab..7b6ed7a908f6 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -892,7 +892,7 @@ static const struct inode_operations hostfs_dir_iops = { .setattr = hostfs_setattr, }; -static const char *hostfs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) +static const char *hostfs_follow_link(struct dentry *dentry, void **cookie) { char *link = __getname(); if (link) { diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index b8f24d3b04ee..15a774eb5bbf 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -642,11 +642,11 @@ static int hppfs_readlink(struct dentry *dentry, char __user *buffer, buflen); } -static const char *hppfs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) +static const char *hppfs_follow_link(struct dentry *dentry, void **cookie) { struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry; - return d_inode(proc_dentry)->i_op->follow_link(proc_dentry, cookie, nd); + return d_inode(proc_dentry)->i_op->follow_link(proc_dentry, cookie); } static void hppfs_put_link(struct dentry *dentry, void *cookie) diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c index 3c7e799974a2..366c5a17475e 100644 --- a/fs/kernfs/symlink.c +++ b/fs/kernfs/symlink.c @@ -112,7 +112,7 @@ static int kernfs_getlink(struct dentry *dentry, char *path) return error; } -static const char *kernfs_iop_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) +static const char *kernfs_iop_follow_link(struct dentry *dentry, void **cookie) { int error = -ENOMEM; unsigned long page = get_zeroed_page(GFP_KERNEL); diff --git a/fs/libfs.c b/fs/libfs.c index 0c83fde20dbd..c5f3373e326b 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1091,7 +1091,7 @@ simple_nosetlease(struct file *filp, long arg, struct file_lock **flp, } EXPORT_SYMBOL(simple_nosetlease); -const char *simple_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) +const char *simple_follow_link(struct dentry *dentry, void **cookie) { return d_inode(dentry)->i_link; } diff --git a/fs/namei.c b/fs/namei.c index b57400ca6a0f..f311f0369e3c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -753,8 +753,9 @@ static inline void path_to_nameidata(const struct path *path, * Helper to directly jump to a known parsed path from ->follow_link, * caller must have taken a reference to path beforehand. */ -void nd_jump_link(struct nameidata *nd, struct path *path) +void nd_jump_link(struct path *path) { + struct nameidata *nd = current->nameidata; path_put(&nd->path); nd->path = *path; @@ -916,7 +917,7 @@ const char *get_link(struct nameidata *nd) nd->last_type = LAST_BIND; res = inode->i_link; if (!res) { - res = inode->i_op->follow_link(dentry, &last->cookie, nd); + res = inode->i_op->follow_link(dentry, &last->cookie); if (IS_ERR(res)) { out: path_put(&last->link); @@ -4485,12 +4486,12 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) int res; if (!link) { - link = dentry->d_inode->i_op->follow_link(dentry, &cookie, NULL); + link = dentry->d_inode->i_op->follow_link(dentry, &cookie); if (IS_ERR(link)) return PTR_ERR(link); } res = readlink_copy(buffer, buflen, link); - if (cookie && dentry->d_inode->i_op->put_link) + if (dentry->d_inode->i_op->put_link) dentry->d_inode->i_op->put_link(dentry, cookie); return res; } @@ -4523,7 +4524,7 @@ int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) } EXPORT_SYMBOL(page_readlink); -const char *page_follow_link_light(struct dentry *dentry, void **cookie, struct nameidata *nd) +const char *page_follow_link_light(struct dentry *dentry, void **cookie) { struct page *page = NULL; char *res = page_getlink(dentry, &page); diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c index c992b200ae7e..b6de433da5db 100644 --- a/fs/nfs/symlink.c +++ b/fs/nfs/symlink.c @@ -42,7 +42,7 @@ error: return -EIO; } -static const char *nfs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) +static const char *nfs_follow_link(struct dentry *dentry, void **cookie) { struct inode *inode = d_inode(dentry); struct page *page; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 235ad42afb57..9986833c9fcc 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -140,7 +140,7 @@ struct ovl_link_data { void *cookie; }; -static const char *ovl_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) +static const char *ovl_follow_link(struct dentry *dentry, void **cookie) { struct dentry *realdentry; struct inode *realinode; @@ -160,7 +160,7 @@ static const char *ovl_follow_link(struct dentry *dentry, void **cookie, struct data->realdentry = realdentry; } - ret = realinode->i_op->follow_link(realdentry, cookie, nd); + ret = realinode->i_op->follow_link(realdentry, cookie); if (IS_ERR_OR_NULL(ret)) { kfree(data); return ret; diff --git a/fs/proc/base.c b/fs/proc/base.c index 52652f86b187..286a422f440e 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1380,7 +1380,7 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path) return -ENOENT; } -static const char *proc_pid_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) +static const char *proc_pid_follow_link(struct dentry *dentry, void **cookie) { struct inode *inode = d_inode(dentry); struct path path; @@ -1394,7 +1394,7 @@ static const char *proc_pid_follow_link(struct dentry *dentry, void **cookie, st if (error) goto out; - nd_jump_link(nd, &path); + nd_jump_link(&path); return NULL; out: return ERR_PTR(error); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index acd51d75387d..eb35874fe09c 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -393,7 +393,7 @@ static const struct file_operations proc_reg_file_ops_no_compat = { }; #endif -static const char *proc_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) +static const char *proc_follow_link(struct dentry *dentry, void **cookie) { struct proc_dir_entry *pde = PDE(d_inode(dentry)); if (unlikely(!use_pde(pde))) diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 10d24dd096e8..f6e8354b8cea 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -30,7 +30,7 @@ static const struct proc_ns_operations *ns_entries[] = { &mntns_operations, }; -static const char *proc_ns_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) +static const char *proc_ns_follow_link(struct dentry *dentry, void **cookie) { struct inode *inode = d_inode(dentry); const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; @@ -45,7 +45,7 @@ static const char *proc_ns_follow_link(struct dentry *dentry, void **cookie, str if (ptrace_may_access(task, PTRACE_MODE_READ)) { error = ns_get_path(&ns_path, task, ns_ops); if (!error) - nd_jump_link(nd, &ns_path); + nd_jump_link(&ns_path); } put_task_struct(task); return error; diff --git a/fs/proc/self.c b/fs/proc/self.c index ad333946b53a..113b8d061fc0 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -18,7 +18,7 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer, return readlink_copy(buffer, buflen, tmp); } -static const char *proc_self_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) +static const char *proc_self_follow_link(struct dentry *dentry, void **cookie) { struct pid_namespace *ns = dentry->d_sb->s_fs_info; pid_t tgid = task_tgid_nr_ns(current, ns); diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index 85c96e0d7aaa..947b0f4fd0a1 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -19,7 +19,7 @@ static int proc_thread_self_readlink(struct dentry *dentry, char __user *buffer, return readlink_copy(buffer, buflen, tmp); } -static const char *proc_thread_self_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) +static const char *proc_thread_self_follow_link(struct dentry *dentry, void **cookie) { struct pid_namespace *ns = dentry->d_sb->s_fs_info; pid_t tgid = task_tgid_nr_ns(current, ns); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 26c4dcb1ef56..7f51f39f8acc 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -416,8 +416,7 @@ xfs_vn_rename( STATIC const char * xfs_vn_follow_link( struct dentry *dentry, - void **cookie, - struct nameidata *nd) + void **cookie) { char *link; int error = -ENOMEM; diff --git a/include/linux/fs.h b/include/linux/fs.h index 9ab934113a28..ed7c9f298759 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1608,7 +1608,7 @@ struct file_operations { struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); - const char * (*follow_link) (struct dentry *, void **, struct nameidata *); + const char * (*follow_link) (struct dentry *, void **); int (*permission) (struct inode *, int); struct posix_acl * (*get_acl)(struct inode *, int); @@ -2705,7 +2705,7 @@ extern const struct file_operations generic_ro_fops; extern int readlink_copy(char __user *, int, const char *); extern int page_readlink(struct dentry *, char __user *, int); -extern const char *page_follow_link_light(struct dentry *, void **, struct nameidata *); +extern const char *page_follow_link_light(struct dentry *, void **); extern void page_put_link(struct dentry *, void *); extern int __page_symlink(struct inode *inode, const char *symname, int len, int nofs); @@ -2722,7 +2722,7 @@ void __inode_sub_bytes(struct inode *inode, loff_t bytes); void inode_sub_bytes(struct inode *inode, loff_t bytes); loff_t inode_get_bytes(struct inode *inode); void inode_set_bytes(struct inode *inode, loff_t bytes); -const char *simple_follow_link(struct dentry *, void **, struct nameidata *); +const char *simple_follow_link(struct dentry *, void **); extern const struct inode_operations simple_symlink_inode_operations; extern int iterate_dir(struct file *, struct dir_context *); diff --git a/include/linux/namei.h b/include/linux/namei.h index 3a6cc9651712..d756304aa09b 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -72,7 +72,7 @@ extern int follow_up(struct path *); extern struct dentry *lock_rename(struct dentry *, struct dentry *); extern void unlock_rename(struct dentry *, struct dentry *); -extern void nd_jump_link(struct nameidata *nd, struct path *path); +extern void nd_jump_link(struct path *path); static inline void nd_terminate_link(void *name, size_t len, size_t maxlen) { diff --git a/mm/shmem.c b/mm/shmem.c index d1693dcb4285..e02682267046 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2475,7 +2475,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s return 0; } -static const char *shmem_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd) +static const char *shmem_follow_link(struct dentry *dentry, void **cookie) { struct page *page = NULL; int error = shmem_getpage(d_inode(dentry), 0, &page, SGP_READ, NULL); -- cgit v1.2.3 From 2da572c959dd5815aef153cf62010b16a498a0d3 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Thu, 7 May 2015 13:49:14 -0400 Subject: lib: add software 842 compression/decompression Add 842-format software compression and decompression functions. Update the MAINTAINERS 842 section to include the new files. The 842 compression function can compress any input data into the 842 compression format. The 842 decompression function can decompress any standard-format 842 compressed data - specifically, either a compressed data buffer created by the 842 software compression function, or a compressed data buffer created by the 842 hardware compressor (located in PowerPC coprocessors). The 842 compressed data format is explained in the header comments. This is used in a later patch to provide a full software 842 compression and decompression crypto interface. Signed-off-by: Dan Streetman Signed-off-by: Herbert Xu --- MAINTAINERS | 2 + include/linux/sw842.h | 12 + lib/842/842.h | 127 ++++++++++ lib/842/842_compress.c | 626 +++++++++++++++++++++++++++++++++++++++++++++++ lib/842/842_debugfs.h | 52 ++++ lib/842/842_decompress.c | 405 ++++++++++++++++++++++++++++++ lib/842/Makefile | 2 + lib/Kconfig | 6 + lib/Makefile | 2 + 9 files changed, 1234 insertions(+) create mode 100644 include/linux/sw842.h create mode 100644 lib/842/842.h create mode 100644 lib/842/842_compress.c create mode 100644 lib/842/842_debugfs.h create mode 100644 lib/842/842_decompress.c create mode 100644 lib/842/Makefile (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 56a432d51119..96684fd49abc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4863,6 +4863,8 @@ M: Dan Streetman S: Supported F: drivers/crypto/nx/nx-842.c F: include/linux/nx842.h +F: include/linux/sw842.h +F: lib/842/ IBM Power Linux RAID adapter M: Brian King diff --git a/include/linux/sw842.h b/include/linux/sw842.h new file mode 100644 index 000000000000..109ba041c2ae --- /dev/null +++ b/include/linux/sw842.h @@ -0,0 +1,12 @@ +#ifndef __SW842_H__ +#define __SW842_H__ + +#define SW842_MEM_COMPRESS (0xf000) + +int sw842_compress(const u8 *src, unsigned int srclen, + u8 *dst, unsigned int *destlen, void *wmem); + +int sw842_decompress(const u8 *src, unsigned int srclen, + u8 *dst, unsigned int *destlen); + +#endif diff --git a/lib/842/842.h b/lib/842/842.h new file mode 100644 index 000000000000..7c200030acf7 --- /dev/null +++ b/lib/842/842.h @@ -0,0 +1,127 @@ + +#ifndef __842_H__ +#define __842_H__ + +/* The 842 compressed format is made up of multiple blocks, each of + * which have the format: + * + *