From fd0928df98b9578be8a786ac0cb78a47a5e17a20 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 24 Jan 2008 08:52:45 +0100
Subject: ioprio: move io priority from task_struct to io_context

This is where it belongs and then it doesn't take up space for a
process that doesn't do IO.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c       | 34 ++++++++++----------
 block/ll_rw_blk.c         | 30 ++++++++++++------
 fs/ioprio.c               | 29 ++++++++++++-----
 include/linux/blkdev.h    | 81 ++++-------------------------------------------
 include/linux/init_task.h |  1 -
 include/linux/iocontext.h | 79 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/ioprio.h    | 13 ++++----
 include/linux/sched.h     |  1 -
 kernel/fork.c             | 32 ++++++++++++++++---
 9 files changed, 178 insertions(+), 122 deletions(-)
 create mode 100644 include/linux/iocontext.h

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 13553e015d72..533af75329e6 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -199,7 +199,7 @@ CFQ_CFQQ_FNS(sync);
 
 static void cfq_dispatch_insert(struct request_queue *, struct request *);
 static struct cfq_queue *cfq_get_queue(struct cfq_data *, int,
-				       struct task_struct *, gfp_t);
+				       struct io_context *, gfp_t);
 static struct cfq_io_context *cfq_cic_rb_lookup(struct cfq_data *,
 						struct io_context *);
 
@@ -1273,7 +1273,7 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 	return cic;
 }
 
-static void cfq_init_prio_data(struct cfq_queue *cfqq)
+static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
 {
 	struct task_struct *tsk = current;
 	int ioprio_class;
@@ -1281,7 +1281,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq)
 	if (!cfq_cfqq_prio_changed(cfqq))
 		return;
 
-	ioprio_class = IOPRIO_PRIO_CLASS(tsk->ioprio);
+	ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
 	switch (ioprio_class) {
 		default:
 			printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
@@ -1293,11 +1293,11 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq)
 			cfqq->ioprio_class = IOPRIO_CLASS_BE;
 			break;
 		case IOPRIO_CLASS_RT:
-			cfqq->ioprio = task_ioprio(tsk);
+			cfqq->ioprio = task_ioprio(ioc);
 			cfqq->ioprio_class = IOPRIO_CLASS_RT;
 			break;
 		case IOPRIO_CLASS_BE:
-			cfqq->ioprio = task_ioprio(tsk);
+			cfqq->ioprio = task_ioprio(ioc);
 			cfqq->ioprio_class = IOPRIO_CLASS_BE;
 			break;
 		case IOPRIO_CLASS_IDLE:
@@ -1330,8 +1330,7 @@ static inline void changed_ioprio(struct cfq_io_context *cic)
 	cfqq = cic->cfqq[ASYNC];
 	if (cfqq) {
 		struct cfq_queue *new_cfqq;
-		new_cfqq = cfq_get_queue(cfqd, ASYNC, cic->ioc->task,
-					 GFP_ATOMIC);
+		new_cfqq = cfq_get_queue(cfqd, ASYNC, cic->ioc, GFP_ATOMIC);
 		if (new_cfqq) {
 			cic->cfqq[ASYNC] = new_cfqq;
 			cfq_put_queue(cfqq);
@@ -1363,13 +1362,13 @@ static void cfq_ioc_set_ioprio(struct io_context *ioc)
 
 static struct cfq_queue *
 cfq_find_alloc_queue(struct cfq_data *cfqd, int is_sync,
-		     struct task_struct *tsk, gfp_t gfp_mask)
+		     struct io_context *ioc, gfp_t gfp_mask)
 {
 	struct cfq_queue *cfqq, *new_cfqq = NULL;
 	struct cfq_io_context *cic;
 
 retry:
-	cic = cfq_cic_rb_lookup(cfqd, tsk->io_context);
+	cic = cfq_cic_rb_lookup(cfqd, ioc);
 	/* cic always exists here */
 	cfqq = cic_to_cfqq(cic, is_sync);
 
@@ -1412,7 +1411,7 @@ retry:
 		cfq_mark_cfqq_prio_changed(cfqq);
 		cfq_mark_cfqq_queue_new(cfqq);
 
-		cfq_init_prio_data(cfqq);
+		cfq_init_prio_data(cfqq, ioc);
 	}
 
 	if (new_cfqq)
@@ -1439,11 +1438,11 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
 }
 
 static struct cfq_queue *
-cfq_get_queue(struct cfq_data *cfqd, int is_sync, struct task_struct *tsk,
+cfq_get_queue(struct cfq_data *cfqd, int is_sync, struct io_context *ioc,
 	      gfp_t gfp_mask)
 {
-	const int ioprio = task_ioprio(tsk);
-	const int ioprio_class = task_ioprio_class(tsk);
+	const int ioprio = task_ioprio(ioc);
+	const int ioprio_class = task_ioprio_class(ioc);
 	struct cfq_queue **async_cfqq = NULL;
 	struct cfq_queue *cfqq = NULL;
 
@@ -1453,7 +1452,7 @@ cfq_get_queue(struct cfq_data *cfqd, int is_sync, struct task_struct *tsk,
 	}
 
 	if (!cfqq) {
-		cfqq = cfq_find_alloc_queue(cfqd, is_sync, tsk, gfp_mask);
+		cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask);
 		if (!cfqq)
 			return NULL;
 	}
@@ -1793,7 +1792,7 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 
-	cfq_init_prio_data(cfqq);
+	cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
 
 	cfq_add_rq_rb(rq);
 
@@ -1900,7 +1899,7 @@ static int cfq_may_queue(struct request_queue *q, int rw)
 
 	cfqq = cic_to_cfqq(cic, rw & REQ_RW_SYNC);
 	if (cfqq) {
-		cfq_init_prio_data(cfqq);
+		cfq_init_prio_data(cfqq, cic->ioc);
 		cfq_prio_boost(cfqq);
 
 		return __cfq_may_queue(cfqq);
@@ -1938,7 +1937,6 @@ static int
 cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
-	struct task_struct *tsk = current;
 	struct cfq_io_context *cic;
 	const int rw = rq_data_dir(rq);
 	const int is_sync = rq_is_sync(rq);
@@ -1956,7 +1954,7 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 
 	cfqq = cic_to_cfqq(cic, is_sync);
 	if (!cfqq) {
-		cfqq = cfq_get_queue(cfqd, is_sync, tsk, gfp_mask);
+		cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
 
 		if (!cfqq)
 			goto queue_fail;
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 3d0422f48453..b9bb02e845cd 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3904,6 +3904,26 @@ void exit_io_context(void)
 	put_io_context(ioc);
 }
 
+struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
+{
+	struct io_context *ret;
+
+	ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
+	if (ret) {
+		atomic_set(&ret->refcount, 1);
+		ret->task = current;
+		ret->ioprio_changed = 0;
+		ret->ioprio = 0;
+		ret->last_waited = jiffies; /* doesn't matter... */
+		ret->nr_batch_requests = 0; /* because this is 0 */
+		ret->aic = NULL;
+		ret->cic_root.rb_node = NULL;
+		ret->ioc_data = NULL;
+	}
+
+	return ret;
+}
+
 /*
  * If the current task has no IO context then create one and initialise it.
  * Otherwise, return its existing IO context.
@@ -3921,16 +3941,8 @@ static struct io_context *current_io_context(gfp_t gfp_flags, int node)
 	if (likely(ret))
 		return ret;
 
-	ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
+	ret = alloc_io_context(gfp_flags, node);
 	if (ret) {
-		atomic_set(&ret->refcount, 1);
-		ret->task = current;
-		ret->ioprio_changed = 0;
-		ret->last_waited = jiffies; /* doesn't matter... */
-		ret->nr_batch_requests = 0; /* because this is 0 */
-		ret->aic = NULL;
-		ret->cic_root.rb_node = NULL;
-		ret->ioc_data = NULL;
 		/* make sure set_task_ioprio() sees the settings above */
 		smp_wmb();
 		tsk->io_context = ret;
diff --git a/fs/ioprio.c b/fs/ioprio.c
index e4e01bc7f338..a7600401ecf7 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -41,18 +41,29 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
 		return err;
 
 	task_lock(task);
+	do {
+		ioc = task->io_context;
+		/* see wmb() in current_io_context() */
+		smp_read_barrier_depends();
+		if (ioc)
+			break;
 
-	task->ioprio = ioprio;
-
-	ioc = task->io_context;
-	/* see wmb() in current_io_context() */
-	smp_read_barrier_depends();
+		ioc = alloc_io_context(GFP_ATOMIC, -1);
+		if (!ioc) {
+			err = -ENOMEM;
+			break;
+		}
+		task->io_context = ioc;
+		ioc->task = task;
+	} while (1);
 
-	if (ioc)
+	if (!err) {
+		ioc->ioprio = ioprio;
 		ioc->ioprio_changed = 1;
+	}
 
 	task_unlock(task);
-	return 0;
+	return err;
 }
 
 asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
@@ -148,7 +159,9 @@ static int get_task_ioprio(struct task_struct *p)
 	ret = security_task_getioprio(p);
 	if (ret)
 		goto out;
-	ret = p->ioprio;
+	ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM);
+	if (p->io_context)
+		ret = p->io_context->ioprio;
 out:
 	return ret;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 49b7a4c31a6d..510a18ba1ec5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -34,83 +34,10 @@ struct sg_io_hdr;
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
 
-/*
- * This is the per-process anticipatory I/O scheduler state.
- */
-struct as_io_context {
-	spinlock_t lock;
-
-	void (*dtor)(struct as_io_context *aic); /* destructor */
-	void (*exit)(struct as_io_context *aic); /* called on task exit */
-
-	unsigned long state;
-	atomic_t nr_queued; /* queued reads & sync writes */
-	atomic_t nr_dispatched; /* number of requests gone to the drivers */
-
-	/* IO History tracking */
-	/* Thinktime */
-	unsigned long last_end_request;
-	unsigned long ttime_total;
-	unsigned long ttime_samples;
-	unsigned long ttime_mean;
-	/* Layout pattern */
-	unsigned int seek_samples;
-	sector_t last_request_pos;
-	u64 seek_total;
-	sector_t seek_mean;
-};
-
-struct cfq_queue;
-struct cfq_io_context {
-	struct rb_node rb_node;
-	void *key;
-
-	struct cfq_queue *cfqq[2];
-
-	struct io_context *ioc;
-
-	unsigned long last_end_request;
-	sector_t last_request_pos;
-
-	unsigned long ttime_total;
-	unsigned long ttime_samples;
-	unsigned long ttime_mean;
-
-	unsigned int seek_samples;
-	u64 seek_total;
-	sector_t seek_mean;
-
-	struct list_head queue_list;
-
-	void (*dtor)(struct io_context *); /* destructor */
-	void (*exit)(struct io_context *); /* called on task exit */
-};
-
-/*
- * This is the per-process I/O subsystem state.  It is refcounted and
- * kmalloc'ed. Currently all fields are modified in process io context
- * (apart from the atomic refcount), so require no locking.
- */
-struct io_context {
-	atomic_t refcount;
-	struct task_struct *task;
-
-	unsigned int ioprio_changed;
-
-	/*
-	 * For request batching
-	 */
-	unsigned long last_waited; /* Time last woken after wait for request */
-	int nr_batch_requests;     /* Number of requests left in the batch */
-
-	struct as_io_context *aic;
-	struct rb_root cic_root;
-	void *ioc_data;
-};
-
 void put_io_context(struct io_context *ioc);
 void exit_io_context(void);
 struct io_context *get_io_context(gfp_t gfp_flags, int node);
+struct io_context *alloc_io_context(gfp_t gfp_flags, int node);
 void copy_io_context(struct io_context **pdst, struct io_context **psrc);
 void swap_io_context(struct io_context **ioc1, struct io_context **ioc2);
 
@@ -894,6 +821,12 @@ static inline void exit_io_context(void)
 {
 }
 
+static inline int put_io_context(struct io_context *ioc)
+{
+	return 1;
+}
+
+
 #endif /* CONFIG_BLOCK */
 
 #endif
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 796019b22b6f..e6b3f7080679 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -137,7 +137,6 @@ extern struct group_info init_groups;
 		.time_slice	= HZ, 					\
 		.nr_cpus_allowed = NR_CPUS,				\
 	},								\
-	.ioprio		= 0,						\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
 	.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children),		\
 	.ptrace_list	= LIST_HEAD_INIT(tsk.ptrace_list),		\
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
new file mode 100644
index 000000000000..186807ea62e2
--- /dev/null
+++ b/include/linux/iocontext.h
@@ -0,0 +1,79 @@
+#ifndef IOCONTEXT_H
+#define IOCONTEXT_H
+
+/*
+ * This is the per-process anticipatory I/O scheduler state.
+ */
+struct as_io_context {
+	spinlock_t lock;
+
+	void (*dtor)(struct as_io_context *aic); /* destructor */
+	void (*exit)(struct as_io_context *aic); /* called on task exit */
+
+	unsigned long state;
+	atomic_t nr_queued; /* queued reads & sync writes */
+	atomic_t nr_dispatched; /* number of requests gone to the drivers */
+
+	/* IO History tracking */
+	/* Thinktime */
+	unsigned long last_end_request;
+	unsigned long ttime_total;
+	unsigned long ttime_samples;
+	unsigned long ttime_mean;
+	/* Layout pattern */
+	unsigned int seek_samples;
+	sector_t last_request_pos;
+	u64 seek_total;
+	sector_t seek_mean;
+};
+
+struct cfq_queue;
+struct cfq_io_context {
+	struct rb_node rb_node;
+	void *key;
+
+	struct cfq_queue *cfqq[2];
+
+	struct io_context *ioc;
+
+	unsigned long last_end_request;
+	sector_t last_request_pos;
+
+	unsigned long ttime_total;
+	unsigned long ttime_samples;
+	unsigned long ttime_mean;
+
+	unsigned int seek_samples;
+	u64 seek_total;
+	sector_t seek_mean;
+
+	struct list_head queue_list;
+
+	void (*dtor)(struct io_context *); /* destructor */
+	void (*exit)(struct io_context *); /* called on task exit */
+};
+
+/*
+ * This is the per-process I/O subsystem state.  It is refcounted and
+ * kmalloc'ed. Currently all fields are modified in process io context
+ * (apart from the atomic refcount), so require no locking.
+ */
+struct io_context {
+	atomic_t refcount;
+	struct task_struct *task;
+
+	unsigned short ioprio;
+	unsigned short ioprio_changed;
+
+	/*
+	 * For request batching
+	 */
+	unsigned long last_waited; /* Time last woken after wait for request */
+	int nr_batch_requests;     /* Number of requests left in the batch */
+
+	struct as_io_context *aic;
+	struct rb_root cic_root;
+	void *ioc_data;
+};
+
+#endif
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index baf29387cab4..2a3bb1bb7433 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -2,6 +2,7 @@
 #define IOPRIO_H
 
 #include <linux/sched.h>
+#include <linux/iocontext.h>
 
 /*
  * Gives us 8 prio classes with 13-bits of data for each class
@@ -45,18 +46,18 @@ enum {
  * the cpu scheduler nice value to an io priority
  */
 #define IOPRIO_NORM	(4)
-static inline int task_ioprio(struct task_struct *task)
+static inline int task_ioprio(struct io_context *ioc)
 {
-	if (ioprio_valid(task->ioprio))
-		return IOPRIO_PRIO_DATA(task->ioprio);
+	if (ioprio_valid(ioc->ioprio))
+		return IOPRIO_PRIO_DATA(ioc->ioprio);
 
 	return IOPRIO_NORM;
 }
 
-static inline int task_ioprio_class(struct task_struct *task)
+static inline int task_ioprio_class(struct io_context *ioc)
 {
-	if (ioprio_valid(task->ioprio))
-		return IOPRIO_PRIO_CLASS(task->ioprio);
+	if (ioprio_valid(ioc->ioprio))
+		return IOPRIO_PRIO_CLASS(ioc->ioprio);
 
 	return IOPRIO_CLASS_BE;
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index df5b24ee80b3..80837e7d527e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -975,7 +975,6 @@ struct task_struct {
 	struct hlist_head preempt_notifiers;
 #endif
 
-	unsigned short ioprio;
 	/*
 	 * fpu_counter contains the number of consecutive context switches
 	 * that the FPU is used. If this is over a threshold, the lazy fpu
diff --git a/kernel/fork.c b/kernel/fork.c
index 39d22b3357de..2a86c9dff744 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -51,6 +51,7 @@
 #include <linux/random.h>
 #include <linux/tty.h>
 #include <linux/proc_fs.h>
+#include <linux/blkdev.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -791,6 +792,26 @@ out:
 	return error;
 }
 
+static int copy_io(struct task_struct *tsk)
+{
+#ifdef CONFIG_BLOCK
+	struct io_context *ioc = current->io_context;
+
+	if (!ioc)
+		return 0;
+
+	if (ioprio_valid(ioc->ioprio)) {
+		tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
+		if (unlikely(!tsk->io_context))
+			return -ENOMEM;
+
+		tsk->io_context->task = tsk;
+		tsk->io_context->ioprio = ioc->ioprio;
+	}
+#endif
+	return 0;
+}
+
 /*
  *	Helper to unshare the files of the current task.
  *	We don't want to expose copy_files internals to
@@ -1156,15 +1177,17 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto bad_fork_cleanup_mm;
 	if ((retval = copy_namespaces(clone_flags, p)))
 		goto bad_fork_cleanup_keys;
+	if ((retval = copy_io(p)))
+		goto bad_fork_cleanup_namespaces;
 	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
 	if (retval)
-		goto bad_fork_cleanup_namespaces;
+		goto bad_fork_cleanup_io;
 
 	if (pid != &init_struct_pid) {
 		retval = -ENOMEM;
 		pid = alloc_pid(task_active_pid_ns(p));
 		if (!pid)
-			goto bad_fork_cleanup_namespaces;
+			goto bad_fork_cleanup_io;
 
 		if (clone_flags & CLONE_NEWPID) {
 			retval = pid_ns_prepare_proc(task_active_pid_ns(p));
@@ -1234,9 +1257,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	/* Need tasklist lock for parent etc handling! */
 	write_lock_irq(&tasklist_lock);
 
-	/* for sys_ioprio_set(IOPRIO_WHO_PGRP) */
-	p->ioprio = current->ioprio;
-
 	/*
 	 * The task hasn't been attached yet, so its cpus_allowed mask will
 	 * not be changed, nor will its assigned CPU.
@@ -1328,6 +1348,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 bad_fork_free_pid:
 	if (pid != &init_struct_pid)
 		free_pid(pid);
+bad_fork_cleanup_io:
+	put_io_context(p->io_context);
 bad_fork_cleanup_namespaces:
 	exit_task_namespaces(p);
 bad_fork_cleanup_keys:
-- 
cgit v1.2.3


From d38ecf935fcb10264a6bc190855d9595165e6eeb Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 24 Jan 2008 08:53:35 +0100
Subject: io context sharing: preliminary support

Detach task state from ioc, instead keep track of how many processes
are accessing the ioc.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/ll_rw_blk.c         | 46 ++++++++++++++++++++++++++++++----------------
 fs/ioprio.c               |  1 -
 include/linux/blkdev.h    |  2 +-
 include/linux/iocontext.h | 22 ++++++++++++++++++----
 kernel/fork.c             |  1 -
 5 files changed, 49 insertions(+), 23 deletions(-)

diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index b9bb02e845cd..d4550ecae443 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3854,12 +3854,13 @@ int __init blk_dev_init(void)
 }
 
 /*
- * IO Context helper functions
+ * IO Context helper functions. put_io_context() returns 1 if there are no
+ * more users of this io context, 0 otherwise.
  */
-void put_io_context(struct io_context *ioc)
+int put_io_context(struct io_context *ioc)
 {
 	if (ioc == NULL)
-		return;
+		return 1;
 
 	BUG_ON(atomic_read(&ioc->refcount) == 0);
 
@@ -3878,7 +3879,9 @@ void put_io_context(struct io_context *ioc)
 		rcu_read_unlock();
 
 		kmem_cache_free(iocontext_cachep, ioc);
+		return 1;
 	}
+	return 0;
 }
 EXPORT_SYMBOL(put_io_context);
 
@@ -3893,15 +3896,17 @@ void exit_io_context(void)
 	current->io_context = NULL;
 	task_unlock(current);
 
-	ioc->task = NULL;
-	if (ioc->aic && ioc->aic->exit)
-		ioc->aic->exit(ioc->aic);
-	if (ioc->cic_root.rb_node != NULL) {
-		cic = rb_entry(rb_first(&ioc->cic_root), struct cfq_io_context, rb_node);
-		cic->exit(ioc);
-	}
+	if (atomic_dec_and_test(&ioc->nr_tasks)) {
+		if (ioc->aic && ioc->aic->exit)
+			ioc->aic->exit(ioc->aic);
+		if (ioc->cic_root.rb_node != NULL) {
+			cic = rb_entry(rb_first(&ioc->cic_root),
+				struct cfq_io_context, rb_node);
+			cic->exit(ioc);
+		}
 
-	put_io_context(ioc);
+		put_io_context(ioc);
+	}
 }
 
 struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
@@ -3911,7 +3916,8 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 	ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
 	if (ret) {
 		atomic_set(&ret->refcount, 1);
-		ret->task = current;
+		atomic_set(&ret->nr_tasks, 1);
+		spin_lock_init(&ret->lock);
 		ret->ioprio_changed = 0;
 		ret->ioprio = 0;
 		ret->last_waited = jiffies; /* doesn't matter... */
@@ -3959,10 +3965,18 @@ static struct io_context *current_io_context(gfp_t gfp_flags, int node)
  */
 struct io_context *get_io_context(gfp_t gfp_flags, int node)
 {
-	struct io_context *ret;
-	ret = current_io_context(gfp_flags, node);
-	if (likely(ret))
-		atomic_inc(&ret->refcount);
+	struct io_context *ret = NULL;
+
+	/*
+	 * Check for unlikely race with exiting task. ioc ref count is
+	 * zero when ioc is being detached.
+	 */
+	do {
+		ret = current_io_context(gfp_flags, node);
+		if (unlikely(!ret))
+			break;
+	} while (!atomic_inc_not_zero(&ret->refcount));
+
 	return ret;
 }
 EXPORT_SYMBOL(get_io_context);
diff --git a/fs/ioprio.c b/fs/ioprio.c
index a7600401ecf7..06b5d97c5fdd 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -54,7 +54,6 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
 			break;
 		}
 		task->io_context = ioc;
-		ioc->task = task;
 	} while (1);
 
 	if (!err) {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 510a18ba1ec5..2483a05231c7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -34,7 +34,7 @@ struct sg_io_hdr;
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
 
-void put_io_context(struct io_context *ioc);
+int put_io_context(struct io_context *ioc);
 void exit_io_context(void);
 struct io_context *get_io_context(gfp_t gfp_flags, int node);
 struct io_context *alloc_io_context(gfp_t gfp_flags, int node);
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 186807ea62e2..cd44d458124a 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -54,13 +54,15 @@ struct cfq_io_context {
 };
 
 /*
- * This is the per-process I/O subsystem state.  It is refcounted and
- * kmalloc'ed. Currently all fields are modified in process io context
- * (apart from the atomic refcount), so require no locking.
+ * I/O subsystem state of the associated processes.  It is refcounted
+ * and kmalloc'ed. These could be shared between processes.
  */
 struct io_context {
 	atomic_t refcount;
-	struct task_struct *task;
+	atomic_t nr_tasks;
+
+	/* all the fields below are protected by this lock */
+	spinlock_t lock;
 
 	unsigned short ioprio;
 	unsigned short ioprio_changed;
@@ -76,4 +78,16 @@ struct io_context {
 	void *ioc_data;
 };
 
+static inline struct io_context *ioc_task_link(struct io_context *ioc)
+{
+	/*
+	 * if ref count is zero, don't allow sharing (ioc is going away, it's
+	 * a race).
+	 */
+	if (ioc && atomic_inc_not_zero(&ioc->refcount))
+		return ioc;
+
+	return NULL;
+}
+
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 2a86c9dff744..1987c57abb08 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -805,7 +805,6 @@ static int copy_io(struct task_struct *tsk)
 		if (unlikely(!tsk->io_context))
 			return -ENOMEM;
 
-		tsk->io_context->task = tsk;
 		tsk->io_context->ioprio = ioc->ioprio;
 	}
 #endif
-- 
cgit v1.2.3


From 66dac98ed0de7a1125fb0dd7907f238f6b9d2f60 Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <KNikanth@novell.com>
Date: Tue, 27 Nov 2007 12:47:04 +0100
Subject: io_context sharing - cfq changes

changes in the cfq for io_context sharing

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 533af75329e6..dba52b62cef8 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -895,7 +895,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 	 * task has exited, don't wait
 	 */
 	cic = cfqd->active_cic;
-	if (!cic || !cic->ioc->task)
+	if (!cic || !atomic_read(&cic->ioc->nr_tasks))
 		return;
 
 	/*
@@ -1178,6 +1178,8 @@ static void cfq_free_io_context(struct io_context *ioc)
 
 	ioc->ioc_data = NULL;
 
+	spin_lock(&ioc->lock);
+
 	while ((n = rb_first(&ioc->cic_root)) != NULL) {
 		__cic = rb_entry(n, struct cfq_io_context, rb_node);
 		rb_erase(&__cic->rb_node, &ioc->cic_root);
@@ -1189,6 +1191,8 @@ static void cfq_free_io_context(struct io_context *ioc)
 
 	if (ioc_gone && !elv_ioc_count_read(ioc_count))
 		complete(ioc_gone);
+
+	spin_unlock(&ioc->lock);
 }
 
 static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
@@ -1243,6 +1247,7 @@ static void cfq_exit_io_context(struct io_context *ioc)
 
 	ioc->ioc_data = NULL;
 
+	spin_lock(&ioc->lock);
 	/*
 	 * put the reference this task is holding to the various queues
 	 */
@@ -1253,6 +1258,8 @@ static void cfq_exit_io_context(struct io_context *ioc)
 		cfq_exit_single_io_context(__cic);
 		n = rb_next(n);
 	}
+
+	spin_unlock(&ioc->lock);
 }
 
 static struct cfq_io_context *
@@ -1349,6 +1356,8 @@ static void cfq_ioc_set_ioprio(struct io_context *ioc)
 	struct cfq_io_context *cic;
 	struct rb_node *n;
 
+	spin_lock(&ioc->lock);
+
 	ioc->ioprio_changed = 0;
 
 	n = rb_first(&ioc->cic_root);
@@ -1358,6 +1367,8 @@ static void cfq_ioc_set_ioprio(struct io_context *ioc)
 		changed_ioprio(cic);
 		n = rb_next(n);
 	}
+
+	spin_unlock(&ioc->lock);
 }
 
 static struct cfq_queue *
@@ -1502,6 +1513,7 @@ cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc)
 	if (cic && cic->key == cfqd)
 		return cic;
 
+	spin_lock(&ioc->lock);
 restart:
 	n = ioc->cic_root.rb_node;
 	while (n) {
@@ -1519,10 +1531,12 @@ restart:
 			n = n->rb_right;
 		else {
 			ioc->ioc_data = cic;
+			spin_unlock(&ioc->lock);
 			return cic;
 		}
 	}
 
+	spin_unlock(&ioc->lock);
 	return NULL;
 }
 
@@ -1536,6 +1550,7 @@ cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
 	unsigned long flags;
 	void *k;
 
+	spin_lock(&ioc->lock);
 	cic->ioc = ioc;
 	cic->key = cfqd;
 
@@ -1566,6 +1581,7 @@ restart:
 	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
 	list_add(&cic->queue_list, &cfqd->cic_list);
 	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
+	spin_unlock(&ioc->lock);
 }
 
 /*
@@ -1659,7 +1675,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 
 	enable_idle = cfq_cfqq_idle_window(cfqq);
 
-	if (!cic->ioc->task || !cfqd->cfq_slice_idle ||
+	if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
 	    (cfqd->hw_tag && CIC_SEEKY(cic)))
 		enable_idle = 0;
 	else if (sample_valid(cic->ttime_samples)) {
-- 
cgit v1.2.3


From 4ac845a2e9a816ed5a7b301f56dcc0a3d0b1ba4d Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 24 Jan 2008 08:44:49 +0100
Subject: block: cfq: make the io contect sharing lockless

The io context sharing introduced a per-ioc spinlock, that would protect
the cfq io context lookup. That is a regression from the original, since
we never needed any locking there because the ioc/cic were process private.

The cic lookup is changed from an rbtree construct to a radix tree, which
we can then use RCU to make the reader side lockless. That is the performance
critical path, modifying the radix tree is only done on process creation
(when that process first does IO, actually) and on process exit (if that
process has done IO).

As it so happens, radix trees are also much faster for this type of
lookup where the key is a pointer. It's a very sparse tree.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c       | 267 +++++++++++++++++++++++++---------------------
 block/ll_rw_blk.c         |  49 ++++++---
 include/linux/iocontext.h |   6 +-
 3 files changed, 185 insertions(+), 137 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index dba52b62cef8..8830893e062f 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -200,7 +200,7 @@ CFQ_CFQQ_FNS(sync);
 static void cfq_dispatch_insert(struct request_queue *, struct request *);
 static struct cfq_queue *cfq_get_queue(struct cfq_data *, int,
 				       struct io_context *, gfp_t);
-static struct cfq_io_context *cfq_cic_rb_lookup(struct cfq_data *,
+static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
 						struct io_context *);
 
 static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
@@ -609,7 +609,7 @@ cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
 	struct cfq_io_context *cic;
 	struct cfq_queue *cfqq;
 
-	cic = cfq_cic_rb_lookup(cfqd, tsk->io_context);
+	cic = cfq_cic_lookup(cfqd, tsk->io_context);
 	if (!cic)
 		return NULL;
 
@@ -721,7 +721,7 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
 	 * Lookup the cfqq that this bio will be queued with. Allow
 	 * merge only if rq is queued there.
 	 */
-	cic = cfq_cic_rb_lookup(cfqd, current->io_context);
+	cic = cfq_cic_lookup(cfqd, current->io_context);
 	if (!cic)
 		return 0;
 
@@ -1170,29 +1170,74 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 	kmem_cache_free(cfq_pool, cfqq);
 }
 
-static void cfq_free_io_context(struct io_context *ioc)
+/*
+ * Call func for each cic attached to this ioc. Returns number of cic's seen.
+ */
+#define CIC_GANG_NR	16
+static unsigned int
+call_for_each_cic(struct io_context *ioc,
+		  void (*func)(struct io_context *, struct cfq_io_context *))
 {
-	struct cfq_io_context *__cic;
-	struct rb_node *n;
-	int freed = 0;
+	struct cfq_io_context *cics[CIC_GANG_NR];
+	unsigned long index = 0;
+	unsigned int called = 0;
+	int nr;
 
-	ioc->ioc_data = NULL;
+	rcu_read_lock();
 
-	spin_lock(&ioc->lock);
+	do {
+		int i;
 
-	while ((n = rb_first(&ioc->cic_root)) != NULL) {
-		__cic = rb_entry(n, struct cfq_io_context, rb_node);
-		rb_erase(&__cic->rb_node, &ioc->cic_root);
-		kmem_cache_free(cfq_ioc_pool, __cic);
-		freed++;
-	}
+		/*
+		 * Perhaps there's a better way - this just gang lookups from
+		 * 0 to the end, restarting after each CIC_GANG_NR from the
+		 * last key + 1.
+		 */
+		nr = radix_tree_gang_lookup(&ioc->radix_root, (void **) cics,
+						index, CIC_GANG_NR);
+		if (!nr)
+			break;
+
+		called += nr;
+		index = 1 + (unsigned long) cics[nr - 1]->key;
+
+		for (i = 0; i < nr; i++)
+			func(ioc, cics[i]);
+	} while (nr == CIC_GANG_NR);
+
+	rcu_read_unlock();
+
+	return called;
+}
+
+static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
+{
+	unsigned long flags;
+
+	BUG_ON(!cic->dead_key);
+
+	spin_lock_irqsave(&ioc->lock, flags);
+	radix_tree_delete(&ioc->radix_root, cic->dead_key);
+	spin_unlock_irqrestore(&ioc->lock, flags);
+
+	kmem_cache_free(cfq_ioc_pool, cic);
+}
+
+static void cfq_free_io_context(struct io_context *ioc)
+{
+	int freed;
+
+	/*
+	 * ioc->refcount is zero here, so no more cic's are allowed to be
+	 * linked into this ioc. So it should be ok to iterate over the known
+	 * list, we will see all cic's since no new ones are added.
+	 */
+	freed = call_for_each_cic(ioc, cic_free_func);
 
 	elv_ioc_count_mod(ioc_count, -freed);
 
 	if (ioc_gone && !elv_ioc_count_read(ioc_count))
 		complete(ioc_gone);
-
-	spin_unlock(&ioc->lock);
 }
 
 static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
@@ -1209,7 +1254,12 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
 					 struct cfq_io_context *cic)
 {
 	list_del_init(&cic->queue_list);
+
+	/*
+	 * Make sure key == NULL is seen for dead queues
+	 */
 	smp_wmb();
+	cic->dead_key = (unsigned long) cic->key;
 	cic->key = NULL;
 
 	if (cic->cfqq[ASYNC]) {
@@ -1223,16 +1273,18 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
 	}
 }
 
-static void cfq_exit_single_io_context(struct cfq_io_context *cic)
+static void cfq_exit_single_io_context(struct io_context *ioc,
+				       struct cfq_io_context *cic)
 {
 	struct cfq_data *cfqd = cic->key;
 
 	if (cfqd) {
 		struct request_queue *q = cfqd->queue;
+		unsigned long flags;
 
-		spin_lock_irq(q->queue_lock);
+		spin_lock_irqsave(q->queue_lock, flags);
 		__cfq_exit_single_io_context(cfqd, cic);
-		spin_unlock_irq(q->queue_lock);
+		spin_unlock_irqrestore(q->queue_lock, flags);
 	}
 }
 
@@ -1242,24 +1294,8 @@ static void cfq_exit_single_io_context(struct cfq_io_context *cic)
  */
 static void cfq_exit_io_context(struct io_context *ioc)
 {
-	struct cfq_io_context *__cic;
-	struct rb_node *n;
-
-	ioc->ioc_data = NULL;
-
-	spin_lock(&ioc->lock);
-	/*
-	 * put the reference this task is holding to the various queues
-	 */
-	n = rb_first(&ioc->cic_root);
-	while (n != NULL) {
-		__cic = rb_entry(n, struct cfq_io_context, rb_node);
-
-		cfq_exit_single_io_context(__cic);
-		n = rb_next(n);
-	}
-
-	spin_unlock(&ioc->lock);
+	rcu_assign_pointer(ioc->ioc_data, NULL);
+	call_for_each_cic(ioc, cfq_exit_single_io_context);
 }
 
 static struct cfq_io_context *
@@ -1323,7 +1359,8 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
 	cfq_clear_cfqq_prio_changed(cfqq);
 }
 
-static inline void changed_ioprio(struct cfq_io_context *cic)
+static inline void changed_ioprio(struct io_context *ioc,
+				  struct cfq_io_context *cic)
 {
 	struct cfq_data *cfqd = cic->key;
 	struct cfq_queue *cfqq;
@@ -1353,22 +1390,8 @@ static inline void changed_ioprio(struct cfq_io_context *cic)
 
 static void cfq_ioc_set_ioprio(struct io_context *ioc)
 {
-	struct cfq_io_context *cic;
-	struct rb_node *n;
-
-	spin_lock(&ioc->lock);
-
+	call_for_each_cic(ioc, changed_ioprio);
 	ioc->ioprio_changed = 0;
-
-	n = rb_first(&ioc->cic_root);
-	while (n != NULL) {
-		cic = rb_entry(n, struct cfq_io_context, rb_node);
-
-		changed_ioprio(cic);
-		n = rb_next(n);
-	}
-
-	spin_unlock(&ioc->lock);
 }
 
 static struct cfq_queue *
@@ -1379,7 +1402,7 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, int is_sync,
 	struct cfq_io_context *cic;
 
 retry:
-	cic = cfq_cic_rb_lookup(cfqd, ioc);
+	cic = cfq_cic_lookup(cfqd, ioc);
 	/* cic always exists here */
 	cfqq = cic_to_cfqq(cic, is_sync);
 
@@ -1480,28 +1503,42 @@ cfq_get_queue(struct cfq_data *cfqd, int is_sync, struct io_context *ioc,
 	return cfqq;
 }
 
+static void cfq_cic_free(struct cfq_io_context *cic)
+{
+	kmem_cache_free(cfq_ioc_pool, cic);
+	elv_ioc_count_dec(ioc_count);
+
+	if (ioc_gone && !elv_ioc_count_read(ioc_count))
+		complete(ioc_gone);
+}
+
 /*
  * We drop cfq io contexts lazily, so we may find a dead one.
  */
 static void
-cfq_drop_dead_cic(struct io_context *ioc, struct cfq_io_context *cic)
+cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
+		  struct cfq_io_context *cic)
 {
+	unsigned long flags;
+
 	WARN_ON(!list_empty(&cic->queue_list));
 
+	spin_lock_irqsave(&ioc->lock, flags);
+
 	if (ioc->ioc_data == cic)
-		ioc->ioc_data = NULL;
+		rcu_assign_pointer(ioc->ioc_data, NULL);
 
-	rb_erase(&cic->rb_node, &ioc->cic_root);
-	kmem_cache_free(cfq_ioc_pool, cic);
-	elv_ioc_count_dec(ioc_count);
+	radix_tree_delete(&ioc->radix_root, (unsigned long) cfqd);
+	spin_unlock_irqrestore(&ioc->lock, flags);
+
+	cfq_cic_free(cic);
 }
 
 static struct cfq_io_context *
-cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc)
+cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
 {
-	struct rb_node *n;
 	struct cfq_io_context *cic;
-	void *k, *key = cfqd;
+	void *k;
 
 	if (unlikely(!ioc))
 		return NULL;
@@ -1509,79 +1546,65 @@ cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc)
 	/*
 	 * we maintain a last-hit cache, to avoid browsing over the tree
 	 */
-	cic = ioc->ioc_data;
+	cic = rcu_dereference(ioc->ioc_data);
 	if (cic && cic->key == cfqd)
 		return cic;
 
-	spin_lock(&ioc->lock);
-restart:
-	n = ioc->cic_root.rb_node;
-	while (n) {
-		cic = rb_entry(n, struct cfq_io_context, rb_node);
+	do {
+		rcu_read_lock();
+		cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd);
+		rcu_read_unlock();
+		if (!cic)
+			break;
 		/* ->key must be copied to avoid race with cfq_exit_queue() */
 		k = cic->key;
 		if (unlikely(!k)) {
-			cfq_drop_dead_cic(ioc, cic);
-			goto restart;
+			cfq_drop_dead_cic(cfqd, ioc, cic);
+			continue;
 		}
 
-		if (key < k)
-			n = n->rb_left;
-		else if (key > k)
-			n = n->rb_right;
-		else {
-			ioc->ioc_data = cic;
-			spin_unlock(&ioc->lock);
-			return cic;
-		}
-	}
+		rcu_assign_pointer(ioc->ioc_data, cic);
+		break;
+	} while (1);
 
-	spin_unlock(&ioc->lock);
-	return NULL;
+	return cic;
 }
 
-static inline void
+/*
+ * Add cic into ioc, using cfqd as the search key. This enables us to lookup
+ * the process specific cfq io context when entered from the block layer.
+ * Also adds the cic to a per-cfqd list, used when this queue is removed.
+ */
+static inline int
 cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
-	     struct cfq_io_context *cic)
+	     struct cfq_io_context *cic, gfp_t gfp_mask)
 {
-	struct rb_node **p;
-	struct rb_node *parent;
-	struct cfq_io_context *__cic;
 	unsigned long flags;
-	void *k;
+	int ret;
 
-	spin_lock(&ioc->lock);
-	cic->ioc = ioc;
-	cic->key = cfqd;
+	ret = radix_tree_preload(gfp_mask);
+	if (!ret) {
+		cic->ioc = ioc;
+		cic->key = cfqd;
 
-restart:
-	parent = NULL;
-	p = &ioc->cic_root.rb_node;
-	while (*p) {
-		parent = *p;
-		__cic = rb_entry(parent, struct cfq_io_context, rb_node);
-		/* ->key must be copied to avoid race with cfq_exit_queue() */
-		k = __cic->key;
-		if (unlikely(!k)) {
-			cfq_drop_dead_cic(ioc, __cic);
-			goto restart;
-		}
+		spin_lock_irqsave(&ioc->lock, flags);
+		ret = radix_tree_insert(&ioc->radix_root,
+						(unsigned long) cfqd, cic);
+		spin_unlock_irqrestore(&ioc->lock, flags);
 
-		if (cic->key < k)
-			p = &(*p)->rb_left;
-		else if (cic->key > k)
-			p = &(*p)->rb_right;
-		else
-			BUG();
+		radix_tree_preload_end();
+
+		if (!ret) {
+			spin_lock_irqsave(cfqd->queue->queue_lock, flags);
+			list_add(&cic->queue_list, &cfqd->cic_list);
+			spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
+		}
 	}
 
-	rb_link_node(&cic->rb_node, parent, p);
-	rb_insert_color(&cic->rb_node, &ioc->cic_root);
+	if (ret)
+		printk(KERN_ERR "cfq: cic link failed!\n");
 
-	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
-	list_add(&cic->queue_list, &cfqd->cic_list);
-	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
-	spin_unlock(&ioc->lock);
+	return ret;
 }
 
 /*
@@ -1601,7 +1624,7 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 	if (!ioc)
 		return NULL;
 
-	cic = cfq_cic_rb_lookup(cfqd, ioc);
+	cic = cfq_cic_lookup(cfqd, ioc);
 	if (cic)
 		goto out;
 
@@ -1609,13 +1632,17 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 	if (cic == NULL)
 		goto err;
 
-	cfq_cic_link(cfqd, ioc, cic);
+	if (cfq_cic_link(cfqd, ioc, cic, gfp_mask))
+		goto err_free;
+
 out:
 	smp_read_barrier_depends();
 	if (unlikely(ioc->ioprio_changed))
 		cfq_ioc_set_ioprio(ioc);
 
 	return cic;
+err_free:
+	cfq_cic_free(cic);
 err:
 	put_io_context(ioc);
 	return NULL;
@@ -1909,7 +1936,7 @@ static int cfq_may_queue(struct request_queue *q, int rw)
 	 * so just lookup a possibly existing queue, or return 'may queue'
 	 * if that fails
 	 */
-	cic = cfq_cic_rb_lookup(cfqd, tsk->io_context);
+	cic = cfq_cic_lookup(cfqd, tsk->io_context);
 	if (!cic)
 		return ELV_MQUEUE_MAY;
 
@@ -2174,7 +2201,7 @@ static int __init cfq_slab_setup(void)
 	if (!cfq_pool)
 		goto fail;
 
-	cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0);
+	cfq_ioc_pool = KMEM_CACHE(cfq_io_context, SLAB_DESTROY_BY_RCU);
 	if (!cfq_ioc_pool)
 		goto fail;
 
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index d4550ecae443..b901db63f6ae 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3853,6 +3853,21 @@ int __init blk_dev_init(void)
 	return 0;
 }
 
+static void cfq_dtor(struct io_context *ioc)
+{
+	struct cfq_io_context *cic[1];
+	int r;
+
+	/*
+	 * We don't have a specific key to lookup with, so use the gang
+	 * lookup to just retrieve the first item stored. The cfq exit
+	 * function will iterate the full tree, so any member will do.
+	 */
+	r = radix_tree_gang_lookup(&ioc->radix_root, (void **) cic, 0, 1);
+	if (r > 0)
+		cic[0]->dtor(ioc);
+}
+
 /*
  * IO Context helper functions. put_io_context() returns 1 if there are no
  * more users of this io context, 0 otherwise.
@@ -3865,18 +3880,11 @@ int put_io_context(struct io_context *ioc)
 	BUG_ON(atomic_read(&ioc->refcount) == 0);
 
 	if (atomic_dec_and_test(&ioc->refcount)) {
-		struct cfq_io_context *cic;
-
 		rcu_read_lock();
 		if (ioc->aic && ioc->aic->dtor)
 			ioc->aic->dtor(ioc->aic);
-		if (ioc->cic_root.rb_node != NULL) {
-			struct rb_node *n = rb_first(&ioc->cic_root);
-
-			cic = rb_entry(n, struct cfq_io_context, rb_node);
-			cic->dtor(ioc);
-		}
 		rcu_read_unlock();
+		cfq_dtor(ioc);
 
 		kmem_cache_free(iocontext_cachep, ioc);
 		return 1;
@@ -3885,11 +3893,26 @@ int put_io_context(struct io_context *ioc)
 }
 EXPORT_SYMBOL(put_io_context);
 
+static void cfq_exit(struct io_context *ioc)
+{
+	struct cfq_io_context *cic[1];
+	int r;
+
+	rcu_read_lock();
+	/*
+	 * See comment for cfq_dtor()
+	 */
+	r = radix_tree_gang_lookup(&ioc->radix_root, (void **) cic, 0, 1);
+	rcu_read_unlock();
+
+	if (r > 0)
+		cic[0]->exit(ioc);
+}
+
 /* Called by the exitting task */
 void exit_io_context(void)
 {
 	struct io_context *ioc;
-	struct cfq_io_context *cic;
 
 	task_lock(current);
 	ioc = current->io_context;
@@ -3899,11 +3922,7 @@ void exit_io_context(void)
 	if (atomic_dec_and_test(&ioc->nr_tasks)) {
 		if (ioc->aic && ioc->aic->exit)
 			ioc->aic->exit(ioc->aic);
-		if (ioc->cic_root.rb_node != NULL) {
-			cic = rb_entry(rb_first(&ioc->cic_root),
-				struct cfq_io_context, rb_node);
-			cic->exit(ioc);
-		}
+		cfq_exit(ioc);
 
 		put_io_context(ioc);
 	}
@@ -3923,7 +3942,7 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 		ret->last_waited = jiffies; /* doesn't matter... */
 		ret->nr_batch_requests = 0; /* because this is 0 */
 		ret->aic = NULL;
-		ret->cic_root.rb_node = NULL;
+		INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
 		ret->ioc_data = NULL;
 	}
 
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index cd44d458124a..593b222d9dcc 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -1,6 +1,8 @@
 #ifndef IOCONTEXT_H
 #define IOCONTEXT_H
 
+#include <linux/radix-tree.h>
+
 /*
  * This is the per-process anticipatory I/O scheduler state.
  */
@@ -29,8 +31,8 @@ struct as_io_context {
 
 struct cfq_queue;
 struct cfq_io_context {
-	struct rb_node rb_node;
 	void *key;
+	unsigned long dead_key;
 
 	struct cfq_queue *cfqq[2];
 
@@ -74,7 +76,7 @@ struct io_context {
 	int nr_batch_requests;     /* Number of requests left in the batch */
 
 	struct as_io_context *aic;
-	struct rb_root cic_root;
+	struct radix_tree_root radix_root;
 	void *ioc_data;
 };
 
-- 
cgit v1.2.3


From 521f3bbdba6b92582ef8047df01b156668343542 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 21 Jan 2008 20:03:12 +0100
Subject: io_context sharing - anticipatory changes

changes to anticipatory io scheduler for io_context sharing

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/as-iosched.c | 34 ++++++++++++++++++++++++++++------
 1 file changed, 28 insertions(+), 6 deletions(-)

diff --git a/block/as-iosched.c b/block/as-iosched.c
index cb5e53b05c7c..b201d16a7102 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -170,9 +170,11 @@ static void free_as_io_context(struct as_io_context *aic)
 
 static void as_trim(struct io_context *ioc)
 {
+	spin_lock(&ioc->lock);
 	if (ioc->aic)
 		free_as_io_context(ioc->aic);
 	ioc->aic = NULL;
+	spin_unlock(&ioc->lock);
 }
 
 /* Called when the task exits */
@@ -462,7 +464,9 @@ static void as_antic_timeout(unsigned long data)
 	spin_lock_irqsave(q->queue_lock, flags);
 	if (ad->antic_status == ANTIC_WAIT_REQ
 			|| ad->antic_status == ANTIC_WAIT_NEXT) {
-		struct as_io_context *aic = ad->io_context->aic;
+		struct as_io_context *aic;
+		spin_lock(&ad->io_context->lock);
+		aic = ad->io_context->aic;
 
 		ad->antic_status = ANTIC_FINISHED;
 		kblockd_schedule_work(&ad->antic_work);
@@ -475,6 +479,7 @@ static void as_antic_timeout(unsigned long data)
 			/* process not "saved" by a cooperating request */
 			ad->exit_no_coop = (7*ad->exit_no_coop + 256)/8;
 		}
+		spin_unlock(&ad->io_context->lock);
 	}
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
@@ -635,9 +640,11 @@ static int as_can_break_anticipation(struct as_data *ad, struct request *rq)
 
 	ioc = ad->io_context;
 	BUG_ON(!ioc);
+	spin_lock(&ioc->lock);
 
 	if (rq && ioc == RQ_IOC(rq)) {
 		/* request from same process */
+		spin_unlock(&ioc->lock);
 		return 1;
 	}
 
@@ -646,20 +653,25 @@ static int as_can_break_anticipation(struct as_data *ad, struct request *rq)
 		 * In this situation status should really be FINISHED,
 		 * however the timer hasn't had the chance to run yet.
 		 */
+		spin_unlock(&ioc->lock);
 		return 1;
 	}
 
 	aic = ioc->aic;
-	if (!aic)
+	if (!aic) {
+		spin_unlock(&ioc->lock);
 		return 0;
+	}
 
 	if (atomic_read(&aic->nr_queued) > 0) {
 		/* process has more requests queued */
+		spin_unlock(&ioc->lock);
 		return 1;
 	}
 
 	if (atomic_read(&aic->nr_dispatched) > 0) {
 		/* process has more requests dispatched */
+		spin_unlock(&ioc->lock);
 		return 1;
 	}
 
@@ -680,6 +692,7 @@ static int as_can_break_anticipation(struct as_data *ad, struct request *rq)
 		}
 
 		as_update_iohist(ad, aic, rq);
+		spin_unlock(&ioc->lock);
 		return 1;
 	}
 
@@ -688,20 +701,27 @@ static int as_can_break_anticipation(struct as_data *ad, struct request *rq)
 		if (aic->ttime_samples == 0)
 			ad->exit_prob = (7*ad->exit_prob + 256)/8;
 
-		if (ad->exit_no_coop > 128)
+		if (ad->exit_no_coop > 128) {
+			spin_unlock(&ioc->lock);
 			return 1;
+		}
 	}
 
 	if (aic->ttime_samples == 0) {
-		if (ad->new_ttime_mean > ad->antic_expire)
+		if (ad->new_ttime_mean > ad->antic_expire) {
+			spin_unlock(&ioc->lock);
 			return 1;
-		if (ad->exit_prob * ad->exit_no_coop > 128*256)
+		}
+		if (ad->exit_prob * ad->exit_no_coop > 128*256) {
+			spin_unlock(&ioc->lock);
 			return 1;
+		}
 	} else if (aic->ttime_mean > ad->antic_expire) {
 		/* the process thinks too much between requests */
+		spin_unlock(&ioc->lock);
 		return 1;
 	}
-
+	spin_unlock(&ioc->lock);
 	return 0;
 }
 
@@ -1255,7 +1275,9 @@ static void as_merged_requests(struct request_queue *q, struct request *req,
 			 * Don't copy here but swap, because when anext is
 			 * removed below, it must contain the unused context
 			 */
+			double_spin_lock(&rioc->lock, &nioc->lock, rioc < nioc);
 			swap_io_context(&rioc, &nioc);
+			double_spin_unlock(&rioc->lock, &nioc->lock, rioc < nioc);
 		}
 	}
 
-- 
cgit v1.2.3


From fadad878cc0640cc9cd5569998bf54b693f7b38b Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 24 Jan 2008 08:54:47 +0100
Subject: kernel: add CLONE_IO to specifically request sharing of IO contexts

syslets (or other threads/processes that want io context sharing) can
set this to enforce sharing of io context.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/sched.h |  1 +
 kernel/fork.c         | 14 ++++++++++----
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 80837e7d527e..2d0546e884ea 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -27,6 +27,7 @@
 #define CLONE_NEWUSER		0x10000000	/* New user namespace */
 #define CLONE_NEWPID		0x20000000	/* New pid namespace */
 #define CLONE_NEWNET		0x40000000	/* New network namespace */
+#define CLONE_IO		0x80000000	/* Clone io context */
 
 /*
  * Scheduling policies
diff --git a/kernel/fork.c b/kernel/fork.c
index 1987c57abb08..314f5101d2b0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -792,15 +792,21 @@ out:
 	return error;
 }
 
-static int copy_io(struct task_struct *tsk)
+static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
 {
 #ifdef CONFIG_BLOCK
 	struct io_context *ioc = current->io_context;
 
 	if (!ioc)
 		return 0;
-
-	if (ioprio_valid(ioc->ioprio)) {
+	/*
+	 * Share io context with parent, if CLONE_IO is set
+	 */
+	if (clone_flags & CLONE_IO) {
+		tsk->io_context = ioc_task_link(ioc);
+		if (unlikely(!tsk->io_context))
+			return -ENOMEM;
+	} else if (ioprio_valid(ioc->ioprio)) {
 		tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
 		if (unlikely(!tsk->io_context))
 			return -ENOMEM;
@@ -1176,7 +1182,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto bad_fork_cleanup_mm;
 	if ((retval = copy_namespaces(clone_flags, p)))
 		goto bad_fork_cleanup_keys;
-	if ((retval = copy_io(p)))
+	if ((retval = copy_io(clone_flags, p)))
 		goto bad_fork_cleanup_namespaces;
 	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
 	if (retval)
-- 
cgit v1.2.3


From 0871714e08fed7ba66cadad11b2e4f85a9dc9b96 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 28 Jan 2008 11:38:15 +0100
Subject: cfq-iosched: relax IOPRIO_CLASS_IDLE restrictions

Currently you must be root to set idle io prio class on a process. This
is due to the fact that the idle class is implemented as a true idle
class, meaning that it will not make progress if someone else is
requesting disk access. Unfortunately this means that it opens DOS
opportunities by locking down file system resources, hence it is root
only at the moment.

This patch relaxes the idle class a little, by removing the truly idle
part (which entals a grace period with associated timer). The
modifications make the idle class as close to zero impact as can be done
while still guarenteeing progress. This means we can relax the root only
criteria as well.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 117 +++++++++++++++-------------------------------------
 fs/ioprio.c         |   2 -
 2 files changed, 34 insertions(+), 85 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 8830893e062f..8fe1a0ddb511 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -26,9 +26,9 @@ static const int cfq_slice_async_rq = 2;
 static int cfq_slice_idle = HZ / 125;
 
 /*
- * grace period before allowing idle class to get disk access
+ * offset from end of service tree
  */
-#define CFQ_IDLE_GRACE		(HZ / 10)
+#define CFQ_IDLE_DELAY		(HZ / 5)
 
 /*
  * below this threshold, we consider thinktime immediate
@@ -98,8 +98,6 @@ struct cfq_data {
 	struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
 	struct cfq_queue *async_idle_cfqq;
 
-	struct timer_list idle_class_timer;
-
 	sector_t last_position;
 	unsigned long last_end_request;
 
@@ -384,12 +382,15 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
 /*
  * The below is leftmost cache rbtree addon
  */
-static struct rb_node *cfq_rb_first(struct cfq_rb_root *root)
+static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
 {
 	if (!root->left)
 		root->left = rb_first(&root->rb);
 
-	return root->left;
+	if (root->left)
+		return rb_entry(root->left, struct cfq_queue, rb_node);
+
+	return NULL;
 }
 
 static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
@@ -446,12 +447,20 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
 static void cfq_service_tree_add(struct cfq_data *cfqd,
 				    struct cfq_queue *cfqq, int add_front)
 {
-	struct rb_node **p = &cfqd->service_tree.rb.rb_node;
-	struct rb_node *parent = NULL;
+	struct rb_node **p, *parent;
+	struct cfq_queue *__cfqq;
 	unsigned long rb_key;
 	int left;
 
-	if (!add_front) {
+	if (cfq_class_idle(cfqq)) {
+		rb_key = CFQ_IDLE_DELAY;
+		parent = rb_last(&cfqd->service_tree.rb);
+		if (parent && parent != &cfqq->rb_node) {
+			__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
+			rb_key += __cfqq->rb_key;
+		} else
+			rb_key += jiffies;
+	} else if (!add_front) {
 		rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
 		rb_key += cfqq->slice_resid;
 		cfqq->slice_resid = 0;
@@ -469,8 +478,9 @@ static void cfq_service_tree_add(struct cfq_data *cfqd,
 	}
 
 	left = 1;
+	parent = NULL;
+	p = &cfqd->service_tree.rb.rb_node;
 	while (*p) {
-		struct cfq_queue *__cfqq;
 		struct rb_node **n;
 
 		parent = *p;
@@ -736,11 +746,6 @@ static inline void
 __cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	if (cfqq) {
-		/*
-		 * stop potential idle class queues waiting service
-		 */
-		del_timer(&cfqd->idle_class_timer);
-
 		cfqq->slice_end = 0;
 		cfq_clear_cfqq_must_alloc_slice(cfqq);
 		cfq_clear_cfqq_fifo_expire(cfqq);
@@ -789,47 +794,16 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, int timed_out)
 		__cfq_slice_expired(cfqd, cfqq, timed_out);
 }
 
-static int start_idle_class_timer(struct cfq_data *cfqd)
-{
-	unsigned long end = cfqd->last_end_request + CFQ_IDLE_GRACE;
-	unsigned long now = jiffies;
-
-	if (time_before(now, end) &&
-	    time_after_eq(now, cfqd->last_end_request)) {
-		mod_timer(&cfqd->idle_class_timer, end);
-		return 1;
-	}
-
-	return 0;
-}
-
 /*
  * Get next queue for service. Unless we have a queue preemption,
  * we'll simply select the first cfqq in the service tree.
  */
 static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 {
-	struct cfq_queue *cfqq;
-	struct rb_node *n;
-
 	if (RB_EMPTY_ROOT(&cfqd->service_tree.rb))
 		return NULL;
 
-	n = cfq_rb_first(&cfqd->service_tree);
-	cfqq = rb_entry(n, struct cfq_queue, rb_node);
-
-	if (cfq_class_idle(cfqq)) {
-		/*
-		 * if we have idle queues and no rt or be queues had
-		 * pending requests, either allow immediate service if
-		 * the grace period has passed or arm the idle grace
-		 * timer
-		 */
-		if (start_idle_class_timer(cfqd))
-			cfqq = NULL;
-	}
-
-	return cfqq;
+	return cfq_rb_first(&cfqd->service_tree);
 }
 
 /*
@@ -1087,14 +1061,11 @@ static inline int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
  */
 static int cfq_forced_dispatch(struct cfq_data *cfqd)
 {
+	struct cfq_queue *cfqq;
 	int dispatched = 0;
-	struct rb_node *n;
-
-	while ((n = cfq_rb_first(&cfqd->service_tree)) != NULL) {
-		struct cfq_queue *cfqq = rb_entry(n, struct cfq_queue, rb_node);
 
+	while ((cfqq = cfq_rb_first(&cfqd->service_tree)) != NULL)
 		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
-	}
 
 	cfq_slice_expired(cfqd, 0);
 
@@ -1437,15 +1408,16 @@ retry:
 		atomic_set(&cfqq->ref, 0);
 		cfqq->cfqd = cfqd;
 
-		if (is_sync) {
-			cfq_mark_cfqq_idle_window(cfqq);
-			cfq_mark_cfqq_sync(cfqq);
-		}
-
 		cfq_mark_cfqq_prio_changed(cfqq);
 		cfq_mark_cfqq_queue_new(cfqq);
 
 		cfq_init_prio_data(cfqq, ioc);
+
+		if (is_sync) {
+			if (!cfq_class_idle(cfqq))
+				cfq_mark_cfqq_idle_window(cfqq);
+			cfq_mark_cfqq_sync(cfqq);
+		}
 	}
 
 	if (new_cfqq)
@@ -1697,7 +1669,10 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 {
 	int enable_idle;
 
-	if (!cfq_cfqq_sync(cfqq))
+	/*
+	 * Don't idle for async or idle io prio class
+	 */
+	if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
 		return;
 
 	enable_idle = cfq_cfqq_idle_window(cfqq);
@@ -1876,7 +1851,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 			cfq_set_prio_slice(cfqd, cfqq);
 			cfq_clear_cfqq_slice_new(cfqq);
 		}
-		if (cfq_slice_used(cfqq))
+		if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
 			cfq_slice_expired(cfqd, 1);
 		else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list))
 			cfq_arm_slice_timer(cfqd);
@@ -2080,29 +2055,9 @@ out_cont:
 	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
 }
 
-/*
- * Timer running if an idle class queue is waiting for service
- */
-static void cfq_idle_class_timer(unsigned long data)
-{
-	struct cfq_data *cfqd = (struct cfq_data *) data;
-	unsigned long flags;
-
-	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
-
-	/*
-	 * race with a non-idle queue, reset timer
-	 */
-	if (!start_idle_class_timer(cfqd))
-		cfq_schedule_dispatch(cfqd);
-
-	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
-}
-
 static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
 {
 	del_timer_sync(&cfqd->idle_slice_timer);
-	del_timer_sync(&cfqd->idle_class_timer);
 	kblockd_flush_work(&cfqd->unplug_work);
 }
 
@@ -2167,10 +2122,6 @@ static void *cfq_init_queue(struct request_queue *q)
 	cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
 	cfqd->idle_slice_timer.data = (unsigned long) cfqd;
 
-	init_timer(&cfqd->idle_class_timer);
-	cfqd->idle_class_timer.function = cfq_idle_class_timer;
-	cfqd->idle_class_timer.data = (unsigned long) cfqd;
-
 	INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
 
 	cfqd->last_end_request = jiffies;
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 06b5d97c5fdd..c4a1c3c65aac 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -85,8 +85,6 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
 
 			break;
 		case IOPRIO_CLASS_IDLE:
-			if (!capable(CAP_SYS_ADMIN))
-				return -EPERM;
 			break;
 		case IOPRIO_CLASS_NONE:
 			if (data)
-- 
cgit v1.2.3


From febffd61816ef6d4f84189468c1d47f1df55921e Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 28 Jan 2008 13:19:43 +0100
Subject: cfq-iosched: kill some big inlines

Use of inlines were a bit over the top, trim them down a bit.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 8fe1a0ddb511..f28d1fb30608 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -534,8 +534,7 @@ static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
  * add to busy list of queues for service, trying to be fair in ordering
  * the pending list according to last request service
  */
-static inline void
-cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	BUG_ON(cfq_cfqq_on_rr(cfqq));
 	cfq_mark_cfqq_on_rr(cfqq);
@@ -548,8 +547,7 @@ cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
  * Called when the cfqq no longer has requests pending, remove it from
  * the service tree.
  */
-static inline void
-cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	BUG_ON(!cfq_cfqq_on_rr(cfqq));
 	cfq_clear_cfqq_on_rr(cfqq);
@@ -564,7 +562,7 @@ cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 /*
  * rb tree support functions
  */
-static inline void cfq_del_rq_rb(struct request *rq)
+static void cfq_del_rq_rb(struct request *rq)
 {
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	struct cfq_data *cfqd = cfqq->cfqd;
@@ -604,8 +602,7 @@ static void cfq_add_rq_rb(struct request *rq)
 	BUG_ON(!cfqq->next_rq);
 }
 
-static inline void
-cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
+static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
 {
 	elv_rb_del(&cfqq->sort_list, rq);
 	cfqq->queued[rq_is_sync(rq)]--;
@@ -742,8 +739,8 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
 	return 0;
 }
 
-static inline void
-__cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+static void __cfq_set_active_queue(struct cfq_data *cfqd,
+				   struct cfq_queue *cfqq)
 {
 	if (cfqq) {
 		cfqq->slice_end = 0;
@@ -913,7 +910,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
 /*
  * return expired entry, or NULL to just start from scratch in rbtree
  */
-static inline struct request *cfq_check_fifo(struct cfq_queue *cfqq)
+static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
 {
 	struct cfq_data *cfqd = cfqq->cfqd;
 	struct request *rq;
@@ -1042,7 +1039,7 @@ __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	return dispatched;
 }
 
-static inline int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
+static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
 {
 	int dispatched = 0;
 
@@ -1330,8 +1327,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
 	cfq_clear_cfqq_prio_changed(cfqq);
 }
 
-static inline void changed_ioprio(struct io_context *ioc,
-				  struct cfq_io_context *cic)
+static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
 {
 	struct cfq_data *cfqd = cic->key;
 	struct cfq_queue *cfqq;
@@ -1547,9 +1543,8 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
  * the process specific cfq io context when entered from the block layer.
  * Also adds the cic to a per-cfqd list, used when this queue is removed.
  */
-static inline int
-cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
-	     struct cfq_io_context *cic, gfp_t gfp_mask)
+static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
+			struct cfq_io_context *cic, gfp_t gfp_mask)
 {
 	unsigned long flags;
 	int ret;
-- 
cgit v1.2.3