diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /net/sunrpc |
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'net/sunrpc')
33 files changed, 15934 insertions, 0 deletions
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile new file mode 100644 index 000000000000..46a2ce00a29b --- /dev/null +++ b/net/sunrpc/Makefile @@ -0,0 +1,15 @@ +# +# Makefile for Linux kernel SUN RPC +# + + +obj-$(CONFIG_SUNRPC) += sunrpc.o +obj-$(CONFIG_SUNRPC_GSS) += auth_gss/ + +sunrpc-y := clnt.o xprt.o sched.o \ + auth.o auth_null.o auth_unix.o \ + svc.o svcsock.o svcauth.o svcauth_unix.o \ + pmap_clnt.o timer.o xdr.o \ + sunrpc_syms.o cache.o rpc_pipe.o +sunrpc-$(CONFIG_PROC_FS) += stats.o +sunrpc-$(CONFIG_SYSCTL) += sysctl.o diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c new file mode 100644 index 000000000000..9bcec9b927b9 --- /dev/null +++ b/net/sunrpc/auth.c @@ -0,0 +1,395 @@ +/* + * linux/net/sunrpc/auth.c + * + * Generic RPC client authentication API. + * + * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de> + */ + +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/errno.h> +#include <linux/socket.h> +#include <linux/sunrpc/clnt.h> +#include <linux/spinlock.h> + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +static struct rpc_authops * auth_flavors[RPC_AUTH_MAXFLAVOR] = { + &authnull_ops, /* AUTH_NULL */ + &authunix_ops, /* AUTH_UNIX */ + NULL, /* others can be loadable modules */ +}; + +static u32 +pseudoflavor_to_flavor(u32 flavor) { + if (flavor >= RPC_AUTH_MAXFLAVOR) + return RPC_AUTH_GSS; + return flavor; +} + +int +rpcauth_register(struct rpc_authops *ops) +{ + rpc_authflavor_t flavor; + + if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR) + return -EINVAL; + if (auth_flavors[flavor] != NULL) + return -EPERM; /* what else? */ + auth_flavors[flavor] = ops; + return 0; +} + +int +rpcauth_unregister(struct rpc_authops *ops) +{ + rpc_authflavor_t flavor; + + if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR) + return -EINVAL; + if (auth_flavors[flavor] != ops) + return -EPERM; /* what else? */ + auth_flavors[flavor] = NULL; + return 0; +} + +struct rpc_auth * +rpcauth_create(rpc_authflavor_t pseudoflavor, struct rpc_clnt *clnt) +{ + struct rpc_auth *auth; + struct rpc_authops *ops; + u32 flavor = pseudoflavor_to_flavor(pseudoflavor); + + if (flavor >= RPC_AUTH_MAXFLAVOR || !(ops = auth_flavors[flavor])) + return NULL; + auth = ops->create(clnt, pseudoflavor); + if (!auth) + return NULL; + if (clnt->cl_auth) + rpcauth_destroy(clnt->cl_auth); + clnt->cl_auth = auth; + return auth; +} + +void +rpcauth_destroy(struct rpc_auth *auth) +{ + if (!atomic_dec_and_test(&auth->au_count)) + return; + auth->au_ops->destroy(auth); +} + +static DEFINE_SPINLOCK(rpc_credcache_lock); + +/* + * Initialize RPC credential cache + */ +int +rpcauth_init_credcache(struct rpc_auth *auth, unsigned long expire) +{ + struct rpc_cred_cache *new; + int i; + + new = (struct rpc_cred_cache *)kmalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return -ENOMEM; + for (i = 0; i < RPC_CREDCACHE_NR; i++) + INIT_HLIST_HEAD(&new->hashtable[i]); + new->expire = expire; + new->nextgc = jiffies + (expire >> 1); + auth->au_credcache = new; + return 0; +} + +/* + * Destroy a list of credentials + */ +static inline +void rpcauth_destroy_credlist(struct hlist_head *head) +{ + struct rpc_cred *cred; + + while (!hlist_empty(head)) { + cred = hlist_entry(head->first, struct rpc_cred, cr_hash); + hlist_del_init(&cred->cr_hash); + put_rpccred(cred); + } +} + +/* + * Clear the RPC credential cache, and delete those credentials + * that are not referenced. + */ +void +rpcauth_free_credcache(struct rpc_auth *auth) +{ + struct rpc_cred_cache *cache = auth->au_credcache; + HLIST_HEAD(free); + struct hlist_node *pos, *next; + struct rpc_cred *cred; + int i; + + spin_lock(&rpc_credcache_lock); + for (i = 0; i < RPC_CREDCACHE_NR; i++) { + hlist_for_each_safe(pos, next, &cache->hashtable[i]) { + cred = hlist_entry(pos, struct rpc_cred, cr_hash); + __hlist_del(&cred->cr_hash); + hlist_add_head(&cred->cr_hash, &free); + } + } + spin_unlock(&rpc_credcache_lock); + rpcauth_destroy_credlist(&free); +} + +static void +rpcauth_prune_expired(struct rpc_auth *auth, struct rpc_cred *cred, struct hlist_head *free) +{ + if (atomic_read(&cred->cr_count) != 1) + return; + if (time_after(jiffies, cred->cr_expire + auth->au_credcache->expire)) + cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; + if (!(cred->cr_flags & RPCAUTH_CRED_UPTODATE)) { + __hlist_del(&cred->cr_hash); + hlist_add_head(&cred->cr_hash, free); + } +} + +/* + * Remove stale credentials. Avoid sleeping inside the loop. + */ +static void +rpcauth_gc_credcache(struct rpc_auth *auth, struct hlist_head *free) +{ + struct rpc_cred_cache *cache = auth->au_credcache; + struct hlist_node *pos, *next; + struct rpc_cred *cred; + int i; + + dprintk("RPC: gc'ing RPC credentials for auth %p\n", auth); + for (i = 0; i < RPC_CREDCACHE_NR; i++) { + hlist_for_each_safe(pos, next, &cache->hashtable[i]) { + cred = hlist_entry(pos, struct rpc_cred, cr_hash); + rpcauth_prune_expired(auth, cred, free); + } + } + cache->nextgc = jiffies + cache->expire; +} + +/* + * Look up a process' credentials in the authentication cache + */ +struct rpc_cred * +rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, + int taskflags) +{ + struct rpc_cred_cache *cache = auth->au_credcache; + HLIST_HEAD(free); + struct hlist_node *pos, *next; + struct rpc_cred *new = NULL, + *cred = NULL; + int nr = 0; + + if (!(taskflags & RPC_TASK_ROOTCREDS)) + nr = acred->uid & RPC_CREDCACHE_MASK; +retry: + spin_lock(&rpc_credcache_lock); + if (time_before(cache->nextgc, jiffies)) + rpcauth_gc_credcache(auth, &free); + hlist_for_each_safe(pos, next, &cache->hashtable[nr]) { + struct rpc_cred *entry; + entry = hlist_entry(pos, struct rpc_cred, cr_hash); + if (entry->cr_ops->crmatch(acred, entry, taskflags)) { + hlist_del(&entry->cr_hash); + cred = entry; + break; + } + rpcauth_prune_expired(auth, entry, &free); + } + if (new) { + if (cred) + hlist_add_head(&new->cr_hash, &free); + else + cred = new; + } + if (cred) { + hlist_add_head(&cred->cr_hash, &cache->hashtable[nr]); + get_rpccred(cred); + } + spin_unlock(&rpc_credcache_lock); + + rpcauth_destroy_credlist(&free); + + if (!cred) { + new = auth->au_ops->crcreate(auth, acred, taskflags); + if (!IS_ERR(new)) { +#ifdef RPC_DEBUG + new->cr_magic = RPCAUTH_CRED_MAGIC; +#endif + goto retry; + } else + cred = new; + } + + return (struct rpc_cred *) cred; +} + +struct rpc_cred * +rpcauth_lookupcred(struct rpc_auth *auth, int taskflags) +{ + struct auth_cred acred = { + .uid = current->fsuid, + .gid = current->fsgid, + .group_info = current->group_info, + }; + struct rpc_cred *ret; + + dprintk("RPC: looking up %s cred\n", + auth->au_ops->au_name); + get_group_info(acred.group_info); + ret = auth->au_ops->lookup_cred(auth, &acred, taskflags); + put_group_info(acred.group_info); + return ret; +} + +struct rpc_cred * +rpcauth_bindcred(struct rpc_task *task) +{ + struct rpc_auth *auth = task->tk_auth; + struct auth_cred acred = { + .uid = current->fsuid, + .gid = current->fsgid, + .group_info = current->group_info, + }; + struct rpc_cred *ret; + + dprintk("RPC: %4d looking up %s cred\n", + task->tk_pid, task->tk_auth->au_ops->au_name); + get_group_info(acred.group_info); + ret = auth->au_ops->lookup_cred(auth, &acred, task->tk_flags); + if (!IS_ERR(ret)) + task->tk_msg.rpc_cred = ret; + else + task->tk_status = PTR_ERR(ret); + put_group_info(acred.group_info); + return ret; +} + +void +rpcauth_holdcred(struct rpc_task *task) +{ + dprintk("RPC: %4d holding %s cred %p\n", + task->tk_pid, task->tk_auth->au_ops->au_name, task->tk_msg.rpc_cred); + if (task->tk_msg.rpc_cred) + get_rpccred(task->tk_msg.rpc_cred); +} + +void +put_rpccred(struct rpc_cred *cred) +{ + cred->cr_expire = jiffies; + if (!atomic_dec_and_test(&cred->cr_count)) + return; + cred->cr_ops->crdestroy(cred); +} + +void +rpcauth_unbindcred(struct rpc_task *task) +{ + struct rpc_auth *auth = task->tk_auth; + struct rpc_cred *cred = task->tk_msg.rpc_cred; + + dprintk("RPC: %4d releasing %s cred %p\n", + task->tk_pid, auth->au_ops->au_name, cred); + + put_rpccred(cred); + task->tk_msg.rpc_cred = NULL; +} + +u32 * +rpcauth_marshcred(struct rpc_task *task, u32 *p) +{ + struct rpc_auth *auth = task->tk_auth; + struct rpc_cred *cred = task->tk_msg.rpc_cred; + + dprintk("RPC: %4d marshaling %s cred %p\n", + task->tk_pid, auth->au_ops->au_name, cred); + return cred->cr_ops->crmarshal(task, p); +} + +u32 * +rpcauth_checkverf(struct rpc_task *task, u32 *p) +{ + struct rpc_auth *auth = task->tk_auth; + struct rpc_cred *cred = task->tk_msg.rpc_cred; + + dprintk("RPC: %4d validating %s cred %p\n", + task->tk_pid, auth->au_ops->au_name, cred); + return cred->cr_ops->crvalidate(task, p); +} + +int +rpcauth_wrap_req(struct rpc_task *task, kxdrproc_t encode, void *rqstp, + u32 *data, void *obj) +{ + struct rpc_cred *cred = task->tk_msg.rpc_cred; + + dprintk("RPC: %4d using %s cred %p to wrap rpc data\n", + task->tk_pid, cred->cr_ops->cr_name, cred); + if (cred->cr_ops->crwrap_req) + return cred->cr_ops->crwrap_req(task, encode, rqstp, data, obj); + /* By default, we encode the arguments normally. */ + return encode(rqstp, data, obj); +} + +int +rpcauth_unwrap_resp(struct rpc_task *task, kxdrproc_t decode, void *rqstp, + u32 *data, void *obj) +{ + struct rpc_cred *cred = task->tk_msg.rpc_cred; + + dprintk("RPC: %4d using %s cred %p to unwrap rpc data\n", + task->tk_pid, cred->cr_ops->cr_name, cred); + if (cred->cr_ops->crunwrap_resp) + return cred->cr_ops->crunwrap_resp(task, decode, rqstp, + data, obj); + /* By default, we decode the arguments normally. */ + return decode(rqstp, data, obj); +} + +int +rpcauth_refreshcred(struct rpc_task *task) +{ + struct rpc_auth *auth = task->tk_auth; + struct rpc_cred *cred = task->tk_msg.rpc_cred; + int err; + + dprintk("RPC: %4d refreshing %s cred %p\n", + task->tk_pid, auth->au_ops->au_name, cred); + err = cred->cr_ops->crrefresh(task); + if (err < 0) + task->tk_status = err; + return err; +} + +void +rpcauth_invalcred(struct rpc_task *task) +{ + dprintk("RPC: %4d invalidating %s cred %p\n", + task->tk_pid, task->tk_auth->au_ops->au_name, task->tk_msg.rpc_cred); + spin_lock(&rpc_credcache_lock); + if (task->tk_msg.rpc_cred) + task->tk_msg.rpc_cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; + spin_unlock(&rpc_credcache_lock); +} + +int +rpcauth_uptodatecred(struct rpc_task *task) +{ + return !(task->tk_msg.rpc_cred) || + (task->tk_msg.rpc_cred->cr_flags & RPCAUTH_CRED_UPTODATE); +} diff --git a/net/sunrpc/auth_gss/Makefile b/net/sunrpc/auth_gss/Makefile new file mode 100644 index 000000000000..fe1b874084bc --- /dev/null +++ b/net/sunrpc/auth_gss/Makefile @@ -0,0 +1,18 @@ +# +# Makefile for Linux kernel rpcsec_gss implementation +# + +obj-$(CONFIG_SUNRPC_GSS) += auth_rpcgss.o + +auth_rpcgss-objs := auth_gss.o gss_generic_token.o \ + gss_mech_switch.o svcauth_gss.o gss_krb5_crypto.o + +obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o + +rpcsec_gss_krb5-objs := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \ + gss_krb5_seqnum.o + +obj-$(CONFIG_RPCSEC_GSS_SPKM3) += rpcsec_gss_spkm3.o + +rpcsec_gss_spkm3-objs := gss_spkm3_mech.o gss_spkm3_seal.o gss_spkm3_unseal.o \ + gss_spkm3_token.o diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c new file mode 100644 index 000000000000..a33b627cbef4 --- /dev/null +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -0,0 +1,1152 @@ +/* + * linux/net/sunrpc/auth_gss.c + * + * RPCSEC_GSS client authentication. + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Dug Song <dugsong@monkey.org> + * Andy Adamson <andros@umich.edu> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $Id$ + */ + + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/sched.h> +#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/auth.h> +#include <linux/sunrpc/auth_gss.h> +#include <linux/sunrpc/svcauth_gss.h> +#include <linux/sunrpc/gss_err.h> +#include <linux/workqueue.h> +#include <linux/sunrpc/rpc_pipe_fs.h> +#include <linux/sunrpc/gss_api.h> +#include <asm/uaccess.h> + +static struct rpc_authops authgss_ops; + +static struct rpc_credops gss_credops; + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +#define NFS_NGROUPS 16 + +#define GSS_CRED_EXPIRE (60 * HZ) /* XXX: reasonable? */ +#define GSS_CRED_SLACK 1024 /* XXX: unused */ +/* length of a krb5 verifier (48), plus data added before arguments when + * using integrity (two 4-byte integers): */ +#define GSS_VERF_SLACK 56 + +/* XXX this define must match the gssd define +* as it is passed to gssd to signal the use of +* machine creds should be part of the shared rpc interface */ + +#define CA_RUN_AS_MACHINE 0x00000200 + +/* dump the buffer in `emacs-hexl' style */ +#define isprint(c) ((c > 0x1f) && (c < 0x7f)) + +static DEFINE_RWLOCK(gss_ctx_lock); + +struct gss_auth { + struct rpc_auth rpc_auth; + struct gss_api_mech *mech; + enum rpc_gss_svc service; + struct list_head upcalls; + struct rpc_clnt *client; + struct dentry *dentry; + char path[48]; + spinlock_t lock; +}; + +static void gss_destroy_ctx(struct gss_cl_ctx *); +static struct rpc_pipe_ops gss_upcall_ops; + +void +print_hexl(u32 *p, u_int length, u_int offset) +{ + u_int i, j, jm; + u8 c, *cp; + + dprintk("RPC: print_hexl: length %d\n",length); + dprintk("\n"); + cp = (u8 *) p; + + for (i = 0; i < length; i += 0x10) { + dprintk(" %04x: ", (u_int)(i + offset)); + jm = length - i; + jm = jm > 16 ? 16 : jm; + + for (j = 0; j < jm; j++) { + if ((j % 2) == 1) + dprintk("%02x ", (u_int)cp[i+j]); + else + dprintk("%02x", (u_int)cp[i+j]); + } + for (; j < 16; j++) { + if ((j % 2) == 1) + dprintk(" "); + else + dprintk(" "); + } + dprintk(" "); + + for (j = 0; j < jm; j++) { + c = cp[i+j]; + c = isprint(c) ? c : '.'; + dprintk("%c", c); + } + dprintk("\n"); + } +} + +EXPORT_SYMBOL(print_hexl); + +static inline struct gss_cl_ctx * +gss_get_ctx(struct gss_cl_ctx *ctx) +{ + atomic_inc(&ctx->count); + return ctx; +} + +static inline void +gss_put_ctx(struct gss_cl_ctx *ctx) +{ + if (atomic_dec_and_test(&ctx->count)) + gss_destroy_ctx(ctx); +} + +static void +gss_cred_set_ctx(struct rpc_cred *cred, struct gss_cl_ctx *ctx) +{ + struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); + struct gss_cl_ctx *old; + write_lock(&gss_ctx_lock); + old = gss_cred->gc_ctx; + gss_cred->gc_ctx = ctx; + cred->cr_flags |= RPCAUTH_CRED_UPTODATE; + write_unlock(&gss_ctx_lock); + if (old) + gss_put_ctx(old); +} + +static int +gss_cred_is_uptodate_ctx(struct rpc_cred *cred) +{ + struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); + int res = 0; + + read_lock(&gss_ctx_lock); + if ((cred->cr_flags & RPCAUTH_CRED_UPTODATE) && gss_cred->gc_ctx) + res = 1; + read_unlock(&gss_ctx_lock); + return res; +} + +static const void * +simple_get_bytes(const void *p, const void *end, void *res, size_t len) +{ + const void *q = (const void *)((const char *)p + len); + if (unlikely(q > end || q < p)) + return ERR_PTR(-EFAULT); + memcpy(res, p, len); + return q; +} + +static inline const void * +simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest) +{ + const void *q; + unsigned int len; + + p = simple_get_bytes(p, end, &len, sizeof(len)); + if (IS_ERR(p)) + return p; + q = (const void *)((const char *)p + len); + if (unlikely(q > end || q < p)) + return ERR_PTR(-EFAULT); + dest->data = kmalloc(len, GFP_KERNEL); + if (unlikely(dest->data == NULL)) + return ERR_PTR(-ENOMEM); + dest->len = len; + memcpy(dest->data, p, len); + return q; +} + +static struct gss_cl_ctx * +gss_cred_get_ctx(struct rpc_cred *cred) +{ + struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); + struct gss_cl_ctx *ctx = NULL; + + read_lock(&gss_ctx_lock); + if (gss_cred->gc_ctx) + ctx = gss_get_ctx(gss_cred->gc_ctx); + read_unlock(&gss_ctx_lock); + return ctx; +} + +static struct gss_cl_ctx * +gss_alloc_context(void) +{ + struct gss_cl_ctx *ctx; + + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + if (ctx != NULL) { + memset(ctx, 0, sizeof(*ctx)); + ctx->gc_proc = RPC_GSS_PROC_DATA; + ctx->gc_seq = 1; /* NetApp 6.4R1 doesn't accept seq. no. 0 */ + spin_lock_init(&ctx->gc_seq_lock); + atomic_set(&ctx->count,1); + } + return ctx; +} + +#define GSSD_MIN_TIMEOUT (60 * 60) +static const void * +gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct gss_api_mech *gm) +{ + const void *q; + unsigned int seclen; + unsigned int timeout; + u32 window_size; + int ret; + + /* First unsigned int gives the lifetime (in seconds) of the cred */ + p = simple_get_bytes(p, end, &timeout, sizeof(timeout)); + if (IS_ERR(p)) + goto err; + if (timeout == 0) + timeout = GSSD_MIN_TIMEOUT; + ctx->gc_expiry = jiffies + (unsigned long)timeout * HZ * 3 / 4; + /* Sequence number window. Determines the maximum number of simultaneous requests */ + p = simple_get_bytes(p, end, &window_size, sizeof(window_size)); + if (IS_ERR(p)) + goto err; + ctx->gc_win = window_size; + /* gssd signals an error by passing ctx->gc_win = 0: */ + if (ctx->gc_win == 0) { + /* in which case, p points to an error code which we ignore */ + p = ERR_PTR(-EACCES); + goto err; + } + /* copy the opaque wire context */ + p = simple_get_netobj(p, end, &ctx->gc_wire_ctx); + if (IS_ERR(p)) + goto err; + /* import the opaque security context */ + p = simple_get_bytes(p, end, &seclen, sizeof(seclen)); + if (IS_ERR(p)) + goto err; + q = (const void *)((const char *)p + seclen); + if (unlikely(q > end || q < p)) { + p = ERR_PTR(-EFAULT); + goto err; + } + ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx); + if (ret < 0) { + p = ERR_PTR(ret); + goto err; + } + return q; +err: + dprintk("RPC: gss_fill_context returning %ld\n", -PTR_ERR(p)); + return p; +} + + +struct gss_upcall_msg { + atomic_t count; + uid_t uid; + struct rpc_pipe_msg msg; + struct list_head list; + struct gss_auth *auth; + struct rpc_wait_queue rpc_waitqueue; + wait_queue_head_t waitqueue; + struct gss_cl_ctx *ctx; +}; + +static void +gss_release_msg(struct gss_upcall_msg *gss_msg) +{ + if (!atomic_dec_and_test(&gss_msg->count)) + return; + BUG_ON(!list_empty(&gss_msg->list)); + if (gss_msg->ctx != NULL) + gss_put_ctx(gss_msg->ctx); + kfree(gss_msg); +} + +static struct gss_upcall_msg * +__gss_find_upcall(struct gss_auth *gss_auth, uid_t uid) +{ + struct gss_upcall_msg *pos; + list_for_each_entry(pos, &gss_auth->upcalls, list) { + if (pos->uid != uid) + continue; + atomic_inc(&pos->count); + dprintk("RPC: gss_find_upcall found msg %p\n", pos); + return pos; + } + dprintk("RPC: gss_find_upcall found nothing\n"); + return NULL; +} + +/* Try to add a upcall to the pipefs queue. + * If an upcall owned by our uid already exists, then we return a reference + * to that upcall instead of adding the new upcall. + */ +static inline struct gss_upcall_msg * +gss_add_msg(struct gss_auth *gss_auth, struct gss_upcall_msg *gss_msg) +{ + struct gss_upcall_msg *old; + + spin_lock(&gss_auth->lock); + old = __gss_find_upcall(gss_auth, gss_msg->uid); + if (old == NULL) { + atomic_inc(&gss_msg->count); + list_add(&gss_msg->list, &gss_auth->upcalls); + } else + gss_msg = old; + spin_unlock(&gss_auth->lock); + return gss_msg; +} + +static void +__gss_unhash_msg(struct gss_upcall_msg *gss_msg) +{ + if (list_empty(&gss_msg->list)) + return; + list_del_init(&gss_msg->list); + rpc_wake_up_status(&gss_msg->rpc_waitqueue, gss_msg->msg.errno); + wake_up_all(&gss_msg->waitqueue); + atomic_dec(&gss_msg->count); +} + +static void +gss_unhash_msg(struct gss_upcall_msg *gss_msg) +{ + struct gss_auth *gss_auth = gss_msg->auth; + + spin_lock(&gss_auth->lock); + __gss_unhash_msg(gss_msg); + spin_unlock(&gss_auth->lock); +} + +static void +gss_upcall_callback(struct rpc_task *task) +{ + struct gss_cred *gss_cred = container_of(task->tk_msg.rpc_cred, + struct gss_cred, gc_base); + struct gss_upcall_msg *gss_msg = gss_cred->gc_upcall; + + BUG_ON(gss_msg == NULL); + if (gss_msg->ctx) + gss_cred_set_ctx(task->tk_msg.rpc_cred, gss_get_ctx(gss_msg->ctx)); + else + task->tk_status = gss_msg->msg.errno; + spin_lock(&gss_msg->auth->lock); + gss_cred->gc_upcall = NULL; + rpc_wake_up_status(&gss_msg->rpc_waitqueue, gss_msg->msg.errno); + spin_unlock(&gss_msg->auth->lock); + gss_release_msg(gss_msg); +} + +static inline struct gss_upcall_msg * +gss_alloc_msg(struct gss_auth *gss_auth, uid_t uid) +{ + struct gss_upcall_msg *gss_msg; + + gss_msg = kmalloc(sizeof(*gss_msg), GFP_KERNEL); + if (gss_msg != NULL) { + memset(gss_msg, 0, sizeof(*gss_msg)); + INIT_LIST_HEAD(&gss_msg->list); + rpc_init_wait_queue(&gss_msg->rpc_waitqueue, "RPCSEC_GSS upcall waitq"); + init_waitqueue_head(&gss_msg->waitqueue); + atomic_set(&gss_msg->count, 1); + gss_msg->msg.data = &gss_msg->uid; + gss_msg->msg.len = sizeof(gss_msg->uid); + gss_msg->uid = uid; + gss_msg->auth = gss_auth; + } + return gss_msg; +} + +static struct gss_upcall_msg * +gss_setup_upcall(struct rpc_clnt *clnt, struct gss_auth *gss_auth, struct rpc_cred *cred) +{ + struct gss_upcall_msg *gss_new, *gss_msg; + + gss_new = gss_alloc_msg(gss_auth, cred->cr_uid); + if (gss_new == NULL) + return ERR_PTR(-ENOMEM); + gss_msg = gss_add_msg(gss_auth, gss_new); + if (gss_msg == gss_new) { + int res = rpc_queue_upcall(gss_auth->dentry->d_inode, &gss_new->msg); + if (res) { + gss_unhash_msg(gss_new); + gss_msg = ERR_PTR(res); + } + } else + gss_release_msg(gss_new); + return gss_msg; +} + +static inline int +gss_refresh_upcall(struct rpc_task *task) +{ + struct rpc_cred *cred = task->tk_msg.rpc_cred; + struct gss_auth *gss_auth = container_of(task->tk_client->cl_auth, + struct gss_auth, rpc_auth); + struct gss_cred *gss_cred = container_of(cred, + struct gss_cred, gc_base); + struct gss_upcall_msg *gss_msg; + int err = 0; + + dprintk("RPC: %4u gss_refresh_upcall for uid %u\n", task->tk_pid, cred->cr_uid); + gss_msg = gss_setup_upcall(task->tk_client, gss_auth, cred); + if (IS_ERR(gss_msg)) { + err = PTR_ERR(gss_msg); + goto out; + } + spin_lock(&gss_auth->lock); + if (gss_cred->gc_upcall != NULL) + rpc_sleep_on(&gss_cred->gc_upcall->rpc_waitqueue, task, NULL, NULL); + else if (gss_msg->ctx == NULL && gss_msg->msg.errno >= 0) { + task->tk_timeout = 0; + gss_cred->gc_upcall = gss_msg; + /* gss_upcall_callback will release the reference to gss_upcall_msg */ + atomic_inc(&gss_msg->count); + rpc_sleep_on(&gss_msg->rpc_waitqueue, task, gss_upcall_callback, NULL); + } else + err = gss_msg->msg.errno; + spin_unlock(&gss_auth->lock); + gss_release_msg(gss_msg); +out: + dprintk("RPC: %4u gss_refresh_upcall for uid %u result %d\n", task->tk_pid, + cred->cr_uid, err); + return err; +} + +static inline int +gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred) +{ + struct rpc_cred *cred = &gss_cred->gc_base; + struct gss_upcall_msg *gss_msg; + DEFINE_WAIT(wait); + int err = 0; + + dprintk("RPC: gss_upcall for uid %u\n", cred->cr_uid); + gss_msg = gss_setup_upcall(gss_auth->client, gss_auth, cred); + if (IS_ERR(gss_msg)) { + err = PTR_ERR(gss_msg); + goto out; + } + for (;;) { + prepare_to_wait(&gss_msg->waitqueue, &wait, TASK_INTERRUPTIBLE); + spin_lock(&gss_auth->lock); + if (gss_msg->ctx != NULL || gss_msg->msg.errno < 0) { + spin_unlock(&gss_auth->lock); + break; + } + spin_unlock(&gss_auth->lock); + if (signalled()) { + err = -ERESTARTSYS; + goto out_intr; + } + schedule(); + } + if (gss_msg->ctx) + gss_cred_set_ctx(cred, gss_get_ctx(gss_msg->ctx)); + else + err = gss_msg->msg.errno; +out_intr: + finish_wait(&gss_msg->waitqueue, &wait); + gss_release_msg(gss_msg); +out: + dprintk("RPC: gss_create_upcall for uid %u result %d\n", cred->cr_uid, err); + return err; +} + +static ssize_t +gss_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, + char __user *dst, size_t buflen) +{ + char *data = (char *)msg->data + msg->copied; + ssize_t mlen = msg->len; + ssize_t left; + + if (mlen > buflen) + mlen = buflen; + left = copy_to_user(dst, data, mlen); + if (left < 0) { + msg->errno = left; + return left; + } + mlen -= left; + msg->copied += mlen; + msg->errno = 0; + return mlen; +} + +#define MSG_BUF_MAXSIZE 1024 + +static ssize_t +gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) +{ + const void *p, *end; + void *buf; + struct rpc_clnt *clnt; + struct gss_auth *gss_auth; + struct rpc_cred *cred; + struct gss_upcall_msg *gss_msg; + struct gss_cl_ctx *ctx; + uid_t uid; + int err = -EFBIG; + + if (mlen > MSG_BUF_MAXSIZE) + goto out; + err = -ENOMEM; + buf = kmalloc(mlen, GFP_KERNEL); + if (!buf) + goto out; + + clnt = RPC_I(filp->f_dentry->d_inode)->private; + err = -EFAULT; + if (copy_from_user(buf, src, mlen)) + goto err; + + end = (const void *)((char *)buf + mlen); + p = simple_get_bytes(buf, end, &uid, sizeof(uid)); + if (IS_ERR(p)) { + err = PTR_ERR(p); + goto err; + } + + err = -ENOMEM; + ctx = gss_alloc_context(); + if (ctx == NULL) + goto err; + err = 0; + gss_auth = container_of(clnt->cl_auth, struct gss_auth, rpc_auth); + p = gss_fill_context(p, end, ctx, gss_auth->mech); + if (IS_ERR(p)) { + err = PTR_ERR(p); + if (err != -EACCES) + goto err_put_ctx; + } + spin_lock(&gss_auth->lock); + gss_msg = __gss_find_upcall(gss_auth, uid); + if (gss_msg) { + if (err == 0 && gss_msg->ctx == NULL) + gss_msg->ctx = gss_get_ctx(ctx); + gss_msg->msg.errno = err; + __gss_unhash_msg(gss_msg); + spin_unlock(&gss_auth->lock); + gss_release_msg(gss_msg); + } else { + struct auth_cred acred = { .uid = uid }; + spin_unlock(&gss_auth->lock); + cred = rpcauth_lookup_credcache(clnt->cl_auth, &acred, 0); + if (IS_ERR(cred)) { + err = PTR_ERR(cred); + goto err_put_ctx; + } + gss_cred_set_ctx(cred, gss_get_ctx(ctx)); + } + gss_put_ctx(ctx); + kfree(buf); + dprintk("RPC: gss_pipe_downcall returning length %Zu\n", mlen); + return mlen; +err_put_ctx: + gss_put_ctx(ctx); +err: + kfree(buf); +out: + dprintk("RPC: gss_pipe_downcall returning %d\n", err); + return err; +} + +static void +gss_pipe_release(struct inode *inode) +{ + struct rpc_inode *rpci = RPC_I(inode); + struct rpc_clnt *clnt; + struct rpc_auth *auth; + struct gss_auth *gss_auth; + + clnt = rpci->private; + auth = clnt->cl_auth; + gss_auth = container_of(auth, struct gss_auth, rpc_auth); + spin_lock(&gss_auth->lock); + while (!list_empty(&gss_auth->upcalls)) { + struct gss_upcall_msg *gss_msg; + + gss_msg = list_entry(gss_auth->upcalls.next, + struct gss_upcall_msg, list); + gss_msg->msg.errno = -EPIPE; + atomic_inc(&gss_msg->count); + __gss_unhash_msg(gss_msg); + spin_unlock(&gss_auth->lock); + gss_release_msg(gss_msg); + spin_lock(&gss_auth->lock); + } + spin_unlock(&gss_auth->lock); +} + +static void +gss_pipe_destroy_msg(struct rpc_pipe_msg *msg) +{ + struct gss_upcall_msg *gss_msg = container_of(msg, struct gss_upcall_msg, msg); + static unsigned long ratelimit; + + if (msg->errno < 0) { + dprintk("RPC: gss_pipe_destroy_msg releasing msg %p\n", + gss_msg); + atomic_inc(&gss_msg->count); + gss_unhash_msg(gss_msg); + if (msg->errno == -ETIMEDOUT || msg->errno == -EPIPE) { + unsigned long now = jiffies; + if (time_after(now, ratelimit)) { + printk(KERN_WARNING "RPC: AUTH_GSS upcall timed out.\n" + "Please check user daemon is running!\n"); + ratelimit = now + 15*HZ; + } + } + gss_release_msg(gss_msg); + } +} + +/* + * NOTE: we have the opportunity to use different + * parameters based on the input flavor (which must be a pseudoflavor) + */ +static struct rpc_auth * +gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor) +{ + struct gss_auth *gss_auth; + struct rpc_auth * auth; + + dprintk("RPC: creating GSS authenticator for client %p\n",clnt); + + if (!try_module_get(THIS_MODULE)) + return NULL; + if (!(gss_auth = kmalloc(sizeof(*gss_auth), GFP_KERNEL))) + goto out_dec; + gss_auth->client = clnt; + gss_auth->mech = gss_mech_get_by_pseudoflavor(flavor); + if (!gss_auth->mech) { + printk(KERN_WARNING "%s: Pseudoflavor %d not found!", + __FUNCTION__, flavor); + goto err_free; + } + gss_auth->service = gss_pseudoflavor_to_service(gss_auth->mech, flavor); + /* FIXME: Will go away once privacy support is merged in */ + if (gss_auth->service == RPC_GSS_SVC_PRIVACY) + gss_auth->service = RPC_GSS_SVC_INTEGRITY; + INIT_LIST_HEAD(&gss_auth->upcalls); + spin_lock_init(&gss_auth->lock); + auth = &gss_auth->rpc_auth; + auth->au_cslack = GSS_CRED_SLACK >> 2; + auth->au_rslack = GSS_VERF_SLACK >> 2; + auth->au_ops = &authgss_ops; + auth->au_flavor = flavor; + atomic_set(&auth->au_count, 1); + + if (rpcauth_init_credcache(auth, GSS_CRED_EXPIRE) < 0) + goto err_put_mech; + + snprintf(gss_auth->path, sizeof(gss_auth->path), "%s/%s", + clnt->cl_pathname, + gss_auth->mech->gm_name); + gss_auth->dentry = rpc_mkpipe(gss_auth->path, clnt, &gss_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN); + if (IS_ERR(gss_auth->dentry)) + goto err_put_mech; + + return auth; +err_put_mech: + gss_mech_put(gss_auth->mech); +err_free: + kfree(gss_auth); +out_dec: + module_put(THIS_MODULE); + return NULL; +} + +static void +gss_destroy(struct rpc_auth *auth) +{ + struct gss_auth *gss_auth; + + dprintk("RPC: destroying GSS authenticator %p flavor %d\n", + auth, auth->au_flavor); + + gss_auth = container_of(auth, struct gss_auth, rpc_auth); + rpc_unlink(gss_auth->path); + gss_mech_put(gss_auth->mech); + + rpcauth_free_credcache(auth); + kfree(gss_auth); + module_put(THIS_MODULE); +} + +/* gss_destroy_cred (and gss_destroy_ctx) are used to clean up after failure + * to create a new cred or context, so they check that things have been + * allocated before freeing them. */ +static void +gss_destroy_ctx(struct gss_cl_ctx *ctx) +{ + dprintk("RPC: gss_destroy_ctx\n"); + + if (ctx->gc_gss_ctx) + gss_delete_sec_context(&ctx->gc_gss_ctx); + + kfree(ctx->gc_wire_ctx.data); + kfree(ctx); +} + +static void +gss_destroy_cred(struct rpc_cred *rc) +{ + struct gss_cred *cred = container_of(rc, struct gss_cred, gc_base); + + dprintk("RPC: gss_destroy_cred \n"); + + if (cred->gc_ctx) + gss_put_ctx(cred->gc_ctx); + kfree(cred); +} + +/* + * Lookup RPCSEC_GSS cred for the current process + */ +static struct rpc_cred * +gss_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int taskflags) +{ + return rpcauth_lookup_credcache(auth, acred, taskflags); +} + +static struct rpc_cred * +gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int taskflags) +{ + struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth); + struct gss_cred *cred = NULL; + int err = -ENOMEM; + + dprintk("RPC: gss_create_cred for uid %d, flavor %d\n", + acred->uid, auth->au_flavor); + + if (!(cred = kmalloc(sizeof(*cred), GFP_KERNEL))) + goto out_err; + + memset(cred, 0, sizeof(*cred)); + atomic_set(&cred->gc_count, 1); + cred->gc_uid = acred->uid; + /* + * Note: in order to force a call to call_refresh(), we deliberately + * fail to flag the credential as RPCAUTH_CRED_UPTODATE. + */ + cred->gc_flags = 0; + cred->gc_base.cr_ops = &gss_credops; + cred->gc_service = gss_auth->service; + err = gss_create_upcall(gss_auth, cred); + if (err < 0) + goto out_err; + + return &cred->gc_base; + +out_err: + dprintk("RPC: gss_create_cred failed with error %d\n", err); + if (cred) gss_destroy_cred(&cred->gc_base); + return ERR_PTR(err); +} + +static int +gss_match(struct auth_cred *acred, struct rpc_cred *rc, int taskflags) +{ + struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base); + + /* Don't match with creds that have expired. */ + if (gss_cred->gc_ctx && time_after(jiffies, gss_cred->gc_ctx->gc_expiry)) + return 0; + return (rc->cr_uid == acred->uid); +} + +/* +* Marshal credentials. +* Maybe we should keep a cached credential for performance reasons. +*/ +static u32 * +gss_marshal(struct rpc_task *task, u32 *p) +{ + struct rpc_cred *cred = task->tk_msg.rpc_cred; + struct gss_cred *gss_cred = container_of(cred, struct gss_cred, + gc_base); + struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); + u32 *cred_len; + struct rpc_rqst *req = task->tk_rqstp; + u32 maj_stat = 0; + struct xdr_netobj mic; + struct kvec iov; + struct xdr_buf verf_buf; + + dprintk("RPC: %4u gss_marshal\n", task->tk_pid); + + *p++ = htonl(RPC_AUTH_GSS); + cred_len = p++; + + spin_lock(&ctx->gc_seq_lock); + req->rq_seqno = ctx->gc_seq++; + spin_unlock(&ctx->gc_seq_lock); + + *p++ = htonl((u32) RPC_GSS_VERSION); + *p++ = htonl((u32) ctx->gc_proc); + *p++ = htonl((u32) req->rq_seqno); + *p++ = htonl((u32) gss_cred->gc_service); + p = xdr_encode_netobj(p, &ctx->gc_wire_ctx); + *cred_len = htonl((p - (cred_len + 1)) << 2); + + /* We compute the checksum for the verifier over the xdr-encoded bytes + * starting with the xid and ending at the end of the credential: */ + iov.iov_base = req->rq_snd_buf.head[0].iov_base; + if (task->tk_client->cl_xprt->stream) + /* See clnt.c:call_header() */ + iov.iov_base += 4; + iov.iov_len = (u8 *)p - (u8 *)iov.iov_base; + xdr_buf_from_iov(&iov, &verf_buf); + + /* set verifier flavor*/ + *p++ = htonl(RPC_AUTH_GSS); + + mic.data = (u8 *)(p + 1); + maj_stat = gss_get_mic(ctx->gc_gss_ctx, + GSS_C_QOP_DEFAULT, + &verf_buf, &mic); + if (maj_stat == GSS_S_CONTEXT_EXPIRED) { + cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; + } else if (maj_stat != 0) { + printk("gss_marshal: gss_get_mic FAILED (%d)\n", maj_stat); + goto out_put_ctx; + } + p = xdr_encode_opaque(p, NULL, mic.len); + gss_put_ctx(ctx); + return p; +out_put_ctx: + gss_put_ctx(ctx); + return NULL; +} + +/* +* Refresh credentials. XXX - finish +*/ +static int +gss_refresh(struct rpc_task *task) +{ + + if (!gss_cred_is_uptodate_ctx(task->tk_msg.rpc_cred)) + return gss_refresh_upcall(task); + return 0; +} + +static u32 * +gss_validate(struct rpc_task *task, u32 *p) +{ + struct rpc_cred *cred = task->tk_msg.rpc_cred; + struct gss_cred *gss_cred = container_of(cred, struct gss_cred, + gc_base); + struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); + u32 seq, qop_state; + struct kvec iov; + struct xdr_buf verf_buf; + struct xdr_netobj mic; + u32 flav,len; + u32 maj_stat; + + dprintk("RPC: %4u gss_validate\n", task->tk_pid); + + flav = ntohl(*p++); + if ((len = ntohl(*p++)) > RPC_MAX_AUTH_SIZE) + goto out_bad; + if (flav != RPC_AUTH_GSS) + goto out_bad; + seq = htonl(task->tk_rqstp->rq_seqno); + iov.iov_base = &seq; + iov.iov_len = sizeof(seq); + xdr_buf_from_iov(&iov, &verf_buf); + mic.data = (u8 *)p; + mic.len = len; + + maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic, &qop_state); + if (maj_stat == GSS_S_CONTEXT_EXPIRED) + cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; + if (maj_stat) + goto out_bad; + switch (gss_cred->gc_service) { + case RPC_GSS_SVC_NONE: + /* verifier data, flavor, length: */ + task->tk_auth->au_rslack = XDR_QUADLEN(len) + 2; + break; + case RPC_GSS_SVC_INTEGRITY: + /* verifier data, flavor, length, length, sequence number: */ + task->tk_auth->au_rslack = XDR_QUADLEN(len) + 4; + break; + case RPC_GSS_SVC_PRIVACY: + goto out_bad; + } + gss_put_ctx(ctx); + dprintk("RPC: %4u GSS gss_validate: gss_verify_mic succeeded.\n", + task->tk_pid); + return p + XDR_QUADLEN(len); +out_bad: + gss_put_ctx(ctx); + dprintk("RPC: %4u gss_validate failed.\n", task->tk_pid); + return NULL; +} + +static inline int +gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, + kxdrproc_t encode, struct rpc_rqst *rqstp, u32 *p, void *obj) +{ + struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; + struct xdr_buf integ_buf; + u32 *integ_len = NULL; + struct xdr_netobj mic; + u32 offset, *q; + struct kvec *iov; + u32 maj_stat = 0; + int status = -EIO; + + integ_len = p++; + offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base; + *p++ = htonl(rqstp->rq_seqno); + + status = encode(rqstp, p, obj); + if (status) + return status; + + if (xdr_buf_subsegment(snd_buf, &integ_buf, + offset, snd_buf->len - offset)) + return status; + *integ_len = htonl(integ_buf.len); + + /* guess whether we're in the head or the tail: */ + if (snd_buf->page_len || snd_buf->tail[0].iov_len) + iov = snd_buf->tail; + else + iov = snd_buf->head; + p = iov->iov_base + iov->iov_len; + mic.data = (u8 *)(p + 1); + + maj_stat = gss_get_mic(ctx->gc_gss_ctx, + GSS_C_QOP_DEFAULT, &integ_buf, &mic); + status = -EIO; /* XXX? */ + if (maj_stat == GSS_S_CONTEXT_EXPIRED) + cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; + else if (maj_stat) + return status; + q = xdr_encode_opaque(p, NULL, mic.len); + + offset = (u8 *)q - (u8 *)p; + iov->iov_len += offset; + snd_buf->len += offset; + return 0; +} + +static int +gss_wrap_req(struct rpc_task *task, + kxdrproc_t encode, void *rqstp, u32 *p, void *obj) +{ + struct rpc_cred *cred = task->tk_msg.rpc_cred; + struct gss_cred *gss_cred = container_of(cred, struct gss_cred, + gc_base); + struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); + int status = -EIO; + + dprintk("RPC: %4u gss_wrap_req\n", task->tk_pid); + if (ctx->gc_proc != RPC_GSS_PROC_DATA) { + /* The spec seems a little ambiguous here, but I think that not + * wrapping context destruction requests makes the most sense. + */ + status = encode(rqstp, p, obj); + goto out; + } + switch (gss_cred->gc_service) { + case RPC_GSS_SVC_NONE: + status = encode(rqstp, p, obj); + break; + case RPC_GSS_SVC_INTEGRITY: + status = gss_wrap_req_integ(cred, ctx, encode, + rqstp, p, obj); + break; + case RPC_GSS_SVC_PRIVACY: + break; + } +out: + gss_put_ctx(ctx); + dprintk("RPC: %4u gss_wrap_req returning %d\n", task->tk_pid, status); + return status; +} + +static inline int +gss_unwrap_resp_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, + struct rpc_rqst *rqstp, u32 **p) +{ + struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf; + struct xdr_buf integ_buf; + struct xdr_netobj mic; + u32 data_offset, mic_offset; + u32 integ_len; + u32 maj_stat; + int status = -EIO; + + integ_len = ntohl(*(*p)++); + if (integ_len & 3) + return status; + data_offset = (u8 *)(*p) - (u8 *)rcv_buf->head[0].iov_base; + mic_offset = integ_len + data_offset; + if (mic_offset > rcv_buf->len) + return status; + if (ntohl(*(*p)++) != rqstp->rq_seqno) + return status; + + if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset, + mic_offset - data_offset)) + return status; + + if (xdr_buf_read_netobj(rcv_buf, &mic, mic_offset)) + return status; + + maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, + &mic, NULL); + if (maj_stat == GSS_S_CONTEXT_EXPIRED) + cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; + if (maj_stat != GSS_S_COMPLETE) + return status; + return 0; +} + +static int +gss_unwrap_resp(struct rpc_task *task, + kxdrproc_t decode, void *rqstp, u32 *p, void *obj) +{ + struct rpc_cred *cred = task->tk_msg.rpc_cred; + struct gss_cred *gss_cred = container_of(cred, struct gss_cred, + gc_base); + struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); + int status = -EIO; + + if (ctx->gc_proc != RPC_GSS_PROC_DATA) + goto out_decode; + switch (gss_cred->gc_service) { + case RPC_GSS_SVC_NONE: + break; + case RPC_GSS_SVC_INTEGRITY: + status = gss_unwrap_resp_integ(cred, ctx, rqstp, &p); + if (status) + goto out; + break; + case RPC_GSS_SVC_PRIVACY: + break; + } +out_decode: + status = decode(rqstp, p, obj); +out: + gss_put_ctx(ctx); + dprintk("RPC: %4u gss_unwrap_resp returning %d\n", task->tk_pid, + status); + return status; +} + +static struct rpc_authops authgss_ops = { + .owner = THIS_MODULE, + .au_flavor = RPC_AUTH_GSS, +#ifdef RPC_DEBUG + .au_name = "RPCSEC_GSS", +#endif + .create = gss_create, + .destroy = gss_destroy, + .lookup_cred = gss_lookup_cred, + .crcreate = gss_create_cred +}; + +static struct rpc_credops gss_credops = { + .cr_name = "AUTH_GSS", + .crdestroy = gss_destroy_cred, + .crmatch = gss_match, + .crmarshal = gss_marshal, + .crrefresh = gss_refresh, + .crvalidate = gss_validate, + .crwrap_req = gss_wrap_req, + .crunwrap_resp = gss_unwrap_resp, +}; + +static struct rpc_pipe_ops gss_upcall_ops = { + .upcall = gss_pipe_upcall, + .downcall = gss_pipe_downcall, + .destroy_msg = gss_pipe_destroy_msg, + .release_pipe = gss_pipe_release, +}; + +/* + * Initialize RPCSEC_GSS module + */ +static int __init init_rpcsec_gss(void) +{ + int err = 0; + + err = rpcauth_register(&authgss_ops); + if (err) + goto out; + err = gss_svc_init(); + if (err) + goto out_unregister; + return 0; +out_unregister: + rpcauth_unregister(&authgss_ops); +out: + return err; +} + +static void __exit exit_rpcsec_gss(void) +{ + gss_svc_shutdown(); + rpcauth_unregister(&authgss_ops); +} + +MODULE_LICENSE("GPL"); +module_init(init_rpcsec_gss) +module_exit(exit_rpcsec_gss) diff --git a/net/sunrpc/auth_gss/gss_generic_token.c b/net/sunrpc/auth_gss/gss_generic_token.c new file mode 100644 index 000000000000..826df44e7fca --- /dev/null +++ b/net/sunrpc/auth_gss/gss_generic_token.c @@ -0,0 +1,235 @@ +/* + * linux/net/sunrpc/gss_generic_token.c + * + * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/generic/util_token.c + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson <andros@umich.edu> + */ + +/* + * Copyright 1993 by OpenVision Technologies, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software + * and its documentation for any purpose is hereby granted without fee, + * provided that the above copyright notice appears in all copies and + * that both that copyright notice and this permission notice appear in + * supporting documentation, and that the name of OpenVision not be used + * in advertising or publicity pertaining to distribution of the software + * without specific, written prior permission. OpenVision makes no + * representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied warranty. + * + * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include <linux/types.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/sunrpc/sched.h> +#include <linux/sunrpc/gss_asn1.h> + + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + + +/* TWRITE_STR from gssapiP_generic.h */ +#define TWRITE_STR(ptr, str, len) \ + memcpy((ptr), (char *) (str), (len)); \ + (ptr) += (len); + +/* XXXX this code currently makes the assumption that a mech oid will + never be longer than 127 bytes. This assumption is not inherent in + the interfaces, so the code can be fixed if the OSI namespace + balloons unexpectedly. */ + +/* Each token looks like this: + +0x60 tag for APPLICATION 0, SEQUENCE + (constructed, definite-length) + <length> possible multiple bytes, need to parse/generate + 0x06 tag for OBJECT IDENTIFIER + <moid_length> compile-time constant string (assume 1 byte) + <moid_bytes> compile-time constant string + <inner_bytes> the ANY containing the application token + bytes 0,1 are the token type + bytes 2,n are the token data + +For the purposes of this abstraction, the token "header" consists of +the sequence tag and length octets, the mech OID DER encoding, and the +first two inner bytes, which indicate the token type. The token +"body" consists of everything else. + +*/ + +static int +der_length_size( int length) +{ + if (length < (1<<7)) + return(1); + else if (length < (1<<8)) + return(2); +#if (SIZEOF_INT == 2) + else + return(3); +#else + else if (length < (1<<16)) + return(3); + else if (length < (1<<24)) + return(4); + else + return(5); +#endif +} + +static void +der_write_length(unsigned char **buf, int length) +{ + if (length < (1<<7)) { + *(*buf)++ = (unsigned char) length; + } else { + *(*buf)++ = (unsigned char) (der_length_size(length)+127); +#if (SIZEOF_INT > 2) + if (length >= (1<<24)) + *(*buf)++ = (unsigned char) (length>>24); + if (length >= (1<<16)) + *(*buf)++ = (unsigned char) ((length>>16)&0xff); +#endif + if (length >= (1<<8)) + *(*buf)++ = (unsigned char) ((length>>8)&0xff); + *(*buf)++ = (unsigned char) (length&0xff); + } +} + +/* returns decoded length, or < 0 on failure. Advances buf and + decrements bufsize */ + +static int +der_read_length(unsigned char **buf, int *bufsize) +{ + unsigned char sf; + int ret; + + if (*bufsize < 1) + return(-1); + sf = *(*buf)++; + (*bufsize)--; + if (sf & 0x80) { + if ((sf &= 0x7f) > ((*bufsize)-1)) + return(-1); + if (sf > SIZEOF_INT) + return (-1); + ret = 0; + for (; sf; sf--) { + ret = (ret<<8) + (*(*buf)++); + (*bufsize)--; + } + } else { + ret = sf; + } + + return(ret); +} + +/* returns the length of a token, given the mech oid and the body size */ + +int +g_token_size(struct xdr_netobj *mech, unsigned int body_size) +{ + /* set body_size to sequence contents size */ + body_size += 4 + (int) mech->len; /* NEED overflow check */ + return(1 + der_length_size(body_size) + body_size); +} + +EXPORT_SYMBOL(g_token_size); + +/* fills in a buffer with the token header. The buffer is assumed to + be the right size. buf is advanced past the token header */ + +void +g_make_token_header(struct xdr_netobj *mech, int body_size, unsigned char **buf) +{ + *(*buf)++ = 0x60; + der_write_length(buf, 4 + mech->len + body_size); + *(*buf)++ = 0x06; + *(*buf)++ = (unsigned char) mech->len; + TWRITE_STR(*buf, mech->data, ((int) mech->len)); +} + +EXPORT_SYMBOL(g_make_token_header); + +/* + * Given a buffer containing a token, reads and verifies the token, + * leaving buf advanced past the token header, and setting body_size + * to the number of remaining bytes. Returns 0 on success, + * G_BAD_TOK_HEADER for a variety of errors, and G_WRONG_MECH if the + * mechanism in the token does not match the mech argument. buf and + * *body_size are left unmodified on error. + */ +u32 +g_verify_token_header(struct xdr_netobj *mech, int *body_size, + unsigned char **buf_in, int toksize) +{ + unsigned char *buf = *buf_in; + int seqsize; + struct xdr_netobj toid; + int ret = 0; + + if ((toksize-=1) < 0) + return(G_BAD_TOK_HEADER); + if (*buf++ != 0x60) + return(G_BAD_TOK_HEADER); + + if ((seqsize = der_read_length(&buf, &toksize)) < 0) + return(G_BAD_TOK_HEADER); + + if (seqsize != toksize) + return(G_BAD_TOK_HEADER); + + if ((toksize-=1) < 0) + return(G_BAD_TOK_HEADER); + if (*buf++ != 0x06) + return(G_BAD_TOK_HEADER); + + if ((toksize-=1) < 0) + return(G_BAD_TOK_HEADER); + toid.len = *buf++; + + if ((toksize-=toid.len) < 0) + return(G_BAD_TOK_HEADER); + toid.data = buf; + buf+=toid.len; + + if (! g_OID_equal(&toid, mech)) + ret = G_WRONG_MECH; + + /* G_WRONG_MECH is not returned immediately because it's more important + to return G_BAD_TOK_HEADER if the token header is in fact bad */ + + if ((toksize-=2) < 0) + return(G_BAD_TOK_HEADER); + + if (ret) + return(ret); + + if (!ret) { + *buf_in = buf; + *body_size = toksize; + } + + return(ret); +} + +EXPORT_SYMBOL(g_verify_token_header); + diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c new file mode 100644 index 000000000000..24c21f2a33a7 --- /dev/null +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -0,0 +1,209 @@ +/* + * linux/net/sunrpc/gss_krb5_crypto.c + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson <andros@umich.edu> + * Bruce Fields <bfields@umich.edu> + */ + +/* + * Copyright (C) 1998 by the FundsXpress, INC. + * + * All rights reserved. + * + * Export of this software from the United States of America may require + * a specific license from the United States Government. It is the + * responsibility of any person or organization contemplating export to + * obtain such a license before exporting. + * + * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and + * distribute this software and its documentation for any purpose and + * without fee is hereby granted, provided that the above copyright + * notice appear in all copies and that both that copyright notice and + * this permission notice appear in supporting documentation, and that + * the name of FundsXpress. not be used in advertising or publicity pertaining + * to distribution of the software without specific, written prior + * permission. FundsXpress makes no representations about the suitability of + * this software for any purpose. It is provided "as is" without express + * or implied warranty. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <asm/scatterlist.h> +#include <linux/crypto.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/sunrpc/gss_krb5.h> + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +u32 +krb5_encrypt( + struct crypto_tfm *tfm, + void * iv, + void * in, + void * out, + int length) +{ + u32 ret = -EINVAL; + struct scatterlist sg[1]; + u8 local_iv[16] = {0}; + + dprintk("RPC: krb5_encrypt: input data:\n"); + print_hexl((u32 *)in, length, 0); + + if (length % crypto_tfm_alg_blocksize(tfm) != 0) + goto out; + + if (crypto_tfm_alg_ivsize(tfm) > 16) { + dprintk("RPC: gss_k5encrypt: tfm iv size to large %d\n", + crypto_tfm_alg_ivsize(tfm)); + goto out; + } + + if (iv) + memcpy(local_iv, iv, crypto_tfm_alg_ivsize(tfm)); + + memcpy(out, in, length); + sg[0].page = virt_to_page(out); + sg[0].offset = offset_in_page(out); + sg[0].length = length; + + ret = crypto_cipher_encrypt_iv(tfm, sg, sg, length, local_iv); + + dprintk("RPC: krb5_encrypt: output data:\n"); + print_hexl((u32 *)out, length, 0); +out: + dprintk("RPC: krb5_encrypt returns %d\n",ret); + return(ret); +} + +EXPORT_SYMBOL(krb5_encrypt); + +u32 +krb5_decrypt( + struct crypto_tfm *tfm, + void * iv, + void * in, + void * out, + int length) +{ + u32 ret = -EINVAL; + struct scatterlist sg[1]; + u8 local_iv[16] = {0}; + + dprintk("RPC: krb5_decrypt: input data:\n"); + print_hexl((u32 *)in, length, 0); + + if (length % crypto_tfm_alg_blocksize(tfm) != 0) + goto out; + + if (crypto_tfm_alg_ivsize(tfm) > 16) { + dprintk("RPC: gss_k5decrypt: tfm iv size to large %d\n", + crypto_tfm_alg_ivsize(tfm)); + goto out; + } + if (iv) + memcpy(local_iv,iv, crypto_tfm_alg_ivsize(tfm)); + + memcpy(out, in, length); + sg[0].page = virt_to_page(out); + sg[0].offset = offset_in_page(out); + sg[0].length = length; + + ret = crypto_cipher_decrypt_iv(tfm, sg, sg, length, local_iv); + + dprintk("RPC: krb5_decrypt: output_data:\n"); + print_hexl((u32 *)out, length, 0); +out: + dprintk("RPC: gss_k5decrypt returns %d\n",ret); + return(ret); +} + +EXPORT_SYMBOL(krb5_decrypt); + +static void +buf_to_sg(struct scatterlist *sg, char *ptr, int len) { + sg->page = virt_to_page(ptr); + sg->offset = offset_in_page(ptr); + sg->length = len; +} + +/* checksum the plaintext data and hdrlen bytes of the token header */ +s32 +make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body, + struct xdr_netobj *cksum) +{ + char *cksumname; + struct crypto_tfm *tfm = NULL; /* XXX add to ctx? */ + struct scatterlist sg[1]; + u32 code = GSS_S_FAILURE; + int len, thislen, offset; + int i; + + switch (cksumtype) { + case CKSUMTYPE_RSA_MD5: + cksumname = "md5"; + break; + default: + dprintk("RPC: krb5_make_checksum:" + " unsupported checksum %d", cksumtype); + goto out; + } + if (!(tfm = crypto_alloc_tfm(cksumname, 0))) + goto out; + cksum->len = crypto_tfm_alg_digestsize(tfm); + if ((cksum->data = kmalloc(cksum->len, GFP_KERNEL)) == NULL) + goto out; + + crypto_digest_init(tfm); + buf_to_sg(sg, header, hdrlen); + crypto_digest_update(tfm, sg, 1); + if (body->head[0].iov_len) { + buf_to_sg(sg, body->head[0].iov_base, body->head[0].iov_len); + crypto_digest_update(tfm, sg, 1); + } + + len = body->page_len; + if (len != 0) { + offset = body->page_base & (PAGE_CACHE_SIZE - 1); + i = body->page_base >> PAGE_CACHE_SHIFT; + thislen = PAGE_CACHE_SIZE - offset; + do { + if (thislen > len) + thislen = len; + sg->page = body->pages[i]; + sg->offset = offset; + sg->length = thislen; + kmap(sg->page); /* XXX kmap_atomic? */ + crypto_digest_update(tfm, sg, 1); + kunmap(sg->page); + len -= thislen; + i++; + offset = 0; + thislen = PAGE_CACHE_SIZE; + } while(len != 0); + } + if (body->tail[0].iov_len) { + buf_to_sg(sg, body->tail[0].iov_base, body->tail[0].iov_len); + crypto_digest_update(tfm, sg, 1); + } + crypto_digest_final(tfm, cksum->data); + code = 0; +out: + if (tfm) + crypto_free_tfm(tfm); + return code; +} + +EXPORT_SYMBOL(make_checksum); diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c new file mode 100644 index 000000000000..cf726510df8e --- /dev/null +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -0,0 +1,275 @@ +/* + * linux/net/sunrpc/gss_krb5_mech.c + * + * Copyright (c) 2001 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson <andros@umich.edu> + * J. Bruce Fields <bfields@umich.edu> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/sunrpc/auth.h> +#include <linux/in.h> +#include <linux/sunrpc/gss_krb5.h> +#include <linux/sunrpc/xdr.h> +#include <linux/crypto.h> + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +static const void * +simple_get_bytes(const void *p, const void *end, void *res, int len) +{ + const void *q = (const void *)((const char *)p + len); + if (unlikely(q > end || q < p)) + return ERR_PTR(-EFAULT); + memcpy(res, p, len); + return q; +} + +static const void * +simple_get_netobj(const void *p, const void *end, struct xdr_netobj *res) +{ + const void *q; + unsigned int len; + + p = simple_get_bytes(p, end, &len, sizeof(len)); + if (IS_ERR(p)) + return p; + q = (const void *)((const char *)p + len); + if (unlikely(q > end || q < p)) + return ERR_PTR(-EFAULT); + res->data = kmalloc(len, GFP_KERNEL); + if (unlikely(res->data == NULL)) + return ERR_PTR(-ENOMEM); + memcpy(res->data, p, len); + res->len = len; + return q; +} + +static inline const void * +get_key(const void *p, const void *end, struct crypto_tfm **res) +{ + struct xdr_netobj key; + int alg, alg_mode; + char *alg_name; + + p = simple_get_bytes(p, end, &alg, sizeof(alg)); + if (IS_ERR(p)) + goto out_err; + p = simple_get_netobj(p, end, &key); + if (IS_ERR(p)) + goto out_err; + + switch (alg) { + case ENCTYPE_DES_CBC_RAW: + alg_name = "des"; + alg_mode = CRYPTO_TFM_MODE_CBC; + break; + default: + dprintk("RPC: get_key: unsupported algorithm %d\n", alg); + goto out_err_free_key; + } + if (!(*res = crypto_alloc_tfm(alg_name, alg_mode))) + goto out_err_free_key; + if (crypto_cipher_setkey(*res, key.data, key.len)) + goto out_err_free_tfm; + + kfree(key.data); + return p; + +out_err_free_tfm: + crypto_free_tfm(*res); +out_err_free_key: + kfree(key.data); + p = ERR_PTR(-EINVAL); +out_err: + return p; +} + +static int +gss_import_sec_context_kerberos(const void *p, + size_t len, + struct gss_ctx *ctx_id) +{ + const void *end = (const void *)((const char *)p + len); + struct krb5_ctx *ctx; + + if (!(ctx = kmalloc(sizeof(*ctx), GFP_KERNEL))) + goto out_err; + memset(ctx, 0, sizeof(*ctx)); + + p = simple_get_bytes(p, end, &ctx->initiate, sizeof(ctx->initiate)); + if (IS_ERR(p)) + goto out_err_free_ctx; + p = simple_get_bytes(p, end, &ctx->seed_init, sizeof(ctx->seed_init)); + if (IS_ERR(p)) + goto out_err_free_ctx; + p = simple_get_bytes(p, end, ctx->seed, sizeof(ctx->seed)); + if (IS_ERR(p)) + goto out_err_free_ctx; + p = simple_get_bytes(p, end, &ctx->signalg, sizeof(ctx->signalg)); + if (IS_ERR(p)) + goto out_err_free_ctx; + p = simple_get_bytes(p, end, &ctx->sealalg, sizeof(ctx->sealalg)); + if (IS_ERR(p)) + goto out_err_free_ctx; + p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime)); + if (IS_ERR(p)) + goto out_err_free_ctx; + p = simple_get_bytes(p, end, &ctx->seq_send, sizeof(ctx->seq_send)); + if (IS_ERR(p)) + goto out_err_free_ctx; + p = simple_get_netobj(p, end, &ctx->mech_used); + if (IS_ERR(p)) + goto out_err_free_ctx; + p = get_key(p, end, &ctx->enc); + if (IS_ERR(p)) + goto out_err_free_mech; + p = get_key(p, end, &ctx->seq); + if (IS_ERR(p)) + goto out_err_free_key1; + if (p != end) { + p = ERR_PTR(-EFAULT); + goto out_err_free_key2; + } + + ctx_id->internal_ctx_id = ctx; + dprintk("RPC: Succesfully imported new context.\n"); + return 0; + +out_err_free_key2: + crypto_free_tfm(ctx->seq); +out_err_free_key1: + crypto_free_tfm(ctx->enc); +out_err_free_mech: + kfree(ctx->mech_used.data); +out_err_free_ctx: + kfree(ctx); +out_err: + return PTR_ERR(p); +} + +static void +gss_delete_sec_context_kerberos(void *internal_ctx) { + struct krb5_ctx *kctx = internal_ctx; + + if (kctx->seq) + crypto_free_tfm(kctx->seq); + if (kctx->enc) + crypto_free_tfm(kctx->enc); + if (kctx->mech_used.data) + kfree(kctx->mech_used.data); + kfree(kctx); +} + +static u32 +gss_verify_mic_kerberos(struct gss_ctx *ctx, + struct xdr_buf *message, + struct xdr_netobj *mic_token, + u32 *qstate) { + u32 maj_stat = 0; + int qop_state; + struct krb5_ctx *kctx = ctx->internal_ctx_id; + + maj_stat = krb5_read_token(kctx, mic_token, message, &qop_state, + KG_TOK_MIC_MSG); + if (!maj_stat && qop_state) + *qstate = qop_state; + + dprintk("RPC: gss_verify_mic_kerberos returning %d\n", maj_stat); + return maj_stat; +} + +static u32 +gss_get_mic_kerberos(struct gss_ctx *ctx, + u32 qop, + struct xdr_buf *message, + struct xdr_netobj *mic_token) { + u32 err = 0; + struct krb5_ctx *kctx = ctx->internal_ctx_id; + + err = krb5_make_token(kctx, qop, message, mic_token, KG_TOK_MIC_MSG); + + dprintk("RPC: gss_get_mic_kerberos returning %d\n",err); + + return err; +} + +static struct gss_api_ops gss_kerberos_ops = { + .gss_import_sec_context = gss_import_sec_context_kerberos, + .gss_get_mic = gss_get_mic_kerberos, + .gss_verify_mic = gss_verify_mic_kerberos, + .gss_delete_sec_context = gss_delete_sec_context_kerberos, +}; + +static struct pf_desc gss_kerberos_pfs[] = { + [0] = { + .pseudoflavor = RPC_AUTH_GSS_KRB5, + .service = RPC_GSS_SVC_NONE, + .name = "krb5", + }, + [1] = { + .pseudoflavor = RPC_AUTH_GSS_KRB5I, + .service = RPC_GSS_SVC_INTEGRITY, + .name = "krb5i", + }, +}; + +static struct gss_api_mech gss_kerberos_mech = { + .gm_name = "krb5", + .gm_owner = THIS_MODULE, + .gm_ops = &gss_kerberos_ops, + .gm_pf_num = ARRAY_SIZE(gss_kerberos_pfs), + .gm_pfs = gss_kerberos_pfs, +}; + +static int __init init_kerberos_module(void) +{ + int status; + + status = gss_mech_register(&gss_kerberos_mech); + if (status) + printk("Failed to register kerberos gss mechanism!\n"); + return status; +} + +static void __exit cleanup_kerberos_module(void) +{ + gss_mech_unregister(&gss_kerberos_mech); +} + +MODULE_LICENSE("GPL"); +module_init(init_kerberos_module); +module_exit(cleanup_kerberos_module); diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c new file mode 100644 index 000000000000..afeeb8715a77 --- /dev/null +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -0,0 +1,176 @@ +/* + * linux/net/sunrpc/gss_krb5_seal.c + * + * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/k5seal.c + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson <andros@umich.edu> + * J. Bruce Fields <bfields@umich.edu> + */ + +/* + * Copyright 1993 by OpenVision Technologies, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software + * and its documentation for any purpose is hereby granted without fee, + * provided that the above copyright notice appears in all copies and + * that both that copyright notice and this permission notice appear in + * supporting documentation, and that the name of OpenVision not be used + * in advertising or publicity pertaining to distribution of the software + * without specific, written prior permission. OpenVision makes no + * representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied warranty. + * + * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * Copyright (C) 1998 by the FundsXpress, INC. + * + * All rights reserved. + * + * Export of this software from the United States of America may require + * a specific license from the United States Government. It is the + * responsibility of any person or organization contemplating export to + * obtain such a license before exporting. + * + * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and + * distribute this software and its documentation for any purpose and + * without fee is hereby granted, provided that the above copyright + * notice appear in all copies and that both that copyright notice and + * this permission notice appear in supporting documentation, and that + * the name of FundsXpress. not be used in advertising or publicity pertaining + * to distribution of the software without specific, written prior + * permission. FundsXpress makes no representations about the suitability of + * this software for any purpose. It is provided "as is" without express + * or implied warranty. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/jiffies.h> +#include <linux/sunrpc/gss_krb5.h> +#include <linux/random.h> +#include <asm/scatterlist.h> +#include <linux/crypto.h> + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +static inline int +gss_krb5_padding(int blocksize, int length) { + /* Most of the code is block-size independent but in practice we + * use only 8: */ + BUG_ON(blocksize != 8); + return 8 - (length & 7); +} + +u32 +krb5_make_token(struct krb5_ctx *ctx, int qop_req, + struct xdr_buf *text, struct xdr_netobj *token, + int toktype) +{ + s32 checksum_type; + struct xdr_netobj md5cksum = {.len = 0, .data = NULL}; + int blocksize = 0, tmsglen; + unsigned char *ptr, *krb5_hdr, *msg_start; + s32 now; + + dprintk("RPC: gss_krb5_seal\n"); + + now = get_seconds(); + + if (qop_req != 0) + goto out_err; + + switch (ctx->signalg) { + case SGN_ALG_DES_MAC_MD5: + checksum_type = CKSUMTYPE_RSA_MD5; + break; + default: + dprintk("RPC: gss_krb5_seal: ctx->signalg %d not" + " supported\n", ctx->signalg); + goto out_err; + } + if (ctx->sealalg != SEAL_ALG_NONE && ctx->sealalg != SEAL_ALG_DES) { + dprintk("RPC: gss_krb5_seal: ctx->sealalg %d not supported\n", + ctx->sealalg); + goto out_err; + } + + if (toktype == KG_TOK_WRAP_MSG) { + blocksize = crypto_tfm_alg_blocksize(ctx->enc); + tmsglen = blocksize + text->len + + gss_krb5_padding(blocksize, blocksize + text->len); + } else { + tmsglen = 0; + } + + token->len = g_token_size(&ctx->mech_used, 22 + tmsglen); + + ptr = token->data; + g_make_token_header(&ctx->mech_used, 22 + tmsglen, &ptr); + + *ptr++ = (unsigned char) ((toktype>>8)&0xff); + *ptr++ = (unsigned char) (toktype&0xff); + + /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */ + krb5_hdr = ptr - 2; + msg_start = krb5_hdr + 24; + + *(u16 *)(krb5_hdr + 2) = htons(ctx->signalg); + memset(krb5_hdr + 4, 0xff, 4); + if (toktype == KG_TOK_WRAP_MSG) + *(u16 *)(krb5_hdr + 4) = htons(ctx->sealalg); + + if (toktype == KG_TOK_WRAP_MSG) { + /* XXX removing support for now */ + goto out_err; + } else { /* Sign only. */ + if (make_checksum(checksum_type, krb5_hdr, 8, text, + &md5cksum)) + goto out_err; + } + + switch (ctx->signalg) { + case SGN_ALG_DES_MAC_MD5: + if (krb5_encrypt(ctx->seq, NULL, md5cksum.data, + md5cksum.data, md5cksum.len)) + goto out_err; + memcpy(krb5_hdr + 16, + md5cksum.data + md5cksum.len - KRB5_CKSUM_LENGTH, + KRB5_CKSUM_LENGTH); + + dprintk("RPC: make_seal_token: cksum data: \n"); + print_hexl((u32 *) (krb5_hdr + 16), KRB5_CKSUM_LENGTH, 0); + break; + default: + BUG(); + } + + kfree(md5cksum.data); + + if ((krb5_make_seq_num(ctx->seq, ctx->initiate ? 0 : 0xff, + ctx->seq_send, krb5_hdr + 16, krb5_hdr + 8))) + goto out_err; + + ctx->seq_send++; + + return ((ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE); +out_err: + if (md5cksum.data) kfree(md5cksum.data); + return GSS_S_FAILURE; +} diff --git a/net/sunrpc/auth_gss/gss_krb5_seqnum.c b/net/sunrpc/auth_gss/gss_krb5_seqnum.c new file mode 100644 index 000000000000..c53ead39118d --- /dev/null +++ b/net/sunrpc/auth_gss/gss_krb5_seqnum.c @@ -0,0 +1,88 @@ +/* + * linux/net/sunrpc/gss_krb5_seqnum.c + * + * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/util_seqnum.c + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson <andros@umich.edu> + */ + +/* + * Copyright 1993 by OpenVision Technologies, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software + * and its documentation for any purpose is hereby granted without fee, + * provided that the above copyright notice appears in all copies and + * that both that copyright notice and this permission notice appear in + * supporting documentation, and that the name of OpenVision not be used + * in advertising or publicity pertaining to distribution of the software + * without specific, written prior permission. OpenVision makes no + * representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied warranty. + * + * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/sunrpc/gss_krb5.h> +#include <linux/crypto.h> + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +s32 +krb5_make_seq_num(struct crypto_tfm *key, + int direction, + s32 seqnum, + unsigned char *cksum, unsigned char *buf) +{ + unsigned char plain[8]; + + plain[0] = (unsigned char) (seqnum & 0xff); + plain[1] = (unsigned char) ((seqnum >> 8) & 0xff); + plain[2] = (unsigned char) ((seqnum >> 16) & 0xff); + plain[3] = (unsigned char) ((seqnum >> 24) & 0xff); + + plain[4] = direction; + plain[5] = direction; + plain[6] = direction; + plain[7] = direction; + + return krb5_encrypt(key, cksum, plain, buf, 8); +} + +s32 +krb5_get_seq_num(struct crypto_tfm *key, + unsigned char *cksum, + unsigned char *buf, + int *direction, s32 * seqnum) +{ + s32 code; + unsigned char plain[8]; + + dprintk("RPC: krb5_get_seq_num:\n"); + + if ((code = krb5_decrypt(key, cksum, buf, plain, 8))) + return code; + + if ((plain[4] != plain[5]) || (plain[4] != plain[6]) + || (plain[4] != plain[7])) + return (s32)KG_BAD_SEQ; + + *direction = plain[4]; + + *seqnum = ((plain[0]) | + (plain[1] << 8) | (plain[2] << 16) | (plain[3] << 24)); + + return (0); +} diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c new file mode 100644 index 000000000000..8767fc53183d --- /dev/null +++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c @@ -0,0 +1,202 @@ +/* + * linux/net/sunrpc/gss_krb5_unseal.c + * + * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/k5unseal.c + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson <andros@umich.edu> + */ + +/* + * Copyright 1993 by OpenVision Technologies, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software + * and its documentation for any purpose is hereby granted without fee, + * provided that the above copyright notice appears in all copies and + * that both that copyright notice and this permission notice appear in + * supporting documentation, and that the name of OpenVision not be used + * in advertising or publicity pertaining to distribution of the software + * without specific, written prior permission. OpenVision makes no + * representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied warranty. + * + * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * Copyright (C) 1998 by the FundsXpress, INC. + * + * All rights reserved. + * + * Export of this software from the United States of America may require + * a specific license from the United States Government. It is the + * responsibility of any person or organization contemplating export to + * obtain such a license before exporting. + * + * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and + * distribute this software and its documentation for any purpose and + * without fee is hereby granted, provided that the above copyright + * notice appear in all copies and that both that copyright notice and + * this permission notice appear in supporting documentation, and that + * the name of FundsXpress. not be used in advertising or publicity pertaining + * to distribution of the software without specific, written prior + * permission. FundsXpress makes no representations about the suitability of + * this software for any purpose. It is provided "as is" without express + * or implied warranty. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/jiffies.h> +#include <linux/sunrpc/gss_krb5.h> +#include <linux/crypto.h> + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + + +/* message_buffer is an input if toktype is MIC and an output if it is WRAP: + * If toktype is MIC: read_token is a mic token, and message_buffer is the + * data that the mic was supposedly taken over. + * If toktype is WRAP: read_token is a wrap token, and message_buffer is used + * to return the decrypted data. + */ + +/* XXX will need to change prototype and/or just split into a separate function + * when we add privacy (because read_token will be in pages too). */ +u32 +krb5_read_token(struct krb5_ctx *ctx, + struct xdr_netobj *read_token, + struct xdr_buf *message_buffer, + int *qop_state, int toktype) +{ + int signalg; + int sealalg; + s32 checksum_type; + struct xdr_netobj md5cksum = {.len = 0, .data = NULL}; + s32 now; + int direction; + s32 seqnum; + unsigned char *ptr = (unsigned char *)read_token->data; + int bodysize; + u32 ret = GSS_S_DEFECTIVE_TOKEN; + + dprintk("RPC: krb5_read_token\n"); + + if (g_verify_token_header(&ctx->mech_used, &bodysize, &ptr, + read_token->len)) + goto out; + + if ((*ptr++ != ((toktype>>8)&0xff)) || (*ptr++ != (toktype&0xff))) + goto out; + + /* XXX sanity-check bodysize?? */ + + if (toktype == KG_TOK_WRAP_MSG) { + /* XXX gone */ + goto out; + } + + /* get the sign and seal algorithms */ + + signalg = ptr[0] + (ptr[1] << 8); + sealalg = ptr[2] + (ptr[3] << 8); + + /* Sanity checks */ + + if ((ptr[4] != 0xff) || (ptr[5] != 0xff)) + goto out; + + if (((toktype != KG_TOK_WRAP_MSG) && (sealalg != 0xffff)) || + ((toktype == KG_TOK_WRAP_MSG) && (sealalg == 0xffff))) + goto out; + + /* in the current spec, there is only one valid seal algorithm per + key type, so a simple comparison is ok */ + + if ((toktype == KG_TOK_WRAP_MSG) && !(sealalg == ctx->sealalg)) + goto out; + + /* there are several mappings of seal algorithms to sign algorithms, + but few enough that we can try them all. */ + + if ((ctx->sealalg == SEAL_ALG_NONE && signalg > 1) || + (ctx->sealalg == SEAL_ALG_1 && signalg != SGN_ALG_3) || + (ctx->sealalg == SEAL_ALG_DES3KD && + signalg != SGN_ALG_HMAC_SHA1_DES3_KD)) + goto out; + + /* compute the checksum of the message */ + + /* initialize the the cksum */ + switch (signalg) { + case SGN_ALG_DES_MAC_MD5: + checksum_type = CKSUMTYPE_RSA_MD5; + break; + default: + ret = GSS_S_DEFECTIVE_TOKEN; + goto out; + } + + switch (signalg) { + case SGN_ALG_DES_MAC_MD5: + ret = make_checksum(checksum_type, ptr - 2, 8, + message_buffer, &md5cksum); + if (ret) + goto out; + + ret = krb5_encrypt(ctx->seq, NULL, md5cksum.data, + md5cksum.data, 16); + if (ret) + goto out; + + if (memcmp(md5cksum.data + 8, ptr + 14, 8)) { + ret = GSS_S_BAD_SIG; + goto out; + } + break; + default: + ret = GSS_S_DEFECTIVE_TOKEN; + goto out; + } + + /* it got through unscathed. Make sure the context is unexpired */ + + if (qop_state) + *qop_state = GSS_C_QOP_DEFAULT; + + now = get_seconds(); + + ret = GSS_S_CONTEXT_EXPIRED; + if (now > ctx->endtime) + goto out; + + /* do sequencing checks */ + + ret = GSS_S_BAD_SIG; + if ((ret = krb5_get_seq_num(ctx->seq, ptr + 14, ptr + 6, &direction, + &seqnum))) + goto out; + + if ((ctx->initiate && direction != 0xff) || + (!ctx->initiate && direction != 0)) + goto out; + + ret = GSS_S_COMPLETE; +out: + if (md5cksum.data) kfree(md5cksum.data); + return ret; +} diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c new file mode 100644 index 000000000000..9dfb68377d69 --- /dev/null +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -0,0 +1,301 @@ +/* + * linux/net/sunrpc/gss_mech_switch.c + * + * Copyright (c) 2001 The Regents of the University of Michigan. + * All rights reserved. + * + * J. Bruce Fields <bfields@umich.edu> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/socket.h> +#include <linux/module.h> +#include <linux/sunrpc/msg_prot.h> +#include <linux/sunrpc/gss_asn1.h> +#include <linux/sunrpc/auth_gss.h> +#include <linux/sunrpc/svcauth_gss.h> +#include <linux/sunrpc/gss_err.h> +#include <linux/sunrpc/sched.h> +#include <linux/sunrpc/gss_api.h> +#include <linux/sunrpc/clnt.h> + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +static LIST_HEAD(registered_mechs); +static DEFINE_SPINLOCK(registered_mechs_lock); + +static void +gss_mech_free(struct gss_api_mech *gm) +{ + struct pf_desc *pf; + int i; + + for (i = 0; i < gm->gm_pf_num; i++) { + pf = &gm->gm_pfs[i]; + if (pf->auth_domain_name) + kfree(pf->auth_domain_name); + pf->auth_domain_name = NULL; + } +} + +static inline char * +make_auth_domain_name(char *name) +{ + static char *prefix = "gss/"; + char *new; + + new = kmalloc(strlen(name) + strlen(prefix) + 1, GFP_KERNEL); + if (new) { + strcpy(new, prefix); + strcat(new, name); + } + return new; +} + +static int +gss_mech_svc_setup(struct gss_api_mech *gm) +{ + struct pf_desc *pf; + int i, status; + + for (i = 0; i < gm->gm_pf_num; i++) { + pf = &gm->gm_pfs[i]; + pf->auth_domain_name = make_auth_domain_name(pf->name); + status = -ENOMEM; + if (pf->auth_domain_name == NULL) + goto out; + status = svcauth_gss_register_pseudoflavor(pf->pseudoflavor, + pf->auth_domain_name); + if (status) + goto out; + } + return 0; +out: + gss_mech_free(gm); + return status; +} + +int +gss_mech_register(struct gss_api_mech *gm) +{ + int status; + + status = gss_mech_svc_setup(gm); + if (status) + return status; + spin_lock(®istered_mechs_lock); + list_add(&gm->gm_list, ®istered_mechs); + spin_unlock(®istered_mechs_lock); + dprintk("RPC: registered gss mechanism %s\n", gm->gm_name); + return 0; +} + +EXPORT_SYMBOL(gss_mech_register); + +void +gss_mech_unregister(struct gss_api_mech *gm) +{ + spin_lock(®istered_mechs_lock); + list_del(&gm->gm_list); + spin_unlock(®istered_mechs_lock); + dprintk("RPC: unregistered gss mechanism %s\n", gm->gm_name); + gss_mech_free(gm); +} + +EXPORT_SYMBOL(gss_mech_unregister); + +struct gss_api_mech * +gss_mech_get(struct gss_api_mech *gm) +{ + __module_get(gm->gm_owner); + return gm; +} + +EXPORT_SYMBOL(gss_mech_get); + +struct gss_api_mech * +gss_mech_get_by_name(const char *name) +{ + struct gss_api_mech *pos, *gm = NULL; + + spin_lock(®istered_mechs_lock); + list_for_each_entry(pos, ®istered_mechs, gm_list) { + if (0 == strcmp(name, pos->gm_name)) { + if (try_module_get(pos->gm_owner)) + gm = pos; + break; + } + } + spin_unlock(®istered_mechs_lock); + return gm; + +} + +EXPORT_SYMBOL(gss_mech_get_by_name); + +static inline int +mech_supports_pseudoflavor(struct gss_api_mech *gm, u32 pseudoflavor) +{ + int i; + + for (i = 0; i < gm->gm_pf_num; i++) { + if (gm->gm_pfs[i].pseudoflavor == pseudoflavor) + return 1; + } + return 0; +} + +struct gss_api_mech * +gss_mech_get_by_pseudoflavor(u32 pseudoflavor) +{ + struct gss_api_mech *pos, *gm = NULL; + + spin_lock(®istered_mechs_lock); + list_for_each_entry(pos, ®istered_mechs, gm_list) { + if (!mech_supports_pseudoflavor(pos, pseudoflavor)) { + module_put(pos->gm_owner); + continue; + } + if (try_module_get(pos->gm_owner)) + gm = pos; + break; + } + spin_unlock(®istered_mechs_lock); + return gm; +} + +EXPORT_SYMBOL(gss_mech_get_by_pseudoflavor); + +u32 +gss_pseudoflavor_to_service(struct gss_api_mech *gm, u32 pseudoflavor) +{ + int i; + + for (i = 0; i < gm->gm_pf_num; i++) { + if (gm->gm_pfs[i].pseudoflavor == pseudoflavor) + return gm->gm_pfs[i].service; + } + return 0; +} + +EXPORT_SYMBOL(gss_pseudoflavor_to_service); + +char * +gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service) +{ + int i; + + for (i = 0; i < gm->gm_pf_num; i++) { + if (gm->gm_pfs[i].service == service) + return gm->gm_pfs[i].auth_domain_name; + } + return NULL; +} + +EXPORT_SYMBOL(gss_service_to_auth_domain_name); + +void +gss_mech_put(struct gss_api_mech * gm) +{ + module_put(gm->gm_owner); +} + +EXPORT_SYMBOL(gss_mech_put); + +/* The mech could probably be determined from the token instead, but it's just + * as easy for now to pass it in. */ +int +gss_import_sec_context(const void *input_token, size_t bufsize, + struct gss_api_mech *mech, + struct gss_ctx **ctx_id) +{ + if (!(*ctx_id = kmalloc(sizeof(**ctx_id), GFP_KERNEL))) + return GSS_S_FAILURE; + memset(*ctx_id, 0, sizeof(**ctx_id)); + (*ctx_id)->mech_type = gss_mech_get(mech); + + return mech->gm_ops + ->gss_import_sec_context(input_token, bufsize, *ctx_id); +} + +/* gss_get_mic: compute a mic over message and return mic_token. */ + +u32 +gss_get_mic(struct gss_ctx *context_handle, + u32 qop, + struct xdr_buf *message, + struct xdr_netobj *mic_token) +{ + return context_handle->mech_type->gm_ops + ->gss_get_mic(context_handle, + qop, + message, + mic_token); +} + +/* gss_verify_mic: check whether the provided mic_token verifies message. */ + +u32 +gss_verify_mic(struct gss_ctx *context_handle, + struct xdr_buf *message, + struct xdr_netobj *mic_token, + u32 *qstate) +{ + return context_handle->mech_type->gm_ops + ->gss_verify_mic(context_handle, + message, + mic_token, + qstate); +} + +/* gss_delete_sec_context: free all resources associated with context_handle. + * Note this differs from the RFC 2744-specified prototype in that we don't + * bother returning an output token, since it would never be used anyway. */ + +u32 +gss_delete_sec_context(struct gss_ctx **context_handle) +{ + dprintk("RPC: gss_delete_sec_context deleting %p\n", + *context_handle); + + if (!*context_handle) + return(GSS_S_NO_CONTEXT); + if ((*context_handle)->internal_ctx_id != 0) + (*context_handle)->mech_type->gm_ops + ->gss_delete_sec_context((*context_handle) + ->internal_ctx_id); + if ((*context_handle)->mech_type) + gss_mech_put((*context_handle)->mech_type); + kfree(*context_handle); + *context_handle=NULL; + return GSS_S_COMPLETE; +} diff --git a/net/sunrpc/auth_gss/gss_spkm3_mech.c b/net/sunrpc/auth_gss/gss_spkm3_mech.c new file mode 100644 index 000000000000..dad05994c3eb --- /dev/null +++ b/net/sunrpc/auth_gss/gss_spkm3_mech.c @@ -0,0 +1,300 @@ +/* + * linux/net/sunrpc/gss_spkm3_mech.c + * + * Copyright (c) 2003 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson <andros@umich.edu> + * J. Bruce Fields <bfields@umich.edu> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/sunrpc/auth.h> +#include <linux/in.h> +#include <linux/sunrpc/svcauth_gss.h> +#include <linux/sunrpc/gss_spkm3.h> +#include <linux/sunrpc/xdr.h> +#include <linux/crypto.h> + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +static const void * +simple_get_bytes(const void *p, const void *end, void *res, int len) +{ + const void *q = (const void *)((const char *)p + len); + if (unlikely(q > end || q < p)) + return ERR_PTR(-EFAULT); + memcpy(res, p, len); + return q; +} + +static const void * +simple_get_netobj(const void *p, const void *end, struct xdr_netobj *res) +{ + const void *q; + unsigned int len; + p = simple_get_bytes(p, end, &len, sizeof(len)); + if (IS_ERR(p)) + return p; + res->len = len; + if (len == 0) { + res->data = NULL; + return p; + } + q = (const void *)((const char *)p + len); + if (unlikely(q > end || q < p)) + return ERR_PTR(-EFAULT); + res->data = kmalloc(len, GFP_KERNEL); + if (unlikely(res->data == NULL)) + return ERR_PTR(-ENOMEM); + memcpy(res->data, p, len); + return q; +} + +static inline const void * +get_key(const void *p, const void *end, struct crypto_tfm **res, int *resalg) +{ + struct xdr_netobj key = { 0 }; + int alg_mode,setkey = 0; + char *alg_name; + + p = simple_get_bytes(p, end, resalg, sizeof(*resalg)); + if (IS_ERR(p)) + goto out_err; + p = simple_get_netobj(p, end, &key); + if (IS_ERR(p)) + goto out_err; + + switch (*resalg) { + case NID_des_cbc: + alg_name = "des"; + alg_mode = CRYPTO_TFM_MODE_CBC; + setkey = 1; + break; + case NID_md5: + if (key.len == 0) { + dprintk("RPC: SPKM3 get_key: NID_md5 zero Key length\n"); + } + alg_name = "md5"; + alg_mode = 0; + setkey = 0; + break; + default: + dprintk("RPC: SPKM3 get_key: unsupported algorithm %d", *resalg); + goto out_err_free_key; + } + if (!(*res = crypto_alloc_tfm(alg_name, alg_mode))) + goto out_err_free_key; + if (setkey) { + if (crypto_cipher_setkey(*res, key.data, key.len)) + goto out_err_free_tfm; + } + + if(key.len > 0) + kfree(key.data); + return p; + +out_err_free_tfm: + crypto_free_tfm(*res); +out_err_free_key: + if(key.len > 0) + kfree(key.data); + p = ERR_PTR(-EINVAL); +out_err: + return p; +} + +static int +gss_import_sec_context_spkm3(const void *p, size_t len, + struct gss_ctx *ctx_id) +{ + const void *end = (const void *)((const char *)p + len); + struct spkm3_ctx *ctx; + + if (!(ctx = kmalloc(sizeof(*ctx), GFP_KERNEL))) + goto out_err; + memset(ctx, 0, sizeof(*ctx)); + + p = simple_get_netobj(p, end, &ctx->ctx_id); + if (IS_ERR(p)) + goto out_err_free_ctx; + + p = simple_get_bytes(p, end, &ctx->qop, sizeof(ctx->qop)); + if (IS_ERR(p)) + goto out_err_free_ctx_id; + + p = simple_get_netobj(p, end, &ctx->mech_used); + if (IS_ERR(p)) + goto out_err_free_mech; + + p = simple_get_bytes(p, end, &ctx->ret_flags, sizeof(ctx->ret_flags)); + if (IS_ERR(p)) + goto out_err_free_mech; + + p = simple_get_bytes(p, end, &ctx->req_flags, sizeof(ctx->req_flags)); + if (IS_ERR(p)) + goto out_err_free_mech; + + p = simple_get_netobj(p, end, &ctx->share_key); + if (IS_ERR(p)) + goto out_err_free_s_key; + + p = get_key(p, end, &ctx->derived_conf_key, &ctx->conf_alg); + if (IS_ERR(p)) + goto out_err_free_s_key; + + p = get_key(p, end, &ctx->derived_integ_key, &ctx->intg_alg); + if (IS_ERR(p)) + goto out_err_free_key1; + + p = simple_get_bytes(p, end, &ctx->keyestb_alg, sizeof(ctx->keyestb_alg)); + if (IS_ERR(p)) + goto out_err_free_key2; + + p = simple_get_bytes(p, end, &ctx->owf_alg, sizeof(ctx->owf_alg)); + if (IS_ERR(p)) + goto out_err_free_key2; + + if (p != end) + goto out_err_free_key2; + + ctx_id->internal_ctx_id = ctx; + + dprintk("Succesfully imported new spkm context.\n"); + return 0; + +out_err_free_key2: + crypto_free_tfm(ctx->derived_integ_key); +out_err_free_key1: + crypto_free_tfm(ctx->derived_conf_key); +out_err_free_s_key: + kfree(ctx->share_key.data); +out_err_free_mech: + kfree(ctx->mech_used.data); +out_err_free_ctx_id: + kfree(ctx->ctx_id.data); +out_err_free_ctx: + kfree(ctx); +out_err: + return PTR_ERR(p); +} + +static void +gss_delete_sec_context_spkm3(void *internal_ctx) { + struct spkm3_ctx *sctx = internal_ctx; + + if(sctx->derived_integ_key) + crypto_free_tfm(sctx->derived_integ_key); + if(sctx->derived_conf_key) + crypto_free_tfm(sctx->derived_conf_key); + if(sctx->share_key.data) + kfree(sctx->share_key.data); + if(sctx->mech_used.data) + kfree(sctx->mech_used.data); + kfree(sctx); +} + +static u32 +gss_verify_mic_spkm3(struct gss_ctx *ctx, + struct xdr_buf *signbuf, + struct xdr_netobj *checksum, + u32 *qstate) { + u32 maj_stat = 0; + int qop_state = 0; + struct spkm3_ctx *sctx = ctx->internal_ctx_id; + + dprintk("RPC: gss_verify_mic_spkm3 calling spkm3_read_token\n"); + maj_stat = spkm3_read_token(sctx, checksum, signbuf, &qop_state, + SPKM_MIC_TOK); + + if (!maj_stat && qop_state) + *qstate = qop_state; + + dprintk("RPC: gss_verify_mic_spkm3 returning %d\n", maj_stat); + return maj_stat; +} + +static u32 +gss_get_mic_spkm3(struct gss_ctx *ctx, + u32 qop, + struct xdr_buf *message_buffer, + struct xdr_netobj *message_token) { + u32 err = 0; + struct spkm3_ctx *sctx = ctx->internal_ctx_id; + + dprintk("RPC: gss_get_mic_spkm3\n"); + + err = spkm3_make_token(sctx, qop, message_buffer, + message_token, SPKM_MIC_TOK); + return err; +} + +static struct gss_api_ops gss_spkm3_ops = { + .gss_import_sec_context = gss_import_sec_context_spkm3, + .gss_get_mic = gss_get_mic_spkm3, + .gss_verify_mic = gss_verify_mic_spkm3, + .gss_delete_sec_context = gss_delete_sec_context_spkm3, +}; + +static struct pf_desc gss_spkm3_pfs[] = { + {RPC_AUTH_GSS_SPKM, 0, RPC_GSS_SVC_NONE, "spkm3"}, + {RPC_AUTH_GSS_SPKMI, 0, RPC_GSS_SVC_INTEGRITY, "spkm3i"}, +}; + +static struct gss_api_mech gss_spkm3_mech = { + .gm_name = "spkm3", + .gm_owner = THIS_MODULE, + .gm_ops = &gss_spkm3_ops, + .gm_pf_num = ARRAY_SIZE(gss_spkm3_pfs), + .gm_pfs = gss_spkm3_pfs, +}; + +static int __init init_spkm3_module(void) +{ + int status; + + status = gss_mech_register(&gss_spkm3_mech); + if (status) + printk("Failed to register spkm3 gss mechanism!\n"); + return 0; +} + +static void __exit cleanup_spkm3_module(void) +{ + gss_mech_unregister(&gss_spkm3_mech); +} + +MODULE_LICENSE("GPL"); +module_init(init_spkm3_module); +module_exit(cleanup_spkm3_module); diff --git a/net/sunrpc/auth_gss/gss_spkm3_seal.c b/net/sunrpc/auth_gss/gss_spkm3_seal.c new file mode 100644 index 000000000000..25339868d462 --- /dev/null +++ b/net/sunrpc/auth_gss/gss_spkm3_seal.c @@ -0,0 +1,132 @@ +/* + * linux/net/sunrpc/gss_spkm3_seal.c + * + * Copyright (c) 2003 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson <andros@umich.edu> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/jiffies.h> +#include <linux/sunrpc/gss_spkm3.h> +#include <linux/random.h> +#include <linux/crypto.h> + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +/* + * spkm3_make_token() + * + * Only SPKM_MIC_TOK with md5 intg-alg is supported + */ + +u32 +spkm3_make_token(struct spkm3_ctx *ctx, int qop_req, + struct xdr_buf * text, struct xdr_netobj * token, + int toktype) +{ + s32 checksum_type; + char tokhdrbuf[25]; + struct xdr_netobj md5cksum = {.len = 0, .data = NULL}; + struct xdr_netobj mic_hdr = {.len = 0, .data = tokhdrbuf}; + int tmsglen, tokenlen = 0; + unsigned char *ptr; + s32 now; + int ctxelen = 0, ctxzbit = 0; + int md5elen = 0, md5zbit = 0; + + dprintk("RPC: spkm3_make_token\n"); + + now = jiffies; + if (qop_req != 0) + goto out_err; + + if (ctx->ctx_id.len != 16) { + dprintk("RPC: spkm3_make_token BAD ctx_id.len %d\n", + ctx->ctx_id.len); + goto out_err; + } + + switch (ctx->intg_alg) { + case NID_md5: + checksum_type = CKSUMTYPE_RSA_MD5; + break; + default: + dprintk("RPC: gss_spkm3_seal: ctx->signalg %d not" + " supported\n", ctx->intg_alg); + goto out_err; + } + /* XXX since we don't support WRAP, perhaps we don't care... */ + if (ctx->conf_alg != NID_cast5_cbc) { + dprintk("RPC: gss_spkm3_seal: ctx->sealalg %d not supported\n", + ctx->conf_alg); + goto out_err; + } + + if (toktype == SPKM_MIC_TOK) { + tmsglen = 0; + /* Calculate checksum over the mic-header */ + asn1_bitstring_len(&ctx->ctx_id, &ctxelen, &ctxzbit); + spkm3_mic_header(&mic_hdr.data, &mic_hdr.len, ctx->ctx_id.data, + ctxelen, ctxzbit); + + if (make_checksum(checksum_type, mic_hdr.data, mic_hdr.len, + text, &md5cksum)) + goto out_err; + + asn1_bitstring_len(&md5cksum, &md5elen, &md5zbit); + tokenlen = 10 + ctxelen + 1 + 2 + md5elen + 1; + + /* Create token header using generic routines */ + token->len = g_token_size(&ctx->mech_used, tokenlen + tmsglen); + + ptr = token->data; + g_make_token_header(&ctx->mech_used, tokenlen + tmsglen, &ptr); + + spkm3_make_mic_token(&ptr, tokenlen, &mic_hdr, &md5cksum, md5elen, md5zbit); + } else if (toktype == SPKM_WRAP_TOK) { /* Not Supported */ + dprintk("RPC: gss_spkm3_seal: SPKM_WRAP_TOK not supported\n"); + goto out_err; + } + kfree(md5cksum.data); + + /* XXX need to implement sequence numbers, and ctx->expired */ + + return GSS_S_COMPLETE; +out_err: + if (md5cksum.data) + kfree(md5cksum.data); + token->data = NULL; + token->len = 0; + return GSS_S_FAILURE; +} diff --git a/net/sunrpc/auth_gss/gss_spkm3_token.c b/net/sunrpc/auth_gss/gss_spkm3_token.c new file mode 100644 index 000000000000..46c08a0710f6 --- /dev/null +++ b/net/sunrpc/auth_gss/gss_spkm3_token.c @@ -0,0 +1,266 @@ +/* + * linux/net/sunrpc/gss_spkm3_token.c + * + * Copyright (c) 2003 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson <andros@umich.edu> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/jiffies.h> +#include <linux/sunrpc/gss_spkm3.h> +#include <linux/random.h> +#include <linux/crypto.h> + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +/* + * asn1_bitstring_len() + * + * calculate the asn1 bitstring length of the xdr_netobject + */ +void +asn1_bitstring_len(struct xdr_netobj *in, int *enclen, int *zerobits) +{ + int i, zbit = 0,elen = in->len; + char *ptr; + + ptr = &in->data[in->len -1]; + + /* count trailing 0's */ + for(i = in->len; i > 0; i--) { + if (*ptr == 0) { + ptr--; + elen--; + } else + break; + } + + /* count number of 0 bits in final octet */ + ptr = &in->data[elen - 1]; + for(i = 0; i < 8; i++) { + short mask = 0x01; + + if (!((mask << i) & *ptr)) + zbit++; + else + break; + } + *enclen = elen; + *zerobits = zbit; +} + +/* + * decode_asn1_bitstring() + * + * decode a bitstring into a buffer of the expected length. + * enclen = bit string length + * explen = expected length (define in rfc) + */ +int +decode_asn1_bitstring(struct xdr_netobj *out, char *in, int enclen, int explen) +{ + if (!(out->data = kmalloc(explen,GFP_KERNEL))) + return 0; + out->len = explen; + memset(out->data, 0, explen); + memcpy(out->data, in, enclen); + return 1; +} + +/* + * SPKMInnerContextToken choice SPKM_MIC asn1 token layout + * + * contextid is always 16 bytes plain data. max asn1 bitstring len = 17. + * + * tokenlen = pos[0] to end of token (max pos[45] with MD5 cksum) + * + * pos value + * ---------- + * [0] a4 SPKM-MIC tag + * [1] ?? innertoken length (max 44) + * + * + * tok_hdr piece of checksum data starts here + * + * the maximum mic-header len = 9 + 17 = 26 + * mic-header + * ---------- + * [2] 30 SEQUENCE tag + * [3] ?? mic-header length: (max 23) = TokenID + ContextID + * + * TokenID - all fields constant and can be hardcoded + * ------- + * [4] 02 Type 2 + * [5] 02 Length 2 + * [6][7] 01 01 TokenID (SPKM_MIC_TOK) + * + * ContextID - encoded length not constant, calculated + * --------- + * [8] 03 Type 3 + * [9] ?? encoded length + * [10] ?? ctxzbit + * [11] contextid + * + * mic_header piece of checksum data ends here. + * + * int-cksum - encoded length not constant, calculated + * --------- + * [??] 03 Type 3 + * [??] ?? encoded length + * [??] ?? md5zbit + * [??] int-cksum (NID_md5 = 16) + * + * maximum SPKM-MIC innercontext token length = + * 10 + encoded contextid_size(17 max) + 2 + encoded + * cksum_size (17 maxfor NID_md5) = 46 + */ + +/* + * spkm3_mic_header() + * + * Prepare the SPKM_MIC_TOK mic-header for check-sum calculation + * elen: 16 byte context id asn1 bitstring encoded length + */ +void +spkm3_mic_header(unsigned char **hdrbuf, unsigned int *hdrlen, unsigned char *ctxdata, int elen, int zbit) +{ + char *hptr = *hdrbuf; + char *top = *hdrbuf; + + *(u8 *)hptr++ = 0x30; + *(u8 *)hptr++ = elen + 7; /* on the wire header length */ + + /* tokenid */ + *(u8 *)hptr++ = 0x02; + *(u8 *)hptr++ = 0x02; + *(u8 *)hptr++ = 0x01; + *(u8 *)hptr++ = 0x01; + + /* coniextid */ + *(u8 *)hptr++ = 0x03; + *(u8 *)hptr++ = elen + 1; /* add 1 to include zbit */ + *(u8 *)hptr++ = zbit; + memcpy(hptr, ctxdata, elen); + hptr += elen; + *hdrlen = hptr - top; +} + +/* + * spkm3_mic_innercontext_token() + * + * *tokp points to the beginning of the SPKM_MIC token described + * in rfc 2025, section 3.2.1: + * + */ +void +spkm3_make_mic_token(unsigned char **tokp, int toklen, struct xdr_netobj *mic_hdr, struct xdr_netobj *md5cksum, int md5elen, int md5zbit) +{ + unsigned char *ict = *tokp; + + *(u8 *)ict++ = 0xa4; + *(u8 *)ict++ = toklen - 2; + memcpy(ict, mic_hdr->data, mic_hdr->len); + ict += mic_hdr->len; + + *(u8 *)ict++ = 0x03; + *(u8 *)ict++ = md5elen + 1; /* add 1 to include zbit */ + *(u8 *)ict++ = md5zbit; + memcpy(ict, md5cksum->data, md5elen); +} + +u32 +spkm3_verify_mic_token(unsigned char **tokp, int *mic_hdrlen, unsigned char **cksum) +{ + struct xdr_netobj spkm3_ctx_id = {.len =0, .data = NULL}; + unsigned char *ptr = *tokp; + int ctxelen; + u32 ret = GSS_S_DEFECTIVE_TOKEN; + + /* spkm3 innercontext token preamble */ + if ((ptr[0] != 0xa4) || (ptr[2] != 0x30)) { + dprintk("RPC: BAD SPKM ictoken preamble\n"); + goto out; + } + + *mic_hdrlen = ptr[3]; + + /* token type */ + if ((ptr[4] != 0x02) || (ptr[5] != 0x02)) { + dprintk("RPC: BAD asn1 SPKM3 token type\n"); + goto out; + } + + /* only support SPKM_MIC_TOK */ + if((ptr[6] != 0x01) || (ptr[7] != 0x01)) { + dprintk("RPC: ERROR unsupported SPKM3 token \n"); + goto out; + } + + /* contextid */ + if (ptr[8] != 0x03) { + dprintk("RPC: BAD SPKM3 asn1 context-id type\n"); + goto out; + } + + ctxelen = ptr[9]; + if (ctxelen > 17) { /* length includes asn1 zbit octet */ + dprintk("RPC: BAD SPKM3 contextid len %d\n", ctxelen); + goto out; + } + + /* ignore ptr[10] */ + + if(!decode_asn1_bitstring(&spkm3_ctx_id, &ptr[11], ctxelen - 1, 16)) + goto out; + + /* + * in the current implementation: the optional int-alg is not present + * so the default int-alg (md5) is used the optional snd-seq field is + * also not present + */ + + if (*mic_hdrlen != 6 + ctxelen) { + dprintk("RPC: BAD SPKM_ MIC_TOK header len %d: we only support default int-alg (should be absent) and do not support snd-seq\n", *mic_hdrlen); + goto out; + } + /* checksum */ + *cksum = (&ptr[10] + ctxelen); /* ctxelen includes ptr[10] */ + + ret = GSS_S_COMPLETE; +out: + if (spkm3_ctx_id.data) + kfree(spkm3_ctx_id.data); + return ret; +} + diff --git a/net/sunrpc/auth_gss/gss_spkm3_unseal.c b/net/sunrpc/auth_gss/gss_spkm3_unseal.c new file mode 100644 index 000000000000..65ce81bf0bc4 --- /dev/null +++ b/net/sunrpc/auth_gss/gss_spkm3_unseal.c @@ -0,0 +1,128 @@ +/* + * linux/net/sunrpc/gss_spkm3_unseal.c + * + * Copyright (c) 2003 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson <andros@umich.edu> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/jiffies.h> +#include <linux/sunrpc/gss_spkm3.h> +#include <linux/crypto.h> + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +/* + * spkm3_read_token() + * + * only SPKM_MIC_TOK with md5 intg-alg is supported + */ +u32 +spkm3_read_token(struct spkm3_ctx *ctx, + struct xdr_netobj *read_token, /* checksum */ + struct xdr_buf *message_buffer, /* signbuf */ + int *qop_state, int toktype) +{ + s32 code; + struct xdr_netobj wire_cksum = {.len =0, .data = NULL}; + struct xdr_netobj md5cksum = {.len = 0, .data = NULL}; + unsigned char *ptr = (unsigned char *)read_token->data; + unsigned char *cksum; + int bodysize, md5elen; + int mic_hdrlen; + u32 ret = GSS_S_DEFECTIVE_TOKEN; + + dprintk("RPC: spkm3_read_token read_token->len %d\n", read_token->len); + + if (g_verify_token_header((struct xdr_netobj *) &ctx->mech_used, + &bodysize, &ptr, read_token->len)) + goto out; + + /* decode the token */ + + if (toktype == SPKM_MIC_TOK) { + + if ((ret = spkm3_verify_mic_token(&ptr, &mic_hdrlen, &cksum))) + goto out; + + if (*cksum++ != 0x03) { + dprintk("RPC: spkm3_read_token BAD checksum type\n"); + goto out; + } + md5elen = *cksum++; + cksum++; /* move past the zbit */ + + if(!decode_asn1_bitstring(&wire_cksum, cksum, md5elen - 1, 16)) + goto out; + + /* HARD CODED FOR MD5 */ + + /* compute the checksum of the message. + * ptr + 2 = start of header piece of checksum + * mic_hdrlen + 2 = length of header piece of checksum + */ + ret = GSS_S_DEFECTIVE_TOKEN; + code = make_checksum(CKSUMTYPE_RSA_MD5, ptr + 2, + mic_hdrlen + 2, + message_buffer, &md5cksum); + + if (code) + goto out; + + dprintk("RPC: spkm3_read_token: digest wire_cksum.len %d:\n", + wire_cksum.len); + dprintk(" md5cksum.data\n"); + print_hexl((u32 *) md5cksum.data, 16, 0); + dprintk(" cksum.data:\n"); + print_hexl((u32 *) wire_cksum.data, wire_cksum.len, 0); + + ret = GSS_S_BAD_SIG; + code = memcmp(md5cksum.data, wire_cksum.data, wire_cksum.len); + if (code) + goto out; + + } else { + dprintk("RPC: BAD or UNSUPPORTED SPKM3 token type: %d\n",toktype); + goto out; + } + + /* XXX: need to add expiration and sequencing */ + ret = GSS_S_COMPLETE; +out: + if (md5cksum.data) + kfree(md5cksum.data); + if (wire_cksum.data) + kfree(wire_cksum.data); + return ret; +} diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c new file mode 100644 index 000000000000..5c8fe3bfc494 --- /dev/null +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -0,0 +1,1080 @@ +/* + * Neil Brown <neilb@cse.unsw.edu.au> + * J. Bruce Fields <bfields@umich.edu> + * Andy Adamson <andros@umich.edu> + * Dug Song <dugsong@monkey.org> + * + * RPCSEC_GSS server authentication. + * This implements RPCSEC_GSS as defined in rfc2203 (rpcsec_gss) and rfc2078 + * (gssapi) + * + * The RPCSEC_GSS involves three stages: + * 1/ context creation + * 2/ data exchange + * 3/ context destruction + * + * Context creation is handled largely by upcalls to user-space. + * In particular, GSS_Accept_sec_context is handled by an upcall + * Data exchange is handled entirely within the kernel + * In particular, GSS_GetMIC, GSS_VerifyMIC, GSS_Seal, GSS_Unseal are in-kernel. + * Context destruction is handled in-kernel + * GSS_Delete_sec_context is in-kernel + * + * Context creation is initiated by a RPCSEC_GSS_INIT request arriving. + * The context handle and gss_token are used as a key into the rpcsec_init cache. + * The content of this cache includes some of the outputs of GSS_Accept_sec_context, + * being major_status, minor_status, context_handle, reply_token. + * These are sent back to the client. + * Sequence window management is handled by the kernel. The window size if currently + * a compile time constant. + * + * When user-space is happy that a context is established, it places an entry + * in the rpcsec_context cache. The key for this cache is the context_handle. + * The content includes: + * uid/gidlist - for determining access rights + * mechanism type + * mechanism specific information, such as a key + * + */ + +#include <linux/types.h> +#include <linux/module.h> +#include <linux/pagemap.h> + +#include <linux/sunrpc/auth_gss.h> +#include <linux/sunrpc/svcauth.h> +#include <linux/sunrpc/gss_err.h> +#include <linux/sunrpc/svcauth.h> +#include <linux/sunrpc/svcauth_gss.h> +#include <linux/sunrpc/cache.h> + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +/* The rpcsec_init cache is used for mapping RPCSEC_GSS_{,CONT_}INIT requests + * into replies. + * + * Key is context handle (\x if empty) and gss_token. + * Content is major_status minor_status (integers) context_handle, reply_token. + * + */ + +static int netobj_equal(struct xdr_netobj *a, struct xdr_netobj *b) +{ + return a->len == b->len && 0 == memcmp(a->data, b->data, a->len); +} + +#define RSI_HASHBITS 6 +#define RSI_HASHMAX (1<<RSI_HASHBITS) +#define RSI_HASHMASK (RSI_HASHMAX-1) + +struct rsi { + struct cache_head h; + struct xdr_netobj in_handle, in_token; + struct xdr_netobj out_handle, out_token; + int major_status, minor_status; +}; + +static struct cache_head *rsi_table[RSI_HASHMAX]; +static struct cache_detail rsi_cache; +static struct rsi *rsi_lookup(struct rsi *item, int set); + +static void rsi_free(struct rsi *rsii) +{ + kfree(rsii->in_handle.data); + kfree(rsii->in_token.data); + kfree(rsii->out_handle.data); + kfree(rsii->out_token.data); +} + +static void rsi_put(struct cache_head *item, struct cache_detail *cd) +{ + struct rsi *rsii = container_of(item, struct rsi, h); + if (cache_put(item, cd)) { + rsi_free(rsii); + kfree(rsii); + } +} + +static inline int rsi_hash(struct rsi *item) +{ + return hash_mem(item->in_handle.data, item->in_handle.len, RSI_HASHBITS) + ^ hash_mem(item->in_token.data, item->in_token.len, RSI_HASHBITS); +} + +static inline int rsi_match(struct rsi *item, struct rsi *tmp) +{ + return netobj_equal(&item->in_handle, &tmp->in_handle) + && netobj_equal(&item->in_token, &tmp->in_token); +} + +static int dup_to_netobj(struct xdr_netobj *dst, char *src, int len) +{ + dst->len = len; + dst->data = (len ? kmalloc(len, GFP_KERNEL) : NULL); + if (dst->data) + memcpy(dst->data, src, len); + if (len && !dst->data) + return -ENOMEM; + return 0; +} + +static inline int dup_netobj(struct xdr_netobj *dst, struct xdr_netobj *src) +{ + return dup_to_netobj(dst, src->data, src->len); +} + +static inline void rsi_init(struct rsi *new, struct rsi *item) +{ + new->out_handle.data = NULL; + new->out_handle.len = 0; + new->out_token.data = NULL; + new->out_token.len = 0; + new->in_handle.len = item->in_handle.len; + item->in_handle.len = 0; + new->in_token.len = item->in_token.len; + item->in_token.len = 0; + new->in_handle.data = item->in_handle.data; + item->in_handle.data = NULL; + new->in_token.data = item->in_token.data; + item->in_token.data = NULL; +} + +static inline void rsi_update(struct rsi *new, struct rsi *item) +{ + BUG_ON(new->out_handle.data || new->out_token.data); + new->out_handle.len = item->out_handle.len; + item->out_handle.len = 0; + new->out_token.len = item->out_token.len; + item->out_token.len = 0; + new->out_handle.data = item->out_handle.data; + item->out_handle.data = NULL; + new->out_token.data = item->out_token.data; + item->out_token.data = NULL; + + new->major_status = item->major_status; + new->minor_status = item->minor_status; +} + +static void rsi_request(struct cache_detail *cd, + struct cache_head *h, + char **bpp, int *blen) +{ + struct rsi *rsii = container_of(h, struct rsi, h); + + qword_addhex(bpp, blen, rsii->in_handle.data, rsii->in_handle.len); + qword_addhex(bpp, blen, rsii->in_token.data, rsii->in_token.len); + (*bpp)[-1] = '\n'; +} + + +static int rsi_parse(struct cache_detail *cd, + char *mesg, int mlen) +{ + /* context token expiry major minor context token */ + char *buf = mesg; + char *ep; + int len; + struct rsi rsii, *rsip = NULL; + time_t expiry; + int status = -EINVAL; + + memset(&rsii, 0, sizeof(rsii)); + /* handle */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + status = -ENOMEM; + if (dup_to_netobj(&rsii.in_handle, buf, len)) + goto out; + + /* token */ + len = qword_get(&mesg, buf, mlen); + status = -EINVAL; + if (len < 0) + goto out; + status = -ENOMEM; + if (dup_to_netobj(&rsii.in_token, buf, len)) + goto out; + + rsii.h.flags = 0; + /* expiry */ + expiry = get_expiry(&mesg); + status = -EINVAL; + if (expiry == 0) + goto out; + + /* major/minor */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + if (len == 0) { + goto out; + } else { + rsii.major_status = simple_strtoul(buf, &ep, 10); + if (*ep) + goto out; + len = qword_get(&mesg, buf, mlen); + if (len <= 0) + goto out; + rsii.minor_status = simple_strtoul(buf, &ep, 10); + if (*ep) + goto out; + + /* out_handle */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + status = -ENOMEM; + if (dup_to_netobj(&rsii.out_handle, buf, len)) + goto out; + + /* out_token */ + len = qword_get(&mesg, buf, mlen); + status = -EINVAL; + if (len < 0) + goto out; + status = -ENOMEM; + if (dup_to_netobj(&rsii.out_token, buf, len)) + goto out; + } + rsii.h.expiry_time = expiry; + rsip = rsi_lookup(&rsii, 1); + status = 0; +out: + rsi_free(&rsii); + if (rsip) + rsi_put(&rsip->h, &rsi_cache); + return status; +} + +static struct cache_detail rsi_cache = { + .hash_size = RSI_HASHMAX, + .hash_table = rsi_table, + .name = "auth.rpcsec.init", + .cache_put = rsi_put, + .cache_request = rsi_request, + .cache_parse = rsi_parse, +}; + +static DefineSimpleCacheLookup(rsi, 0) + +/* + * The rpcsec_context cache is used to store a context that is + * used in data exchange. + * The key is a context handle. The content is: + * uid, gidlist, mechanism, service-set, mech-specific-data + */ + +#define RSC_HASHBITS 10 +#define RSC_HASHMAX (1<<RSC_HASHBITS) +#define RSC_HASHMASK (RSC_HASHMAX-1) + +#define GSS_SEQ_WIN 128 + +struct gss_svc_seq_data { + /* highest seq number seen so far: */ + int sd_max; + /* for i such that sd_max-GSS_SEQ_WIN < i <= sd_max, the i-th bit of + * sd_win is nonzero iff sequence number i has been seen already: */ + unsigned long sd_win[GSS_SEQ_WIN/BITS_PER_LONG]; + spinlock_t sd_lock; +}; + +struct rsc { + struct cache_head h; + struct xdr_netobj handle; + struct svc_cred cred; + struct gss_svc_seq_data seqdata; + struct gss_ctx *mechctx; +}; + +static struct cache_head *rsc_table[RSC_HASHMAX]; +static struct cache_detail rsc_cache; +static struct rsc *rsc_lookup(struct rsc *item, int set); + +static void rsc_free(struct rsc *rsci) +{ + kfree(rsci->handle.data); + if (rsci->mechctx) + gss_delete_sec_context(&rsci->mechctx); + if (rsci->cred.cr_group_info) + put_group_info(rsci->cred.cr_group_info); +} + +static void rsc_put(struct cache_head *item, struct cache_detail *cd) +{ + struct rsc *rsci = container_of(item, struct rsc, h); + + if (cache_put(item, cd)) { + rsc_free(rsci); + kfree(rsci); + } +} + +static inline int +rsc_hash(struct rsc *rsci) +{ + return hash_mem(rsci->handle.data, rsci->handle.len, RSC_HASHBITS); +} + +static inline int +rsc_match(struct rsc *new, struct rsc *tmp) +{ + return netobj_equal(&new->handle, &tmp->handle); +} + +static inline void +rsc_init(struct rsc *new, struct rsc *tmp) +{ + new->handle.len = tmp->handle.len; + tmp->handle.len = 0; + new->handle.data = tmp->handle.data; + tmp->handle.data = NULL; + new->mechctx = NULL; + new->cred.cr_group_info = NULL; +} + +static inline void +rsc_update(struct rsc *new, struct rsc *tmp) +{ + new->mechctx = tmp->mechctx; + tmp->mechctx = NULL; + memset(&new->seqdata, 0, sizeof(new->seqdata)); + spin_lock_init(&new->seqdata.sd_lock); + new->cred = tmp->cred; + tmp->cred.cr_group_info = NULL; +} + +static int rsc_parse(struct cache_detail *cd, + char *mesg, int mlen) +{ + /* contexthandle expiry [ uid gid N <n gids> mechname ...mechdata... ] */ + char *buf = mesg; + int len, rv; + struct rsc rsci, *rscp = NULL; + time_t expiry; + int status = -EINVAL; + + memset(&rsci, 0, sizeof(rsci)); + /* context handle */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) goto out; + status = -ENOMEM; + if (dup_to_netobj(&rsci.handle, buf, len)) + goto out; + + rsci.h.flags = 0; + /* expiry */ + expiry = get_expiry(&mesg); + status = -EINVAL; + if (expiry == 0) + goto out; + + /* uid, or NEGATIVE */ + rv = get_int(&mesg, &rsci.cred.cr_uid); + if (rv == -EINVAL) + goto out; + if (rv == -ENOENT) + set_bit(CACHE_NEGATIVE, &rsci.h.flags); + else { + int N, i; + struct gss_api_mech *gm; + + /* gid */ + if (get_int(&mesg, &rsci.cred.cr_gid)) + goto out; + + /* number of additional gid's */ + if (get_int(&mesg, &N)) + goto out; + status = -ENOMEM; + rsci.cred.cr_group_info = groups_alloc(N); + if (rsci.cred.cr_group_info == NULL) + goto out; + + /* gid's */ + status = -EINVAL; + for (i=0; i<N; i++) { + gid_t gid; + if (get_int(&mesg, &gid)) + goto out; + GROUP_AT(rsci.cred.cr_group_info, i) = gid; + } + + /* mech name */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + gm = gss_mech_get_by_name(buf); + status = -EOPNOTSUPP; + if (!gm) + goto out; + + status = -EINVAL; + /* mech-specific data: */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) { + gss_mech_put(gm); + goto out; + } + if (gss_import_sec_context(buf, len, gm, &rsci.mechctx)) { + gss_mech_put(gm); + goto out; + } + gss_mech_put(gm); + } + rsci.h.expiry_time = expiry; + rscp = rsc_lookup(&rsci, 1); + status = 0; +out: + rsc_free(&rsci); + if (rscp) + rsc_put(&rscp->h, &rsc_cache); + return status; +} + +static struct cache_detail rsc_cache = { + .hash_size = RSC_HASHMAX, + .hash_table = rsc_table, + .name = "auth.rpcsec.context", + .cache_put = rsc_put, + .cache_parse = rsc_parse, +}; + +static DefineSimpleCacheLookup(rsc, 0); + +static struct rsc * +gss_svc_searchbyctx(struct xdr_netobj *handle) +{ + struct rsc rsci; + struct rsc *found; + + memset(&rsci, 0, sizeof(rsci)); + if (dup_to_netobj(&rsci.handle, handle->data, handle->len)) + return NULL; + found = rsc_lookup(&rsci, 0); + rsc_free(&rsci); + if (!found) + return NULL; + if (cache_check(&rsc_cache, &found->h, NULL)) + return NULL; + return found; +} + +/* Implements sequence number algorithm as specified in RFC 2203. */ +static int +gss_check_seq_num(struct rsc *rsci, int seq_num) +{ + struct gss_svc_seq_data *sd = &rsci->seqdata; + + spin_lock(&sd->sd_lock); + if (seq_num > sd->sd_max) { + if (seq_num >= sd->sd_max + GSS_SEQ_WIN) { + memset(sd->sd_win,0,sizeof(sd->sd_win)); + sd->sd_max = seq_num; + } else while (sd->sd_max < seq_num) { + sd->sd_max++; + __clear_bit(sd->sd_max % GSS_SEQ_WIN, sd->sd_win); + } + __set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win); + goto ok; + } else if (seq_num <= sd->sd_max - GSS_SEQ_WIN) { + goto drop; + } + /* sd_max - GSS_SEQ_WIN < seq_num <= sd_max */ + if (__test_and_set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win)) + goto drop; +ok: + spin_unlock(&sd->sd_lock); + return 1; +drop: + spin_unlock(&sd->sd_lock); + return 0; +} + +static inline u32 round_up_to_quad(u32 i) +{ + return (i + 3 ) & ~3; +} + +static inline int +svc_safe_getnetobj(struct kvec *argv, struct xdr_netobj *o) +{ + int l; + + if (argv->iov_len < 4) + return -1; + o->len = ntohl(svc_getu32(argv)); + l = round_up_to_quad(o->len); + if (argv->iov_len < l) + return -1; + o->data = argv->iov_base; + argv->iov_base += l; + argv->iov_len -= l; + return 0; +} + +static inline int +svc_safe_putnetobj(struct kvec *resv, struct xdr_netobj *o) +{ + u32 *p; + + if (resv->iov_len + 4 > PAGE_SIZE) + return -1; + svc_putu32(resv, htonl(o->len)); + p = resv->iov_base + resv->iov_len; + resv->iov_len += round_up_to_quad(o->len); + if (resv->iov_len > PAGE_SIZE) + return -1; + memcpy(p, o->data, o->len); + memset((u8 *)p + o->len, 0, round_up_to_quad(o->len) - o->len); + return 0; +} + +/* Verify the checksum on the header and return SVC_OK on success. + * Otherwise, return SVC_DROP (in the case of a bad sequence number) + * or return SVC_DENIED and indicate error in authp. + */ +static int +gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, + u32 *rpcstart, struct rpc_gss_wire_cred *gc, u32 *authp) +{ + struct gss_ctx *ctx_id = rsci->mechctx; + struct xdr_buf rpchdr; + struct xdr_netobj checksum; + u32 flavor = 0; + struct kvec *argv = &rqstp->rq_arg.head[0]; + struct kvec iov; + + /* data to compute the checksum over: */ + iov.iov_base = rpcstart; + iov.iov_len = (u8 *)argv->iov_base - (u8 *)rpcstart; + xdr_buf_from_iov(&iov, &rpchdr); + + *authp = rpc_autherr_badverf; + if (argv->iov_len < 4) + return SVC_DENIED; + flavor = ntohl(svc_getu32(argv)); + if (flavor != RPC_AUTH_GSS) + return SVC_DENIED; + if (svc_safe_getnetobj(argv, &checksum)) + return SVC_DENIED; + + if (rqstp->rq_deferred) /* skip verification of revisited request */ + return SVC_OK; + if (gss_verify_mic(ctx_id, &rpchdr, &checksum, NULL) + != GSS_S_COMPLETE) { + *authp = rpcsec_gsserr_credproblem; + return SVC_DENIED; + } + + if (gc->gc_seq > MAXSEQ) { + dprintk("RPC: svcauth_gss: discarding request with large sequence number %d\n", + gc->gc_seq); + *authp = rpcsec_gsserr_ctxproblem; + return SVC_DENIED; + } + if (!gss_check_seq_num(rsci, gc->gc_seq)) { + dprintk("RPC: svcauth_gss: discarding request with old sequence number %d\n", + gc->gc_seq); + return SVC_DROP; + } + return SVC_OK; +} + +static int +gss_write_verf(struct svc_rqst *rqstp, struct gss_ctx *ctx_id, u32 seq) +{ + u32 xdr_seq; + u32 maj_stat; + struct xdr_buf verf_data; + struct xdr_netobj mic; + u32 *p; + struct kvec iov; + + svc_putu32(rqstp->rq_res.head, htonl(RPC_AUTH_GSS)); + xdr_seq = htonl(seq); + + iov.iov_base = &xdr_seq; + iov.iov_len = sizeof(xdr_seq); + xdr_buf_from_iov(&iov, &verf_data); + p = rqstp->rq_res.head->iov_base + rqstp->rq_res.head->iov_len; + mic.data = (u8 *)(p + 1); + maj_stat = gss_get_mic(ctx_id, 0, &verf_data, &mic); + if (maj_stat != GSS_S_COMPLETE) + return -1; + *p++ = htonl(mic.len); + memset((u8 *)p + mic.len, 0, round_up_to_quad(mic.len) - mic.len); + p += XDR_QUADLEN(mic.len); + if (!xdr_ressize_check(rqstp, p)) + return -1; + return 0; +} + +struct gss_domain { + struct auth_domain h; + u32 pseudoflavor; +}; + +static struct auth_domain * +find_gss_auth_domain(struct gss_ctx *ctx, u32 svc) +{ + char *name; + + name = gss_service_to_auth_domain_name(ctx->mech_type, svc); + if (!name) + return NULL; + return auth_domain_find(name); +} + +int +svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name) +{ + struct gss_domain *new; + struct auth_domain *test; + int stat = -ENOMEM; + + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (!new) + goto out; + cache_init(&new->h.h); + new->h.name = kmalloc(strlen(name) + 1, GFP_KERNEL); + if (!new->h.name) + goto out_free_dom; + strcpy(new->h.name, name); + new->h.flavour = RPC_AUTH_GSS; + new->pseudoflavor = pseudoflavor; + new->h.h.expiry_time = NEVER; + + test = auth_domain_lookup(&new->h, 1); + if (test == &new->h) { + BUG_ON(atomic_dec_and_test(&new->h.h.refcnt)); + } else { /* XXX Duplicate registration? */ + auth_domain_put(&new->h); + goto out; + } + return 0; + +out_free_dom: + kfree(new); +out: + return stat; +} + +EXPORT_SYMBOL(svcauth_gss_register_pseudoflavor); + +static inline int +read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj) +{ + u32 raw; + int status; + + status = read_bytes_from_xdr_buf(buf, base, &raw, sizeof(*obj)); + if (status) + return status; + *obj = ntohl(raw); + return 0; +} + +/* It would be nice if this bit of code could be shared with the client. + * Obstacles: + * The client shouldn't malloc(), would have to pass in own memory. + * The server uses base of head iovec as read pointer, while the + * client uses separate pointer. */ +static int +unwrap_integ_data(struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx) +{ + int stat = -EINVAL; + u32 integ_len, maj_stat; + struct xdr_netobj mic; + struct xdr_buf integ_buf; + + integ_len = ntohl(svc_getu32(&buf->head[0])); + if (integ_len & 3) + goto out; + if (integ_len > buf->len) + goto out; + if (xdr_buf_subsegment(buf, &integ_buf, 0, integ_len)) + BUG(); + /* copy out mic... */ + if (read_u32_from_xdr_buf(buf, integ_len, &mic.len)) + BUG(); + if (mic.len > RPC_MAX_AUTH_SIZE) + goto out; + mic.data = kmalloc(mic.len, GFP_KERNEL); + if (!mic.data) + goto out; + if (read_bytes_from_xdr_buf(buf, integ_len + 4, mic.data, mic.len)) + goto out; + maj_stat = gss_verify_mic(ctx, &integ_buf, &mic, NULL); + if (maj_stat != GSS_S_COMPLETE) + goto out; + if (ntohl(svc_getu32(&buf->head[0])) != seq) + goto out; + stat = 0; +out: + return stat; +} + +struct gss_svc_data { + /* decoded gss client cred: */ + struct rpc_gss_wire_cred clcred; + /* pointer to the beginning of the procedure-specific results, + * which may be encrypted/checksummed in svcauth_gss_release: */ + u32 *body_start; + struct rsc *rsci; +}; + +static int +svcauth_gss_set_client(struct svc_rqst *rqstp) +{ + struct gss_svc_data *svcdata = rqstp->rq_auth_data; + struct rsc *rsci = svcdata->rsci; + struct rpc_gss_wire_cred *gc = &svcdata->clcred; + + rqstp->rq_client = find_gss_auth_domain(rsci->mechctx, gc->gc_svc); + if (rqstp->rq_client == NULL) + return SVC_DENIED; + return SVC_OK; +} + +/* + * Accept an rpcsec packet. + * If context establishment, punt to user space + * If data exchange, verify/decrypt + * If context destruction, handle here + * In the context establishment and destruction case we encode + * response here and return SVC_COMPLETE. + */ +static int +svcauth_gss_accept(struct svc_rqst *rqstp, u32 *authp) +{ + struct kvec *argv = &rqstp->rq_arg.head[0]; + struct kvec *resv = &rqstp->rq_res.head[0]; + u32 crlen; + struct xdr_netobj tmpobj; + struct gss_svc_data *svcdata = rqstp->rq_auth_data; + struct rpc_gss_wire_cred *gc; + struct rsc *rsci = NULL; + struct rsi *rsip, rsikey; + u32 *rpcstart; + u32 *reject_stat = resv->iov_base + resv->iov_len; + int ret; + + dprintk("RPC: svcauth_gss: argv->iov_len = %zd\n",argv->iov_len); + + *authp = rpc_autherr_badcred; + if (!svcdata) + svcdata = kmalloc(sizeof(*svcdata), GFP_KERNEL); + if (!svcdata) + goto auth_err; + rqstp->rq_auth_data = svcdata; + svcdata->body_start = NULL; + svcdata->rsci = NULL; + gc = &svcdata->clcred; + + /* start of rpc packet is 7 u32's back from here: + * xid direction rpcversion prog vers proc flavour + */ + rpcstart = argv->iov_base; + rpcstart -= 7; + + /* credential is: + * version(==1), proc(0,1,2,3), seq, service (1,2,3), handle + * at least 5 u32s, and is preceeded by length, so that makes 6. + */ + + if (argv->iov_len < 5 * 4) + goto auth_err; + crlen = ntohl(svc_getu32(argv)); + if (ntohl(svc_getu32(argv)) != RPC_GSS_VERSION) + goto auth_err; + gc->gc_proc = ntohl(svc_getu32(argv)); + gc->gc_seq = ntohl(svc_getu32(argv)); + gc->gc_svc = ntohl(svc_getu32(argv)); + if (svc_safe_getnetobj(argv, &gc->gc_ctx)) + goto auth_err; + if (crlen != round_up_to_quad(gc->gc_ctx.len) + 5 * 4) + goto auth_err; + + if ((gc->gc_proc != RPC_GSS_PROC_DATA) && (rqstp->rq_proc != 0)) + goto auth_err; + + /* + * We've successfully parsed the credential. Let's check out the + * verifier. An AUTH_NULL verifier is allowed (and required) for + * INIT and CONTINUE_INIT requests. AUTH_RPCSEC_GSS is required for + * PROC_DATA and PROC_DESTROY. + * + * AUTH_NULL verifier is 0 (AUTH_NULL), 0 (length). + * AUTH_RPCSEC_GSS verifier is: + * 6 (AUTH_RPCSEC_GSS), length, checksum. + * checksum is calculated over rpcheader from xid up to here. + */ + *authp = rpc_autherr_badverf; + switch (gc->gc_proc) { + case RPC_GSS_PROC_INIT: + case RPC_GSS_PROC_CONTINUE_INIT: + if (argv->iov_len < 2 * 4) + goto auth_err; + if (ntohl(svc_getu32(argv)) != RPC_AUTH_NULL) + goto auth_err; + if (ntohl(svc_getu32(argv)) != 0) + goto auth_err; + break; + case RPC_GSS_PROC_DATA: + case RPC_GSS_PROC_DESTROY: + *authp = rpcsec_gsserr_credproblem; + rsci = gss_svc_searchbyctx(&gc->gc_ctx); + if (!rsci) + goto auth_err; + switch (gss_verify_header(rqstp, rsci, rpcstart, gc, authp)) { + case SVC_OK: + break; + case SVC_DENIED: + goto auth_err; + case SVC_DROP: + goto drop; + } + break; + default: + *authp = rpc_autherr_rejectedcred; + goto auth_err; + } + + /* now act upon the command: */ + switch (gc->gc_proc) { + case RPC_GSS_PROC_INIT: + case RPC_GSS_PROC_CONTINUE_INIT: + *authp = rpc_autherr_badcred; + if (gc->gc_proc == RPC_GSS_PROC_INIT && gc->gc_ctx.len != 0) + goto auth_err; + memset(&rsikey, 0, sizeof(rsikey)); + if (dup_netobj(&rsikey.in_handle, &gc->gc_ctx)) + goto drop; + *authp = rpc_autherr_badverf; + if (svc_safe_getnetobj(argv, &tmpobj)) { + kfree(rsikey.in_handle.data); + goto auth_err; + } + if (dup_netobj(&rsikey.in_token, &tmpobj)) { + kfree(rsikey.in_handle.data); + goto drop; + } + + rsip = rsi_lookup(&rsikey, 0); + rsi_free(&rsikey); + if (!rsip) { + goto drop; + } + switch(cache_check(&rsi_cache, &rsip->h, &rqstp->rq_chandle)) { + case -EAGAIN: + goto drop; + case -ENOENT: + goto drop; + case 0: + rsci = gss_svc_searchbyctx(&rsip->out_handle); + if (!rsci) { + goto drop; + } + if (gss_write_verf(rqstp, rsci->mechctx, GSS_SEQ_WIN)) + goto drop; + if (resv->iov_len + 4 > PAGE_SIZE) + goto drop; + svc_putu32(resv, rpc_success); + if (svc_safe_putnetobj(resv, &rsip->out_handle)) + goto drop; + if (resv->iov_len + 3 * 4 > PAGE_SIZE) + goto drop; + svc_putu32(resv, htonl(rsip->major_status)); + svc_putu32(resv, htonl(rsip->minor_status)); + svc_putu32(resv, htonl(GSS_SEQ_WIN)); + if (svc_safe_putnetobj(resv, &rsip->out_token)) + goto drop; + rqstp->rq_client = NULL; + } + goto complete; + case RPC_GSS_PROC_DESTROY: + set_bit(CACHE_NEGATIVE, &rsci->h.flags); + if (resv->iov_len + 4 > PAGE_SIZE) + goto drop; + svc_putu32(resv, rpc_success); + goto complete; + case RPC_GSS_PROC_DATA: + *authp = rpcsec_gsserr_ctxproblem; + if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) + goto auth_err; + rqstp->rq_cred = rsci->cred; + get_group_info(rsci->cred.cr_group_info); + *authp = rpc_autherr_badcred; + switch (gc->gc_svc) { + case RPC_GSS_SVC_NONE: + break; + case RPC_GSS_SVC_INTEGRITY: + if (unwrap_integ_data(&rqstp->rq_arg, + gc->gc_seq, rsci->mechctx)) + goto auth_err; + /* placeholders for length and seq. number: */ + svcdata->body_start = resv->iov_base + resv->iov_len; + svc_putu32(resv, 0); + svc_putu32(resv, 0); + break; + case RPC_GSS_SVC_PRIVACY: + /* currently unsupported */ + default: + goto auth_err; + } + svcdata->rsci = rsci; + cache_get(&rsci->h); + ret = SVC_OK; + goto out; + } +auth_err: + /* Restore write pointer to original value: */ + xdr_ressize_check(rqstp, reject_stat); + ret = SVC_DENIED; + goto out; +complete: + ret = SVC_COMPLETE; + goto out; +drop: + ret = SVC_DROP; +out: + if (rsci) + rsc_put(&rsci->h, &rsc_cache); + return ret; +} + +static int +svcauth_gss_release(struct svc_rqst *rqstp) +{ + struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data; + struct rpc_gss_wire_cred *gc = &gsd->clcred; + struct xdr_buf *resbuf = &rqstp->rq_res; + struct xdr_buf integ_buf; + struct xdr_netobj mic; + struct kvec *resv; + u32 *p; + int integ_offset, integ_len; + int stat = -EINVAL; + + if (gc->gc_proc != RPC_GSS_PROC_DATA) + goto out; + /* Release can be called twice, but we only wrap once. */ + if (gsd->body_start == NULL) + goto out; + /* normally not set till svc_send, but we need it here: */ + resbuf->len = resbuf->head[0].iov_len + + resbuf->page_len + resbuf->tail[0].iov_len; + switch (gc->gc_svc) { + case RPC_GSS_SVC_NONE: + break; + case RPC_GSS_SVC_INTEGRITY: + p = gsd->body_start; + gsd->body_start = NULL; + /* move accept_stat to right place: */ + memcpy(p, p + 2, 4); + /* don't wrap in failure case: */ + /* Note: counting on not getting here if call was not even + * accepted! */ + if (*p != rpc_success) { + resbuf->head[0].iov_len -= 2 * 4; + goto out; + } + p++; + integ_offset = (u8 *)(p + 1) - (u8 *)resbuf->head[0].iov_base; + integ_len = resbuf->len - integ_offset; + BUG_ON(integ_len % 4); + *p++ = htonl(integ_len); + *p++ = htonl(gc->gc_seq); + if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset, + integ_len)) + BUG(); + if (resbuf->page_len == 0 + && resbuf->tail[0].iov_len + RPC_MAX_AUTH_SIZE + < PAGE_SIZE) { + BUG_ON(resbuf->tail[0].iov_len); + /* Use head for everything */ + resv = &resbuf->head[0]; + } else if (resbuf->tail[0].iov_base == NULL) { + /* copied from nfsd4_encode_read */ + svc_take_page(rqstp); + resbuf->tail[0].iov_base = page_address(rqstp + ->rq_respages[rqstp->rq_resused-1]); + rqstp->rq_restailpage = rqstp->rq_resused-1; + resbuf->tail[0].iov_len = 0; + resv = &resbuf->tail[0]; + } else { + resv = &resbuf->tail[0]; + } + mic.data = (u8 *)resv->iov_base + resv->iov_len + 4; + if (gss_get_mic(gsd->rsci->mechctx, 0, &integ_buf, &mic)) + goto out_err; + svc_putu32(resv, htonl(mic.len)); + memset(mic.data + mic.len, 0, + round_up_to_quad(mic.len) - mic.len); + resv->iov_len += XDR_QUADLEN(mic.len) << 2; + /* not strictly required: */ + resbuf->len += XDR_QUADLEN(mic.len) << 2; + BUG_ON(resv->iov_len > PAGE_SIZE); + break; + case RPC_GSS_SVC_PRIVACY: + default: + goto out_err; + } + +out: + stat = 0; +out_err: + if (rqstp->rq_client) + auth_domain_put(rqstp->rq_client); + rqstp->rq_client = NULL; + if (rqstp->rq_cred.cr_group_info) + put_group_info(rqstp->rq_cred.cr_group_info); + rqstp->rq_cred.cr_group_info = NULL; + if (gsd->rsci) + rsc_put(&gsd->rsci->h, &rsc_cache); + gsd->rsci = NULL; + + return stat; +} + +static void +svcauth_gss_domain_release(struct auth_domain *dom) +{ + struct gss_domain *gd = container_of(dom, struct gss_domain, h); + + kfree(dom->name); + kfree(gd); +} + +static struct auth_ops svcauthops_gss = { + .name = "rpcsec_gss", + .owner = THIS_MODULE, + .flavour = RPC_AUTH_GSS, + .accept = svcauth_gss_accept, + .release = svcauth_gss_release, + .domain_release = svcauth_gss_domain_release, + .set_client = svcauth_gss_set_client, +}; + +int +gss_svc_init(void) +{ + int rv = svc_auth_register(RPC_AUTH_GSS, &svcauthops_gss); + if (rv == 0) { + cache_register(&rsc_cache); + cache_register(&rsi_cache); + } + return rv; +} + +void +gss_svc_shutdown(void) +{ + cache_unregister(&rsc_cache); + cache_unregister(&rsi_cache); + svc_auth_unregister(RPC_AUTH_GSS); +} diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c new file mode 100644 index 000000000000..9b72d3abf823 --- /dev/null +++ b/net/sunrpc/auth_null.c @@ -0,0 +1,143 @@ +/* + * linux/net/sunrpc/auth_null.c + * + * AUTH_NULL authentication. Really :-) + * + * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de> + */ + +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/module.h> +#include <linux/in.h> +#include <linux/utsname.h> +#include <linux/sunrpc/clnt.h> +#include <linux/sched.h> + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +static struct rpc_auth null_auth; +static struct rpc_cred null_cred; + +static struct rpc_auth * +nul_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor) +{ + atomic_inc(&null_auth.au_count); + return &null_auth; +} + +static void +nul_destroy(struct rpc_auth *auth) +{ +} + +/* + * Lookup NULL creds for current process + */ +static struct rpc_cred * +nul_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) +{ + return get_rpccred(&null_cred); +} + +/* + * Destroy cred handle. + */ +static void +nul_destroy_cred(struct rpc_cred *cred) +{ +} + +/* + * Match cred handle against current process + */ +static int +nul_match(struct auth_cred *acred, struct rpc_cred *cred, int taskflags) +{ + return 1; +} + +/* + * Marshal credential. + */ +static u32 * +nul_marshal(struct rpc_task *task, u32 *p) +{ + *p++ = htonl(RPC_AUTH_NULL); + *p++ = 0; + *p++ = htonl(RPC_AUTH_NULL); + *p++ = 0; + + return p; +} + +/* + * Refresh credential. This is a no-op for AUTH_NULL + */ +static int +nul_refresh(struct rpc_task *task) +{ + task->tk_msg.rpc_cred->cr_flags |= RPCAUTH_CRED_UPTODATE; + return 0; +} + +static u32 * +nul_validate(struct rpc_task *task, u32 *p) +{ + rpc_authflavor_t flavor; + u32 size; + + flavor = ntohl(*p++); + if (flavor != RPC_AUTH_NULL) { + printk("RPC: bad verf flavor: %u\n", flavor); + return NULL; + } + + size = ntohl(*p++); + if (size != 0) { + printk("RPC: bad verf size: %u\n", size); + return NULL; + } + + return p; +} + +struct rpc_authops authnull_ops = { + .owner = THIS_MODULE, + .au_flavor = RPC_AUTH_NULL, +#ifdef RPC_DEBUG + .au_name = "NULL", +#endif + .create = nul_create, + .destroy = nul_destroy, + .lookup_cred = nul_lookup_cred, +}; + +static +struct rpc_auth null_auth = { + .au_cslack = 4, + .au_rslack = 2, + .au_ops = &authnull_ops, +}; + +static +struct rpc_credops null_credops = { + .cr_name = "AUTH_NULL", + .crdestroy = nul_destroy_cred, + .crmatch = nul_match, + .crmarshal = nul_marshal, + .crrefresh = nul_refresh, + .crvalidate = nul_validate, +}; + +static +struct rpc_cred null_cred = { + .cr_ops = &null_credops, + .cr_count = ATOMIC_INIT(1), + .cr_flags = RPCAUTH_CRED_UPTODATE, +#ifdef RPC_DEBUG + .cr_magic = RPCAUTH_CRED_MAGIC, +#endif +}; diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c new file mode 100644 index 000000000000..4ff297a9b15b --- /dev/null +++ b/net/sunrpc/auth_unix.c @@ -0,0 +1,242 @@ +/* + * linux/net/sunrpc/auth_unix.c + * + * UNIX-style authentication; no AUTH_SHORT support + * + * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de> + */ + +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/auth.h> + +#define NFS_NGROUPS 16 + +struct unx_cred { + struct rpc_cred uc_base; + gid_t uc_gid; + gid_t uc_gids[NFS_NGROUPS]; +}; +#define uc_uid uc_base.cr_uid +#define uc_count uc_base.cr_count +#define uc_flags uc_base.cr_flags +#define uc_expire uc_base.cr_expire + +#define UNX_CRED_EXPIRE (60 * HZ) + +#define UNX_WRITESLACK (21 + (UNX_MAXNODENAME >> 2)) + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +static struct rpc_auth unix_auth; +static struct rpc_cred_cache unix_cred_cache; +static struct rpc_credops unix_credops; + +static struct rpc_auth * +unx_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor) +{ + dprintk("RPC: creating UNIX authenticator for client %p\n", clnt); + if (atomic_inc_return(&unix_auth.au_count) == 0) + unix_cred_cache.nextgc = jiffies + (unix_cred_cache.expire >> 1); + return &unix_auth; +} + +static void +unx_destroy(struct rpc_auth *auth) +{ + dprintk("RPC: destroying UNIX authenticator %p\n", auth); + rpcauth_free_credcache(auth); +} + +/* + * Lookup AUTH_UNIX creds for current process + */ +static struct rpc_cred * +unx_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) +{ + return rpcauth_lookup_credcache(auth, acred, flags); +} + +static struct rpc_cred * +unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) +{ + struct unx_cred *cred; + int i; + + dprintk("RPC: allocating UNIX cred for uid %d gid %d\n", + acred->uid, acred->gid); + + if (!(cred = (struct unx_cred *) kmalloc(sizeof(*cred), GFP_KERNEL))) + return ERR_PTR(-ENOMEM); + + atomic_set(&cred->uc_count, 1); + cred->uc_flags = RPCAUTH_CRED_UPTODATE; + if (flags & RPC_TASK_ROOTCREDS) { + cred->uc_uid = 0; + cred->uc_gid = 0; + cred->uc_gids[0] = NOGROUP; + } else { + int groups = acred->group_info->ngroups; + if (groups > NFS_NGROUPS) + groups = NFS_NGROUPS; + + cred->uc_uid = acred->uid; + cred->uc_gid = acred->gid; + for (i = 0; i < groups; i++) + cred->uc_gids[i] = GROUP_AT(acred->group_info, i); + if (i < NFS_NGROUPS) + cred->uc_gids[i] = NOGROUP; + } + cred->uc_base.cr_ops = &unix_credops; + + return (struct rpc_cred *) cred; +} + +static void +unx_destroy_cred(struct rpc_cred *cred) +{ + kfree(cred); +} + +/* + * Match credentials against current process creds. + * The root_override argument takes care of cases where the caller may + * request root creds (e.g. for NFS swapping). + */ +static int +unx_match(struct auth_cred *acred, struct rpc_cred *rcred, int taskflags) +{ + struct unx_cred *cred = (struct unx_cred *) rcred; + int i; + + if (!(taskflags & RPC_TASK_ROOTCREDS)) { + int groups; + + if (cred->uc_uid != acred->uid + || cred->uc_gid != acred->gid) + return 0; + + groups = acred->group_info->ngroups; + if (groups > NFS_NGROUPS) + groups = NFS_NGROUPS; + for (i = 0; i < groups ; i++) + if (cred->uc_gids[i] != GROUP_AT(acred->group_info, i)) + return 0; + return 1; + } + return (cred->uc_uid == 0 + && cred->uc_gid == 0 + && cred->uc_gids[0] == (gid_t) NOGROUP); +} + +/* + * Marshal credentials. + * Maybe we should keep a cached credential for performance reasons. + */ +static u32 * +unx_marshal(struct rpc_task *task, u32 *p) +{ + struct rpc_clnt *clnt = task->tk_client; + struct unx_cred *cred = (struct unx_cred *) task->tk_msg.rpc_cred; + u32 *base, *hold; + int i; + + *p++ = htonl(RPC_AUTH_UNIX); + base = p++; + *p++ = htonl(jiffies/HZ); + + /* + * Copy the UTS nodename captured when the client was created. + */ + p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen); + + *p++ = htonl((u32) cred->uc_uid); + *p++ = htonl((u32) cred->uc_gid); + hold = p++; + for (i = 0; i < 16 && cred->uc_gids[i] != (gid_t) NOGROUP; i++) + *p++ = htonl((u32) cred->uc_gids[i]); + *hold = htonl(p - hold - 1); /* gid array length */ + *base = htonl((p - base - 1) << 2); /* cred length */ + + *p++ = htonl(RPC_AUTH_NULL); + *p++ = htonl(0); + + return p; +} + +/* + * Refresh credentials. This is a no-op for AUTH_UNIX + */ +static int +unx_refresh(struct rpc_task *task) +{ + task->tk_msg.rpc_cred->cr_flags |= RPCAUTH_CRED_UPTODATE; + return 0; +} + +static u32 * +unx_validate(struct rpc_task *task, u32 *p) +{ + rpc_authflavor_t flavor; + u32 size; + + flavor = ntohl(*p++); + if (flavor != RPC_AUTH_NULL && + flavor != RPC_AUTH_UNIX && + flavor != RPC_AUTH_SHORT) { + printk("RPC: bad verf flavor: %u\n", flavor); + return NULL; + } + + size = ntohl(*p++); + if (size > RPC_MAX_AUTH_SIZE) { + printk("RPC: giant verf size: %u\n", size); + return NULL; + } + task->tk_auth->au_rslack = (size >> 2) + 2; + p += (size >> 2); + + return p; +} + +struct rpc_authops authunix_ops = { + .owner = THIS_MODULE, + .au_flavor = RPC_AUTH_UNIX, +#ifdef RPC_DEBUG + .au_name = "UNIX", +#endif + .create = unx_create, + .destroy = unx_destroy, + .lookup_cred = unx_lookup_cred, + .crcreate = unx_create_cred, +}; + +static +struct rpc_cred_cache unix_cred_cache = { + .expire = UNX_CRED_EXPIRE, +}; + +static +struct rpc_auth unix_auth = { + .au_cslack = UNX_WRITESLACK, + .au_rslack = 2, /* assume AUTH_NULL verf */ + .au_ops = &authunix_ops, + .au_count = ATOMIC_INIT(0), + .au_credcache = &unix_cred_cache, +}; + +static +struct rpc_credops unix_credops = { + .cr_name = "AUTH_UNIX", + .crdestroy = unx_destroy_cred, + .crmatch = unx_match, + .crmarshal = unx_marshal, + .crrefresh = unx_refresh, + .crvalidate = unx_validate, +}; diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c new file mode 100644 index 000000000000..900f5bc7e336 --- /dev/null +++ b/net/sunrpc/cache.c @@ -0,0 +1,1189 @@ +/* + * net/sunrpc/cache.c + * + * Generic code for various authentication-related caches + * used by sunrpc clients and servers. + * + * Copyright (C) 2002 Neil Brown <neilb@cse.unsw.edu.au> + * + * Released under terms in GPL version 2. See COPYING. + * + */ + +#include <linux/types.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/slab.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kmod.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/ctype.h> +#include <asm/uaccess.h> +#include <linux/poll.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> +#include <linux/net.h> +#include <linux/workqueue.h> +#include <asm/ioctls.h> +#include <linux/sunrpc/types.h> +#include <linux/sunrpc/cache.h> +#include <linux/sunrpc/stats.h> + +#define RPCDBG_FACILITY RPCDBG_CACHE + +static void cache_defer_req(struct cache_req *req, struct cache_head *item); +static void cache_revisit_request(struct cache_head *item); + +void cache_init(struct cache_head *h) +{ + time_t now = get_seconds(); + h->next = NULL; + h->flags = 0; + atomic_set(&h->refcnt, 1); + h->expiry_time = now + CACHE_NEW_EXPIRY; + h->last_refresh = now; +} + + +static int cache_make_upcall(struct cache_detail *detail, struct cache_head *h); +/* + * This is the generic cache management routine for all + * the authentication caches. + * It checks the currency of a cache item and will (later) + * initiate an upcall to fill it if needed. + * + * + * Returns 0 if the cache_head can be used, or cache_puts it and returns + * -EAGAIN if upcall is pending, + * -ENOENT if cache entry was negative + */ +int cache_check(struct cache_detail *detail, + struct cache_head *h, struct cache_req *rqstp) +{ + int rv; + long refresh_age, age; + + /* First decide return status as best we can */ + if (!test_bit(CACHE_VALID, &h->flags) || + h->expiry_time < get_seconds()) + rv = -EAGAIN; + else if (detail->flush_time > h->last_refresh) + rv = -EAGAIN; + else { + /* entry is valid */ + if (test_bit(CACHE_NEGATIVE, &h->flags)) + rv = -ENOENT; + else rv = 0; + } + + /* now see if we want to start an upcall */ + refresh_age = (h->expiry_time - h->last_refresh); + age = get_seconds() - h->last_refresh; + + if (rqstp == NULL) { + if (rv == -EAGAIN) + rv = -ENOENT; + } else if (rv == -EAGAIN || age > refresh_age/2) { + dprintk("Want update, refage=%ld, age=%ld\n", refresh_age, age); + if (!test_and_set_bit(CACHE_PENDING, &h->flags)) { + switch (cache_make_upcall(detail, h)) { + case -EINVAL: + clear_bit(CACHE_PENDING, &h->flags); + if (rv == -EAGAIN) { + set_bit(CACHE_NEGATIVE, &h->flags); + cache_fresh(detail, h, get_seconds()+CACHE_NEW_EXPIRY); + rv = -ENOENT; + } + break; + + case -EAGAIN: + clear_bit(CACHE_PENDING, &h->flags); + cache_revisit_request(h); + break; + } + } + } + + if (rv == -EAGAIN) + cache_defer_req(rqstp, h); + + if (rv && h) + detail->cache_put(h, detail); + return rv; +} + +static void queue_loose(struct cache_detail *detail, struct cache_head *ch); + +void cache_fresh(struct cache_detail *detail, + struct cache_head *head, time_t expiry) +{ + + head->expiry_time = expiry; + head->last_refresh = get_seconds(); + if (!test_and_set_bit(CACHE_VALID, &head->flags)) + cache_revisit_request(head); + if (test_and_clear_bit(CACHE_PENDING, &head->flags)) + queue_loose(detail, head); +} + +/* + * caches need to be periodically cleaned. + * For this we maintain a list of cache_detail and + * a current pointer into that list and into the table + * for that entry. + * + * Each time clean_cache is called it finds the next non-empty entry + * in the current table and walks the list in that entry + * looking for entries that can be removed. + * + * An entry gets removed if: + * - The expiry is before current time + * - The last_refresh time is before the flush_time for that cache + * + * later we might drop old entries with non-NEVER expiry if that table + * is getting 'full' for some definition of 'full' + * + * The question of "how often to scan a table" is an interesting one + * and is answered in part by the use of the "nextcheck" field in the + * cache_detail. + * When a scan of a table begins, the nextcheck field is set to a time + * that is well into the future. + * While scanning, if an expiry time is found that is earlier than the + * current nextcheck time, nextcheck is set to that expiry time. + * If the flush_time is ever set to a time earlier than the nextcheck + * time, the nextcheck time is then set to that flush_time. + * + * A table is then only scanned if the current time is at least + * the nextcheck time. + * + */ + +static LIST_HEAD(cache_list); +static DEFINE_SPINLOCK(cache_list_lock); +static struct cache_detail *current_detail; +static int current_index; + +static struct file_operations cache_file_operations; +static struct file_operations content_file_operations; +static struct file_operations cache_flush_operations; + +static void do_cache_clean(void *data); +static DECLARE_WORK(cache_cleaner, do_cache_clean, NULL); + +void cache_register(struct cache_detail *cd) +{ + cd->proc_ent = proc_mkdir(cd->name, proc_net_rpc); + if (cd->proc_ent) { + struct proc_dir_entry *p; + cd->proc_ent->owner = THIS_MODULE; + cd->channel_ent = cd->content_ent = NULL; + + p = create_proc_entry("flush", S_IFREG|S_IRUSR|S_IWUSR, + cd->proc_ent); + cd->flush_ent = p; + if (p) { + p->proc_fops = &cache_flush_operations; + p->owner = THIS_MODULE; + p->data = cd; + } + + if (cd->cache_request || cd->cache_parse) { + p = create_proc_entry("channel", S_IFREG|S_IRUSR|S_IWUSR, + cd->proc_ent); + cd->channel_ent = p; + if (p) { + p->proc_fops = &cache_file_operations; + p->owner = THIS_MODULE; + p->data = cd; + } + } + if (cd->cache_show) { + p = create_proc_entry("content", S_IFREG|S_IRUSR|S_IWUSR, + cd->proc_ent); + cd->content_ent = p; + if (p) { + p->proc_fops = &content_file_operations; + p->owner = THIS_MODULE; + p->data = cd; + } + } + } + rwlock_init(&cd->hash_lock); + INIT_LIST_HEAD(&cd->queue); + spin_lock(&cache_list_lock); + cd->nextcheck = 0; + cd->entries = 0; + atomic_set(&cd->readers, 0); + cd->last_close = 0; + cd->last_warn = -1; + list_add(&cd->others, &cache_list); + spin_unlock(&cache_list_lock); + + /* start the cleaning process */ + schedule_work(&cache_cleaner); +} + +int cache_unregister(struct cache_detail *cd) +{ + cache_purge(cd); + spin_lock(&cache_list_lock); + write_lock(&cd->hash_lock); + if (cd->entries || atomic_read(&cd->inuse)) { + write_unlock(&cd->hash_lock); + spin_unlock(&cache_list_lock); + return -EBUSY; + } + if (current_detail == cd) + current_detail = NULL; + list_del_init(&cd->others); + write_unlock(&cd->hash_lock); + spin_unlock(&cache_list_lock); + if (cd->proc_ent) { + if (cd->flush_ent) + remove_proc_entry("flush", cd->proc_ent); + if (cd->channel_ent) + remove_proc_entry("channel", cd->proc_ent); + if (cd->content_ent) + remove_proc_entry("content", cd->proc_ent); + + cd->proc_ent = NULL; + remove_proc_entry(cd->name, proc_net_rpc); + } + if (list_empty(&cache_list)) { + /* module must be being unloaded so its safe to kill the worker */ + cancel_delayed_work(&cache_cleaner); + flush_scheduled_work(); + } + return 0; +} + +/* clean cache tries to find something to clean + * and cleans it. + * It returns 1 if it cleaned something, + * 0 if it didn't find anything this time + * -1 if it fell off the end of the list. + */ +static int cache_clean(void) +{ + int rv = 0; + struct list_head *next; + + spin_lock(&cache_list_lock); + + /* find a suitable table if we don't already have one */ + while (current_detail == NULL || + current_index >= current_detail->hash_size) { + if (current_detail) + next = current_detail->others.next; + else + next = cache_list.next; + if (next == &cache_list) { + current_detail = NULL; + spin_unlock(&cache_list_lock); + return -1; + } + current_detail = list_entry(next, struct cache_detail, others); + if (current_detail->nextcheck > get_seconds()) + current_index = current_detail->hash_size; + else { + current_index = 0; + current_detail->nextcheck = get_seconds()+30*60; + } + } + + /* find a non-empty bucket in the table */ + while (current_detail && + current_index < current_detail->hash_size && + current_detail->hash_table[current_index] == NULL) + current_index++; + + /* find a cleanable entry in the bucket and clean it, or set to next bucket */ + + if (current_detail && current_index < current_detail->hash_size) { + struct cache_head *ch, **cp; + struct cache_detail *d; + + write_lock(¤t_detail->hash_lock); + + /* Ok, now to clean this strand */ + + cp = & current_detail->hash_table[current_index]; + ch = *cp; + for (; ch; cp= & ch->next, ch= *cp) { + if (current_detail->nextcheck > ch->expiry_time) + current_detail->nextcheck = ch->expiry_time+1; + if (ch->expiry_time >= get_seconds() + && ch->last_refresh >= current_detail->flush_time + ) + continue; + if (test_and_clear_bit(CACHE_PENDING, &ch->flags)) + queue_loose(current_detail, ch); + + if (atomic_read(&ch->refcnt) == 1) + break; + } + if (ch) { + *cp = ch->next; + ch->next = NULL; + current_detail->entries--; + rv = 1; + } + write_unlock(¤t_detail->hash_lock); + d = current_detail; + if (!ch) + current_index ++; + spin_unlock(&cache_list_lock); + if (ch) + d->cache_put(ch, d); + } else + spin_unlock(&cache_list_lock); + + return rv; +} + +/* + * We want to regularly clean the cache, so we need to schedule some work ... + */ +static void do_cache_clean(void *data) +{ + int delay = 5; + if (cache_clean() == -1) + delay = 30*HZ; + + if (list_empty(&cache_list)) + delay = 0; + + if (delay) + schedule_delayed_work(&cache_cleaner, delay); +} + + +/* + * Clean all caches promptly. This just calls cache_clean + * repeatedly until we are sure that every cache has had a chance to + * be fully cleaned + */ +void cache_flush(void) +{ + while (cache_clean() != -1) + cond_resched(); + while (cache_clean() != -1) + cond_resched(); +} + +void cache_purge(struct cache_detail *detail) +{ + detail->flush_time = LONG_MAX; + detail->nextcheck = get_seconds(); + cache_flush(); + detail->flush_time = 1; +} + + + +/* + * Deferral and Revisiting of Requests. + * + * If a cache lookup finds a pending entry, we + * need to defer the request and revisit it later. + * All deferred requests are stored in a hash table, + * indexed by "struct cache_head *". + * As it may be wasteful to store a whole request + * structure, we allow the request to provide a + * deferred form, which must contain a + * 'struct cache_deferred_req' + * This cache_deferred_req contains a method to allow + * it to be revisited when cache info is available + */ + +#define DFR_HASHSIZE (PAGE_SIZE/sizeof(struct list_head)) +#define DFR_HASH(item) ((((long)item)>>4 ^ (((long)item)>>13)) % DFR_HASHSIZE) + +#define DFR_MAX 300 /* ??? */ + +static DEFINE_SPINLOCK(cache_defer_lock); +static LIST_HEAD(cache_defer_list); +static struct list_head cache_defer_hash[DFR_HASHSIZE]; +static int cache_defer_cnt; + +static void cache_defer_req(struct cache_req *req, struct cache_head *item) +{ + struct cache_deferred_req *dreq; + int hash = DFR_HASH(item); + + dreq = req->defer(req); + if (dreq == NULL) + return; + + dreq->item = item; + dreq->recv_time = get_seconds(); + + spin_lock(&cache_defer_lock); + + list_add(&dreq->recent, &cache_defer_list); + + if (cache_defer_hash[hash].next == NULL) + INIT_LIST_HEAD(&cache_defer_hash[hash]); + list_add(&dreq->hash, &cache_defer_hash[hash]); + + /* it is in, now maybe clean up */ + dreq = NULL; + if (++cache_defer_cnt > DFR_MAX) { + /* too much in the cache, randomly drop + * first or last + */ + if (net_random()&1) + dreq = list_entry(cache_defer_list.next, + struct cache_deferred_req, + recent); + else + dreq = list_entry(cache_defer_list.prev, + struct cache_deferred_req, + recent); + list_del(&dreq->recent); + list_del(&dreq->hash); + cache_defer_cnt--; + } + spin_unlock(&cache_defer_lock); + + if (dreq) { + /* there was one too many */ + dreq->revisit(dreq, 1); + } + if (test_bit(CACHE_VALID, &item->flags)) { + /* must have just been validated... */ + cache_revisit_request(item); + } +} + +static void cache_revisit_request(struct cache_head *item) +{ + struct cache_deferred_req *dreq; + struct list_head pending; + + struct list_head *lp; + int hash = DFR_HASH(item); + + INIT_LIST_HEAD(&pending); + spin_lock(&cache_defer_lock); + + lp = cache_defer_hash[hash].next; + if (lp) { + while (lp != &cache_defer_hash[hash]) { + dreq = list_entry(lp, struct cache_deferred_req, hash); + lp = lp->next; + if (dreq->item == item) { + list_del(&dreq->hash); + list_move(&dreq->recent, &pending); + cache_defer_cnt--; + } + } + } + spin_unlock(&cache_defer_lock); + + while (!list_empty(&pending)) { + dreq = list_entry(pending.next, struct cache_deferred_req, recent); + list_del_init(&dreq->recent); + dreq->revisit(dreq, 0); + } +} + +void cache_clean_deferred(void *owner) +{ + struct cache_deferred_req *dreq, *tmp; + struct list_head pending; + + + INIT_LIST_HEAD(&pending); + spin_lock(&cache_defer_lock); + + list_for_each_entry_safe(dreq, tmp, &cache_defer_list, recent) { + if (dreq->owner == owner) { + list_del(&dreq->hash); + list_move(&dreq->recent, &pending); + cache_defer_cnt--; + } + } + spin_unlock(&cache_defer_lock); + + while (!list_empty(&pending)) { + dreq = list_entry(pending.next, struct cache_deferred_req, recent); + list_del_init(&dreq->recent); + dreq->revisit(dreq, 1); + } +} + +/* + * communicate with user-space + * + * We have a magic /proc file - /proc/sunrpc/cache + * On read, you get a full request, or block + * On write, an update request is processed + * Poll works if anything to read, and always allows write + * + * Implemented by linked list of requests. Each open file has + * a ->private that also exists in this list. New request are added + * to the end and may wakeup and preceding readers. + * New readers are added to the head. If, on read, an item is found with + * CACHE_UPCALLING clear, we free it from the list. + * + */ + +static DEFINE_SPINLOCK(queue_lock); +static DECLARE_MUTEX(queue_io_sem); + +struct cache_queue { + struct list_head list; + int reader; /* if 0, then request */ +}; +struct cache_request { + struct cache_queue q; + struct cache_head *item; + char * buf; + int len; + int readers; +}; +struct cache_reader { + struct cache_queue q; + int offset; /* if non-0, we have a refcnt on next request */ +}; + +static ssize_t +cache_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) +{ + struct cache_reader *rp = filp->private_data; + struct cache_request *rq; + struct cache_detail *cd = PDE(filp->f_dentry->d_inode)->data; + int err; + + if (count == 0) + return 0; + + down(&queue_io_sem); /* protect against multiple concurrent + * readers on this file */ + again: + spin_lock(&queue_lock); + /* need to find next request */ + while (rp->q.list.next != &cd->queue && + list_entry(rp->q.list.next, struct cache_queue, list) + ->reader) { + struct list_head *next = rp->q.list.next; + list_move(&rp->q.list, next); + } + if (rp->q.list.next == &cd->queue) { + spin_unlock(&queue_lock); + up(&queue_io_sem); + if (rp->offset) + BUG(); + return 0; + } + rq = container_of(rp->q.list.next, struct cache_request, q.list); + if (rq->q.reader) BUG(); + if (rp->offset == 0) + rq->readers++; + spin_unlock(&queue_lock); + + if (rp->offset == 0 && !test_bit(CACHE_PENDING, &rq->item->flags)) { + err = -EAGAIN; + spin_lock(&queue_lock); + list_move(&rp->q.list, &rq->q.list); + spin_unlock(&queue_lock); + } else { + if (rp->offset + count > rq->len) + count = rq->len - rp->offset; + err = -EFAULT; + if (copy_to_user(buf, rq->buf + rp->offset, count)) + goto out; + rp->offset += count; + if (rp->offset >= rq->len) { + rp->offset = 0; + spin_lock(&queue_lock); + list_move(&rp->q.list, &rq->q.list); + spin_unlock(&queue_lock); + } + err = 0; + } + out: + if (rp->offset == 0) { + /* need to release rq */ + spin_lock(&queue_lock); + rq->readers--; + if (rq->readers == 0 && + !test_bit(CACHE_PENDING, &rq->item->flags)) { + list_del(&rq->q.list); + spin_unlock(&queue_lock); + cd->cache_put(rq->item, cd); + kfree(rq->buf); + kfree(rq); + } else + spin_unlock(&queue_lock); + } + if (err == -EAGAIN) + goto again; + up(&queue_io_sem); + return err ? err : count; +} + +static char write_buf[8192]; /* protected by queue_io_sem */ + +static ssize_t +cache_write(struct file *filp, const char __user *buf, size_t count, + loff_t *ppos) +{ + int err; + struct cache_detail *cd = PDE(filp->f_dentry->d_inode)->data; + + if (count == 0) + return 0; + if (count >= sizeof(write_buf)) + return -EINVAL; + + down(&queue_io_sem); + + if (copy_from_user(write_buf, buf, count)) { + up(&queue_io_sem); + return -EFAULT; + } + write_buf[count] = '\0'; + if (cd->cache_parse) + err = cd->cache_parse(cd, write_buf, count); + else + err = -EINVAL; + + up(&queue_io_sem); + return err ? err : count; +} + +static DECLARE_WAIT_QUEUE_HEAD(queue_wait); + +static unsigned int +cache_poll(struct file *filp, poll_table *wait) +{ + unsigned int mask; + struct cache_reader *rp = filp->private_data; + struct cache_queue *cq; + struct cache_detail *cd = PDE(filp->f_dentry->d_inode)->data; + + poll_wait(filp, &queue_wait, wait); + + /* alway allow write */ + mask = POLL_OUT | POLLWRNORM; + + if (!rp) + return mask; + + spin_lock(&queue_lock); + + for (cq= &rp->q; &cq->list != &cd->queue; + cq = list_entry(cq->list.next, struct cache_queue, list)) + if (!cq->reader) { + mask |= POLLIN | POLLRDNORM; + break; + } + spin_unlock(&queue_lock); + return mask; +} + +static int +cache_ioctl(struct inode *ino, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + int len = 0; + struct cache_reader *rp = filp->private_data; + struct cache_queue *cq; + struct cache_detail *cd = PDE(ino)->data; + + if (cmd != FIONREAD || !rp) + return -EINVAL; + + spin_lock(&queue_lock); + + /* only find the length remaining in current request, + * or the length of the next request + */ + for (cq= &rp->q; &cq->list != &cd->queue; + cq = list_entry(cq->list.next, struct cache_queue, list)) + if (!cq->reader) { + struct cache_request *cr = + container_of(cq, struct cache_request, q); + len = cr->len - rp->offset; + break; + } + spin_unlock(&queue_lock); + + return put_user(len, (int __user *)arg); +} + +static int +cache_open(struct inode *inode, struct file *filp) +{ + struct cache_reader *rp = NULL; + + nonseekable_open(inode, filp); + if (filp->f_mode & FMODE_READ) { + struct cache_detail *cd = PDE(inode)->data; + + rp = kmalloc(sizeof(*rp), GFP_KERNEL); + if (!rp) + return -ENOMEM; + rp->offset = 0; + rp->q.reader = 1; + atomic_inc(&cd->readers); + spin_lock(&queue_lock); + list_add(&rp->q.list, &cd->queue); + spin_unlock(&queue_lock); + } + filp->private_data = rp; + return 0; +} + +static int +cache_release(struct inode *inode, struct file *filp) +{ + struct cache_reader *rp = filp->private_data; + struct cache_detail *cd = PDE(inode)->data; + + if (rp) { + spin_lock(&queue_lock); + if (rp->offset) { + struct cache_queue *cq; + for (cq= &rp->q; &cq->list != &cd->queue; + cq = list_entry(cq->list.next, struct cache_queue, list)) + if (!cq->reader) { + container_of(cq, struct cache_request, q) + ->readers--; + break; + } + rp->offset = 0; + } + list_del(&rp->q.list); + spin_unlock(&queue_lock); + + filp->private_data = NULL; + kfree(rp); + + cd->last_close = get_seconds(); + atomic_dec(&cd->readers); + } + return 0; +} + + + +static struct file_operations cache_file_operations = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .read = cache_read, + .write = cache_write, + .poll = cache_poll, + .ioctl = cache_ioctl, /* for FIONREAD */ + .open = cache_open, + .release = cache_release, +}; + + +static void queue_loose(struct cache_detail *detail, struct cache_head *ch) +{ + struct cache_queue *cq; + spin_lock(&queue_lock); + list_for_each_entry(cq, &detail->queue, list) + if (!cq->reader) { + struct cache_request *cr = container_of(cq, struct cache_request, q); + if (cr->item != ch) + continue; + if (cr->readers != 0) + break; + list_del(&cr->q.list); + spin_unlock(&queue_lock); + detail->cache_put(cr->item, detail); + kfree(cr->buf); + kfree(cr); + return; + } + spin_unlock(&queue_lock); +} + +/* + * Support routines for text-based upcalls. + * Fields are separated by spaces. + * Fields are either mangled to quote space tab newline slosh with slosh + * or a hexified with a leading \x + * Record is terminated with newline. + * + */ + +void qword_add(char **bpp, int *lp, char *str) +{ + char *bp = *bpp; + int len = *lp; + char c; + + if (len < 0) return; + + while ((c=*str++) && len) + switch(c) { + case ' ': + case '\t': + case '\n': + case '\\': + if (len >= 4) { + *bp++ = '\\'; + *bp++ = '0' + ((c & 0300)>>6); + *bp++ = '0' + ((c & 0070)>>3); + *bp++ = '0' + ((c & 0007)>>0); + } + len -= 4; + break; + default: + *bp++ = c; + len--; + } + if (c || len <1) len = -1; + else { + *bp++ = ' '; + len--; + } + *bpp = bp; + *lp = len; +} + +void qword_addhex(char **bpp, int *lp, char *buf, int blen) +{ + char *bp = *bpp; + int len = *lp; + + if (len < 0) return; + + if (len > 2) { + *bp++ = '\\'; + *bp++ = 'x'; + len -= 2; + while (blen && len >= 2) { + unsigned char c = *buf++; + *bp++ = '0' + ((c&0xf0)>>4) + (c>=0xa0)*('a'-'9'-1); + *bp++ = '0' + (c&0x0f) + ((c&0x0f)>=0x0a)*('a'-'9'-1); + len -= 2; + blen--; + } + } + if (blen || len<1) len = -1; + else { + *bp++ = ' '; + len--; + } + *bpp = bp; + *lp = len; +} + +static void warn_no_listener(struct cache_detail *detail) +{ + if (detail->last_warn != detail->last_close) { + detail->last_warn = detail->last_close; + if (detail->warn_no_listener) + detail->warn_no_listener(detail); + } +} + +/* + * register an upcall request to user-space. + * Each request is at most one page long. + */ +static int cache_make_upcall(struct cache_detail *detail, struct cache_head *h) +{ + + char *buf; + struct cache_request *crq; + char *bp; + int len; + + if (detail->cache_request == NULL) + return -EINVAL; + + if (atomic_read(&detail->readers) == 0 && + detail->last_close < get_seconds() - 30) { + warn_no_listener(detail); + return -EINVAL; + } + + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) + return -EAGAIN; + + crq = kmalloc(sizeof (*crq), GFP_KERNEL); + if (!crq) { + kfree(buf); + return -EAGAIN; + } + + bp = buf; len = PAGE_SIZE; + + detail->cache_request(detail, h, &bp, &len); + + if (len < 0) { + kfree(buf); + kfree(crq); + return -EAGAIN; + } + crq->q.reader = 0; + crq->item = cache_get(h); + crq->buf = buf; + crq->len = PAGE_SIZE - len; + crq->readers = 0; + spin_lock(&queue_lock); + list_add_tail(&crq->q.list, &detail->queue); + spin_unlock(&queue_lock); + wake_up(&queue_wait); + return 0; +} + +/* + * parse a message from user-space and pass it + * to an appropriate cache + * Messages are, like requests, separated into fields by + * spaces and dequotes as \xHEXSTRING or embedded \nnn octal + * + * Message is + * reply cachename expiry key ... content.... + * + * key and content are both parsed by cache + */ + +#define isodigit(c) (isdigit(c) && c <= '7') +int qword_get(char **bpp, char *dest, int bufsize) +{ + /* return bytes copied, or -1 on error */ + char *bp = *bpp; + int len = 0; + + while (*bp == ' ') bp++; + + if (bp[0] == '\\' && bp[1] == 'x') { + /* HEX STRING */ + bp += 2; + while (isxdigit(bp[0]) && isxdigit(bp[1]) && len < bufsize) { + int byte = isdigit(*bp) ? *bp-'0' : toupper(*bp)-'A'+10; + bp++; + byte <<= 4; + byte |= isdigit(*bp) ? *bp-'0' : toupper(*bp)-'A'+10; + *dest++ = byte; + bp++; + len++; + } + } else { + /* text with \nnn octal quoting */ + while (*bp != ' ' && *bp != '\n' && *bp && len < bufsize-1) { + if (*bp == '\\' && + isodigit(bp[1]) && (bp[1] <= '3') && + isodigit(bp[2]) && + isodigit(bp[3])) { + int byte = (*++bp -'0'); + bp++; + byte = (byte << 3) | (*bp++ - '0'); + byte = (byte << 3) | (*bp++ - '0'); + *dest++ = byte; + len++; + } else { + *dest++ = *bp++; + len++; + } + } + } + + if (*bp != ' ' && *bp != '\n' && *bp != '\0') + return -1; + while (*bp == ' ') bp++; + *bpp = bp; + *dest = '\0'; + return len; +} + + +/* + * support /proc/sunrpc/cache/$CACHENAME/content + * as a seqfile. + * We call ->cache_show passing NULL for the item to + * get a header, then pass each real item in the cache + */ + +struct handle { + struct cache_detail *cd; +}; + +static void *c_start(struct seq_file *m, loff_t *pos) +{ + loff_t n = *pos; + unsigned hash, entry; + struct cache_head *ch; + struct cache_detail *cd = ((struct handle*)m->private)->cd; + + + read_lock(&cd->hash_lock); + if (!n--) + return SEQ_START_TOKEN; + hash = n >> 32; + entry = n & ((1LL<<32) - 1); + + for (ch=cd->hash_table[hash]; ch; ch=ch->next) + if (!entry--) + return ch; + n &= ~((1LL<<32) - 1); + do { + hash++; + n += 1LL<<32; + } while(hash < cd->hash_size && + cd->hash_table[hash]==NULL); + if (hash >= cd->hash_size) + return NULL; + *pos = n+1; + return cd->hash_table[hash]; +} + +static void *c_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct cache_head *ch = p; + int hash = (*pos >> 32); + struct cache_detail *cd = ((struct handle*)m->private)->cd; + + if (p == SEQ_START_TOKEN) + hash = 0; + else if (ch->next == NULL) { + hash++; + *pos += 1LL<<32; + } else { + ++*pos; + return ch->next; + } + *pos &= ~((1LL<<32) - 1); + while (hash < cd->hash_size && + cd->hash_table[hash] == NULL) { + hash++; + *pos += 1LL<<32; + } + if (hash >= cd->hash_size) + return NULL; + ++*pos; + return cd->hash_table[hash]; +} + +static void c_stop(struct seq_file *m, void *p) +{ + struct cache_detail *cd = ((struct handle*)m->private)->cd; + read_unlock(&cd->hash_lock); +} + +static int c_show(struct seq_file *m, void *p) +{ + struct cache_head *cp = p; + struct cache_detail *cd = ((struct handle*)m->private)->cd; + + if (p == SEQ_START_TOKEN) + return cd->cache_show(m, cd, NULL); + + ifdebug(CACHE) + seq_printf(m, "# expiry=%ld refcnt=%d\n", + cp->expiry_time, atomic_read(&cp->refcnt)); + cache_get(cp); + if (cache_check(cd, cp, NULL)) + /* cache_check does a cache_put on failure */ + seq_printf(m, "# "); + else + cache_put(cp, cd); + + return cd->cache_show(m, cd, cp); +} + +static struct seq_operations cache_content_op = { + .start = c_start, + .next = c_next, + .stop = c_stop, + .show = c_show, +}; + +static int content_open(struct inode *inode, struct file *file) +{ + int res; + struct handle *han; + struct cache_detail *cd = PDE(inode)->data; + + han = kmalloc(sizeof(*han), GFP_KERNEL); + if (han == NULL) + return -ENOMEM; + + han->cd = cd; + + res = seq_open(file, &cache_content_op); + if (res) + kfree(han); + else + ((struct seq_file *)file->private_data)->private = han; + + return res; +} +static int content_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = (struct seq_file *)file->private_data; + struct handle *han = m->private; + kfree(han); + m->private = NULL; + return seq_release(inode, file); +} + +static struct file_operations content_file_operations = { + .open = content_open, + .read = seq_read, + .llseek = seq_lseek, + .release = content_release, +}; + +static ssize_t read_flush(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct cache_detail *cd = PDE(file->f_dentry->d_inode)->data; + char tbuf[20]; + unsigned long p = *ppos; + int len; + + sprintf(tbuf, "%lu\n", cd->flush_time); + len = strlen(tbuf); + if (p >= len) + return 0; + len -= p; + if (len > count) len = count; + if (copy_to_user(buf, (void*)(tbuf+p), len)) + len = -EFAULT; + else + *ppos += len; + return len; +} + +static ssize_t write_flush(struct file * file, const char __user * buf, + size_t count, loff_t *ppos) +{ + struct cache_detail *cd = PDE(file->f_dentry->d_inode)->data; + char tbuf[20]; + char *ep; + long flushtime; + if (*ppos || count > sizeof(tbuf)-1) + return -EINVAL; + if (copy_from_user(tbuf, buf, count)) + return -EFAULT; + tbuf[count] = 0; + flushtime = simple_strtoul(tbuf, &ep, 0); + if (*ep && *ep != '\n') + return -EINVAL; + + cd->flush_time = flushtime; + cd->nextcheck = get_seconds(); + cache_flush(); + + *ppos += count; + return count; +} + +static struct file_operations cache_flush_operations = { + .open = nonseekable_open, + .read = read_flush, + .write = write_flush, +}; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c new file mode 100644 index 000000000000..02bc029d46fe --- /dev/null +++ b/net/sunrpc/clnt.c @@ -0,0 +1,1085 @@ +/* + * linux/net/sunrpc/rpcclnt.c + * + * This file contains the high-level RPC interface. + * It is modeled as a finite state machine to support both synchronous + * and asynchronous requests. + * + * - RPC header generation and argument serialization. + * - Credential refresh. + * - TCP connect handling. + * - Retry of operation when it is suspected the operation failed because + * of uid squashing on the server, or when the credentials were stale + * and need to be refreshed, or when a packet was damaged in transit. + * This may be have to be moved to the VFS layer. + * + * NB: BSD uses a more intelligent approach to guessing when a request + * or reply has been lost by keeping the RTO estimate for each procedure. + * We currently make do with a constant timeout value. + * + * Copyright (C) 1992,1993 Rick Sladkey <jrs@world.std.com> + * Copyright (C) 1995,1996 Olaf Kirch <okir@monad.swb.de> + */ + +#include <asm/system.h> + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/in.h> +#include <linux/utsname.h> + +#include <linux/sunrpc/clnt.h> +#include <linux/workqueue.h> +#include <linux/sunrpc/rpc_pipe_fs.h> + +#include <linux/nfs.h> + + +#define RPC_SLACK_SPACE (1024) /* total overkill */ + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_CALL +#endif + +static DECLARE_WAIT_QUEUE_HEAD(destroy_wait); + + +static void call_start(struct rpc_task *task); +static void call_reserve(struct rpc_task *task); +static void call_reserveresult(struct rpc_task *task); +static void call_allocate(struct rpc_task *task); +static void call_encode(struct rpc_task *task); +static void call_decode(struct rpc_task *task); +static void call_bind(struct rpc_task *task); +static void call_transmit(struct rpc_task *task); +static void call_status(struct rpc_task *task); +static void call_refresh(struct rpc_task *task); +static void call_refreshresult(struct rpc_task *task); +static void call_timeout(struct rpc_task *task); +static void call_connect(struct rpc_task *task); +static void call_connect_status(struct rpc_task *task); +static u32 * call_header(struct rpc_task *task); +static u32 * call_verify(struct rpc_task *task); + + +static int +rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name) +{ + static uint32_t clntid; + int error; + + if (dir_name == NULL) + return 0; + for (;;) { + snprintf(clnt->cl_pathname, sizeof(clnt->cl_pathname), + "%s/clnt%x", dir_name, + (unsigned int)clntid++); + clnt->cl_pathname[sizeof(clnt->cl_pathname) - 1] = '\0'; + clnt->cl_dentry = rpc_mkdir(clnt->cl_pathname, clnt); + if (!IS_ERR(clnt->cl_dentry)) + return 0; + error = PTR_ERR(clnt->cl_dentry); + if (error != -EEXIST) { + printk(KERN_INFO "RPC: Couldn't create pipefs entry %s, error %d\n", + clnt->cl_pathname, error); + return error; + } + } +} + +/* + * Create an RPC client + * FIXME: This should also take a flags argument (as in task->tk_flags). + * It's called (among others) from pmap_create_client, which may in + * turn be called by an async task. In this case, rpciod should not be + * made to sleep too long. + */ +struct rpc_clnt * +rpc_create_client(struct rpc_xprt *xprt, char *servname, + struct rpc_program *program, u32 vers, + rpc_authflavor_t flavor) +{ + struct rpc_version *version; + struct rpc_clnt *clnt = NULL; + int err; + int len; + + dprintk("RPC: creating %s client for %s (xprt %p)\n", + program->name, servname, xprt); + + err = -EINVAL; + if (!xprt) + goto out_err; + if (vers >= program->nrvers || !(version = program->version[vers])) + goto out_err; + + err = -ENOMEM; + clnt = (struct rpc_clnt *) kmalloc(sizeof(*clnt), GFP_KERNEL); + if (!clnt) + goto out_err; + memset(clnt, 0, sizeof(*clnt)); + atomic_set(&clnt->cl_users, 0); + atomic_set(&clnt->cl_count, 1); + clnt->cl_parent = clnt; + + clnt->cl_server = clnt->cl_inline_name; + len = strlen(servname) + 1; + if (len > sizeof(clnt->cl_inline_name)) { + char *buf = kmalloc(len, GFP_KERNEL); + if (buf != 0) + clnt->cl_server = buf; + else + len = sizeof(clnt->cl_inline_name); + } + strlcpy(clnt->cl_server, servname, len); + + clnt->cl_xprt = xprt; + clnt->cl_procinfo = version->procs; + clnt->cl_maxproc = version->nrprocs; + clnt->cl_protname = program->name; + clnt->cl_pmap = &clnt->cl_pmap_default; + clnt->cl_port = xprt->addr.sin_port; + clnt->cl_prog = program->number; + clnt->cl_vers = version->number; + clnt->cl_prot = xprt->prot; + clnt->cl_stats = program->stats; + rpc_init_wait_queue(&clnt->cl_pmap_default.pm_bindwait, "bindwait"); + + if (!clnt->cl_port) + clnt->cl_autobind = 1; + + clnt->cl_rtt = &clnt->cl_rtt_default; + rpc_init_rtt(&clnt->cl_rtt_default, xprt->timeout.to_initval); + + err = rpc_setup_pipedir(clnt, program->pipe_dir_name); + if (err < 0) + goto out_no_path; + + err = -ENOMEM; + if (!rpcauth_create(flavor, clnt)) { + printk(KERN_INFO "RPC: Couldn't create auth handle (flavor %u)\n", + flavor); + goto out_no_auth; + } + + /* save the nodename */ + clnt->cl_nodelen = strlen(system_utsname.nodename); + if (clnt->cl_nodelen > UNX_MAXNODENAME) + clnt->cl_nodelen = UNX_MAXNODENAME; + memcpy(clnt->cl_nodename, system_utsname.nodename, clnt->cl_nodelen); + return clnt; + +out_no_auth: + rpc_rmdir(clnt->cl_pathname); +out_no_path: + if (clnt->cl_server != clnt->cl_inline_name) + kfree(clnt->cl_server); + kfree(clnt); +out_err: + return ERR_PTR(err); +} + +/* + * This function clones the RPC client structure. It allows us to share the + * same transport while varying parameters such as the authentication + * flavour. + */ +struct rpc_clnt * +rpc_clone_client(struct rpc_clnt *clnt) +{ + struct rpc_clnt *new; + + new = (struct rpc_clnt *)kmalloc(sizeof(*new), GFP_KERNEL); + if (!new) + goto out_no_clnt; + memcpy(new, clnt, sizeof(*new)); + atomic_set(&new->cl_count, 1); + atomic_set(&new->cl_users, 0); + new->cl_parent = clnt; + atomic_inc(&clnt->cl_count); + /* Duplicate portmapper */ + rpc_init_wait_queue(&new->cl_pmap_default.pm_bindwait, "bindwait"); + /* Turn off autobind on clones */ + new->cl_autobind = 0; + new->cl_oneshot = 0; + new->cl_dead = 0; + rpc_init_rtt(&new->cl_rtt_default, clnt->cl_xprt->timeout.to_initval); + if (new->cl_auth) + atomic_inc(&new->cl_auth->au_count); + return new; +out_no_clnt: + printk(KERN_INFO "RPC: out of memory in %s\n", __FUNCTION__); + return ERR_PTR(-ENOMEM); +} + +/* + * Properly shut down an RPC client, terminating all outstanding + * requests. Note that we must be certain that cl_oneshot and + * cl_dead are cleared, or else the client would be destroyed + * when the last task releases it. + */ +int +rpc_shutdown_client(struct rpc_clnt *clnt) +{ + dprintk("RPC: shutting down %s client for %s, tasks=%d\n", + clnt->cl_protname, clnt->cl_server, + atomic_read(&clnt->cl_users)); + + while (atomic_read(&clnt->cl_users) > 0) { + /* Don't let rpc_release_client destroy us */ + clnt->cl_oneshot = 0; + clnt->cl_dead = 0; + rpc_killall_tasks(clnt); + sleep_on_timeout(&destroy_wait, 1*HZ); + } + + if (atomic_read(&clnt->cl_users) < 0) { + printk(KERN_ERR "RPC: rpc_shutdown_client clnt %p tasks=%d\n", + clnt, atomic_read(&clnt->cl_users)); +#ifdef RPC_DEBUG + rpc_show_tasks(); +#endif + BUG(); + } + + return rpc_destroy_client(clnt); +} + +/* + * Delete an RPC client + */ +int +rpc_destroy_client(struct rpc_clnt *clnt) +{ + if (!atomic_dec_and_test(&clnt->cl_count)) + return 1; + BUG_ON(atomic_read(&clnt->cl_users) != 0); + + dprintk("RPC: destroying %s client for %s\n", + clnt->cl_protname, clnt->cl_server); + if (clnt->cl_auth) { + rpcauth_destroy(clnt->cl_auth); + clnt->cl_auth = NULL; + } + if (clnt->cl_parent != clnt) { + rpc_destroy_client(clnt->cl_parent); + goto out_free; + } + if (clnt->cl_pathname[0]) + rpc_rmdir(clnt->cl_pathname); + if (clnt->cl_xprt) { + xprt_destroy(clnt->cl_xprt); + clnt->cl_xprt = NULL; + } + if (clnt->cl_server != clnt->cl_inline_name) + kfree(clnt->cl_server); +out_free: + kfree(clnt); + return 0; +} + +/* + * Release an RPC client + */ +void +rpc_release_client(struct rpc_clnt *clnt) +{ + dprintk("RPC: rpc_release_client(%p, %d)\n", + clnt, atomic_read(&clnt->cl_users)); + + if (!atomic_dec_and_test(&clnt->cl_users)) + return; + wake_up(&destroy_wait); + if (clnt->cl_oneshot || clnt->cl_dead) + rpc_destroy_client(clnt); +} + +/* + * Default callback for async RPC calls + */ +static void +rpc_default_callback(struct rpc_task *task) +{ +} + +/* + * Export the signal mask handling for aysnchronous code that + * sleeps on RPC calls + */ + +void rpc_clnt_sigmask(struct rpc_clnt *clnt, sigset_t *oldset) +{ + unsigned long sigallow = sigmask(SIGKILL); + unsigned long irqflags; + + /* Turn off various signals */ + if (clnt->cl_intr) { + struct k_sigaction *action = current->sighand->action; + if (action[SIGINT-1].sa.sa_handler == SIG_DFL) + sigallow |= sigmask(SIGINT); + if (action[SIGQUIT-1].sa.sa_handler == SIG_DFL) + sigallow |= sigmask(SIGQUIT); + } + spin_lock_irqsave(¤t->sighand->siglock, irqflags); + *oldset = current->blocked; + siginitsetinv(¤t->blocked, sigallow & ~oldset->sig[0]); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); +} + +void rpc_clnt_sigunmask(struct rpc_clnt *clnt, sigset_t *oldset) +{ + unsigned long irqflags; + + spin_lock_irqsave(¤t->sighand->siglock, irqflags); + current->blocked = *oldset; + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); +} + +/* + * New rpc_call implementation + */ +int rpc_call_sync(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) +{ + struct rpc_task *task; + sigset_t oldset; + int status; + + /* If this client is slain all further I/O fails */ + if (clnt->cl_dead) + return -EIO; + + BUG_ON(flags & RPC_TASK_ASYNC); + + rpc_clnt_sigmask(clnt, &oldset); + + status = -ENOMEM; + task = rpc_new_task(clnt, NULL, flags); + if (task == NULL) + goto out; + + rpc_call_setup(task, msg, 0); + + /* Set up the call info struct and execute the task */ + if (task->tk_status == 0) + status = rpc_execute(task); + else { + status = task->tk_status; + rpc_release_task(task); + } + +out: + rpc_clnt_sigunmask(clnt, &oldset); + + return status; +} + +/* + * New rpc_call implementation + */ +int +rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg, int flags, + rpc_action callback, void *data) +{ + struct rpc_task *task; + sigset_t oldset; + int status; + + /* If this client is slain all further I/O fails */ + if (clnt->cl_dead) + return -EIO; + + flags |= RPC_TASK_ASYNC; + + rpc_clnt_sigmask(clnt, &oldset); + + /* Create/initialize a new RPC task */ + if (!callback) + callback = rpc_default_callback; + status = -ENOMEM; + if (!(task = rpc_new_task(clnt, callback, flags))) + goto out; + task->tk_calldata = data; + + rpc_call_setup(task, msg, 0); + + /* Set up the call info struct and execute the task */ + status = task->tk_status; + if (status == 0) + rpc_execute(task); + else + rpc_release_task(task); + +out: + rpc_clnt_sigunmask(clnt, &oldset); + + return status; +} + + +void +rpc_call_setup(struct rpc_task *task, struct rpc_message *msg, int flags) +{ + task->tk_msg = *msg; + task->tk_flags |= flags; + /* Bind the user cred */ + if (task->tk_msg.rpc_cred != NULL) + rpcauth_holdcred(task); + else + rpcauth_bindcred(task); + + if (task->tk_status == 0) + task->tk_action = call_start; + else + task->tk_action = NULL; +} + +void +rpc_setbufsize(struct rpc_clnt *clnt, unsigned int sndsize, unsigned int rcvsize) +{ + struct rpc_xprt *xprt = clnt->cl_xprt; + + xprt->sndsize = 0; + if (sndsize) + xprt->sndsize = sndsize + RPC_SLACK_SPACE; + xprt->rcvsize = 0; + if (rcvsize) + xprt->rcvsize = rcvsize + RPC_SLACK_SPACE; + if (xprt_connected(xprt)) + xprt_sock_setbufsize(xprt); +} + +/* + * Return size of largest payload RPC client can support, in bytes + * + * For stream transports, this is one RPC record fragment (see RFC + * 1831), as we don't support multi-record requests yet. For datagram + * transports, this is the size of an IP packet minus the IP, UDP, and + * RPC header sizes. + */ +size_t rpc_max_payload(struct rpc_clnt *clnt) +{ + return clnt->cl_xprt->max_payload; +} +EXPORT_SYMBOL(rpc_max_payload); + +/* + * Restart an (async) RPC call. Usually called from within the + * exit handler. + */ +void +rpc_restart_call(struct rpc_task *task) +{ + if (RPC_ASSASSINATED(task)) + return; + + task->tk_action = call_start; +} + +/* + * 0. Initial state + * + * Other FSM states can be visited zero or more times, but + * this state is visited exactly once for each RPC. + */ +static void +call_start(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + + dprintk("RPC: %4d call_start %s%d proc %d (%s)\n", task->tk_pid, + clnt->cl_protname, clnt->cl_vers, task->tk_msg.rpc_proc->p_proc, + (RPC_IS_ASYNC(task) ? "async" : "sync")); + + /* Increment call count */ + task->tk_msg.rpc_proc->p_count++; + clnt->cl_stats->rpccnt++; + task->tk_action = call_reserve; +} + +/* + * 1. Reserve an RPC call slot + */ +static void +call_reserve(struct rpc_task *task) +{ + dprintk("RPC: %4d call_reserve\n", task->tk_pid); + + if (!rpcauth_uptodatecred(task)) { + task->tk_action = call_refresh; + return; + } + + task->tk_status = 0; + task->tk_action = call_reserveresult; + xprt_reserve(task); +} + +/* + * 1b. Grok the result of xprt_reserve() + */ +static void +call_reserveresult(struct rpc_task *task) +{ + int status = task->tk_status; + + dprintk("RPC: %4d call_reserveresult (status %d)\n", + task->tk_pid, task->tk_status); + + /* + * After a call to xprt_reserve(), we must have either + * a request slot or else an error status. + */ + task->tk_status = 0; + if (status >= 0) { + if (task->tk_rqstp) { + task->tk_action = call_allocate; + return; + } + + printk(KERN_ERR "%s: status=%d, but no request slot, exiting\n", + __FUNCTION__, status); + rpc_exit(task, -EIO); + return; + } + + /* + * Even though there was an error, we may have acquired + * a request slot somehow. Make sure not to leak it. + */ + if (task->tk_rqstp) { + printk(KERN_ERR "%s: status=%d, request allocated anyway\n", + __FUNCTION__, status); + xprt_release(task); + } + + switch (status) { + case -EAGAIN: /* woken up; retry */ + task->tk_action = call_reserve; + return; + case -EIO: /* probably a shutdown */ + break; + default: + printk(KERN_ERR "%s: unrecognized error %d, exiting\n", + __FUNCTION__, status); + break; + } + rpc_exit(task, status); +} + +/* + * 2. Allocate the buffer. For details, see sched.c:rpc_malloc. + * (Note: buffer memory is freed in rpc_task_release). + */ +static void +call_allocate(struct rpc_task *task) +{ + unsigned int bufsiz; + + dprintk("RPC: %4d call_allocate (status %d)\n", + task->tk_pid, task->tk_status); + task->tk_action = call_bind; + if (task->tk_buffer) + return; + + /* FIXME: compute buffer requirements more exactly using + * auth->au_wslack */ + bufsiz = task->tk_msg.rpc_proc->p_bufsiz + RPC_SLACK_SPACE; + + if (rpc_malloc(task, bufsiz << 1) != NULL) + return; + printk(KERN_INFO "RPC: buffer allocation failed for task %p\n", task); + + if (RPC_IS_ASYNC(task) || !(task->tk_client->cl_intr && signalled())) { + xprt_release(task); + task->tk_action = call_reserve; + rpc_delay(task, HZ>>4); + return; + } + + rpc_exit(task, -ERESTARTSYS); +} + +/* + * 3. Encode arguments of an RPC call + */ +static void +call_encode(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + struct rpc_rqst *req = task->tk_rqstp; + struct xdr_buf *sndbuf = &req->rq_snd_buf; + struct xdr_buf *rcvbuf = &req->rq_rcv_buf; + unsigned int bufsiz; + kxdrproc_t encode; + int status; + u32 *p; + + dprintk("RPC: %4d call_encode (status %d)\n", + task->tk_pid, task->tk_status); + + /* Default buffer setup */ + bufsiz = task->tk_bufsize >> 1; + sndbuf->head[0].iov_base = (void *)task->tk_buffer; + sndbuf->head[0].iov_len = bufsiz; + sndbuf->tail[0].iov_len = 0; + sndbuf->page_len = 0; + sndbuf->len = 0; + sndbuf->buflen = bufsiz; + rcvbuf->head[0].iov_base = (void *)((char *)task->tk_buffer + bufsiz); + rcvbuf->head[0].iov_len = bufsiz; + rcvbuf->tail[0].iov_len = 0; + rcvbuf->page_len = 0; + rcvbuf->len = 0; + rcvbuf->buflen = bufsiz; + + /* Encode header and provided arguments */ + encode = task->tk_msg.rpc_proc->p_encode; + if (!(p = call_header(task))) { + printk(KERN_INFO "RPC: call_header failed, exit EIO\n"); + rpc_exit(task, -EIO); + return; + } + if (encode && (status = rpcauth_wrap_req(task, encode, req, p, + task->tk_msg.rpc_argp)) < 0) { + printk(KERN_WARNING "%s: can't encode arguments: %d\n", + clnt->cl_protname, -status); + rpc_exit(task, status); + } +} + +/* + * 4. Get the server port number if not yet set + */ +static void +call_bind(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + struct rpc_xprt *xprt = clnt->cl_xprt; + + dprintk("RPC: %4d call_bind xprt %p %s connected\n", task->tk_pid, + xprt, (xprt_connected(xprt) ? "is" : "is not")); + + task->tk_action = (xprt_connected(xprt)) ? call_transmit : call_connect; + + if (!clnt->cl_port) { + task->tk_action = call_connect; + task->tk_timeout = RPC_CONNECT_TIMEOUT; + rpc_getport(task, clnt); + } +} + +/* + * 4a. Connect to the RPC server (TCP case) + */ +static void +call_connect(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + + dprintk("RPC: %4d call_connect status %d\n", + task->tk_pid, task->tk_status); + + if (xprt_connected(clnt->cl_xprt)) { + task->tk_action = call_transmit; + return; + } + task->tk_action = call_connect_status; + if (task->tk_status < 0) + return; + xprt_connect(task); +} + +/* + * 4b. Sort out connect result + */ +static void +call_connect_status(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + int status = task->tk_status; + + task->tk_status = 0; + if (status >= 0) { + clnt->cl_stats->netreconn++; + task->tk_action = call_transmit; + return; + } + + /* Something failed: we may have to rebind */ + if (clnt->cl_autobind) + clnt->cl_port = 0; + switch (status) { + case -ENOTCONN: + case -ETIMEDOUT: + case -EAGAIN: + task->tk_action = (clnt->cl_port == 0) ? call_bind : call_connect; + break; + default: + rpc_exit(task, -EIO); + } +} + +/* + * 5. Transmit the RPC request, and wait for reply + */ +static void +call_transmit(struct rpc_task *task) +{ + dprintk("RPC: %4d call_transmit (status %d)\n", + task->tk_pid, task->tk_status); + + task->tk_action = call_status; + if (task->tk_status < 0) + return; + task->tk_status = xprt_prepare_transmit(task); + if (task->tk_status != 0) + return; + /* Encode here so that rpcsec_gss can use correct sequence number. */ + if (!task->tk_rqstp->rq_bytes_sent) + call_encode(task); + if (task->tk_status < 0) + return; + xprt_transmit(task); + if (task->tk_status < 0) + return; + if (!task->tk_msg.rpc_proc->p_decode) { + task->tk_action = NULL; + rpc_wake_up_task(task); + } +} + +/* + * 6. Sort out the RPC call status + */ +static void +call_status(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + struct rpc_rqst *req = task->tk_rqstp; + int status; + + if (req->rq_received > 0 && !req->rq_bytes_sent) + task->tk_status = req->rq_received; + + dprintk("RPC: %4d call_status (status %d)\n", + task->tk_pid, task->tk_status); + + status = task->tk_status; + if (status >= 0) { + task->tk_action = call_decode; + return; + } + + task->tk_status = 0; + switch(status) { + case -ETIMEDOUT: + task->tk_action = call_timeout; + break; + case -ECONNREFUSED: + case -ENOTCONN: + req->rq_bytes_sent = 0; + if (clnt->cl_autobind) + clnt->cl_port = 0; + task->tk_action = call_bind; + break; + case -EAGAIN: + task->tk_action = call_transmit; + break; + case -EIO: + /* shutdown or soft timeout */ + rpc_exit(task, status); + break; + default: + if (clnt->cl_chatty) + printk("%s: RPC call returned error %d\n", + clnt->cl_protname, -status); + rpc_exit(task, status); + break; + } +} + +/* + * 6a. Handle RPC timeout + * We do not release the request slot, so we keep using the + * same XID for all retransmits. + */ +static void +call_timeout(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + + if (xprt_adjust_timeout(task->tk_rqstp) == 0) { + dprintk("RPC: %4d call_timeout (minor)\n", task->tk_pid); + goto retry; + } + + dprintk("RPC: %4d call_timeout (major)\n", task->tk_pid); + if (RPC_IS_SOFT(task)) { + if (clnt->cl_chatty) + printk(KERN_NOTICE "%s: server %s not responding, timed out\n", + clnt->cl_protname, clnt->cl_server); + rpc_exit(task, -EIO); + return; + } + + if (clnt->cl_chatty && !(task->tk_flags & RPC_CALL_MAJORSEEN)) { + task->tk_flags |= RPC_CALL_MAJORSEEN; + printk(KERN_NOTICE "%s: server %s not responding, still trying\n", + clnt->cl_protname, clnt->cl_server); + } + if (clnt->cl_autobind) + clnt->cl_port = 0; + +retry: + clnt->cl_stats->rpcretrans++; + task->tk_action = call_bind; + task->tk_status = 0; +} + +/* + * 7. Decode the RPC reply + */ +static void +call_decode(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + struct rpc_rqst *req = task->tk_rqstp; + kxdrproc_t decode = task->tk_msg.rpc_proc->p_decode; + u32 *p; + + dprintk("RPC: %4d call_decode (status %d)\n", + task->tk_pid, task->tk_status); + + if (clnt->cl_chatty && (task->tk_flags & RPC_CALL_MAJORSEEN)) { + printk(KERN_NOTICE "%s: server %s OK\n", + clnt->cl_protname, clnt->cl_server); + task->tk_flags &= ~RPC_CALL_MAJORSEEN; + } + + if (task->tk_status < 12) { + if (!RPC_IS_SOFT(task)) { + task->tk_action = call_bind; + clnt->cl_stats->rpcretrans++; + goto out_retry; + } + printk(KERN_WARNING "%s: too small RPC reply size (%d bytes)\n", + clnt->cl_protname, task->tk_status); + rpc_exit(task, -EIO); + return; + } + + req->rq_rcv_buf.len = req->rq_private_buf.len; + + /* Check that the softirq receive buffer is valid */ + WARN_ON(memcmp(&req->rq_rcv_buf, &req->rq_private_buf, + sizeof(req->rq_rcv_buf)) != 0); + + /* Verify the RPC header */ + if (!(p = call_verify(task))) { + if (task->tk_action == NULL) + return; + goto out_retry; + } + + task->tk_action = NULL; + + if (decode) + task->tk_status = rpcauth_unwrap_resp(task, decode, req, p, + task->tk_msg.rpc_resp); + dprintk("RPC: %4d call_decode result %d\n", task->tk_pid, + task->tk_status); + return; +out_retry: + req->rq_received = req->rq_private_buf.len = 0; + task->tk_status = 0; +} + +/* + * 8. Refresh the credentials if rejected by the server + */ +static void +call_refresh(struct rpc_task *task) +{ + dprintk("RPC: %4d call_refresh\n", task->tk_pid); + + xprt_release(task); /* Must do to obtain new XID */ + task->tk_action = call_refreshresult; + task->tk_status = 0; + task->tk_client->cl_stats->rpcauthrefresh++; + rpcauth_refreshcred(task); +} + +/* + * 8a. Process the results of a credential refresh + */ +static void +call_refreshresult(struct rpc_task *task) +{ + int status = task->tk_status; + dprintk("RPC: %4d call_refreshresult (status %d)\n", + task->tk_pid, task->tk_status); + + task->tk_status = 0; + task->tk_action = call_reserve; + if (status >= 0 && rpcauth_uptodatecred(task)) + return; + if (status == -EACCES) { + rpc_exit(task, -EACCES); + return; + } + task->tk_action = call_refresh; + if (status != -ETIMEDOUT) + rpc_delay(task, 3*HZ); + return; +} + +/* + * Call header serialization + */ +static u32 * +call_header(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + struct rpc_xprt *xprt = clnt->cl_xprt; + struct rpc_rqst *req = task->tk_rqstp; + u32 *p = req->rq_svec[0].iov_base; + + /* FIXME: check buffer size? */ + if (xprt->stream) + *p++ = 0; /* fill in later */ + *p++ = req->rq_xid; /* XID */ + *p++ = htonl(RPC_CALL); /* CALL */ + *p++ = htonl(RPC_VERSION); /* RPC version */ + *p++ = htonl(clnt->cl_prog); /* program number */ + *p++ = htonl(clnt->cl_vers); /* program version */ + *p++ = htonl(task->tk_msg.rpc_proc->p_proc); /* procedure */ + return rpcauth_marshcred(task, p); +} + +/* + * Reply header verification + */ +static u32 * +call_verify(struct rpc_task *task) +{ + struct kvec *iov = &task->tk_rqstp->rq_rcv_buf.head[0]; + int len = task->tk_rqstp->rq_rcv_buf.len >> 2; + u32 *p = iov->iov_base, n; + int error = -EACCES; + + if ((len -= 3) < 0) + goto out_overflow; + p += 1; /* skip XID */ + + if ((n = ntohl(*p++)) != RPC_REPLY) { + printk(KERN_WARNING "call_verify: not an RPC reply: %x\n", n); + goto out_retry; + } + if ((n = ntohl(*p++)) != RPC_MSG_ACCEPTED) { + if (--len < 0) + goto out_overflow; + switch ((n = ntohl(*p++))) { + case RPC_AUTH_ERROR: + break; + case RPC_MISMATCH: + printk(KERN_WARNING "%s: RPC call version mismatch!\n", __FUNCTION__); + goto out_eio; + default: + printk(KERN_WARNING "%s: RPC call rejected, unknown error: %x\n", __FUNCTION__, n); + goto out_eio; + } + if (--len < 0) + goto out_overflow; + switch ((n = ntohl(*p++))) { + case RPC_AUTH_REJECTEDCRED: + case RPC_AUTH_REJECTEDVERF: + case RPCSEC_GSS_CREDPROBLEM: + case RPCSEC_GSS_CTXPROBLEM: + if (!task->tk_cred_retry) + break; + task->tk_cred_retry--; + dprintk("RPC: %4d call_verify: retry stale creds\n", + task->tk_pid); + rpcauth_invalcred(task); + task->tk_action = call_refresh; + return NULL; + case RPC_AUTH_BADCRED: + case RPC_AUTH_BADVERF: + /* possibly garbled cred/verf? */ + if (!task->tk_garb_retry) + break; + task->tk_garb_retry--; + dprintk("RPC: %4d call_verify: retry garbled creds\n", + task->tk_pid); + task->tk_action = call_bind; + return NULL; + case RPC_AUTH_TOOWEAK: + printk(KERN_NOTICE "call_verify: server requires stronger " + "authentication.\n"); + break; + default: + printk(KERN_WARNING "call_verify: unknown auth error: %x\n", n); + error = -EIO; + } + dprintk("RPC: %4d call_verify: call rejected %d\n", + task->tk_pid, n); + goto out_err; + } + if (!(p = rpcauth_checkverf(task, p))) { + printk(KERN_WARNING "call_verify: auth check failed\n"); + goto out_retry; /* bad verifier, retry */ + } + len = p - (u32 *)iov->iov_base - 1; + if (len < 0) + goto out_overflow; + switch ((n = ntohl(*p++))) { + case RPC_SUCCESS: + return p; + case RPC_PROG_UNAVAIL: + printk(KERN_WARNING "RPC: call_verify: program %u is unsupported by server %s\n", + (unsigned int)task->tk_client->cl_prog, + task->tk_client->cl_server); + goto out_eio; + case RPC_PROG_MISMATCH: + printk(KERN_WARNING "RPC: call_verify: program %u, version %u unsupported by server %s\n", + (unsigned int)task->tk_client->cl_prog, + (unsigned int)task->tk_client->cl_vers, + task->tk_client->cl_server); + goto out_eio; + case RPC_PROC_UNAVAIL: + printk(KERN_WARNING "RPC: call_verify: proc %p unsupported by program %u, version %u on server %s\n", + task->tk_msg.rpc_proc, + task->tk_client->cl_prog, + task->tk_client->cl_vers, + task->tk_client->cl_server); + goto out_eio; + case RPC_GARBAGE_ARGS: + dprintk("RPC: %4d %s: server saw garbage\n", task->tk_pid, __FUNCTION__); + break; /* retry */ + default: + printk(KERN_WARNING "call_verify: server accept status: %x\n", n); + /* Also retry */ + } + +out_retry: + task->tk_client->cl_stats->rpcgarbage++; + if (task->tk_garb_retry) { + task->tk_garb_retry--; + dprintk(KERN_WARNING "RPC %s: retrying %4d\n", __FUNCTION__, task->tk_pid); + task->tk_action = call_bind; + return NULL; + } + printk(KERN_WARNING "RPC %s: retry failed, exit EIO\n", __FUNCTION__); +out_eio: + error = -EIO; +out_err: + rpc_exit(task, error); + return NULL; +out_overflow: + printk(KERN_WARNING "RPC %s: server reply was truncated.\n", __FUNCTION__); + goto out_retry; +} diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c new file mode 100644 index 000000000000..d0b1d2c34a4d --- /dev/null +++ b/net/sunrpc/pmap_clnt.c @@ -0,0 +1,298 @@ +/* + * linux/net/sunrpc/pmap.c + * + * Portmapper client. + * + * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de> + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/uio.h> +#include <linux/in.h> +#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/xprt.h> +#include <linux/sunrpc/sched.h> + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_PMAP +#endif + +#define PMAP_SET 1 +#define PMAP_UNSET 2 +#define PMAP_GETPORT 3 + +static struct rpc_procinfo pmap_procedures[]; +static struct rpc_clnt * pmap_create(char *, struct sockaddr_in *, int); +static void pmap_getport_done(struct rpc_task *); +static struct rpc_program pmap_program; +static DEFINE_SPINLOCK(pmap_lock); + +/* + * Obtain the port for a given RPC service on a given host. This one can + * be called for an ongoing RPC request. + */ +void +rpc_getport(struct rpc_task *task, struct rpc_clnt *clnt) +{ + struct rpc_portmap *map = clnt->cl_pmap; + struct sockaddr_in *sap = &clnt->cl_xprt->addr; + struct rpc_message msg = { + .rpc_proc = &pmap_procedures[PMAP_GETPORT], + .rpc_argp = map, + .rpc_resp = &clnt->cl_port, + .rpc_cred = NULL + }; + struct rpc_clnt *pmap_clnt; + struct rpc_task *child; + + dprintk("RPC: %4d rpc_getport(%s, %d, %d, %d)\n", + task->tk_pid, clnt->cl_server, + map->pm_prog, map->pm_vers, map->pm_prot); + + spin_lock(&pmap_lock); + if (map->pm_binding) { + rpc_sleep_on(&map->pm_bindwait, task, NULL, NULL); + spin_unlock(&pmap_lock); + return; + } + map->pm_binding = 1; + spin_unlock(&pmap_lock); + + pmap_clnt = pmap_create(clnt->cl_server, sap, map->pm_prot); + if (IS_ERR(pmap_clnt)) { + task->tk_status = PTR_ERR(pmap_clnt); + goto bailout; + } + task->tk_status = 0; + + /* + * Note: rpc_new_child will release client after a failure. + */ + if (!(child = rpc_new_child(pmap_clnt, task))) + goto bailout; + + /* Setup the call info struct */ + rpc_call_setup(child, &msg, 0); + + /* ... and run the child task */ + rpc_run_child(task, child, pmap_getport_done); + return; + +bailout: + spin_lock(&pmap_lock); + map->pm_binding = 0; + rpc_wake_up(&map->pm_bindwait); + spin_unlock(&pmap_lock); + task->tk_status = -EIO; + task->tk_action = NULL; +} + +#ifdef CONFIG_ROOT_NFS +int +rpc_getport_external(struct sockaddr_in *sin, __u32 prog, __u32 vers, int prot) +{ + struct rpc_portmap map = { + .pm_prog = prog, + .pm_vers = vers, + .pm_prot = prot, + .pm_port = 0 + }; + struct rpc_clnt *pmap_clnt; + char hostname[32]; + int status; + + dprintk("RPC: rpc_getport_external(%u.%u.%u.%u, %d, %d, %d)\n", + NIPQUAD(sin->sin_addr.s_addr), prog, vers, prot); + + sprintf(hostname, "%u.%u.%u.%u", NIPQUAD(sin->sin_addr.s_addr)); + pmap_clnt = pmap_create(hostname, sin, prot); + if (IS_ERR(pmap_clnt)) + return PTR_ERR(pmap_clnt); + + /* Setup the call info struct */ + status = rpc_call(pmap_clnt, PMAP_GETPORT, &map, &map.pm_port, 0); + + if (status >= 0) { + if (map.pm_port != 0) + return map.pm_port; + status = -EACCES; + } + return status; +} +#endif + +static void +pmap_getport_done(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + struct rpc_portmap *map = clnt->cl_pmap; + + dprintk("RPC: %4d pmap_getport_done(status %d, port %d)\n", + task->tk_pid, task->tk_status, clnt->cl_port); + if (task->tk_status < 0) { + /* Make the calling task exit with an error */ + task->tk_action = NULL; + } else if (clnt->cl_port == 0) { + /* Program not registered */ + task->tk_status = -EACCES; + task->tk_action = NULL; + } else { + /* byte-swap port number first */ + clnt->cl_port = htons(clnt->cl_port); + clnt->cl_xprt->addr.sin_port = clnt->cl_port; + } + spin_lock(&pmap_lock); + map->pm_binding = 0; + rpc_wake_up(&map->pm_bindwait); + spin_unlock(&pmap_lock); +} + +/* + * Set or unset a port registration with the local portmapper. + * port == 0 means unregister, port != 0 means register. + */ +int +rpc_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay) +{ + struct sockaddr_in sin; + struct rpc_portmap map; + struct rpc_clnt *pmap_clnt; + int error = 0; + + dprintk("RPC: registering (%d, %d, %d, %d) with portmapper.\n", + prog, vers, prot, port); + + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + pmap_clnt = pmap_create("localhost", &sin, IPPROTO_UDP); + if (IS_ERR(pmap_clnt)) { + error = PTR_ERR(pmap_clnt); + dprintk("RPC: couldn't create pmap client. Error = %d\n", error); + return error; + } + + map.pm_prog = prog; + map.pm_vers = vers; + map.pm_prot = prot; + map.pm_port = port; + + error = rpc_call(pmap_clnt, port? PMAP_SET : PMAP_UNSET, + &map, okay, 0); + + if (error < 0) { + printk(KERN_WARNING + "RPC: failed to contact portmap (errno %d).\n", + error); + } + dprintk("RPC: registration status %d/%d\n", error, *okay); + + /* Client deleted automatically because cl_oneshot == 1 */ + return error; +} + +static struct rpc_clnt * +pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto) +{ + struct rpc_xprt *xprt; + struct rpc_clnt *clnt; + + /* printk("pmap: create xprt\n"); */ + xprt = xprt_create_proto(proto, srvaddr, NULL); + if (IS_ERR(xprt)) + return (struct rpc_clnt *)xprt; + xprt->addr.sin_port = htons(RPC_PMAP_PORT); + + /* printk("pmap: create clnt\n"); */ + clnt = rpc_create_client(xprt, hostname, + &pmap_program, RPC_PMAP_VERSION, + RPC_AUTH_UNIX); + if (IS_ERR(clnt)) { + xprt_destroy(xprt); + } else { + clnt->cl_softrtry = 1; + clnt->cl_chatty = 1; + clnt->cl_oneshot = 1; + } + return clnt; +} + +/* + * XDR encode/decode functions for PMAP + */ +static int +xdr_encode_mapping(struct rpc_rqst *req, u32 *p, struct rpc_portmap *map) +{ + dprintk("RPC: xdr_encode_mapping(%d, %d, %d, %d)\n", + map->pm_prog, map->pm_vers, map->pm_prot, map->pm_port); + *p++ = htonl(map->pm_prog); + *p++ = htonl(map->pm_vers); + *p++ = htonl(map->pm_prot); + *p++ = htonl(map->pm_port); + + req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + return 0; +} + +static int +xdr_decode_port(struct rpc_rqst *req, u32 *p, unsigned short *portp) +{ + *portp = (unsigned short) ntohl(*p++); + return 0; +} + +static int +xdr_decode_bool(struct rpc_rqst *req, u32 *p, unsigned int *boolp) +{ + *boolp = (unsigned int) ntohl(*p++); + return 0; +} + +static struct rpc_procinfo pmap_procedures[] = { +[PMAP_SET] = { + .p_proc = PMAP_SET, + .p_encode = (kxdrproc_t) xdr_encode_mapping, + .p_decode = (kxdrproc_t) xdr_decode_bool, + .p_bufsiz = 4, + .p_count = 1, + }, +[PMAP_UNSET] = { + .p_proc = PMAP_UNSET, + .p_encode = (kxdrproc_t) xdr_encode_mapping, + .p_decode = (kxdrproc_t) xdr_decode_bool, + .p_bufsiz = 4, + .p_count = 1, + }, +[PMAP_GETPORT] = { + .p_proc = PMAP_GETPORT, + .p_encode = (kxdrproc_t) xdr_encode_mapping, + .p_decode = (kxdrproc_t) xdr_decode_port, + .p_bufsiz = 4, + .p_count = 1, + }, +}; + +static struct rpc_version pmap_version2 = { + .number = 2, + .nrprocs = 4, + .procs = pmap_procedures +}; + +static struct rpc_version * pmap_version[] = { + NULL, + NULL, + &pmap_version2 +}; + +static struct rpc_stat pmap_stats; + +static struct rpc_program pmap_program = { + .name = "portmap", + .number = RPC_PMAP_PROGRAM, + .nrvers = ARRAY_SIZE(pmap_version), + .version = pmap_version, + .stats = &pmap_stats, +}; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c new file mode 100644 index 000000000000..554f224c0445 --- /dev/null +++ b/net/sunrpc/rpc_pipe.c @@ -0,0 +1,838 @@ +/* + * net/sunrpc/rpc_pipe.c + * + * Userland/kernel interface for rpcauth_gss. + * Code shamelessly plagiarized from fs/nfsd/nfsctl.c + * and fs/driverfs/inode.c + * + * Copyright (c) 2002, Trond Myklebust <trond.myklebust@fys.uio.no> + * + */ +#include <linux/config.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/pagemap.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/dnotify.h> +#include <linux/kernel.h> + +#include <asm/ioctls.h> +#include <linux/fs.h> +#include <linux/poll.h> +#include <linux/wait.h> +#include <linux/seq_file.h> + +#include <linux/sunrpc/clnt.h> +#include <linux/workqueue.h> +#include <linux/sunrpc/rpc_pipe_fs.h> + +static struct vfsmount *rpc_mount; +static int rpc_mount_count; + +static struct file_system_type rpc_pipe_fs_type; + + +static kmem_cache_t *rpc_inode_cachep; + +#define RPC_UPCALL_TIMEOUT (30*HZ) + +static void +__rpc_purge_upcall(struct inode *inode, int err) +{ + struct rpc_inode *rpci = RPC_I(inode); + struct rpc_pipe_msg *msg; + + while (!list_empty(&rpci->pipe)) { + msg = list_entry(rpci->pipe.next, struct rpc_pipe_msg, list); + list_del_init(&msg->list); + msg->errno = err; + rpci->ops->destroy_msg(msg); + } + while (!list_empty(&rpci->in_upcall)) { + msg = list_entry(rpci->pipe.next, struct rpc_pipe_msg, list); + list_del_init(&msg->list); + msg->errno = err; + rpci->ops->destroy_msg(msg); + } + rpci->pipelen = 0; + wake_up(&rpci->waitq); +} + +static void +rpc_timeout_upcall_queue(void *data) +{ + struct rpc_inode *rpci = (struct rpc_inode *)data; + struct inode *inode = &rpci->vfs_inode; + + down(&inode->i_sem); + if (rpci->nreaders == 0 && !list_empty(&rpci->pipe)) + __rpc_purge_upcall(inode, -ETIMEDOUT); + up(&inode->i_sem); +} + +int +rpc_queue_upcall(struct inode *inode, struct rpc_pipe_msg *msg) +{ + struct rpc_inode *rpci = RPC_I(inode); + int res = 0; + + down(&inode->i_sem); + if (rpci->nreaders) { + list_add_tail(&msg->list, &rpci->pipe); + rpci->pipelen += msg->len; + } else if (rpci->flags & RPC_PIPE_WAIT_FOR_OPEN) { + if (list_empty(&rpci->pipe)) + schedule_delayed_work(&rpci->queue_timeout, + RPC_UPCALL_TIMEOUT); + list_add_tail(&msg->list, &rpci->pipe); + rpci->pipelen += msg->len; + } else + res = -EPIPE; + up(&inode->i_sem); + wake_up(&rpci->waitq); + return res; +} + +static void +rpc_close_pipes(struct inode *inode) +{ + struct rpc_inode *rpci = RPC_I(inode); + + cancel_delayed_work(&rpci->queue_timeout); + flush_scheduled_work(); + down(&inode->i_sem); + if (rpci->ops != NULL) { + rpci->nreaders = 0; + __rpc_purge_upcall(inode, -EPIPE); + rpci->nwriters = 0; + if (rpci->ops->release_pipe) + rpci->ops->release_pipe(inode); + rpci->ops = NULL; + } + up(&inode->i_sem); +} + +static inline void +rpc_inode_setowner(struct inode *inode, void *private) +{ + RPC_I(inode)->private = private; +} + +static struct inode * +rpc_alloc_inode(struct super_block *sb) +{ + struct rpc_inode *rpci; + rpci = (struct rpc_inode *)kmem_cache_alloc(rpc_inode_cachep, SLAB_KERNEL); + if (!rpci) + return NULL; + return &rpci->vfs_inode; +} + +static void +rpc_destroy_inode(struct inode *inode) +{ + kmem_cache_free(rpc_inode_cachep, RPC_I(inode)); +} + +static int +rpc_pipe_open(struct inode *inode, struct file *filp) +{ + struct rpc_inode *rpci = RPC_I(inode); + int res = -ENXIO; + + down(&inode->i_sem); + if (rpci->ops != NULL) { + if (filp->f_mode & FMODE_READ) + rpci->nreaders ++; + if (filp->f_mode & FMODE_WRITE) + rpci->nwriters ++; + res = 0; + } + up(&inode->i_sem); + return res; +} + +static int +rpc_pipe_release(struct inode *inode, struct file *filp) +{ + struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode); + struct rpc_pipe_msg *msg; + + down(&inode->i_sem); + if (rpci->ops == NULL) + goto out; + msg = (struct rpc_pipe_msg *)filp->private_data; + if (msg != NULL) { + msg->errno = -EPIPE; + list_del_init(&msg->list); + rpci->ops->destroy_msg(msg); + } + if (filp->f_mode & FMODE_WRITE) + rpci->nwriters --; + if (filp->f_mode & FMODE_READ) + rpci->nreaders --; + if (!rpci->nreaders) + __rpc_purge_upcall(inode, -EPIPE); + if (rpci->ops->release_pipe) + rpci->ops->release_pipe(inode); +out: + up(&inode->i_sem); + return 0; +} + +static ssize_t +rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct rpc_inode *rpci = RPC_I(inode); + struct rpc_pipe_msg *msg; + int res = 0; + + down(&inode->i_sem); + if (rpci->ops == NULL) { + res = -EPIPE; + goto out_unlock; + } + msg = filp->private_data; + if (msg == NULL) { + if (!list_empty(&rpci->pipe)) { + msg = list_entry(rpci->pipe.next, + struct rpc_pipe_msg, + list); + list_move(&msg->list, &rpci->in_upcall); + rpci->pipelen -= msg->len; + filp->private_data = msg; + msg->copied = 0; + } + if (msg == NULL) + goto out_unlock; + } + /* NOTE: it is up to the callback to update msg->copied */ + res = rpci->ops->upcall(filp, msg, buf, len); + if (res < 0 || msg->len == msg->copied) { + filp->private_data = NULL; + list_del_init(&msg->list); + rpci->ops->destroy_msg(msg); + } +out_unlock: + up(&inode->i_sem); + return res; +} + +static ssize_t +rpc_pipe_write(struct file *filp, const char __user *buf, size_t len, loff_t *offset) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct rpc_inode *rpci = RPC_I(inode); + int res; + + down(&inode->i_sem); + res = -EPIPE; + if (rpci->ops != NULL) + res = rpci->ops->downcall(filp, buf, len); + up(&inode->i_sem); + return res; +} + +static unsigned int +rpc_pipe_poll(struct file *filp, struct poll_table_struct *wait) +{ + struct rpc_inode *rpci; + unsigned int mask = 0; + + rpci = RPC_I(filp->f_dentry->d_inode); + poll_wait(filp, &rpci->waitq, wait); + + mask = POLLOUT | POLLWRNORM; + if (rpci->ops == NULL) + mask |= POLLERR | POLLHUP; + if (!list_empty(&rpci->pipe)) + mask |= POLLIN | POLLRDNORM; + return mask; +} + +static int +rpc_pipe_ioctl(struct inode *ino, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode); + int len; + + switch (cmd) { + case FIONREAD: + if (rpci->ops == NULL) + return -EPIPE; + len = rpci->pipelen; + if (filp->private_data) { + struct rpc_pipe_msg *msg; + msg = (struct rpc_pipe_msg *)filp->private_data; + len += msg->len - msg->copied; + } + return put_user(len, (int __user *)arg); + default: + return -EINVAL; + } +} + +static struct file_operations rpc_pipe_fops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .read = rpc_pipe_read, + .write = rpc_pipe_write, + .poll = rpc_pipe_poll, + .ioctl = rpc_pipe_ioctl, + .open = rpc_pipe_open, + .release = rpc_pipe_release, +}; + +static int +rpc_show_info(struct seq_file *m, void *v) +{ + struct rpc_clnt *clnt = m->private; + + seq_printf(m, "RPC server: %s\n", clnt->cl_server); + seq_printf(m, "service: %s (%d) version %d\n", clnt->cl_protname, + clnt->cl_prog, clnt->cl_vers); + seq_printf(m, "address: %u.%u.%u.%u\n", + NIPQUAD(clnt->cl_xprt->addr.sin_addr.s_addr)); + seq_printf(m, "protocol: %s\n", + clnt->cl_xprt->prot == IPPROTO_UDP ? "udp" : "tcp"); + return 0; +} + +static int +rpc_info_open(struct inode *inode, struct file *file) +{ + struct rpc_clnt *clnt; + int ret = single_open(file, rpc_show_info, NULL); + + if (!ret) { + struct seq_file *m = file->private_data; + down(&inode->i_sem); + clnt = RPC_I(inode)->private; + if (clnt) { + atomic_inc(&clnt->cl_users); + m->private = clnt; + } else { + single_release(inode, file); + ret = -EINVAL; + } + up(&inode->i_sem); + } + return ret; +} + +static int +rpc_info_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + struct rpc_clnt *clnt = (struct rpc_clnt *)m->private; + + if (clnt) + rpc_release_client(clnt); + return single_release(inode, file); +} + +static struct file_operations rpc_info_operations = { + .owner = THIS_MODULE, + .open = rpc_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = rpc_info_release, +}; + + +/* + * We have a single directory with 1 node in it. + */ +enum { + RPCAUTH_Root = 1, + RPCAUTH_lockd, + RPCAUTH_mount, + RPCAUTH_nfs, + RPCAUTH_portmap, + RPCAUTH_statd, + RPCAUTH_RootEOF +}; + +/* + * Description of fs contents. + */ +struct rpc_filelist { + char *name; + struct file_operations *i_fop; + int mode; +}; + +static struct rpc_filelist files[] = { + [RPCAUTH_lockd] = { + .name = "lockd", + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + }, + [RPCAUTH_mount] = { + .name = "mount", + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + }, + [RPCAUTH_nfs] = { + .name = "nfs", + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + }, + [RPCAUTH_portmap] = { + .name = "portmap", + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + }, + [RPCAUTH_statd] = { + .name = "statd", + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + }, +}; + +enum { + RPCAUTH_info = 2, + RPCAUTH_EOF +}; + +static struct rpc_filelist authfiles[] = { + [RPCAUTH_info] = { + .name = "info", + .i_fop = &rpc_info_operations, + .mode = S_IFREG | S_IRUSR, + }, +}; + +static int +rpc_get_mount(void) +{ + return simple_pin_fs("rpc_pipefs", &rpc_mount, &rpc_mount_count); +} + +static void +rpc_put_mount(void) +{ + simple_release_fs(&rpc_mount, &rpc_mount_count); +} + +static int +rpc_lookup_parent(char *path, struct nameidata *nd) +{ + if (path[0] == '\0') + return -ENOENT; + if (rpc_get_mount()) { + printk(KERN_WARNING "%s: %s failed to mount " + "pseudofilesystem \n", __FILE__, __FUNCTION__); + return -ENODEV; + } + nd->mnt = mntget(rpc_mount); + nd->dentry = dget(rpc_mount->mnt_root); + nd->last_type = LAST_ROOT; + nd->flags = LOOKUP_PARENT; + nd->depth = 0; + + if (path_walk(path, nd)) { + printk(KERN_WARNING "%s: %s failed to find path %s\n", + __FILE__, __FUNCTION__, path); + rpc_put_mount(); + return -ENOENT; + } + return 0; +} + +static void +rpc_release_path(struct nameidata *nd) +{ + path_release(nd); + rpc_put_mount(); +} + +static struct inode * +rpc_get_inode(struct super_block *sb, int mode) +{ + struct inode *inode = new_inode(sb); + if (!inode) + return NULL; + inode->i_mode = mode; + inode->i_uid = inode->i_gid = 0; + inode->i_blksize = PAGE_CACHE_SIZE; + inode->i_blocks = 0; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + switch(mode & S_IFMT) { + case S_IFDIR: + inode->i_fop = &simple_dir_operations; + inode->i_op = &simple_dir_inode_operations; + inode->i_nlink++; + default: + break; + } + return inode; +} + +/* + * FIXME: This probably has races. + */ +static void +rpc_depopulate(struct dentry *parent) +{ + struct inode *dir = parent->d_inode; + struct list_head *pos, *next; + struct dentry *dentry, *dvec[10]; + int n = 0; + + down(&dir->i_sem); +repeat: + spin_lock(&dcache_lock); + list_for_each_safe(pos, next, &parent->d_subdirs) { + dentry = list_entry(pos, struct dentry, d_child); + spin_lock(&dentry->d_lock); + if (!d_unhashed(dentry)) { + dget_locked(dentry); + __d_drop(dentry); + spin_unlock(&dentry->d_lock); + dvec[n++] = dentry; + if (n == ARRAY_SIZE(dvec)) + break; + } else + spin_unlock(&dentry->d_lock); + } + spin_unlock(&dcache_lock); + if (n) { + do { + dentry = dvec[--n]; + if (dentry->d_inode) { + rpc_close_pipes(dentry->d_inode); + rpc_inode_setowner(dentry->d_inode, NULL); + simple_unlink(dir, dentry); + } + dput(dentry); + } while (n); + goto repeat; + } + up(&dir->i_sem); +} + +static int +rpc_populate(struct dentry *parent, + struct rpc_filelist *files, + int start, int eof) +{ + struct inode *inode, *dir = parent->d_inode; + void *private = RPC_I(dir)->private; + struct dentry *dentry; + int mode, i; + + down(&dir->i_sem); + for (i = start; i < eof; i++) { + dentry = d_alloc_name(parent, files[i].name); + if (!dentry) + goto out_bad; + mode = files[i].mode; + inode = rpc_get_inode(dir->i_sb, mode); + if (!inode) { + dput(dentry); + goto out_bad; + } + inode->i_ino = i; + if (files[i].i_fop) + inode->i_fop = files[i].i_fop; + if (private) + rpc_inode_setowner(inode, private); + if (S_ISDIR(mode)) + dir->i_nlink++; + d_add(dentry, inode); + } + up(&dir->i_sem); + return 0; +out_bad: + up(&dir->i_sem); + printk(KERN_WARNING "%s: %s failed to populate directory %s\n", + __FILE__, __FUNCTION__, parent->d_name.name); + return -ENOMEM; +} + +static int +__rpc_mkdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode; + + inode = rpc_get_inode(dir->i_sb, S_IFDIR | S_IRUSR | S_IXUSR); + if (!inode) + goto out_err; + inode->i_ino = iunique(dir->i_sb, 100); + d_instantiate(dentry, inode); + dir->i_nlink++; + inode_dir_notify(dir, DN_CREATE); + rpc_get_mount(); + return 0; +out_err: + printk(KERN_WARNING "%s: %s failed to allocate inode for dentry %s\n", + __FILE__, __FUNCTION__, dentry->d_name.name); + return -ENOMEM; +} + +static int +__rpc_rmdir(struct inode *dir, struct dentry *dentry) +{ + int error; + + shrink_dcache_parent(dentry); + if (dentry->d_inode) { + rpc_close_pipes(dentry->d_inode); + rpc_inode_setowner(dentry->d_inode, NULL); + } + if ((error = simple_rmdir(dir, dentry)) != 0) + return error; + if (!error) { + inode_dir_notify(dir, DN_DELETE); + d_drop(dentry); + rpc_put_mount(); + } + return 0; +} + +static struct dentry * +rpc_lookup_negative(char *path, struct nameidata *nd) +{ + struct dentry *dentry; + struct inode *dir; + int error; + + if ((error = rpc_lookup_parent(path, nd)) != 0) + return ERR_PTR(error); + dir = nd->dentry->d_inode; + down(&dir->i_sem); + dentry = lookup_hash(&nd->last, nd->dentry); + if (IS_ERR(dentry)) + goto out_err; + if (dentry->d_inode) { + dput(dentry); + dentry = ERR_PTR(-EEXIST); + goto out_err; + } + return dentry; +out_err: + up(&dir->i_sem); + rpc_release_path(nd); + return dentry; +} + + +struct dentry * +rpc_mkdir(char *path, struct rpc_clnt *rpc_client) +{ + struct nameidata nd; + struct dentry *dentry; + struct inode *dir; + int error; + + dentry = rpc_lookup_negative(path, &nd); + if (IS_ERR(dentry)) + return dentry; + dir = nd.dentry->d_inode; + if ((error = __rpc_mkdir(dir, dentry)) != 0) + goto err_dput; + RPC_I(dentry->d_inode)->private = rpc_client; + error = rpc_populate(dentry, authfiles, + RPCAUTH_info, RPCAUTH_EOF); + if (error) + goto err_depopulate; +out: + up(&dir->i_sem); + rpc_release_path(&nd); + return dentry; +err_depopulate: + rpc_depopulate(dentry); + __rpc_rmdir(dir, dentry); +err_dput: + dput(dentry); + printk(KERN_WARNING "%s: %s() failed to create directory %s (errno = %d)\n", + __FILE__, __FUNCTION__, path, error); + dentry = ERR_PTR(error); + goto out; +} + +int +rpc_rmdir(char *path) +{ + struct nameidata nd; + struct dentry *dentry; + struct inode *dir; + int error; + + if ((error = rpc_lookup_parent(path, &nd)) != 0) + return error; + dir = nd.dentry->d_inode; + down(&dir->i_sem); + dentry = lookup_hash(&nd.last, nd.dentry); + if (IS_ERR(dentry)) { + error = PTR_ERR(dentry); + goto out_release; + } + rpc_depopulate(dentry); + error = __rpc_rmdir(dir, dentry); + dput(dentry); +out_release: + up(&dir->i_sem); + rpc_release_path(&nd); + return error; +} + +struct dentry * +rpc_mkpipe(char *path, void *private, struct rpc_pipe_ops *ops, int flags) +{ + struct nameidata nd; + struct dentry *dentry; + struct inode *dir, *inode; + struct rpc_inode *rpci; + + dentry = rpc_lookup_negative(path, &nd); + if (IS_ERR(dentry)) + return dentry; + dir = nd.dentry->d_inode; + inode = rpc_get_inode(dir->i_sb, S_IFSOCK | S_IRUSR | S_IWUSR); + if (!inode) + goto err_dput; + inode->i_ino = iunique(dir->i_sb, 100); + inode->i_fop = &rpc_pipe_fops; + d_instantiate(dentry, inode); + rpci = RPC_I(inode); + rpci->private = private; + rpci->flags = flags; + rpci->ops = ops; + inode_dir_notify(dir, DN_CREATE); +out: + up(&dir->i_sem); + rpc_release_path(&nd); + return dentry; +err_dput: + dput(dentry); + dentry = ERR_PTR(-ENOMEM); + printk(KERN_WARNING "%s: %s() failed to create pipe %s (errno = %d)\n", + __FILE__, __FUNCTION__, path, -ENOMEM); + goto out; +} + +int +rpc_unlink(char *path) +{ + struct nameidata nd; + struct dentry *dentry; + struct inode *dir; + int error; + + if ((error = rpc_lookup_parent(path, &nd)) != 0) + return error; + dir = nd.dentry->d_inode; + down(&dir->i_sem); + dentry = lookup_hash(&nd.last, nd.dentry); + if (IS_ERR(dentry)) { + error = PTR_ERR(dentry); + goto out_release; + } + d_drop(dentry); + if (dentry->d_inode) { + rpc_close_pipes(dentry->d_inode); + rpc_inode_setowner(dentry->d_inode, NULL); + error = simple_unlink(dir, dentry); + } + dput(dentry); + inode_dir_notify(dir, DN_DELETE); +out_release: + up(&dir->i_sem); + rpc_release_path(&nd); + return error; +} + +/* + * populate the filesystem + */ +static struct super_operations s_ops = { + .alloc_inode = rpc_alloc_inode, + .destroy_inode = rpc_destroy_inode, + .statfs = simple_statfs, +}; + +#define RPCAUTH_GSSMAGIC 0x67596969 + +static int +rpc_fill_super(struct super_block *sb, void *data, int silent) +{ + struct inode *inode; + struct dentry *root; + + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = RPCAUTH_GSSMAGIC; + sb->s_op = &s_ops; + sb->s_time_gran = 1; + + inode = rpc_get_inode(sb, S_IFDIR | 0755); + if (!inode) + return -ENOMEM; + root = d_alloc_root(inode); + if (!root) { + iput(inode); + return -ENOMEM; + } + if (rpc_populate(root, files, RPCAUTH_Root + 1, RPCAUTH_RootEOF)) + goto out; + sb->s_root = root; + return 0; +out: + d_genocide(root); + dput(root); + return -ENOMEM; +} + +static struct super_block * +rpc_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return get_sb_single(fs_type, flags, data, rpc_fill_super); +} + +static struct file_system_type rpc_pipe_fs_type = { + .owner = THIS_MODULE, + .name = "rpc_pipefs", + .get_sb = rpc_get_sb, + .kill_sb = kill_litter_super, +}; + +static void +init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) +{ + struct rpc_inode *rpci = (struct rpc_inode *) foo; + + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) { + inode_init_once(&rpci->vfs_inode); + rpci->private = NULL; + rpci->nreaders = 0; + rpci->nwriters = 0; + INIT_LIST_HEAD(&rpci->in_upcall); + INIT_LIST_HEAD(&rpci->pipe); + rpci->pipelen = 0; + init_waitqueue_head(&rpci->waitq); + INIT_WORK(&rpci->queue_timeout, rpc_timeout_upcall_queue, rpci); + rpci->ops = NULL; + } +} + +int register_rpc_pipefs(void) +{ + rpc_inode_cachep = kmem_cache_create("rpc_inode_cache", + sizeof(struct rpc_inode), + 0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, + init_once, NULL); + if (!rpc_inode_cachep) + return -ENOMEM; + register_filesystem(&rpc_pipe_fs_type); + return 0; +} + +void unregister_rpc_pipefs(void) +{ + if (kmem_cache_destroy(rpc_inode_cachep)) + printk(KERN_WARNING "RPC: unable to free inode cache\n"); + unregister_filesystem(&rpc_pipe_fs_type); +} diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c new file mode 100644 index 000000000000..c06614d0e31d --- /dev/null +++ b/net/sunrpc/sched.c @@ -0,0 +1,1119 @@ +/* + * linux/net/sunrpc/sched.c + * + * Scheduling for synchronous and asynchronous RPC requests. + * + * Copyright (C) 1996 Olaf Kirch, <okir@monad.swb.de> + * + * TCP NFS related read + write fixes + * (C) 1999 Dave Airlie, University of Limerick, Ireland <airlied@linux.ie> + */ + +#include <linux/module.h> + +#include <linux/sched.h> +#include <linux/interrupt.h> +#include <linux/slab.h> +#include <linux/mempool.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/spinlock.h> + +#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/xprt.h> + +#ifdef RPC_DEBUG +#define RPCDBG_FACILITY RPCDBG_SCHED +#define RPC_TASK_MAGIC_ID 0xf00baa +static int rpc_task_id; +#endif + +/* + * RPC slabs and memory pools + */ +#define RPC_BUFFER_MAXSIZE (2048) +#define RPC_BUFFER_POOLSIZE (8) +#define RPC_TASK_POOLSIZE (8) +static kmem_cache_t *rpc_task_slabp; +static kmem_cache_t *rpc_buffer_slabp; +static mempool_t *rpc_task_mempool; +static mempool_t *rpc_buffer_mempool; + +static void __rpc_default_timer(struct rpc_task *task); +static void rpciod_killall(void); +static void rpc_free(struct rpc_task *task); + +static void rpc_async_schedule(void *); + +/* + * RPC tasks that create another task (e.g. for contacting the portmapper) + * will wait on this queue for their child's completion + */ +static RPC_WAITQ(childq, "childq"); + +/* + * RPC tasks sit here while waiting for conditions to improve. + */ +static RPC_WAITQ(delay_queue, "delayq"); + +/* + * All RPC tasks are linked into this list + */ +static LIST_HEAD(all_tasks); + +/* + * rpciod-related stuff + */ +static DECLARE_MUTEX(rpciod_sema); +static unsigned int rpciod_users; +static struct workqueue_struct *rpciod_workqueue; + +/* + * Spinlock for other critical sections of code. + */ +static DEFINE_SPINLOCK(rpc_sched_lock); + +/* + * Disable the timer for a given RPC task. Should be called with + * queue->lock and bh_disabled in order to avoid races within + * rpc_run_timer(). + */ +static inline void +__rpc_disable_timer(struct rpc_task *task) +{ + dprintk("RPC: %4d disabling timer\n", task->tk_pid); + task->tk_timeout_fn = NULL; + task->tk_timeout = 0; +} + +/* + * Run a timeout function. + * We use the callback in order to allow __rpc_wake_up_task() + * and friends to disable the timer synchronously on SMP systems + * without calling del_timer_sync(). The latter could cause a + * deadlock if called while we're holding spinlocks... + */ +static void rpc_run_timer(struct rpc_task *task) +{ + void (*callback)(struct rpc_task *); + + callback = task->tk_timeout_fn; + task->tk_timeout_fn = NULL; + if (callback && RPC_IS_QUEUED(task)) { + dprintk("RPC: %4d running timer\n", task->tk_pid); + callback(task); + } + smp_mb__before_clear_bit(); + clear_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate); + smp_mb__after_clear_bit(); +} + +/* + * Set up a timer for the current task. + */ +static inline void +__rpc_add_timer(struct rpc_task *task, rpc_action timer) +{ + if (!task->tk_timeout) + return; + + dprintk("RPC: %4d setting alarm for %lu ms\n", + task->tk_pid, task->tk_timeout * 1000 / HZ); + + if (timer) + task->tk_timeout_fn = timer; + else + task->tk_timeout_fn = __rpc_default_timer; + set_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate); + mod_timer(&task->tk_timer, jiffies + task->tk_timeout); +} + +/* + * Delete any timer for the current task. Because we use del_timer_sync(), + * this function should never be called while holding queue->lock. + */ +static void +rpc_delete_timer(struct rpc_task *task) +{ + if (RPC_IS_QUEUED(task)) + return; + if (test_and_clear_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate)) { + del_singleshot_timer_sync(&task->tk_timer); + dprintk("RPC: %4d deleting timer\n", task->tk_pid); + } +} + +/* + * Add new request to a priority queue. + */ +static void __rpc_add_wait_queue_priority(struct rpc_wait_queue *queue, struct rpc_task *task) +{ + struct list_head *q; + struct rpc_task *t; + + INIT_LIST_HEAD(&task->u.tk_wait.links); + q = &queue->tasks[task->tk_priority]; + if (unlikely(task->tk_priority > queue->maxpriority)) + q = &queue->tasks[queue->maxpriority]; + list_for_each_entry(t, q, u.tk_wait.list) { + if (t->tk_cookie == task->tk_cookie) { + list_add_tail(&task->u.tk_wait.list, &t->u.tk_wait.links); + return; + } + } + list_add_tail(&task->u.tk_wait.list, q); +} + +/* + * Add new request to wait queue. + * + * Swapper tasks always get inserted at the head of the queue. + * This should avoid many nasty memory deadlocks and hopefully + * improve overall performance. + * Everyone else gets appended to the queue to ensure proper FIFO behavior. + */ +static void __rpc_add_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task) +{ + BUG_ON (RPC_IS_QUEUED(task)); + + if (RPC_IS_PRIORITY(queue)) + __rpc_add_wait_queue_priority(queue, task); + else if (RPC_IS_SWAPPER(task)) + list_add(&task->u.tk_wait.list, &queue->tasks[0]); + else + list_add_tail(&task->u.tk_wait.list, &queue->tasks[0]); + task->u.tk_wait.rpc_waitq = queue; + rpc_set_queued(task); + + dprintk("RPC: %4d added to queue %p \"%s\"\n", + task->tk_pid, queue, rpc_qname(queue)); +} + +/* + * Remove request from a priority queue. + */ +static void __rpc_remove_wait_queue_priority(struct rpc_task *task) +{ + struct rpc_task *t; + + if (!list_empty(&task->u.tk_wait.links)) { + t = list_entry(task->u.tk_wait.links.next, struct rpc_task, u.tk_wait.list); + list_move(&t->u.tk_wait.list, &task->u.tk_wait.list); + list_splice_init(&task->u.tk_wait.links, &t->u.tk_wait.links); + } + list_del(&task->u.tk_wait.list); +} + +/* + * Remove request from queue. + * Note: must be called with spin lock held. + */ +static void __rpc_remove_wait_queue(struct rpc_task *task) +{ + struct rpc_wait_queue *queue; + queue = task->u.tk_wait.rpc_waitq; + + if (RPC_IS_PRIORITY(queue)) + __rpc_remove_wait_queue_priority(task); + else + list_del(&task->u.tk_wait.list); + dprintk("RPC: %4d removed from queue %p \"%s\"\n", + task->tk_pid, queue, rpc_qname(queue)); +} + +static inline void rpc_set_waitqueue_priority(struct rpc_wait_queue *queue, int priority) +{ + queue->priority = priority; + queue->count = 1 << (priority * 2); +} + +static inline void rpc_set_waitqueue_cookie(struct rpc_wait_queue *queue, unsigned long cookie) +{ + queue->cookie = cookie; + queue->nr = RPC_BATCH_COUNT; +} + +static inline void rpc_reset_waitqueue_priority(struct rpc_wait_queue *queue) +{ + rpc_set_waitqueue_priority(queue, queue->maxpriority); + rpc_set_waitqueue_cookie(queue, 0); +} + +static void __rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const char *qname, int maxprio) +{ + int i; + + spin_lock_init(&queue->lock); + for (i = 0; i < ARRAY_SIZE(queue->tasks); i++) + INIT_LIST_HEAD(&queue->tasks[i]); + queue->maxpriority = maxprio; + rpc_reset_waitqueue_priority(queue); +#ifdef RPC_DEBUG + queue->name = qname; +#endif +} + +void rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const char *qname) +{ + __rpc_init_priority_wait_queue(queue, qname, RPC_PRIORITY_HIGH); +} + +void rpc_init_wait_queue(struct rpc_wait_queue *queue, const char *qname) +{ + __rpc_init_priority_wait_queue(queue, qname, 0); +} +EXPORT_SYMBOL(rpc_init_wait_queue); + +/* + * Make an RPC task runnable. + * + * Note: If the task is ASYNC, this must be called with + * the spinlock held to protect the wait queue operation. + */ +static void rpc_make_runnable(struct rpc_task *task) +{ + int do_ret; + + BUG_ON(task->tk_timeout_fn); + do_ret = rpc_test_and_set_running(task); + rpc_clear_queued(task); + if (do_ret) + return; + if (RPC_IS_ASYNC(task)) { + int status; + + INIT_WORK(&task->u.tk_work, rpc_async_schedule, (void *)task); + status = queue_work(task->tk_workqueue, &task->u.tk_work); + if (status < 0) { + printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status); + task->tk_status = status; + return; + } + } else + wake_up(&task->u.tk_wait.waitq); +} + +/* + * Place a newly initialized task on the workqueue. + */ +static inline void +rpc_schedule_run(struct rpc_task *task) +{ + /* Don't run a child twice! */ + if (RPC_IS_ACTIVATED(task)) + return; + task->tk_active = 1; + rpc_make_runnable(task); +} + +/* + * Prepare for sleeping on a wait queue. + * By always appending tasks to the list we ensure FIFO behavior. + * NB: An RPC task will only receive interrupt-driven events as long + * as it's on a wait queue. + */ +static void __rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, + rpc_action action, rpc_action timer) +{ + dprintk("RPC: %4d sleep_on(queue \"%s\" time %ld)\n", task->tk_pid, + rpc_qname(q), jiffies); + + if (!RPC_IS_ASYNC(task) && !RPC_IS_ACTIVATED(task)) { + printk(KERN_ERR "RPC: Inactive synchronous task put to sleep!\n"); + return; + } + + /* Mark the task as being activated if so needed */ + if (!RPC_IS_ACTIVATED(task)) + task->tk_active = 1; + + __rpc_add_wait_queue(q, task); + + BUG_ON(task->tk_callback != NULL); + task->tk_callback = action; + __rpc_add_timer(task, timer); +} + +void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, + rpc_action action, rpc_action timer) +{ + /* + * Protect the queue operations. + */ + spin_lock_bh(&q->lock); + __rpc_sleep_on(q, task, action, timer); + spin_unlock_bh(&q->lock); +} + +/** + * __rpc_do_wake_up_task - wake up a single rpc_task + * @task: task to be woken up + * + * Caller must hold queue->lock, and have cleared the task queued flag. + */ +static void __rpc_do_wake_up_task(struct rpc_task *task) +{ + dprintk("RPC: %4d __rpc_wake_up_task (now %ld)\n", task->tk_pid, jiffies); + +#ifdef RPC_DEBUG + BUG_ON(task->tk_magic != RPC_TASK_MAGIC_ID); +#endif + /* Has the task been executed yet? If not, we cannot wake it up! */ + if (!RPC_IS_ACTIVATED(task)) { + printk(KERN_ERR "RPC: Inactive task (%p) being woken up!\n", task); + return; + } + + __rpc_disable_timer(task); + __rpc_remove_wait_queue(task); + + rpc_make_runnable(task); + + dprintk("RPC: __rpc_wake_up_task done\n"); +} + +/* + * Wake up the specified task + */ +static void __rpc_wake_up_task(struct rpc_task *task) +{ + if (rpc_start_wakeup(task)) { + if (RPC_IS_QUEUED(task)) + __rpc_do_wake_up_task(task); + rpc_finish_wakeup(task); + } +} + +/* + * Default timeout handler if none specified by user + */ +static void +__rpc_default_timer(struct rpc_task *task) +{ + dprintk("RPC: %d timeout (default timer)\n", task->tk_pid); + task->tk_status = -ETIMEDOUT; + rpc_wake_up_task(task); +} + +/* + * Wake up the specified task + */ +void rpc_wake_up_task(struct rpc_task *task) +{ + if (rpc_start_wakeup(task)) { + if (RPC_IS_QUEUED(task)) { + struct rpc_wait_queue *queue = task->u.tk_wait.rpc_waitq; + + spin_lock_bh(&queue->lock); + __rpc_do_wake_up_task(task); + spin_unlock_bh(&queue->lock); + } + rpc_finish_wakeup(task); + } +} + +/* + * Wake up the next task on a priority queue. + */ +static struct rpc_task * __rpc_wake_up_next_priority(struct rpc_wait_queue *queue) +{ + struct list_head *q; + struct rpc_task *task; + + /* + * Service a batch of tasks from a single cookie. + */ + q = &queue->tasks[queue->priority]; + if (!list_empty(q)) { + task = list_entry(q->next, struct rpc_task, u.tk_wait.list); + if (queue->cookie == task->tk_cookie) { + if (--queue->nr) + goto out; + list_move_tail(&task->u.tk_wait.list, q); + } + /* + * Check if we need to switch queues. + */ + if (--queue->count) + goto new_cookie; + } + + /* + * Service the next queue. + */ + do { + if (q == &queue->tasks[0]) + q = &queue->tasks[queue->maxpriority]; + else + q = q - 1; + if (!list_empty(q)) { + task = list_entry(q->next, struct rpc_task, u.tk_wait.list); + goto new_queue; + } + } while (q != &queue->tasks[queue->priority]); + + rpc_reset_waitqueue_priority(queue); + return NULL; + +new_queue: + rpc_set_waitqueue_priority(queue, (unsigned int)(q - &queue->tasks[0])); +new_cookie: + rpc_set_waitqueue_cookie(queue, task->tk_cookie); +out: + __rpc_wake_up_task(task); + return task; +} + +/* + * Wake up the next task on the wait queue. + */ +struct rpc_task * rpc_wake_up_next(struct rpc_wait_queue *queue) +{ + struct rpc_task *task = NULL; + + dprintk("RPC: wake_up_next(%p \"%s\")\n", queue, rpc_qname(queue)); + spin_lock_bh(&queue->lock); + if (RPC_IS_PRIORITY(queue)) + task = __rpc_wake_up_next_priority(queue); + else { + task_for_first(task, &queue->tasks[0]) + __rpc_wake_up_task(task); + } + spin_unlock_bh(&queue->lock); + + return task; +} + +/** + * rpc_wake_up - wake up all rpc_tasks + * @queue: rpc_wait_queue on which the tasks are sleeping + * + * Grabs queue->lock + */ +void rpc_wake_up(struct rpc_wait_queue *queue) +{ + struct rpc_task *task; + + struct list_head *head; + spin_lock_bh(&queue->lock); + head = &queue->tasks[queue->maxpriority]; + for (;;) { + while (!list_empty(head)) { + task = list_entry(head->next, struct rpc_task, u.tk_wait.list); + __rpc_wake_up_task(task); + } + if (head == &queue->tasks[0]) + break; + head--; + } + spin_unlock_bh(&queue->lock); +} + +/** + * rpc_wake_up_status - wake up all rpc_tasks and set their status value. + * @queue: rpc_wait_queue on which the tasks are sleeping + * @status: status value to set + * + * Grabs queue->lock + */ +void rpc_wake_up_status(struct rpc_wait_queue *queue, int status) +{ + struct list_head *head; + struct rpc_task *task; + + spin_lock_bh(&queue->lock); + head = &queue->tasks[queue->maxpriority]; + for (;;) { + while (!list_empty(head)) { + task = list_entry(head->next, struct rpc_task, u.tk_wait.list); + task->tk_status = status; + __rpc_wake_up_task(task); + } + if (head == &queue->tasks[0]) + break; + head--; + } + spin_unlock_bh(&queue->lock); +} + +/* + * Run a task at a later time + */ +static void __rpc_atrun(struct rpc_task *); +void +rpc_delay(struct rpc_task *task, unsigned long delay) +{ + task->tk_timeout = delay; + rpc_sleep_on(&delay_queue, task, NULL, __rpc_atrun); +} + +static void +__rpc_atrun(struct rpc_task *task) +{ + task->tk_status = 0; + rpc_wake_up_task(task); +} + +/* + * This is the RPC `scheduler' (or rather, the finite state machine). + */ +static int __rpc_execute(struct rpc_task *task) +{ + int status = 0; + + dprintk("RPC: %4d rpc_execute flgs %x\n", + task->tk_pid, task->tk_flags); + + BUG_ON(RPC_IS_QUEUED(task)); + + restarted: + while (1) { + /* + * Garbage collection of pending timers... + */ + rpc_delete_timer(task); + + /* + * Execute any pending callback. + */ + if (RPC_DO_CALLBACK(task)) { + /* Define a callback save pointer */ + void (*save_callback)(struct rpc_task *); + + /* + * If a callback exists, save it, reset it, + * call it. + * The save is needed to stop from resetting + * another callback set within the callback handler + * - Dave + */ + save_callback=task->tk_callback; + task->tk_callback=NULL; + lock_kernel(); + save_callback(task); + unlock_kernel(); + } + + /* + * Perform the next FSM step. + * tk_action may be NULL when the task has been killed + * by someone else. + */ + if (!RPC_IS_QUEUED(task)) { + if (!task->tk_action) + break; + lock_kernel(); + task->tk_action(task); + unlock_kernel(); + } + + /* + * Lockless check for whether task is sleeping or not. + */ + if (!RPC_IS_QUEUED(task)) + continue; + rpc_clear_running(task); + if (RPC_IS_ASYNC(task)) { + /* Careful! we may have raced... */ + if (RPC_IS_QUEUED(task)) + return 0; + if (rpc_test_and_set_running(task)) + return 0; + continue; + } + + /* sync task: sleep here */ + dprintk("RPC: %4d sync task going to sleep\n", task->tk_pid); + if (RPC_TASK_UNINTERRUPTIBLE(task)) { + __wait_event(task->u.tk_wait.waitq, !RPC_IS_QUEUED(task)); + } else { + __wait_event_interruptible(task->u.tk_wait.waitq, !RPC_IS_QUEUED(task), status); + /* + * When a sync task receives a signal, it exits with + * -ERESTARTSYS. In order to catch any callbacks that + * clean up after sleeping on some queue, we don't + * break the loop here, but go around once more. + */ + if (status == -ERESTARTSYS) { + dprintk("RPC: %4d got signal\n", task->tk_pid); + task->tk_flags |= RPC_TASK_KILLED; + rpc_exit(task, -ERESTARTSYS); + rpc_wake_up_task(task); + } + } + rpc_set_running(task); + dprintk("RPC: %4d sync task resuming\n", task->tk_pid); + } + + if (task->tk_exit) { + lock_kernel(); + task->tk_exit(task); + unlock_kernel(); + /* If tk_action is non-null, the user wants us to restart */ + if (task->tk_action) { + if (!RPC_ASSASSINATED(task)) { + /* Release RPC slot and buffer memory */ + if (task->tk_rqstp) + xprt_release(task); + rpc_free(task); + goto restarted; + } + printk(KERN_ERR "RPC: dead task tries to walk away.\n"); + } + } + + dprintk("RPC: %4d exit() = %d\n", task->tk_pid, task->tk_status); + status = task->tk_status; + + /* Release all resources associated with the task */ + rpc_release_task(task); + return status; +} + +/* + * User-visible entry point to the scheduler. + * + * This may be called recursively if e.g. an async NFS task updates + * the attributes and finds that dirty pages must be flushed. + * NOTE: Upon exit of this function the task is guaranteed to be + * released. In particular note that tk_release() will have + * been called, so your task memory may have been freed. + */ +int +rpc_execute(struct rpc_task *task) +{ + BUG_ON(task->tk_active); + + task->tk_active = 1; + rpc_set_running(task); + return __rpc_execute(task); +} + +static void rpc_async_schedule(void *arg) +{ + __rpc_execute((struct rpc_task *)arg); +} + +/* + * Allocate memory for RPC purposes. + * + * We try to ensure that some NFS reads and writes can always proceed + * by using a mempool when allocating 'small' buffers. + * In order to avoid memory starvation triggering more writebacks of + * NFS requests, we use GFP_NOFS rather than GFP_KERNEL. + */ +void * +rpc_malloc(struct rpc_task *task, size_t size) +{ + int gfp; + + if (task->tk_flags & RPC_TASK_SWAPPER) + gfp = GFP_ATOMIC; + else + gfp = GFP_NOFS; + + if (size > RPC_BUFFER_MAXSIZE) { + task->tk_buffer = kmalloc(size, gfp); + if (task->tk_buffer) + task->tk_bufsize = size; + } else { + task->tk_buffer = mempool_alloc(rpc_buffer_mempool, gfp); + if (task->tk_buffer) + task->tk_bufsize = RPC_BUFFER_MAXSIZE; + } + return task->tk_buffer; +} + +static void +rpc_free(struct rpc_task *task) +{ + if (task->tk_buffer) { + if (task->tk_bufsize == RPC_BUFFER_MAXSIZE) + mempool_free(task->tk_buffer, rpc_buffer_mempool); + else + kfree(task->tk_buffer); + task->tk_buffer = NULL; + task->tk_bufsize = 0; + } +} + +/* + * Creation and deletion of RPC task structures + */ +void rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, rpc_action callback, int flags) +{ + memset(task, 0, sizeof(*task)); + init_timer(&task->tk_timer); + task->tk_timer.data = (unsigned long) task; + task->tk_timer.function = (void (*)(unsigned long)) rpc_run_timer; + task->tk_client = clnt; + task->tk_flags = flags; + task->tk_exit = callback; + + /* Initialize retry counters */ + task->tk_garb_retry = 2; + task->tk_cred_retry = 2; + + task->tk_priority = RPC_PRIORITY_NORMAL; + task->tk_cookie = (unsigned long)current; + + /* Initialize workqueue for async tasks */ + task->tk_workqueue = rpciod_workqueue; + if (!RPC_IS_ASYNC(task)) + init_waitqueue_head(&task->u.tk_wait.waitq); + + if (clnt) { + atomic_inc(&clnt->cl_users); + if (clnt->cl_softrtry) + task->tk_flags |= RPC_TASK_SOFT; + if (!clnt->cl_intr) + task->tk_flags |= RPC_TASK_NOINTR; + } + +#ifdef RPC_DEBUG + task->tk_magic = RPC_TASK_MAGIC_ID; + task->tk_pid = rpc_task_id++; +#endif + /* Add to global list of all tasks */ + spin_lock(&rpc_sched_lock); + list_add_tail(&task->tk_task, &all_tasks); + spin_unlock(&rpc_sched_lock); + + dprintk("RPC: %4d new task procpid %d\n", task->tk_pid, + current->pid); +} + +static struct rpc_task * +rpc_alloc_task(void) +{ + return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS); +} + +static void +rpc_default_free_task(struct rpc_task *task) +{ + dprintk("RPC: %4d freeing task\n", task->tk_pid); + mempool_free(task, rpc_task_mempool); +} + +/* + * Create a new task for the specified client. We have to + * clean up after an allocation failure, as the client may + * have specified "oneshot". + */ +struct rpc_task * +rpc_new_task(struct rpc_clnt *clnt, rpc_action callback, int flags) +{ + struct rpc_task *task; + + task = rpc_alloc_task(); + if (!task) + goto cleanup; + + rpc_init_task(task, clnt, callback, flags); + + /* Replace tk_release */ + task->tk_release = rpc_default_free_task; + + dprintk("RPC: %4d allocated task\n", task->tk_pid); + task->tk_flags |= RPC_TASK_DYNAMIC; +out: + return task; + +cleanup: + /* Check whether to release the client */ + if (clnt) { + printk("rpc_new_task: failed, users=%d, oneshot=%d\n", + atomic_read(&clnt->cl_users), clnt->cl_oneshot); + atomic_inc(&clnt->cl_users); /* pretend we were used ... */ + rpc_release_client(clnt); + } + goto out; +} + +void rpc_release_task(struct rpc_task *task) +{ + dprintk("RPC: %4d release task\n", task->tk_pid); + +#ifdef RPC_DEBUG + BUG_ON(task->tk_magic != RPC_TASK_MAGIC_ID); +#endif + + /* Remove from global task list */ + spin_lock(&rpc_sched_lock); + list_del(&task->tk_task); + spin_unlock(&rpc_sched_lock); + + BUG_ON (RPC_IS_QUEUED(task)); + task->tk_active = 0; + + /* Synchronously delete any running timer */ + rpc_delete_timer(task); + + /* Release resources */ + if (task->tk_rqstp) + xprt_release(task); + if (task->tk_msg.rpc_cred) + rpcauth_unbindcred(task); + rpc_free(task); + if (task->tk_client) { + rpc_release_client(task->tk_client); + task->tk_client = NULL; + } + +#ifdef RPC_DEBUG + task->tk_magic = 0; +#endif + if (task->tk_release) + task->tk_release(task); +} + +/** + * rpc_find_parent - find the parent of a child task. + * @child: child task + * + * Checks that the parent task is still sleeping on the + * queue 'childq'. If so returns a pointer to the parent. + * Upon failure returns NULL. + * + * Caller must hold childq.lock + */ +static inline struct rpc_task *rpc_find_parent(struct rpc_task *child) +{ + struct rpc_task *task, *parent; + struct list_head *le; + + parent = (struct rpc_task *) child->tk_calldata; + task_for_each(task, le, &childq.tasks[0]) + if (task == parent) + return parent; + + return NULL; +} + +static void rpc_child_exit(struct rpc_task *child) +{ + struct rpc_task *parent; + + spin_lock_bh(&childq.lock); + if ((parent = rpc_find_parent(child)) != NULL) { + parent->tk_status = child->tk_status; + __rpc_wake_up_task(parent); + } + spin_unlock_bh(&childq.lock); +} + +/* + * Note: rpc_new_task releases the client after a failure. + */ +struct rpc_task * +rpc_new_child(struct rpc_clnt *clnt, struct rpc_task *parent) +{ + struct rpc_task *task; + + task = rpc_new_task(clnt, NULL, RPC_TASK_ASYNC | RPC_TASK_CHILD); + if (!task) + goto fail; + task->tk_exit = rpc_child_exit; + task->tk_calldata = parent; + return task; + +fail: + parent->tk_status = -ENOMEM; + return NULL; +} + +void rpc_run_child(struct rpc_task *task, struct rpc_task *child, rpc_action func) +{ + spin_lock_bh(&childq.lock); + /* N.B. Is it possible for the child to have already finished? */ + __rpc_sleep_on(&childq, task, func, NULL); + rpc_schedule_run(child); + spin_unlock_bh(&childq.lock); +} + +/* + * Kill all tasks for the given client. + * XXX: kill their descendants as well? + */ +void rpc_killall_tasks(struct rpc_clnt *clnt) +{ + struct rpc_task *rovr; + struct list_head *le; + + dprintk("RPC: killing all tasks for client %p\n", clnt); + + /* + * Spin lock all_tasks to prevent changes... + */ + spin_lock(&rpc_sched_lock); + alltask_for_each(rovr, le, &all_tasks) { + if (! RPC_IS_ACTIVATED(rovr)) + continue; + if (!clnt || rovr->tk_client == clnt) { + rovr->tk_flags |= RPC_TASK_KILLED; + rpc_exit(rovr, -EIO); + rpc_wake_up_task(rovr); + } + } + spin_unlock(&rpc_sched_lock); +} + +static DECLARE_MUTEX_LOCKED(rpciod_running); + +static void rpciod_killall(void) +{ + unsigned long flags; + + while (!list_empty(&all_tasks)) { + clear_thread_flag(TIF_SIGPENDING); + rpc_killall_tasks(NULL); + flush_workqueue(rpciod_workqueue); + if (!list_empty(&all_tasks)) { + dprintk("rpciod_killall: waiting for tasks to exit\n"); + yield(); + } + } + + spin_lock_irqsave(¤t->sighand->siglock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); +} + +/* + * Start up the rpciod process if it's not already running. + */ +int +rpciod_up(void) +{ + struct workqueue_struct *wq; + int error = 0; + + down(&rpciod_sema); + dprintk("rpciod_up: users %d\n", rpciod_users); + rpciod_users++; + if (rpciod_workqueue) + goto out; + /* + * If there's no pid, we should be the first user. + */ + if (rpciod_users > 1) + printk(KERN_WARNING "rpciod_up: no workqueue, %d users??\n", rpciod_users); + /* + * Create the rpciod thread and wait for it to start. + */ + error = -ENOMEM; + wq = create_workqueue("rpciod"); + if (wq == NULL) { + printk(KERN_WARNING "rpciod_up: create workqueue failed, error=%d\n", error); + rpciod_users--; + goto out; + } + rpciod_workqueue = wq; + error = 0; +out: + up(&rpciod_sema); + return error; +} + +void +rpciod_down(void) +{ + down(&rpciod_sema); + dprintk("rpciod_down sema %d\n", rpciod_users); + if (rpciod_users) { + if (--rpciod_users) + goto out; + } else + printk(KERN_WARNING "rpciod_down: no users??\n"); + + if (!rpciod_workqueue) { + dprintk("rpciod_down: Nothing to do!\n"); + goto out; + } + rpciod_killall(); + + destroy_workqueue(rpciod_workqueue); + rpciod_workqueue = NULL; + out: + up(&rpciod_sema); +} + +#ifdef RPC_DEBUG +void rpc_show_tasks(void) +{ + struct list_head *le; + struct rpc_task *t; + + spin_lock(&rpc_sched_lock); + if (list_empty(&all_tasks)) { + spin_unlock(&rpc_sched_lock); + return; + } + printk("-pid- proc flgs status -client- -prog- --rqstp- -timeout " + "-rpcwait -action- --exit--\n"); + alltask_for_each(t, le, &all_tasks) { + const char *rpc_waitq = "none"; + + if (RPC_IS_QUEUED(t)) + rpc_waitq = rpc_qname(t->u.tk_wait.rpc_waitq); + + printk("%05d %04d %04x %06d %8p %6d %8p %08ld %8s %8p %8p\n", + t->tk_pid, + (t->tk_msg.rpc_proc ? t->tk_msg.rpc_proc->p_proc : -1), + t->tk_flags, t->tk_status, + t->tk_client, + (t->tk_client ? t->tk_client->cl_prog : 0), + t->tk_rqstp, t->tk_timeout, + rpc_waitq, + t->tk_action, t->tk_exit); + } + spin_unlock(&rpc_sched_lock); +} +#endif + +void +rpc_destroy_mempool(void) +{ + if (rpc_buffer_mempool) + mempool_destroy(rpc_buffer_mempool); + if (rpc_task_mempool) + mempool_destroy(rpc_task_mempool); + if (rpc_task_slabp && kmem_cache_destroy(rpc_task_slabp)) + printk(KERN_INFO "rpc_task: not all structures were freed\n"); + if (rpc_buffer_slabp && kmem_cache_destroy(rpc_buffer_slabp)) + printk(KERN_INFO "rpc_buffers: not all structures were freed\n"); +} + +int +rpc_init_mempool(void) +{ + rpc_task_slabp = kmem_cache_create("rpc_tasks", + sizeof(struct rpc_task), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!rpc_task_slabp) + goto err_nomem; + rpc_buffer_slabp = kmem_cache_create("rpc_buffers", + RPC_BUFFER_MAXSIZE, + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!rpc_buffer_slabp) + goto err_nomem; + rpc_task_mempool = mempool_create(RPC_TASK_POOLSIZE, + mempool_alloc_slab, + mempool_free_slab, + rpc_task_slabp); + if (!rpc_task_mempool) + goto err_nomem; + rpc_buffer_mempool = mempool_create(RPC_BUFFER_POOLSIZE, + mempool_alloc_slab, + mempool_free_slab, + rpc_buffer_slabp); + if (!rpc_buffer_mempool) + goto err_nomem; + return 0; +err_nomem: + rpc_destroy_mempool(); + return -ENOMEM; +} diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c new file mode 100644 index 000000000000..9b67dc19944c --- /dev/null +++ b/net/sunrpc/stats.c @@ -0,0 +1,175 @@ +/* + * linux/net/sunrpc/stats.c + * + * procfs-based user access to generic RPC statistics. The stats files + * reside in /proc/net/rpc. + * + * The read routines assume that the buffer passed in is just big enough. + * If you implement an RPC service that has its own stats routine which + * appends the generic RPC stats, make sure you don't exceed the PAGE_SIZE + * limit. + * + * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> + */ + +#include <linux/module.h> + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/svcsock.h> + +#define RPCDBG_FACILITY RPCDBG_MISC + +struct proc_dir_entry *proc_net_rpc = NULL; + +/* + * Get RPC client stats + */ +static int rpc_proc_show(struct seq_file *seq, void *v) { + const struct rpc_stat *statp = seq->private; + const struct rpc_program *prog = statp->program; + int i, j; + + seq_printf(seq, + "net %d %d %d %d\n", + statp->netcnt, + statp->netudpcnt, + statp->nettcpcnt, + statp->nettcpconn); + seq_printf(seq, + "rpc %d %d %d\n", + statp->rpccnt, + statp->rpcretrans, + statp->rpcauthrefresh); + + for (i = 0; i < prog->nrvers; i++) { + const struct rpc_version *vers = prog->version[i]; + if (!vers) + continue; + seq_printf(seq, "proc%d %d", + vers->number, vers->nrprocs); + for (j = 0; j < vers->nrprocs; j++) + seq_printf(seq, " %d", + vers->procs[j].p_count); + seq_putc(seq, '\n'); + } + return 0; +} + +static int rpc_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, rpc_proc_show, PDE(inode)->data); +} + +static struct file_operations rpc_proc_fops = { + .owner = THIS_MODULE, + .open = rpc_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* + * Get RPC server stats + */ +void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) { + const struct svc_program *prog = statp->program; + const struct svc_procedure *proc; + const struct svc_version *vers; + int i, j; + + seq_printf(seq, + "net %d %d %d %d\n", + statp->netcnt, + statp->netudpcnt, + statp->nettcpcnt, + statp->nettcpconn); + seq_printf(seq, + "rpc %d %d %d %d %d\n", + statp->rpccnt, + statp->rpcbadfmt+statp->rpcbadauth+statp->rpcbadclnt, + statp->rpcbadfmt, + statp->rpcbadauth, + statp->rpcbadclnt); + + for (i = 0; i < prog->pg_nvers; i++) { + if (!(vers = prog->pg_vers[i]) || !(proc = vers->vs_proc)) + continue; + seq_printf(seq, "proc%d %d", i, vers->vs_nproc); + for (j = 0; j < vers->vs_nproc; j++, proc++) + seq_printf(seq, " %d", proc->pc_count); + seq_putc(seq, '\n'); + } +} + +/* + * Register/unregister RPC proc files + */ +static inline struct proc_dir_entry * +do_register(const char *name, void *data, struct file_operations *fops) +{ + struct proc_dir_entry *ent; + + rpc_proc_init(); + dprintk("RPC: registering /proc/net/rpc/%s\n", name); + + ent = create_proc_entry(name, 0, proc_net_rpc); + if (ent) { + ent->proc_fops = fops; + ent->data = data; + } + return ent; +} + +struct proc_dir_entry * +rpc_proc_register(struct rpc_stat *statp) +{ + return do_register(statp->program->name, statp, &rpc_proc_fops); +} + +void +rpc_proc_unregister(const char *name) +{ + remove_proc_entry(name, proc_net_rpc); +} + +struct proc_dir_entry * +svc_proc_register(struct svc_stat *statp, struct file_operations *fops) +{ + return do_register(statp->program->pg_name, statp, fops); +} + +void +svc_proc_unregister(const char *name) +{ + remove_proc_entry(name, proc_net_rpc); +} + +void +rpc_proc_init(void) +{ + dprintk("RPC: registering /proc/net/rpc\n"); + if (!proc_net_rpc) { + struct proc_dir_entry *ent; + ent = proc_mkdir("rpc", proc_net); + if (ent) { + ent->owner = THIS_MODULE; + proc_net_rpc = ent; + } + } +} + +void +rpc_proc_exit(void) +{ + dprintk("RPC: unregistering /proc/net/rpc\n"); + if (proc_net_rpc) { + proc_net_rpc = NULL; + remove_proc_entry("net/rpc", NULL); + } +} + diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c new file mode 100644 index 000000000000..d4f26bf9e732 --- /dev/null +++ b/net/sunrpc/sunrpc_syms.c @@ -0,0 +1,185 @@ +/* + * linux/net/sunrpc/sunrpc_syms.c + * + * Symbols exported by the sunrpc module. + * + * Copyright (C) 1997 Olaf Kirch <okir@monad.swb.de> + */ + +#include <linux/config.h> +#include <linux/module.h> + +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sched.h> +#include <linux/uio.h> +#include <linux/unistd.h> +#include <linux/init.h> + +#include <linux/sunrpc/sched.h> +#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/svc.h> +#include <linux/sunrpc/svcsock.h> +#include <linux/sunrpc/auth.h> +#include <linux/workqueue.h> +#include <linux/sunrpc/rpc_pipe_fs.h> + + +/* RPC scheduler */ +EXPORT_SYMBOL(rpc_execute); +EXPORT_SYMBOL(rpc_init_task); +EXPORT_SYMBOL(rpc_sleep_on); +EXPORT_SYMBOL(rpc_wake_up_next); +EXPORT_SYMBOL(rpc_wake_up_task); +EXPORT_SYMBOL(rpc_new_child); +EXPORT_SYMBOL(rpc_run_child); +EXPORT_SYMBOL(rpciod_down); +EXPORT_SYMBOL(rpciod_up); +EXPORT_SYMBOL(rpc_new_task); +EXPORT_SYMBOL(rpc_wake_up_status); +EXPORT_SYMBOL(rpc_release_task); + +/* RPC client functions */ +EXPORT_SYMBOL(rpc_create_client); +EXPORT_SYMBOL(rpc_clone_client); +EXPORT_SYMBOL(rpc_destroy_client); +EXPORT_SYMBOL(rpc_shutdown_client); +EXPORT_SYMBOL(rpc_release_client); +EXPORT_SYMBOL(rpc_killall_tasks); +EXPORT_SYMBOL(rpc_call_sync); +EXPORT_SYMBOL(rpc_call_async); +EXPORT_SYMBOL(rpc_call_setup); +EXPORT_SYMBOL(rpc_clnt_sigmask); +EXPORT_SYMBOL(rpc_clnt_sigunmask); +EXPORT_SYMBOL(rpc_delay); +EXPORT_SYMBOL(rpc_restart_call); +EXPORT_SYMBOL(rpc_setbufsize); +EXPORT_SYMBOL(rpc_unlink); +EXPORT_SYMBOL(rpc_wake_up); +EXPORT_SYMBOL(rpc_queue_upcall); +EXPORT_SYMBOL(rpc_mkpipe); + +/* Client transport */ +EXPORT_SYMBOL(xprt_create_proto); +EXPORT_SYMBOL(xprt_destroy); +EXPORT_SYMBOL(xprt_set_timeout); +EXPORT_SYMBOL(xprt_udp_slot_table_entries); +EXPORT_SYMBOL(xprt_tcp_slot_table_entries); + +/* Client credential cache */ +EXPORT_SYMBOL(rpcauth_register); +EXPORT_SYMBOL(rpcauth_unregister); +EXPORT_SYMBOL(rpcauth_create); +EXPORT_SYMBOL(rpcauth_lookupcred); +EXPORT_SYMBOL(rpcauth_lookup_credcache); +EXPORT_SYMBOL(rpcauth_free_credcache); +EXPORT_SYMBOL(rpcauth_init_credcache); +EXPORT_SYMBOL(put_rpccred); + +/* RPC server stuff */ +EXPORT_SYMBOL(svc_create); +EXPORT_SYMBOL(svc_create_thread); +EXPORT_SYMBOL(svc_exit_thread); +EXPORT_SYMBOL(svc_destroy); +EXPORT_SYMBOL(svc_drop); +EXPORT_SYMBOL(svc_process); +EXPORT_SYMBOL(svc_recv); +EXPORT_SYMBOL(svc_wake_up); +EXPORT_SYMBOL(svc_makesock); +EXPORT_SYMBOL(svc_reserve); +EXPORT_SYMBOL(svc_auth_register); +EXPORT_SYMBOL(auth_domain_lookup); +EXPORT_SYMBOL(svc_authenticate); +EXPORT_SYMBOL(svc_set_client); + +/* RPC statistics */ +#ifdef CONFIG_PROC_FS +EXPORT_SYMBOL(rpc_proc_register); +EXPORT_SYMBOL(rpc_proc_unregister); +EXPORT_SYMBOL(svc_proc_register); +EXPORT_SYMBOL(svc_proc_unregister); +EXPORT_SYMBOL(svc_seq_show); +#endif + +/* caching... */ +EXPORT_SYMBOL(auth_domain_find); +EXPORT_SYMBOL(auth_domain_put); +EXPORT_SYMBOL(auth_unix_add_addr); +EXPORT_SYMBOL(auth_unix_forget_old); +EXPORT_SYMBOL(auth_unix_lookup); +EXPORT_SYMBOL(cache_check); +EXPORT_SYMBOL(cache_flush); +EXPORT_SYMBOL(cache_purge); +EXPORT_SYMBOL(cache_fresh); +EXPORT_SYMBOL(cache_init); +EXPORT_SYMBOL(cache_register); +EXPORT_SYMBOL(cache_unregister); +EXPORT_SYMBOL(qword_add); +EXPORT_SYMBOL(qword_addhex); +EXPORT_SYMBOL(qword_get); +EXPORT_SYMBOL(svcauth_unix_purge); +EXPORT_SYMBOL(unix_domain_find); + +/* Generic XDR */ +EXPORT_SYMBOL(xdr_encode_string); +EXPORT_SYMBOL(xdr_decode_string); +EXPORT_SYMBOL(xdr_decode_string_inplace); +EXPORT_SYMBOL(xdr_decode_netobj); +EXPORT_SYMBOL(xdr_encode_netobj); +EXPORT_SYMBOL(xdr_encode_pages); +EXPORT_SYMBOL(xdr_inline_pages); +EXPORT_SYMBOL(xdr_shift_buf); +EXPORT_SYMBOL(xdr_buf_from_iov); +EXPORT_SYMBOL(xdr_buf_subsegment); +EXPORT_SYMBOL(xdr_buf_read_netobj); +EXPORT_SYMBOL(read_bytes_from_xdr_buf); + +/* Debugging symbols */ +#ifdef RPC_DEBUG +EXPORT_SYMBOL(rpc_debug); +EXPORT_SYMBOL(nfs_debug); +EXPORT_SYMBOL(nfsd_debug); +EXPORT_SYMBOL(nlm_debug); +#endif + +extern int register_rpc_pipefs(void); +extern void unregister_rpc_pipefs(void); + +static int __init +init_sunrpc(void) +{ + int err = register_rpc_pipefs(); + if (err) + goto out; + err = rpc_init_mempool() != 0; + if (err) + goto out; +#ifdef RPC_DEBUG + rpc_register_sysctl(); +#endif +#ifdef CONFIG_PROC_FS + rpc_proc_init(); +#endif + cache_register(&auth_domain_cache); + cache_register(&ip_map_cache); +out: + return err; +} + +static void __exit +cleanup_sunrpc(void) +{ + unregister_rpc_pipefs(); + rpc_destroy_mempool(); + cache_unregister(&auth_domain_cache); + cache_unregister(&ip_map_cache); +#ifdef RPC_DEBUG + rpc_unregister_sysctl(); +#endif +#ifdef CONFIG_PROC_FS + rpc_proc_exit(); +#endif +} +MODULE_LICENSE("GPL"); +module_init(init_sunrpc); +module_exit(cleanup_sunrpc); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c new file mode 100644 index 000000000000..bb2d99f33315 --- /dev/null +++ b/net/sunrpc/svc.c @@ -0,0 +1,490 @@ +/* + * linux/net/sunrpc/svc.c + * + * High-level RPC service routines + * + * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> + */ + +#include <linux/linkage.h> +#include <linux/sched.h> +#include <linux/errno.h> +#include <linux/net.h> +#include <linux/in.h> +#include <linux/mm.h> + +#include <linux/sunrpc/types.h> +#include <linux/sunrpc/xdr.h> +#include <linux/sunrpc/stats.h> +#include <linux/sunrpc/svcsock.h> +#include <linux/sunrpc/clnt.h> + +#define RPCDBG_FACILITY RPCDBG_SVCDSP +#define RPC_PARANOIA 1 + +/* + * Create an RPC service + */ +struct svc_serv * +svc_create(struct svc_program *prog, unsigned int bufsize) +{ + struct svc_serv *serv; + int vers; + unsigned int xdrsize; + + if (!(serv = (struct svc_serv *) kmalloc(sizeof(*serv), GFP_KERNEL))) + return NULL; + memset(serv, 0, sizeof(*serv)); + serv->sv_program = prog; + serv->sv_nrthreads = 1; + serv->sv_stats = prog->pg_stats; + serv->sv_bufsz = bufsize? bufsize : 4096; + prog->pg_lovers = prog->pg_nvers-1; + xdrsize = 0; + for (vers=0; vers<prog->pg_nvers ; vers++) + if (prog->pg_vers[vers]) { + prog->pg_hivers = vers; + if (prog->pg_lovers > vers) + prog->pg_lovers = vers; + if (prog->pg_vers[vers]->vs_xdrsize > xdrsize) + xdrsize = prog->pg_vers[vers]->vs_xdrsize; + } + serv->sv_xdrsize = xdrsize; + INIT_LIST_HEAD(&serv->sv_threads); + INIT_LIST_HEAD(&serv->sv_sockets); + INIT_LIST_HEAD(&serv->sv_tempsocks); + INIT_LIST_HEAD(&serv->sv_permsocks); + spin_lock_init(&serv->sv_lock); + + serv->sv_name = prog->pg_name; + + /* Remove any stale portmap registrations */ + svc_register(serv, 0, 0); + + return serv; +} + +/* + * Destroy an RPC service + */ +void +svc_destroy(struct svc_serv *serv) +{ + struct svc_sock *svsk; + + dprintk("RPC: svc_destroy(%s, %d)\n", + serv->sv_program->pg_name, + serv->sv_nrthreads); + + if (serv->sv_nrthreads) { + if (--(serv->sv_nrthreads) != 0) { + svc_sock_update_bufs(serv); + return; + } + } else + printk("svc_destroy: no threads for serv=%p!\n", serv); + + while (!list_empty(&serv->sv_tempsocks)) { + svsk = list_entry(serv->sv_tempsocks.next, + struct svc_sock, + sk_list); + svc_delete_socket(svsk); + } + while (!list_empty(&serv->sv_permsocks)) { + svsk = list_entry(serv->sv_permsocks.next, + struct svc_sock, + sk_list); + svc_delete_socket(svsk); + } + + cache_clean_deferred(serv); + + /* Unregister service with the portmapper */ + svc_register(serv, 0, 0); + kfree(serv); +} + +/* + * Allocate an RPC server's buffer space. + * We allocate pages and place them in rq_argpages. + */ +static int +svc_init_buffer(struct svc_rqst *rqstp, unsigned int size) +{ + int pages; + int arghi; + + if (size > RPCSVC_MAXPAYLOAD) + size = RPCSVC_MAXPAYLOAD; + pages = 2 + (size+ PAGE_SIZE -1) / PAGE_SIZE; + rqstp->rq_argused = 0; + rqstp->rq_resused = 0; + arghi = 0; + if (pages > RPCSVC_MAXPAGES) + BUG(); + while (pages) { + struct page *p = alloc_page(GFP_KERNEL); + if (!p) + break; + rqstp->rq_argpages[arghi++] = p; + pages--; + } + rqstp->rq_arghi = arghi; + return ! pages; +} + +/* + * Release an RPC server buffer + */ +static void +svc_release_buffer(struct svc_rqst *rqstp) +{ + while (rqstp->rq_arghi) + put_page(rqstp->rq_argpages[--rqstp->rq_arghi]); + while (rqstp->rq_resused) { + if (rqstp->rq_respages[--rqstp->rq_resused] == NULL) + continue; + put_page(rqstp->rq_respages[rqstp->rq_resused]); + } + rqstp->rq_argused = 0; +} + +/* + * Create a server thread + */ +int +svc_create_thread(svc_thread_fn func, struct svc_serv *serv) +{ + struct svc_rqst *rqstp; + int error = -ENOMEM; + + rqstp = kmalloc(sizeof(*rqstp), GFP_KERNEL); + if (!rqstp) + goto out; + + memset(rqstp, 0, sizeof(*rqstp)); + init_waitqueue_head(&rqstp->rq_wait); + + if (!(rqstp->rq_argp = (u32 *) kmalloc(serv->sv_xdrsize, GFP_KERNEL)) + || !(rqstp->rq_resp = (u32 *) kmalloc(serv->sv_xdrsize, GFP_KERNEL)) + || !svc_init_buffer(rqstp, serv->sv_bufsz)) + goto out_thread; + + serv->sv_nrthreads++; + rqstp->rq_server = serv; + error = kernel_thread((int (*)(void *)) func, rqstp, 0); + if (error < 0) + goto out_thread; + svc_sock_update_bufs(serv); + error = 0; +out: + return error; + +out_thread: + svc_exit_thread(rqstp); + goto out; +} + +/* + * Destroy an RPC server thread + */ +void +svc_exit_thread(struct svc_rqst *rqstp) +{ + struct svc_serv *serv = rqstp->rq_server; + + svc_release_buffer(rqstp); + if (rqstp->rq_resp) + kfree(rqstp->rq_resp); + if (rqstp->rq_argp) + kfree(rqstp->rq_argp); + if (rqstp->rq_auth_data) + kfree(rqstp->rq_auth_data); + kfree(rqstp); + + /* Release the server */ + if (serv) + svc_destroy(serv); +} + +/* + * Register an RPC service with the local portmapper. + * To unregister a service, call this routine with + * proto and port == 0. + */ +int +svc_register(struct svc_serv *serv, int proto, unsigned short port) +{ + struct svc_program *progp; + unsigned long flags; + int i, error = 0, dummy; + + progp = serv->sv_program; + + dprintk("RPC: svc_register(%s, %s, %d)\n", + progp->pg_name, proto == IPPROTO_UDP? "udp" : "tcp", port); + + if (!port) + clear_thread_flag(TIF_SIGPENDING); + + for (i = 0; i < progp->pg_nvers; i++) { + if (progp->pg_vers[i] == NULL) + continue; + error = rpc_register(progp->pg_prog, i, proto, port, &dummy); + if (error < 0) + break; + if (port && !dummy) { + error = -EACCES; + break; + } + } + + if (!port) { + spin_lock_irqsave(¤t->sighand->siglock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + } + + return error; +} + +/* + * Process the RPC request. + */ +int +svc_process(struct svc_serv *serv, struct svc_rqst *rqstp) +{ + struct svc_program *progp; + struct svc_version *versp = NULL; /* compiler food */ + struct svc_procedure *procp = NULL; + struct kvec * argv = &rqstp->rq_arg.head[0]; + struct kvec * resv = &rqstp->rq_res.head[0]; + kxdrproc_t xdr; + u32 *statp; + u32 dir, prog, vers, proc, + auth_stat, rpc_stat; + int auth_res; + u32 *accept_statp; + + rpc_stat = rpc_success; + + if (argv->iov_len < 6*4) + goto err_short_len; + + /* setup response xdr_buf. + * Initially it has just one page + */ + svc_take_page(rqstp); /* must succeed */ + resv->iov_base = page_address(rqstp->rq_respages[0]); + resv->iov_len = 0; + rqstp->rq_res.pages = rqstp->rq_respages+1; + rqstp->rq_res.len = 0; + rqstp->rq_res.page_base = 0; + rqstp->rq_res.page_len = 0; + rqstp->rq_res.tail[0].iov_len = 0; + /* tcp needs a space for the record length... */ + if (rqstp->rq_prot == IPPROTO_TCP) + svc_putu32(resv, 0); + + rqstp->rq_xid = svc_getu32(argv); + svc_putu32(resv, rqstp->rq_xid); + + dir = ntohl(svc_getu32(argv)); + vers = ntohl(svc_getu32(argv)); + + /* First words of reply: */ + svc_putu32(resv, xdr_one); /* REPLY */ + + if (dir != 0) /* direction != CALL */ + goto err_bad_dir; + if (vers != 2) /* RPC version number */ + goto err_bad_rpc; + + /* Save position in case we later decide to reject: */ + accept_statp = resv->iov_base + resv->iov_len; + + svc_putu32(resv, xdr_zero); /* ACCEPT */ + + rqstp->rq_prog = prog = ntohl(svc_getu32(argv)); /* program number */ + rqstp->rq_vers = vers = ntohl(svc_getu32(argv)); /* version number */ + rqstp->rq_proc = proc = ntohl(svc_getu32(argv)); /* procedure number */ + + progp = serv->sv_program; + /* + * Decode auth data, and add verifier to reply buffer. + * We do this before anything else in order to get a decent + * auth verifier. + */ + auth_res = svc_authenticate(rqstp, &auth_stat); + /* Also give the program a chance to reject this call: */ + if (auth_res == SVC_OK) { + auth_stat = rpc_autherr_badcred; + auth_res = progp->pg_authenticate(rqstp); + } + switch (auth_res) { + case SVC_OK: + break; + case SVC_GARBAGE: + rpc_stat = rpc_garbage_args; + goto err_bad; + case SVC_SYSERR: + rpc_stat = rpc_system_err; + goto err_bad; + case SVC_DENIED: + goto err_bad_auth; + case SVC_DROP: + goto dropit; + case SVC_COMPLETE: + goto sendit; + } + + if (prog != progp->pg_prog) + goto err_bad_prog; + + if (vers >= progp->pg_nvers || + !(versp = progp->pg_vers[vers])) + goto err_bad_vers; + + procp = versp->vs_proc + proc; + if (proc >= versp->vs_nproc || !procp->pc_func) + goto err_bad_proc; + rqstp->rq_server = serv; + rqstp->rq_procinfo = procp; + + /* Syntactic check complete */ + serv->sv_stats->rpccnt++; + + /* Build the reply header. */ + statp = resv->iov_base +resv->iov_len; + svc_putu32(resv, rpc_success); /* RPC_SUCCESS */ + + /* Bump per-procedure stats counter */ + procp->pc_count++; + + /* Initialize storage for argp and resp */ + memset(rqstp->rq_argp, 0, procp->pc_argsize); + memset(rqstp->rq_resp, 0, procp->pc_ressize); + + /* un-reserve some of the out-queue now that we have a + * better idea of reply size + */ + if (procp->pc_xdrressize) + svc_reserve(rqstp, procp->pc_xdrressize<<2); + + /* Call the function that processes the request. */ + if (!versp->vs_dispatch) { + /* Decode arguments */ + xdr = procp->pc_decode; + if (xdr && !xdr(rqstp, argv->iov_base, rqstp->rq_argp)) + goto err_garbage; + + *statp = procp->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); + + /* Encode reply */ + if (*statp == rpc_success && (xdr = procp->pc_encode) + && !xdr(rqstp, resv->iov_base+resv->iov_len, rqstp->rq_resp)) { + dprintk("svc: failed to encode reply\n"); + /* serv->sv_stats->rpcsystemerr++; */ + *statp = rpc_system_err; + } + } else { + dprintk("svc: calling dispatcher\n"); + if (!versp->vs_dispatch(rqstp, statp)) { + /* Release reply info */ + if (procp->pc_release) + procp->pc_release(rqstp, NULL, rqstp->rq_resp); + goto dropit; + } + } + + /* Check RPC status result */ + if (*statp != rpc_success) + resv->iov_len = ((void*)statp) - resv->iov_base + 4; + + /* Release reply info */ + if (procp->pc_release) + procp->pc_release(rqstp, NULL, rqstp->rq_resp); + + if (procp->pc_encode == NULL) + goto dropit; + + sendit: + if (svc_authorise(rqstp)) + goto dropit; + return svc_send(rqstp); + + dropit: + svc_authorise(rqstp); /* doesn't hurt to call this twice */ + dprintk("svc: svc_process dropit\n"); + svc_drop(rqstp); + return 0; + +err_short_len: +#ifdef RPC_PARANOIA + printk("svc: short len %Zd, dropping request\n", argv->iov_len); +#endif + goto dropit; /* drop request */ + +err_bad_dir: +#ifdef RPC_PARANOIA + printk("svc: bad direction %d, dropping request\n", dir); +#endif + serv->sv_stats->rpcbadfmt++; + goto dropit; /* drop request */ + +err_bad_rpc: + serv->sv_stats->rpcbadfmt++; + svc_putu32(resv, xdr_one); /* REJECT */ + svc_putu32(resv, xdr_zero); /* RPC_MISMATCH */ + svc_putu32(resv, xdr_two); /* Only RPCv2 supported */ + svc_putu32(resv, xdr_two); + goto sendit; + +err_bad_auth: + dprintk("svc: authentication failed (%d)\n", ntohl(auth_stat)); + serv->sv_stats->rpcbadauth++; + /* Restore write pointer to location of accept status: */ + xdr_ressize_check(rqstp, accept_statp); + svc_putu32(resv, xdr_one); /* REJECT */ + svc_putu32(resv, xdr_one); /* AUTH_ERROR */ + svc_putu32(resv, auth_stat); /* status */ + goto sendit; + +err_bad_prog: +#ifdef RPC_PARANOIA + if (prog != 100227 || progp->pg_prog != 100003) + printk("svc: unknown program %d (me %d)\n", prog, progp->pg_prog); + /* else it is just a Solaris client seeing if ACLs are supported */ +#endif + serv->sv_stats->rpcbadfmt++; + svc_putu32(resv, rpc_prog_unavail); + goto sendit; + +err_bad_vers: +#ifdef RPC_PARANOIA + printk("svc: unknown version (%d)\n", vers); +#endif + serv->sv_stats->rpcbadfmt++; + svc_putu32(resv, rpc_prog_mismatch); + svc_putu32(resv, htonl(progp->pg_lovers)); + svc_putu32(resv, htonl(progp->pg_hivers)); + goto sendit; + +err_bad_proc: +#ifdef RPC_PARANOIA + printk("svc: unknown procedure (%d)\n", proc); +#endif + serv->sv_stats->rpcbadfmt++; + svc_putu32(resv, rpc_proc_unavail); + goto sendit; + +err_garbage: +#ifdef RPC_PARANOIA + printk("svc: failed to decode args\n"); +#endif + rpc_stat = rpc_garbage_args; +err_bad: + serv->sv_stats->rpcbadfmt++; + svc_putu32(resv, rpc_stat); + goto sendit; +} diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c new file mode 100644 index 000000000000..bde8147ef2db --- /dev/null +++ b/net/sunrpc/svcauth.c @@ -0,0 +1,216 @@ +/* + * linux/net/sunrpc/svcauth.c + * + * The generic interface for RPC authentication on the server side. + * + * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> + * + * CHANGES + * 19-Apr-2000 Chris Evans - Security fix + */ + +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/sunrpc/types.h> +#include <linux/sunrpc/xdr.h> +#include <linux/sunrpc/svcsock.h> +#include <linux/sunrpc/svcauth.h> +#include <linux/err.h> +#include <linux/hash.h> + +#define RPCDBG_FACILITY RPCDBG_AUTH + + +/* + * Table of authenticators + */ +extern struct auth_ops svcauth_null; +extern struct auth_ops svcauth_unix; + +static DEFINE_SPINLOCK(authtab_lock); +static struct auth_ops *authtab[RPC_AUTH_MAXFLAVOR] = { + [0] = &svcauth_null, + [1] = &svcauth_unix, +}; + +int +svc_authenticate(struct svc_rqst *rqstp, u32 *authp) +{ + rpc_authflavor_t flavor; + struct auth_ops *aops; + + *authp = rpc_auth_ok; + + flavor = ntohl(svc_getu32(&rqstp->rq_arg.head[0])); + + dprintk("svc: svc_authenticate (%d)\n", flavor); + + spin_lock(&authtab_lock); + if (flavor >= RPC_AUTH_MAXFLAVOR || !(aops = authtab[flavor]) + || !try_module_get(aops->owner)) { + spin_unlock(&authtab_lock); + *authp = rpc_autherr_badcred; + return SVC_DENIED; + } + spin_unlock(&authtab_lock); + + rqstp->rq_authop = aops; + return aops->accept(rqstp, authp); +} + +int svc_set_client(struct svc_rqst *rqstp) +{ + return rqstp->rq_authop->set_client(rqstp); +} + +/* A request, which was authenticated, has now executed. + * Time to finalise the the credentials and verifier + * and release and resources + */ +int svc_authorise(struct svc_rqst *rqstp) +{ + struct auth_ops *aops = rqstp->rq_authop; + int rv = 0; + + rqstp->rq_authop = NULL; + + if (aops) { + rv = aops->release(rqstp); + module_put(aops->owner); + } + return rv; +} + +int +svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops) +{ + int rv = -EINVAL; + spin_lock(&authtab_lock); + if (flavor < RPC_AUTH_MAXFLAVOR && authtab[flavor] == NULL) { + authtab[flavor] = aops; + rv = 0; + } + spin_unlock(&authtab_lock); + return rv; +} + +void +svc_auth_unregister(rpc_authflavor_t flavor) +{ + spin_lock(&authtab_lock); + if (flavor < RPC_AUTH_MAXFLAVOR) + authtab[flavor] = NULL; + spin_unlock(&authtab_lock); +} +EXPORT_SYMBOL(svc_auth_unregister); + +/************************************************** + * cache for domain name to auth_domain + * Entries are only added by flavours which will normally + * have a structure that 'inherits' from auth_domain. + * e.g. when an IP -> domainname is given to auth_unix, + * and the domain name doesn't exist, it will create a + * auth_unix_domain and add it to this hash table. + * If it finds the name does exist, but isn't AUTH_UNIX, + * it will complain. + */ + +/* + * Auth auth_domain cache is somewhat different to other caches, + * largely because the entries are possibly of different types: + * each auth flavour has it's own type. + * One consequence of this that DefineCacheLookup cannot + * allocate a new structure as it cannot know the size. + * Notice that the "INIT" code fragment is quite different + * from other caches. When auth_domain_lookup might be + * creating a new domain, the new domain is passed in + * complete and it is used as-is rather than being copied into + * another structure. + */ +#define DN_HASHBITS 6 +#define DN_HASHMAX (1<<DN_HASHBITS) +#define DN_HASHMASK (DN_HASHMAX-1) + +static struct cache_head *auth_domain_table[DN_HASHMAX]; + +static void auth_domain_drop(struct cache_head *item, struct cache_detail *cd) +{ + struct auth_domain *dom = container_of(item, struct auth_domain, h); + if (cache_put(item,cd)) + authtab[dom->flavour]->domain_release(dom); +} + + +struct cache_detail auth_domain_cache = { + .hash_size = DN_HASHMAX, + .hash_table = auth_domain_table, + .name = "auth.domain", + .cache_put = auth_domain_drop, +}; + +void auth_domain_put(struct auth_domain *dom) +{ + auth_domain_drop(&dom->h, &auth_domain_cache); +} + +static inline int auth_domain_hash(struct auth_domain *item) +{ + return hash_str(item->name, DN_HASHBITS); +} +static inline int auth_domain_match(struct auth_domain *tmp, struct auth_domain *item) +{ + return strcmp(tmp->name, item->name) == 0; +} + +struct auth_domain * +auth_domain_lookup(struct auth_domain *item, int set) +{ + struct auth_domain *tmp = NULL; + struct cache_head **hp, **head; + head = &auth_domain_cache.hash_table[auth_domain_hash(item)]; + + if (set) + write_lock(&auth_domain_cache.hash_lock); + else + read_lock(&auth_domain_cache.hash_lock); + for (hp=head; *hp != NULL; hp = &tmp->h.next) { + tmp = container_of(*hp, struct auth_domain, h); + if (!auth_domain_match(tmp, item)) + continue; + if (!set) { + cache_get(&tmp->h); + goto out_noset; + } + *hp = tmp->h.next; + tmp->h.next = NULL; + auth_domain_drop(&tmp->h, &auth_domain_cache); + goto out_set; + } + /* Didn't find anything */ + if (!set) + goto out_nada; + auth_domain_cache.entries++; +out_set: + item->h.next = *head; + *head = &item->h; + cache_get(&item->h); + write_unlock(&auth_domain_cache.hash_lock); + cache_fresh(&auth_domain_cache, &item->h, item->h.expiry_time); + cache_get(&item->h); + return item; +out_nada: + tmp = NULL; +out_noset: + read_unlock(&auth_domain_cache.hash_lock); + return tmp; +} + +struct auth_domain *auth_domain_find(char *name) +{ + struct auth_domain *rv, ad; + + ad.name = name; + rv = auth_domain_lookup(&ad, 0); + return rv; +} diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c new file mode 100644 index 000000000000..2b99b4028d31 --- /dev/null +++ b/net/sunrpc/svcauth_unix.c @@ -0,0 +1,502 @@ +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/sunrpc/types.h> +#include <linux/sunrpc/xdr.h> +#include <linux/sunrpc/svcsock.h> +#include <linux/sunrpc/svcauth.h> +#include <linux/err.h> +#include <linux/seq_file.h> +#include <linux/hash.h> + +#define RPCDBG_FACILITY RPCDBG_AUTH + + +/* + * AUTHUNIX and AUTHNULL credentials are both handled here. + * AUTHNULL is treated just like AUTHUNIX except that the uid/gid + * are always nobody (-2). i.e. we do the same IP address checks for + * AUTHNULL as for AUTHUNIX, and that is done here. + */ + + +static char *strdup(char *s) +{ + char *rv = kmalloc(strlen(s)+1, GFP_KERNEL); + if (rv) + strcpy(rv, s); + return rv; +} + +struct unix_domain { + struct auth_domain h; + int addr_changes; + /* other stuff later */ +}; + +struct auth_domain *unix_domain_find(char *name) +{ + struct auth_domain *rv, ud; + struct unix_domain *new; + + ud.name = name; + + rv = auth_domain_lookup(&ud, 0); + + foundit: + if (rv && rv->flavour != RPC_AUTH_UNIX) { + auth_domain_put(rv); + return NULL; + } + if (rv) + return rv; + + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (new == NULL) + return NULL; + cache_init(&new->h.h); + new->h.name = strdup(name); + new->h.flavour = RPC_AUTH_UNIX; + new->addr_changes = 0; + new->h.h.expiry_time = NEVER; + + rv = auth_domain_lookup(&new->h, 2); + if (rv == &new->h) { + if (atomic_dec_and_test(&new->h.h.refcnt)) BUG(); + } else { + auth_domain_put(&new->h); + goto foundit; + } + + return rv; +} + +static void svcauth_unix_domain_release(struct auth_domain *dom) +{ + struct unix_domain *ud = container_of(dom, struct unix_domain, h); + + kfree(dom->name); + kfree(ud); +} + + +/************************************************** + * cache for IP address to unix_domain + * as needed by AUTH_UNIX + */ +#define IP_HASHBITS 8 +#define IP_HASHMAX (1<<IP_HASHBITS) +#define IP_HASHMASK (IP_HASHMAX-1) + +struct ip_map { + struct cache_head h; + char m_class[8]; /* e.g. "nfsd" */ + struct in_addr m_addr; + struct unix_domain *m_client; + int m_add_change; +}; +static struct cache_head *ip_table[IP_HASHMAX]; + +static void ip_map_put(struct cache_head *item, struct cache_detail *cd) +{ + struct ip_map *im = container_of(item, struct ip_map,h); + if (cache_put(item, cd)) { + if (test_bit(CACHE_VALID, &item->flags) && + !test_bit(CACHE_NEGATIVE, &item->flags)) + auth_domain_put(&im->m_client->h); + kfree(im); + } +} + +static inline int ip_map_hash(struct ip_map *item) +{ + return hash_str(item->m_class, IP_HASHBITS) ^ + hash_long((unsigned long)item->m_addr.s_addr, IP_HASHBITS); +} +static inline int ip_map_match(struct ip_map *item, struct ip_map *tmp) +{ + return strcmp(tmp->m_class, item->m_class) == 0 + && tmp->m_addr.s_addr == item->m_addr.s_addr; +} +static inline void ip_map_init(struct ip_map *new, struct ip_map *item) +{ + strcpy(new->m_class, item->m_class); + new->m_addr.s_addr = item->m_addr.s_addr; +} +static inline void ip_map_update(struct ip_map *new, struct ip_map *item) +{ + cache_get(&item->m_client->h.h); + new->m_client = item->m_client; + new->m_add_change = item->m_add_change; +} + +static void ip_map_request(struct cache_detail *cd, + struct cache_head *h, + char **bpp, int *blen) +{ + char text_addr[20]; + struct ip_map *im = container_of(h, struct ip_map, h); + __u32 addr = im->m_addr.s_addr; + + snprintf(text_addr, 20, "%u.%u.%u.%u", + ntohl(addr) >> 24 & 0xff, + ntohl(addr) >> 16 & 0xff, + ntohl(addr) >> 8 & 0xff, + ntohl(addr) >> 0 & 0xff); + + qword_add(bpp, blen, im->m_class); + qword_add(bpp, blen, text_addr); + (*bpp)[-1] = '\n'; +} + +static struct ip_map *ip_map_lookup(struct ip_map *, int); + +static int ip_map_parse(struct cache_detail *cd, + char *mesg, int mlen) +{ + /* class ipaddress [domainname] */ + /* should be safe just to use the start of the input buffer + * for scratch: */ + char *buf = mesg; + int len; + int b1,b2,b3,b4; + char c; + struct ip_map ipm, *ipmp; + struct auth_domain *dom; + time_t expiry; + + if (mesg[mlen-1] != '\n') + return -EINVAL; + mesg[mlen-1] = 0; + + /* class */ + len = qword_get(&mesg, ipm.m_class, sizeof(ipm.m_class)); + if (len <= 0) return -EINVAL; + + /* ip address */ + len = qword_get(&mesg, buf, mlen); + if (len <= 0) return -EINVAL; + + if (sscanf(buf, "%u.%u.%u.%u%c", &b1, &b2, &b3, &b4, &c) != 4) + return -EINVAL; + + expiry = get_expiry(&mesg); + if (expiry ==0) + return -EINVAL; + + /* domainname, or empty for NEGATIVE */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) return -EINVAL; + + if (len) { + dom = unix_domain_find(buf); + if (dom == NULL) + return -ENOENT; + } else + dom = NULL; + + ipm.m_addr.s_addr = + htonl((((((b1<<8)|b2)<<8)|b3)<<8)|b4); + ipm.h.flags = 0; + if (dom) { + ipm.m_client = container_of(dom, struct unix_domain, h); + ipm.m_add_change = ipm.m_client->addr_changes; + } else + set_bit(CACHE_NEGATIVE, &ipm.h.flags); + ipm.h.expiry_time = expiry; + + ipmp = ip_map_lookup(&ipm, 1); + if (ipmp) + ip_map_put(&ipmp->h, &ip_map_cache); + if (dom) + auth_domain_put(dom); + if (!ipmp) + return -ENOMEM; + cache_flush(); + return 0; +} + +static int ip_map_show(struct seq_file *m, + struct cache_detail *cd, + struct cache_head *h) +{ + struct ip_map *im; + struct in_addr addr; + char *dom = "-no-domain-"; + + if (h == NULL) { + seq_puts(m, "#class IP domain\n"); + return 0; + } + im = container_of(h, struct ip_map, h); + /* class addr domain */ + addr = im->m_addr; + + if (test_bit(CACHE_VALID, &h->flags) && + !test_bit(CACHE_NEGATIVE, &h->flags)) + dom = im->m_client->h.name; + + seq_printf(m, "%s %d.%d.%d.%d %s\n", + im->m_class, + htonl(addr.s_addr) >> 24 & 0xff, + htonl(addr.s_addr) >> 16 & 0xff, + htonl(addr.s_addr) >> 8 & 0xff, + htonl(addr.s_addr) >> 0 & 0xff, + dom + ); + return 0; +} + + +struct cache_detail ip_map_cache = { + .hash_size = IP_HASHMAX, + .hash_table = ip_table, + .name = "auth.unix.ip", + .cache_put = ip_map_put, + .cache_request = ip_map_request, + .cache_parse = ip_map_parse, + .cache_show = ip_map_show, +}; + +static DefineSimpleCacheLookup(ip_map, 0) + + +int auth_unix_add_addr(struct in_addr addr, struct auth_domain *dom) +{ + struct unix_domain *udom; + struct ip_map ip, *ipmp; + + if (dom->flavour != RPC_AUTH_UNIX) + return -EINVAL; + udom = container_of(dom, struct unix_domain, h); + strcpy(ip.m_class, "nfsd"); + ip.m_addr = addr; + ip.m_client = udom; + ip.m_add_change = udom->addr_changes+1; + ip.h.flags = 0; + ip.h.expiry_time = NEVER; + + ipmp = ip_map_lookup(&ip, 1); + + if (ipmp) { + ip_map_put(&ipmp->h, &ip_map_cache); + return 0; + } else + return -ENOMEM; +} + +int auth_unix_forget_old(struct auth_domain *dom) +{ + struct unix_domain *udom; + + if (dom->flavour != RPC_AUTH_UNIX) + return -EINVAL; + udom = container_of(dom, struct unix_domain, h); + udom->addr_changes++; + return 0; +} + +struct auth_domain *auth_unix_lookup(struct in_addr addr) +{ + struct ip_map key, *ipm; + struct auth_domain *rv; + + strcpy(key.m_class, "nfsd"); + key.m_addr = addr; + + ipm = ip_map_lookup(&key, 0); + + if (!ipm) + return NULL; + if (cache_check(&ip_map_cache, &ipm->h, NULL)) + return NULL; + + if ((ipm->m_client->addr_changes - ipm->m_add_change) >0) { + if (test_and_set_bit(CACHE_NEGATIVE, &ipm->h.flags) == 0) + auth_domain_put(&ipm->m_client->h); + rv = NULL; + } else { + rv = &ipm->m_client->h; + cache_get(&rv->h); + } + ip_map_put(&ipm->h, &ip_map_cache); + return rv; +} + +void svcauth_unix_purge(void) +{ + cache_purge(&ip_map_cache); + cache_purge(&auth_domain_cache); +} + +static int +svcauth_unix_set_client(struct svc_rqst *rqstp) +{ + struct ip_map key, *ipm; + + rqstp->rq_client = NULL; + if (rqstp->rq_proc == 0) + return SVC_OK; + + strcpy(key.m_class, rqstp->rq_server->sv_program->pg_class); + key.m_addr = rqstp->rq_addr.sin_addr; + + ipm = ip_map_lookup(&key, 0); + + if (ipm == NULL) + return SVC_DENIED; + + switch (cache_check(&ip_map_cache, &ipm->h, &rqstp->rq_chandle)) { + default: + BUG(); + case -EAGAIN: + return SVC_DROP; + case -ENOENT: + return SVC_DENIED; + case 0: + rqstp->rq_client = &ipm->m_client->h; + cache_get(&rqstp->rq_client->h); + ip_map_put(&ipm->h, &ip_map_cache); + break; + } + return SVC_OK; +} + +static int +svcauth_null_accept(struct svc_rqst *rqstp, u32 *authp) +{ + struct kvec *argv = &rqstp->rq_arg.head[0]; + struct kvec *resv = &rqstp->rq_res.head[0]; + struct svc_cred *cred = &rqstp->rq_cred; + + cred->cr_group_info = NULL; + rqstp->rq_client = NULL; + + if (argv->iov_len < 3*4) + return SVC_GARBAGE; + + if (svc_getu32(argv) != 0) { + dprintk("svc: bad null cred\n"); + *authp = rpc_autherr_badcred; + return SVC_DENIED; + } + if (svc_getu32(argv) != RPC_AUTH_NULL || svc_getu32(argv) != 0) { + dprintk("svc: bad null verf\n"); + *authp = rpc_autherr_badverf; + return SVC_DENIED; + } + + /* Signal that mapping to nobody uid/gid is required */ + cred->cr_uid = (uid_t) -1; + cred->cr_gid = (gid_t) -1; + cred->cr_group_info = groups_alloc(0); + if (cred->cr_group_info == NULL) + return SVC_DROP; /* kmalloc failure - client must retry */ + + /* Put NULL verifier */ + svc_putu32(resv, RPC_AUTH_NULL); + svc_putu32(resv, 0); + + return SVC_OK; +} + +static int +svcauth_null_release(struct svc_rqst *rqstp) +{ + if (rqstp->rq_client) + auth_domain_put(rqstp->rq_client); + rqstp->rq_client = NULL; + if (rqstp->rq_cred.cr_group_info) + put_group_info(rqstp->rq_cred.cr_group_info); + rqstp->rq_cred.cr_group_info = NULL; + + return 0; /* don't drop */ +} + + +struct auth_ops svcauth_null = { + .name = "null", + .owner = THIS_MODULE, + .flavour = RPC_AUTH_NULL, + .accept = svcauth_null_accept, + .release = svcauth_null_release, + .set_client = svcauth_unix_set_client, +}; + + +static int +svcauth_unix_accept(struct svc_rqst *rqstp, u32 *authp) +{ + struct kvec *argv = &rqstp->rq_arg.head[0]; + struct kvec *resv = &rqstp->rq_res.head[0]; + struct svc_cred *cred = &rqstp->rq_cred; + u32 slen, i; + int len = argv->iov_len; + + cred->cr_group_info = NULL; + rqstp->rq_client = NULL; + + if ((len -= 3*4) < 0) + return SVC_GARBAGE; + + svc_getu32(argv); /* length */ + svc_getu32(argv); /* time stamp */ + slen = XDR_QUADLEN(ntohl(svc_getu32(argv))); /* machname length */ + if (slen > 64 || (len -= (slen + 3)*4) < 0) + goto badcred; + argv->iov_base = (void*)((u32*)argv->iov_base + slen); /* skip machname */ + argv->iov_len -= slen*4; + + cred->cr_uid = ntohl(svc_getu32(argv)); /* uid */ + cred->cr_gid = ntohl(svc_getu32(argv)); /* gid */ + slen = ntohl(svc_getu32(argv)); /* gids length */ + if (slen > 16 || (len -= (slen + 2)*4) < 0) + goto badcred; + cred->cr_group_info = groups_alloc(slen); + if (cred->cr_group_info == NULL) + return SVC_DROP; + for (i = 0; i < slen; i++) + GROUP_AT(cred->cr_group_info, i) = ntohl(svc_getu32(argv)); + + if (svc_getu32(argv) != RPC_AUTH_NULL || svc_getu32(argv) != 0) { + *authp = rpc_autherr_badverf; + return SVC_DENIED; + } + + /* Put NULL verifier */ + svc_putu32(resv, RPC_AUTH_NULL); + svc_putu32(resv, 0); + + return SVC_OK; + +badcred: + *authp = rpc_autherr_badcred; + return SVC_DENIED; +} + +static int +svcauth_unix_release(struct svc_rqst *rqstp) +{ + /* Verifier (such as it is) is already in place. + */ + if (rqstp->rq_client) + auth_domain_put(rqstp->rq_client); + rqstp->rq_client = NULL; + if (rqstp->rq_cred.cr_group_info) + put_group_info(rqstp->rq_cred.cr_group_info); + rqstp->rq_cred.cr_group_info = NULL; + + return 0; +} + + +struct auth_ops svcauth_unix = { + .name = "unix", + .owner = THIS_MODULE, + .flavour = RPC_AUTH_UNIX, + .accept = svcauth_unix_accept, + .release = svcauth_unix_release, + .domain_release = svcauth_unix_domain_release, + .set_client = svcauth_unix_set_client, +}; + diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c new file mode 100644 index 000000000000..05907035bc96 --- /dev/null +++ b/net/sunrpc/svcsock.c @@ -0,0 +1,1585 @@ +/* + * linux/net/sunrpc/svcsock.c + * + * These are the RPC server socket internals. + * + * The server scheduling algorithm does not always distribute the load + * evenly when servicing a single client. May need to modify the + * svc_sock_enqueue procedure... + * + * TCP support is largely untested and may be a little slow. The problem + * is that we currently do two separate recvfrom's, one for the 4-byte + * record length, and the second for the actual record. This could possibly + * be improved by always reading a minimum size of around 100 bytes and + * tucking any superfluous bytes away in a temporary store. Still, that + * leaves write requests out in the rain. An alternative may be to peek at + * the first skb in the queue, and if it matches the next TCP sequence + * number, to extract the record marker. Yuck. + * + * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> + */ + +#include <linux/sched.h> +#include <linux/errno.h> +#include <linux/fcntl.h> +#include <linux/net.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/udp.h> +#include <linux/tcp.h> +#include <linux/unistd.h> +#include <linux/slab.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/checksum.h> +#include <net/ip.h> +#include <net/tcp.h> +#include <asm/uaccess.h> +#include <asm/ioctls.h> + +#include <linux/sunrpc/types.h> +#include <linux/sunrpc/xdr.h> +#include <linux/sunrpc/svcsock.h> +#include <linux/sunrpc/stats.h> + +/* SMP locking strategy: + * + * svc_serv->sv_lock protects most stuff for that service. + * + * Some flags can be set to certain values at any time + * providing that certain rules are followed: + * + * SK_BUSY can be set to 0 at any time. + * svc_sock_enqueue must be called afterwards + * SK_CONN, SK_DATA, can be set or cleared at any time. + * after a set, svc_sock_enqueue must be called. + * after a clear, the socket must be read/accepted + * if this succeeds, it must be set again. + * SK_CLOSE can set at any time. It is never cleared. + * + */ + +#define RPCDBG_FACILITY RPCDBG_SVCSOCK + + +static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, + int *errp, int pmap_reg); +static void svc_udp_data_ready(struct sock *, int); +static int svc_udp_recvfrom(struct svc_rqst *); +static int svc_udp_sendto(struct svc_rqst *); + +static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk); +static int svc_deferred_recv(struct svc_rqst *rqstp); +static struct cache_deferred_req *svc_defer(struct cache_req *req); + +/* + * Queue up an idle server thread. Must have serv->sv_lock held. + * Note: this is really a stack rather than a queue, so that we only + * use as many different threads as we need, and the rest don't polute + * the cache. + */ +static inline void +svc_serv_enqueue(struct svc_serv *serv, struct svc_rqst *rqstp) +{ + list_add(&rqstp->rq_list, &serv->sv_threads); +} + +/* + * Dequeue an nfsd thread. Must have serv->sv_lock held. + */ +static inline void +svc_serv_dequeue(struct svc_serv *serv, struct svc_rqst *rqstp) +{ + list_del(&rqstp->rq_list); +} + +/* + * Release an skbuff after use + */ +static inline void +svc_release_skb(struct svc_rqst *rqstp) +{ + struct sk_buff *skb = rqstp->rq_skbuff; + struct svc_deferred_req *dr = rqstp->rq_deferred; + + if (skb) { + rqstp->rq_skbuff = NULL; + + dprintk("svc: service %p, releasing skb %p\n", rqstp, skb); + skb_free_datagram(rqstp->rq_sock->sk_sk, skb); + } + if (dr) { + rqstp->rq_deferred = NULL; + kfree(dr); + } +} + +/* + * Any space to write? + */ +static inline unsigned long +svc_sock_wspace(struct svc_sock *svsk) +{ + int wspace; + + if (svsk->sk_sock->type == SOCK_STREAM) + wspace = sk_stream_wspace(svsk->sk_sk); + else + wspace = sock_wspace(svsk->sk_sk); + + return wspace; +} + +/* + * Queue up a socket with data pending. If there are idle nfsd + * processes, wake 'em up. + * + */ +static void +svc_sock_enqueue(struct svc_sock *svsk) +{ + struct svc_serv *serv = svsk->sk_server; + struct svc_rqst *rqstp; + + if (!(svsk->sk_flags & + ( (1<<SK_CONN)|(1<<SK_DATA)|(1<<SK_CLOSE)|(1<<SK_DEFERRED)) )) + return; + if (test_bit(SK_DEAD, &svsk->sk_flags)) + return; + + spin_lock_bh(&serv->sv_lock); + + if (!list_empty(&serv->sv_threads) && + !list_empty(&serv->sv_sockets)) + printk(KERN_ERR + "svc_sock_enqueue: threads and sockets both waiting??\n"); + + if (test_bit(SK_DEAD, &svsk->sk_flags)) { + /* Don't enqueue dead sockets */ + dprintk("svc: socket %p is dead, not enqueued\n", svsk->sk_sk); + goto out_unlock; + } + + if (test_bit(SK_BUSY, &svsk->sk_flags)) { + /* Don't enqueue socket while daemon is receiving */ + dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk); + goto out_unlock; + } + + set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); + if (((svsk->sk_reserved + serv->sv_bufsz)*2 + > svc_sock_wspace(svsk)) + && !test_bit(SK_CLOSE, &svsk->sk_flags) + && !test_bit(SK_CONN, &svsk->sk_flags)) { + /* Don't enqueue while not enough space for reply */ + dprintk("svc: socket %p no space, %d*2 > %ld, not enqueued\n", + svsk->sk_sk, svsk->sk_reserved+serv->sv_bufsz, + svc_sock_wspace(svsk)); + goto out_unlock; + } + clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); + + /* Mark socket as busy. It will remain in this state until the + * server has processed all pending data and put the socket back + * on the idle list. + */ + set_bit(SK_BUSY, &svsk->sk_flags); + + if (!list_empty(&serv->sv_threads)) { + rqstp = list_entry(serv->sv_threads.next, + struct svc_rqst, + rq_list); + dprintk("svc: socket %p served by daemon %p\n", + svsk->sk_sk, rqstp); + svc_serv_dequeue(serv, rqstp); + if (rqstp->rq_sock) + printk(KERN_ERR + "svc_sock_enqueue: server %p, rq_sock=%p!\n", + rqstp, rqstp->rq_sock); + rqstp->rq_sock = svsk; + svsk->sk_inuse++; + rqstp->rq_reserved = serv->sv_bufsz; + svsk->sk_reserved += rqstp->rq_reserved; + wake_up(&rqstp->rq_wait); + } else { + dprintk("svc: socket %p put into queue\n", svsk->sk_sk); + list_add_tail(&svsk->sk_ready, &serv->sv_sockets); + } + +out_unlock: + spin_unlock_bh(&serv->sv_lock); +} + +/* + * Dequeue the first socket. Must be called with the serv->sv_lock held. + */ +static inline struct svc_sock * +svc_sock_dequeue(struct svc_serv *serv) +{ + struct svc_sock *svsk; + + if (list_empty(&serv->sv_sockets)) + return NULL; + + svsk = list_entry(serv->sv_sockets.next, + struct svc_sock, sk_ready); + list_del_init(&svsk->sk_ready); + + dprintk("svc: socket %p dequeued, inuse=%d\n", + svsk->sk_sk, svsk->sk_inuse); + + return svsk; +} + +/* + * Having read something from a socket, check whether it + * needs to be re-enqueued. + * Note: SK_DATA only gets cleared when a read-attempt finds + * no (or insufficient) data. + */ +static inline void +svc_sock_received(struct svc_sock *svsk) +{ + clear_bit(SK_BUSY, &svsk->sk_flags); + svc_sock_enqueue(svsk); +} + + +/** + * svc_reserve - change the space reserved for the reply to a request. + * @rqstp: The request in question + * @space: new max space to reserve + * + * Each request reserves some space on the output queue of the socket + * to make sure the reply fits. This function reduces that reserved + * space to be the amount of space used already, plus @space. + * + */ +void svc_reserve(struct svc_rqst *rqstp, int space) +{ + space += rqstp->rq_res.head[0].iov_len; + + if (space < rqstp->rq_reserved) { + struct svc_sock *svsk = rqstp->rq_sock; + spin_lock_bh(&svsk->sk_server->sv_lock); + svsk->sk_reserved -= (rqstp->rq_reserved - space); + rqstp->rq_reserved = space; + spin_unlock_bh(&svsk->sk_server->sv_lock); + + svc_sock_enqueue(svsk); + } +} + +/* + * Release a socket after use. + */ +static inline void +svc_sock_put(struct svc_sock *svsk) +{ + struct svc_serv *serv = svsk->sk_server; + + spin_lock_bh(&serv->sv_lock); + if (!--(svsk->sk_inuse) && test_bit(SK_DEAD, &svsk->sk_flags)) { + spin_unlock_bh(&serv->sv_lock); + dprintk("svc: releasing dead socket\n"); + sock_release(svsk->sk_sock); + kfree(svsk); + } + else + spin_unlock_bh(&serv->sv_lock); +} + +static void +svc_sock_release(struct svc_rqst *rqstp) +{ + struct svc_sock *svsk = rqstp->rq_sock; + + svc_release_skb(rqstp); + + svc_free_allpages(rqstp); + rqstp->rq_res.page_len = 0; + rqstp->rq_res.page_base = 0; + + + /* Reset response buffer and release + * the reservation. + * But first, check that enough space was reserved + * for the reply, otherwise we have a bug! + */ + if ((rqstp->rq_res.len) > rqstp->rq_reserved) + printk(KERN_ERR "RPC request reserved %d but used %d\n", + rqstp->rq_reserved, + rqstp->rq_res.len); + + rqstp->rq_res.head[0].iov_len = 0; + svc_reserve(rqstp, 0); + rqstp->rq_sock = NULL; + + svc_sock_put(svsk); +} + +/* + * External function to wake up a server waiting for data + */ +void +svc_wake_up(struct svc_serv *serv) +{ + struct svc_rqst *rqstp; + + spin_lock_bh(&serv->sv_lock); + if (!list_empty(&serv->sv_threads)) { + rqstp = list_entry(serv->sv_threads.next, + struct svc_rqst, + rq_list); + dprintk("svc: daemon %p woken up.\n", rqstp); + /* + svc_serv_dequeue(serv, rqstp); + rqstp->rq_sock = NULL; + */ + wake_up(&rqstp->rq_wait); + } + spin_unlock_bh(&serv->sv_lock); +} + +/* + * Generic sendto routine + */ +static int +svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) +{ + struct svc_sock *svsk = rqstp->rq_sock; + struct socket *sock = svsk->sk_sock; + int slen; + char buffer[CMSG_SPACE(sizeof(struct in_pktinfo))]; + struct cmsghdr *cmh = (struct cmsghdr *)buffer; + struct in_pktinfo *pki = (struct in_pktinfo *)CMSG_DATA(cmh); + int len = 0; + int result; + int size; + struct page **ppage = xdr->pages; + size_t base = xdr->page_base; + unsigned int pglen = xdr->page_len; + unsigned int flags = MSG_MORE; + + slen = xdr->len; + + if (rqstp->rq_prot == IPPROTO_UDP) { + /* set the source and destination */ + struct msghdr msg; + msg.msg_name = &rqstp->rq_addr; + msg.msg_namelen = sizeof(rqstp->rq_addr); + msg.msg_iov = NULL; + msg.msg_iovlen = 0; + msg.msg_flags = MSG_MORE; + + msg.msg_control = cmh; + msg.msg_controllen = sizeof(buffer); + cmh->cmsg_len = CMSG_LEN(sizeof(*pki)); + cmh->cmsg_level = SOL_IP; + cmh->cmsg_type = IP_PKTINFO; + pki->ipi_ifindex = 0; + pki->ipi_spec_dst.s_addr = rqstp->rq_daddr; + + if (sock_sendmsg(sock, &msg, 0) < 0) + goto out; + } + + /* send head */ + if (slen == xdr->head[0].iov_len) + flags = 0; + len = sock->ops->sendpage(sock, rqstp->rq_respages[0], 0, xdr->head[0].iov_len, flags); + if (len != xdr->head[0].iov_len) + goto out; + slen -= xdr->head[0].iov_len; + if (slen == 0) + goto out; + + /* send page data */ + size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base : pglen; + while (pglen > 0) { + if (slen == size) + flags = 0; + result = sock->ops->sendpage(sock, *ppage, base, size, flags); + if (result > 0) + len += result; + if (result != size) + goto out; + slen -= size; + pglen -= size; + size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen; + base = 0; + ppage++; + } + /* send tail */ + if (xdr->tail[0].iov_len) { + result = sock->ops->sendpage(sock, rqstp->rq_respages[rqstp->rq_restailpage], + ((unsigned long)xdr->tail[0].iov_base)& (PAGE_SIZE-1), + xdr->tail[0].iov_len, 0); + + if (result > 0) + len += result; + } +out: + dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %x)\n", + rqstp->rq_sock, xdr->head[0].iov_base, xdr->head[0].iov_len, xdr->len, len, + rqstp->rq_addr.sin_addr.s_addr); + + return len; +} + +/* + * Check input queue length + */ +static int +svc_recv_available(struct svc_sock *svsk) +{ + mm_segment_t oldfs; + struct socket *sock = svsk->sk_sock; + int avail, err; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = sock->ops->ioctl(sock, TIOCINQ, (unsigned long) &avail); + set_fs(oldfs); + + return (err >= 0)? avail : err; +} + +/* + * Generic recvfrom routine. + */ +static int +svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, int buflen) +{ + struct msghdr msg; + struct socket *sock; + int len, alen; + + rqstp->rq_addrlen = sizeof(rqstp->rq_addr); + sock = rqstp->rq_sock->sk_sock; + + msg.msg_name = &rqstp->rq_addr; + msg.msg_namelen = sizeof(rqstp->rq_addr); + msg.msg_control = NULL; + msg.msg_controllen = 0; + + msg.msg_flags = MSG_DONTWAIT; + + len = kernel_recvmsg(sock, &msg, iov, nr, buflen, MSG_DONTWAIT); + + /* sock_recvmsg doesn't fill in the name/namelen, so we must.. + * possibly we should cache this in the svc_sock structure + * at accept time. FIXME + */ + alen = sizeof(rqstp->rq_addr); + sock->ops->getname(sock, (struct sockaddr *)&rqstp->rq_addr, &alen, 1); + + dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n", + rqstp->rq_sock, iov[0].iov_base, iov[0].iov_len, len); + + return len; +} + +/* + * Set socket snd and rcv buffer lengths + */ +static inline void +svc_sock_setbufsize(struct socket *sock, unsigned int snd, unsigned int rcv) +{ +#if 0 + mm_segment_t oldfs; + oldfs = get_fs(); set_fs(KERNEL_DS); + sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF, + (char*)&snd, sizeof(snd)); + sock_setsockopt(sock, SOL_SOCKET, SO_RCVBUF, + (char*)&rcv, sizeof(rcv)); +#else + /* sock_setsockopt limits use to sysctl_?mem_max, + * which isn't acceptable. Until that is made conditional + * on not having CAP_SYS_RESOURCE or similar, we go direct... + * DaveM said I could! + */ + lock_sock(sock->sk); + sock->sk->sk_sndbuf = snd * 2; + sock->sk->sk_rcvbuf = rcv * 2; + sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK; + release_sock(sock->sk); +#endif +} +/* + * INET callback when data has been received on the socket. + */ +static void +svc_udp_data_ready(struct sock *sk, int count) +{ + struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data); + + if (!svsk) + goto out; + dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n", + svsk, sk, count, test_bit(SK_BUSY, &svsk->sk_flags)); + set_bit(SK_DATA, &svsk->sk_flags); + svc_sock_enqueue(svsk); + out: + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible(sk->sk_sleep); +} + +/* + * INET callback when space is newly available on the socket. + */ +static void +svc_write_space(struct sock *sk) +{ + struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data); + + if (svsk) { + dprintk("svc: socket %p(inet %p), write_space busy=%d\n", + svsk, sk, test_bit(SK_BUSY, &svsk->sk_flags)); + svc_sock_enqueue(svsk); + } + + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) { + printk(KERN_WARNING "RPC svc_write_space: some sleeping on %p\n", + svsk); + wake_up_interruptible(sk->sk_sleep); + } +} + +/* + * Receive a datagram from a UDP socket. + */ +extern int +csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb); + +static int +svc_udp_recvfrom(struct svc_rqst *rqstp) +{ + struct svc_sock *svsk = rqstp->rq_sock; + struct svc_serv *serv = svsk->sk_server; + struct sk_buff *skb; + int err, len; + + if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags)) + /* udp sockets need large rcvbuf as all pending + * requests are still in that buffer. sndbuf must + * also be large enough that there is enough space + * for one reply per thread. + */ + svc_sock_setbufsize(svsk->sk_sock, + (serv->sv_nrthreads+3) * serv->sv_bufsz, + (serv->sv_nrthreads+3) * serv->sv_bufsz); + + if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) { + svc_sock_received(svsk); + return svc_deferred_recv(rqstp); + } + + clear_bit(SK_DATA, &svsk->sk_flags); + while ((skb = skb_recv_datagram(svsk->sk_sk, 0, 1, &err)) == NULL) { + if (err == -EAGAIN) { + svc_sock_received(svsk); + return err; + } + /* possibly an icmp error */ + dprintk("svc: recvfrom returned error %d\n", -err); + } + if (skb->stamp.tv_sec == 0) { + skb->stamp.tv_sec = xtime.tv_sec; + skb->stamp.tv_usec = xtime.tv_nsec * 1000; + /* Don't enable netstamp, sunrpc doesn't + need that much accuracy */ + } + svsk->sk_sk->sk_stamp = skb->stamp; + set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */ + + /* + * Maybe more packets - kick another thread ASAP. + */ + svc_sock_received(svsk); + + len = skb->len - sizeof(struct udphdr); + rqstp->rq_arg.len = len; + + rqstp->rq_prot = IPPROTO_UDP; + + /* Get sender address */ + rqstp->rq_addr.sin_family = AF_INET; + rqstp->rq_addr.sin_port = skb->h.uh->source; + rqstp->rq_addr.sin_addr.s_addr = skb->nh.iph->saddr; + rqstp->rq_daddr = skb->nh.iph->daddr; + + if (skb_is_nonlinear(skb)) { + /* we have to copy */ + local_bh_disable(); + if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) { + local_bh_enable(); + /* checksum error */ + skb_free_datagram(svsk->sk_sk, skb); + return 0; + } + local_bh_enable(); + skb_free_datagram(svsk->sk_sk, skb); + } else { + /* we can use it in-place */ + rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr); + rqstp->rq_arg.head[0].iov_len = len; + if (skb->ip_summed != CHECKSUM_UNNECESSARY) { + if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { + skb_free_datagram(svsk->sk_sk, skb); + return 0; + } + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + rqstp->rq_skbuff = skb; + } + + rqstp->rq_arg.page_base = 0; + if (len <= rqstp->rq_arg.head[0].iov_len) { + rqstp->rq_arg.head[0].iov_len = len; + rqstp->rq_arg.page_len = 0; + } else { + rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; + rqstp->rq_argused += (rqstp->rq_arg.page_len + PAGE_SIZE - 1)/ PAGE_SIZE; + } + + if (serv->sv_stats) + serv->sv_stats->netudpcnt++; + + return len; +} + +static int +svc_udp_sendto(struct svc_rqst *rqstp) +{ + int error; + + error = svc_sendto(rqstp, &rqstp->rq_res); + if (error == -ECONNREFUSED) + /* ICMP error on earlier request. */ + error = svc_sendto(rqstp, &rqstp->rq_res); + + return error; +} + +static void +svc_udp_init(struct svc_sock *svsk) +{ + svsk->sk_sk->sk_data_ready = svc_udp_data_ready; + svsk->sk_sk->sk_write_space = svc_write_space; + svsk->sk_recvfrom = svc_udp_recvfrom; + svsk->sk_sendto = svc_udp_sendto; + + /* initialise setting must have enough space to + * receive and respond to one request. + * svc_udp_recvfrom will re-adjust if necessary + */ + svc_sock_setbufsize(svsk->sk_sock, + 3 * svsk->sk_server->sv_bufsz, + 3 * svsk->sk_server->sv_bufsz); + + set_bit(SK_DATA, &svsk->sk_flags); /* might have come in before data_ready set up */ + set_bit(SK_CHNGBUF, &svsk->sk_flags); +} + +/* + * A data_ready event on a listening socket means there's a connection + * pending. Do not use state_change as a substitute for it. + */ +static void +svc_tcp_listen_data_ready(struct sock *sk, int count_unused) +{ + struct svc_sock *svsk; + + dprintk("svc: socket %p TCP (listen) state change %d\n", + sk, sk->sk_state); + + if (sk->sk_state != TCP_LISTEN) { + /* + * This callback may called twice when a new connection + * is established as a child socket inherits everything + * from a parent LISTEN socket. + * 1) data_ready method of the parent socket will be called + * when one of child sockets become ESTABLISHED. + * 2) data_ready method of the child socket may be called + * when it receives data before the socket is accepted. + * In case of 2, we should ignore it silently. + */ + goto out; + } + if (!(svsk = (struct svc_sock *) sk->sk_user_data)) { + printk("svc: socket %p: no user data\n", sk); + goto out; + } + set_bit(SK_CONN, &svsk->sk_flags); + svc_sock_enqueue(svsk); + out: + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible_all(sk->sk_sleep); +} + +/* + * A state change on a connected socket means it's dying or dead. + */ +static void +svc_tcp_state_change(struct sock *sk) +{ + struct svc_sock *svsk; + + dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n", + sk, sk->sk_state, sk->sk_user_data); + + if (!(svsk = (struct svc_sock *) sk->sk_user_data)) { + printk("svc: socket %p: no user data\n", sk); + goto out; + } + set_bit(SK_CLOSE, &svsk->sk_flags); + svc_sock_enqueue(svsk); + out: + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible_all(sk->sk_sleep); +} + +static void +svc_tcp_data_ready(struct sock *sk, int count) +{ + struct svc_sock * svsk; + + dprintk("svc: socket %p TCP data ready (svsk %p)\n", + sk, sk->sk_user_data); + if (!(svsk = (struct svc_sock *)(sk->sk_user_data))) + goto out; + set_bit(SK_DATA, &svsk->sk_flags); + svc_sock_enqueue(svsk); + out: + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible(sk->sk_sleep); +} + +/* + * Accept a TCP connection + */ +static void +svc_tcp_accept(struct svc_sock *svsk) +{ + struct sockaddr_in sin; + struct svc_serv *serv = svsk->sk_server; + struct socket *sock = svsk->sk_sock; + struct socket *newsock; + struct proto_ops *ops; + struct svc_sock *newsvsk; + int err, slen; + + dprintk("svc: tcp_accept %p sock %p\n", svsk, sock); + if (!sock) + return; + + err = sock_create_lite(PF_INET, SOCK_STREAM, IPPROTO_TCP, &newsock); + if (err) { + if (err == -ENOMEM) + printk(KERN_WARNING "%s: no more sockets!\n", + serv->sv_name); + return; + } + + dprintk("svc: tcp_accept %p allocated\n", newsock); + newsock->ops = ops = sock->ops; + + clear_bit(SK_CONN, &svsk->sk_flags); + if ((err = ops->accept(sock, newsock, O_NONBLOCK)) < 0) { + if (err != -EAGAIN && net_ratelimit()) + printk(KERN_WARNING "%s: accept failed (err %d)!\n", + serv->sv_name, -err); + goto failed; /* aborted connection or whatever */ + } + set_bit(SK_CONN, &svsk->sk_flags); + svc_sock_enqueue(svsk); + + slen = sizeof(sin); + err = ops->getname(newsock, (struct sockaddr *) &sin, &slen, 1); + if (err < 0) { + if (net_ratelimit()) + printk(KERN_WARNING "%s: peername failed (err %d)!\n", + serv->sv_name, -err); + goto failed; /* aborted connection or whatever */ + } + + /* Ideally, we would want to reject connections from unauthorized + * hosts here, but when we get encription, the IP of the host won't + * tell us anything. For now just warn about unpriv connections. + */ + if (ntohs(sin.sin_port) >= 1024) { + dprintk(KERN_WARNING + "%s: connect from unprivileged port: %u.%u.%u.%u:%d\n", + serv->sv_name, + NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); + } + + dprintk("%s: connect from %u.%u.%u.%u:%04x\n", serv->sv_name, + NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); + + /* make sure that a write doesn't block forever when + * low on memory + */ + newsock->sk->sk_sndtimeo = HZ*30; + + if (!(newsvsk = svc_setup_socket(serv, newsock, &err, 0))) + goto failed; + + + /* make sure that we don't have too many active connections. + * If we have, something must be dropped. + * + * There's no point in trying to do random drop here for + * DoS prevention. The NFS clients does 1 reconnect in 15 + * seconds. An attacker can easily beat that. + * + * The only somewhat efficient mechanism would be if drop + * old connections from the same IP first. But right now + * we don't even record the client IP in svc_sock. + */ + if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) { + struct svc_sock *svsk = NULL; + spin_lock_bh(&serv->sv_lock); + if (!list_empty(&serv->sv_tempsocks)) { + if (net_ratelimit()) { + /* Try to help the admin */ + printk(KERN_NOTICE "%s: too many open TCP " + "sockets, consider increasing the " + "number of nfsd threads\n", + serv->sv_name); + printk(KERN_NOTICE "%s: last TCP connect from " + "%u.%u.%u.%u:%d\n", + serv->sv_name, + NIPQUAD(sin.sin_addr.s_addr), + ntohs(sin.sin_port)); + } + /* + * Always select the oldest socket. It's not fair, + * but so is life + */ + svsk = list_entry(serv->sv_tempsocks.prev, + struct svc_sock, + sk_list); + set_bit(SK_CLOSE, &svsk->sk_flags); + svsk->sk_inuse ++; + } + spin_unlock_bh(&serv->sv_lock); + + if (svsk) { + svc_sock_enqueue(svsk); + svc_sock_put(svsk); + } + + } + + if (serv->sv_stats) + serv->sv_stats->nettcpconn++; + + return; + +failed: + sock_release(newsock); + return; +} + +/* + * Receive data from a TCP socket. + */ +static int +svc_tcp_recvfrom(struct svc_rqst *rqstp) +{ + struct svc_sock *svsk = rqstp->rq_sock; + struct svc_serv *serv = svsk->sk_server; + int len; + struct kvec vec[RPCSVC_MAXPAGES]; + int pnum, vlen; + + dprintk("svc: tcp_recv %p data %d conn %d close %d\n", + svsk, test_bit(SK_DATA, &svsk->sk_flags), + test_bit(SK_CONN, &svsk->sk_flags), + test_bit(SK_CLOSE, &svsk->sk_flags)); + + if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) { + svc_sock_received(svsk); + return svc_deferred_recv(rqstp); + } + + if (test_bit(SK_CLOSE, &svsk->sk_flags)) { + svc_delete_socket(svsk); + return 0; + } + + if (test_bit(SK_CONN, &svsk->sk_flags)) { + svc_tcp_accept(svsk); + svc_sock_received(svsk); + return 0; + } + + if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags)) + /* sndbuf needs to have room for one request + * per thread, otherwise we can stall even when the + * network isn't a bottleneck. + * rcvbuf just needs to be able to hold a few requests. + * Normally they will be removed from the queue + * as soon a a complete request arrives. + */ + svc_sock_setbufsize(svsk->sk_sock, + (serv->sv_nrthreads+3) * serv->sv_bufsz, + 3 * serv->sv_bufsz); + + clear_bit(SK_DATA, &svsk->sk_flags); + + /* Receive data. If we haven't got the record length yet, get + * the next four bytes. Otherwise try to gobble up as much as + * possible up to the complete record length. + */ + if (svsk->sk_tcplen < 4) { + unsigned long want = 4 - svsk->sk_tcplen; + struct kvec iov; + + iov.iov_base = ((char *) &svsk->sk_reclen) + svsk->sk_tcplen; + iov.iov_len = want; + if ((len = svc_recvfrom(rqstp, &iov, 1, want)) < 0) + goto error; + svsk->sk_tcplen += len; + + if (len < want) { + dprintk("svc: short recvfrom while reading record length (%d of %lu)\n", + len, want); + svc_sock_received(svsk); + return -EAGAIN; /* record header not complete */ + } + + svsk->sk_reclen = ntohl(svsk->sk_reclen); + if (!(svsk->sk_reclen & 0x80000000)) { + /* FIXME: technically, a record can be fragmented, + * and non-terminal fragments will not have the top + * bit set in the fragment length header. + * But apparently no known nfs clients send fragmented + * records. */ + printk(KERN_NOTICE "RPC: bad TCP reclen 0x%08lx (non-terminal)\n", + (unsigned long) svsk->sk_reclen); + goto err_delete; + } + svsk->sk_reclen &= 0x7fffffff; + dprintk("svc: TCP record, %d bytes\n", svsk->sk_reclen); + if (svsk->sk_reclen > serv->sv_bufsz) { + printk(KERN_NOTICE "RPC: bad TCP reclen 0x%08lx (large)\n", + (unsigned long) svsk->sk_reclen); + goto err_delete; + } + } + + /* Check whether enough data is available */ + len = svc_recv_available(svsk); + if (len < 0) + goto error; + + if (len < svsk->sk_reclen) { + dprintk("svc: incomplete TCP record (%d of %d)\n", + len, svsk->sk_reclen); + svc_sock_received(svsk); + return -EAGAIN; /* record not complete */ + } + len = svsk->sk_reclen; + set_bit(SK_DATA, &svsk->sk_flags); + + vec[0] = rqstp->rq_arg.head[0]; + vlen = PAGE_SIZE; + pnum = 1; + while (vlen < len) { + vec[pnum].iov_base = page_address(rqstp->rq_argpages[rqstp->rq_argused++]); + vec[pnum].iov_len = PAGE_SIZE; + pnum++; + vlen += PAGE_SIZE; + } + + /* Now receive data */ + len = svc_recvfrom(rqstp, vec, pnum, len); + if (len < 0) + goto error; + + dprintk("svc: TCP complete record (%d bytes)\n", len); + rqstp->rq_arg.len = len; + rqstp->rq_arg.page_base = 0; + if (len <= rqstp->rq_arg.head[0].iov_len) { + rqstp->rq_arg.head[0].iov_len = len; + rqstp->rq_arg.page_len = 0; + } else { + rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; + } + + rqstp->rq_skbuff = NULL; + rqstp->rq_prot = IPPROTO_TCP; + + /* Reset TCP read info */ + svsk->sk_reclen = 0; + svsk->sk_tcplen = 0; + + svc_sock_received(svsk); + if (serv->sv_stats) + serv->sv_stats->nettcpcnt++; + + return len; + + err_delete: + svc_delete_socket(svsk); + return -EAGAIN; + + error: + if (len == -EAGAIN) { + dprintk("RPC: TCP recvfrom got EAGAIN\n"); + svc_sock_received(svsk); + } else { + printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", + svsk->sk_server->sv_name, -len); + svc_sock_received(svsk); + } + + return len; +} + +/* + * Send out data on TCP socket. + */ +static int +svc_tcp_sendto(struct svc_rqst *rqstp) +{ + struct xdr_buf *xbufp = &rqstp->rq_res; + int sent; + u32 reclen; + + /* Set up the first element of the reply kvec. + * Any other kvecs that may be in use have been taken + * care of by the server implementation itself. + */ + reclen = htonl(0x80000000|((xbufp->len ) - 4)); + memcpy(xbufp->head[0].iov_base, &reclen, 4); + + if (test_bit(SK_DEAD, &rqstp->rq_sock->sk_flags)) + return -ENOTCONN; + + sent = svc_sendto(rqstp, &rqstp->rq_res); + if (sent != xbufp->len) { + printk(KERN_NOTICE "rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n", + rqstp->rq_sock->sk_server->sv_name, + (sent<0)?"got error":"sent only", + sent, xbufp->len); + svc_delete_socket(rqstp->rq_sock); + sent = -EAGAIN; + } + return sent; +} + +static void +svc_tcp_init(struct svc_sock *svsk) +{ + struct sock *sk = svsk->sk_sk; + struct tcp_sock *tp = tcp_sk(sk); + + svsk->sk_recvfrom = svc_tcp_recvfrom; + svsk->sk_sendto = svc_tcp_sendto; + + if (sk->sk_state == TCP_LISTEN) { + dprintk("setting up TCP socket for listening\n"); + sk->sk_data_ready = svc_tcp_listen_data_ready; + set_bit(SK_CONN, &svsk->sk_flags); + } else { + dprintk("setting up TCP socket for reading\n"); + sk->sk_state_change = svc_tcp_state_change; + sk->sk_data_ready = svc_tcp_data_ready; + sk->sk_write_space = svc_write_space; + + svsk->sk_reclen = 0; + svsk->sk_tcplen = 0; + + tp->nonagle = 1; /* disable Nagle's algorithm */ + + /* initialise setting must have enough space to + * receive and respond to one request. + * svc_tcp_recvfrom will re-adjust if necessary + */ + svc_sock_setbufsize(svsk->sk_sock, + 3 * svsk->sk_server->sv_bufsz, + 3 * svsk->sk_server->sv_bufsz); + + set_bit(SK_CHNGBUF, &svsk->sk_flags); + set_bit(SK_DATA, &svsk->sk_flags); + if (sk->sk_state != TCP_ESTABLISHED) + set_bit(SK_CLOSE, &svsk->sk_flags); + } +} + +void +svc_sock_update_bufs(struct svc_serv *serv) +{ + /* + * The number of server threads has changed. Update + * rcvbuf and sndbuf accordingly on all sockets + */ + struct list_head *le; + + spin_lock_bh(&serv->sv_lock); + list_for_each(le, &serv->sv_permsocks) { + struct svc_sock *svsk = + list_entry(le, struct svc_sock, sk_list); + set_bit(SK_CHNGBUF, &svsk->sk_flags); + } + list_for_each(le, &serv->sv_tempsocks) { + struct svc_sock *svsk = + list_entry(le, struct svc_sock, sk_list); + set_bit(SK_CHNGBUF, &svsk->sk_flags); + } + spin_unlock_bh(&serv->sv_lock); +} + +/* + * Receive the next request on any socket. + */ +int +svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout) +{ + struct svc_sock *svsk =NULL; + int len; + int pages; + struct xdr_buf *arg; + DECLARE_WAITQUEUE(wait, current); + + dprintk("svc: server %p waiting for data (to = %ld)\n", + rqstp, timeout); + + if (rqstp->rq_sock) + printk(KERN_ERR + "svc_recv: service %p, socket not NULL!\n", + rqstp); + if (waitqueue_active(&rqstp->rq_wait)) + printk(KERN_ERR + "svc_recv: service %p, wait queue active!\n", + rqstp); + + /* Initialize the buffers */ + /* first reclaim pages that were moved to response list */ + svc_pushback_allpages(rqstp); + + /* now allocate needed pages. If we get a failure, sleep briefly */ + pages = 2 + (serv->sv_bufsz + PAGE_SIZE -1) / PAGE_SIZE; + while (rqstp->rq_arghi < pages) { + struct page *p = alloc_page(GFP_KERNEL); + if (!p) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(HZ/2); + continue; + } + rqstp->rq_argpages[rqstp->rq_arghi++] = p; + } + + /* Make arg->head point to first page and arg->pages point to rest */ + arg = &rqstp->rq_arg; + arg->head[0].iov_base = page_address(rqstp->rq_argpages[0]); + arg->head[0].iov_len = PAGE_SIZE; + rqstp->rq_argused = 1; + arg->pages = rqstp->rq_argpages + 1; + arg->page_base = 0; + /* save at least one page for response */ + arg->page_len = (pages-2)*PAGE_SIZE; + arg->len = (pages-1)*PAGE_SIZE; + arg->tail[0].iov_len = 0; + + try_to_freeze(PF_FREEZE); + if (signalled()) + return -EINTR; + + spin_lock_bh(&serv->sv_lock); + if (!list_empty(&serv->sv_tempsocks)) { + svsk = list_entry(serv->sv_tempsocks.next, + struct svc_sock, sk_list); + /* apparently the "standard" is that clients close + * idle connections after 5 minutes, servers after + * 6 minutes + * http://www.connectathon.org/talks96/nfstcp.pdf + */ + if (get_seconds() - svsk->sk_lastrecv < 6*60 + || test_bit(SK_BUSY, &svsk->sk_flags)) + svsk = NULL; + } + if (svsk) { + set_bit(SK_BUSY, &svsk->sk_flags); + set_bit(SK_CLOSE, &svsk->sk_flags); + rqstp->rq_sock = svsk; + svsk->sk_inuse++; + } else if ((svsk = svc_sock_dequeue(serv)) != NULL) { + rqstp->rq_sock = svsk; + svsk->sk_inuse++; + rqstp->rq_reserved = serv->sv_bufsz; + svsk->sk_reserved += rqstp->rq_reserved; + } else { + /* No data pending. Go to sleep */ + svc_serv_enqueue(serv, rqstp); + + /* + * We have to be able to interrupt this wait + * to bring down the daemons ... + */ + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&rqstp->rq_wait, &wait); + spin_unlock_bh(&serv->sv_lock); + + schedule_timeout(timeout); + + try_to_freeze(PF_FREEZE); + + spin_lock_bh(&serv->sv_lock); + remove_wait_queue(&rqstp->rq_wait, &wait); + + if (!(svsk = rqstp->rq_sock)) { + svc_serv_dequeue(serv, rqstp); + spin_unlock_bh(&serv->sv_lock); + dprintk("svc: server %p, no data yet\n", rqstp); + return signalled()? -EINTR : -EAGAIN; + } + } + spin_unlock_bh(&serv->sv_lock); + + dprintk("svc: server %p, socket %p, inuse=%d\n", + rqstp, svsk, svsk->sk_inuse); + len = svsk->sk_recvfrom(rqstp); + dprintk("svc: got len=%d\n", len); + + /* No data, incomplete (TCP) read, or accept() */ + if (len == 0 || len == -EAGAIN) { + rqstp->rq_res.len = 0; + svc_sock_release(rqstp); + return -EAGAIN; + } + svsk->sk_lastrecv = get_seconds(); + if (test_bit(SK_TEMP, &svsk->sk_flags)) { + /* push active sockets to end of list */ + spin_lock_bh(&serv->sv_lock); + if (!list_empty(&svsk->sk_list)) + list_move_tail(&svsk->sk_list, &serv->sv_tempsocks); + spin_unlock_bh(&serv->sv_lock); + } + + rqstp->rq_secure = ntohs(rqstp->rq_addr.sin_port) < 1024; + rqstp->rq_chandle.defer = svc_defer; + + if (serv->sv_stats) + serv->sv_stats->netcnt++; + return len; +} + +/* + * Drop request + */ +void +svc_drop(struct svc_rqst *rqstp) +{ + dprintk("svc: socket %p dropped request\n", rqstp->rq_sock); + svc_sock_release(rqstp); +} + +/* + * Return reply to client. + */ +int +svc_send(struct svc_rqst *rqstp) +{ + struct svc_sock *svsk; + int len; + struct xdr_buf *xb; + + if ((svsk = rqstp->rq_sock) == NULL) { + printk(KERN_WARNING "NULL socket pointer in %s:%d\n", + __FILE__, __LINE__); + return -EFAULT; + } + + /* release the receive skb before sending the reply */ + svc_release_skb(rqstp); + + /* calculate over-all length */ + xb = & rqstp->rq_res; + xb->len = xb->head[0].iov_len + + xb->page_len + + xb->tail[0].iov_len; + + /* Grab svsk->sk_sem to serialize outgoing data. */ + down(&svsk->sk_sem); + if (test_bit(SK_DEAD, &svsk->sk_flags)) + len = -ENOTCONN; + else + len = svsk->sk_sendto(rqstp); + up(&svsk->sk_sem); + svc_sock_release(rqstp); + + if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN) + return 0; + return len; +} + +/* + * Initialize socket for RPC use and create svc_sock struct + * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF. + */ +static struct svc_sock * +svc_setup_socket(struct svc_serv *serv, struct socket *sock, + int *errp, int pmap_register) +{ + struct svc_sock *svsk; + struct sock *inet; + + dprintk("svc: svc_setup_socket %p\n", sock); + if (!(svsk = kmalloc(sizeof(*svsk), GFP_KERNEL))) { + *errp = -ENOMEM; + return NULL; + } + memset(svsk, 0, sizeof(*svsk)); + + inet = sock->sk; + + /* Register socket with portmapper */ + if (*errp >= 0 && pmap_register) + *errp = svc_register(serv, inet->sk_protocol, + ntohs(inet_sk(inet)->sport)); + + if (*errp < 0) { + kfree(svsk); + return NULL; + } + + set_bit(SK_BUSY, &svsk->sk_flags); + inet->sk_user_data = svsk; + svsk->sk_sock = sock; + svsk->sk_sk = inet; + svsk->sk_ostate = inet->sk_state_change; + svsk->sk_odata = inet->sk_data_ready; + svsk->sk_owspace = inet->sk_write_space; + svsk->sk_server = serv; + svsk->sk_lastrecv = get_seconds(); + INIT_LIST_HEAD(&svsk->sk_deferred); + INIT_LIST_HEAD(&svsk->sk_ready); + sema_init(&svsk->sk_sem, 1); + + /* Initialize the socket */ + if (sock->type == SOCK_DGRAM) + svc_udp_init(svsk); + else + svc_tcp_init(svsk); + + spin_lock_bh(&serv->sv_lock); + if (!pmap_register) { + set_bit(SK_TEMP, &svsk->sk_flags); + list_add(&svsk->sk_list, &serv->sv_tempsocks); + serv->sv_tmpcnt++; + } else { + clear_bit(SK_TEMP, &svsk->sk_flags); + list_add(&svsk->sk_list, &serv->sv_permsocks); + } + spin_unlock_bh(&serv->sv_lock); + + dprintk("svc: svc_setup_socket created %p (inet %p)\n", + svsk, svsk->sk_sk); + + clear_bit(SK_BUSY, &svsk->sk_flags); + svc_sock_enqueue(svsk); + return svsk; +} + +/* + * Create socket for RPC service. + */ +static int +svc_create_socket(struct svc_serv *serv, int protocol, struct sockaddr_in *sin) +{ + struct svc_sock *svsk; + struct socket *sock; + int error; + int type; + + dprintk("svc: svc_create_socket(%s, %d, %u.%u.%u.%u:%d)\n", + serv->sv_program->pg_name, protocol, + NIPQUAD(sin->sin_addr.s_addr), + ntohs(sin->sin_port)); + + if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) { + printk(KERN_WARNING "svc: only UDP and TCP " + "sockets supported\n"); + return -EINVAL; + } + type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; + + if ((error = sock_create_kern(PF_INET, type, protocol, &sock)) < 0) + return error; + + if (sin != NULL) { + if (type == SOCK_STREAM) + sock->sk->sk_reuse = 1; /* allow address reuse */ + error = sock->ops->bind(sock, (struct sockaddr *) sin, + sizeof(*sin)); + if (error < 0) + goto bummer; + } + + if (protocol == IPPROTO_TCP) { + if ((error = sock->ops->listen(sock, 64)) < 0) + goto bummer; + } + + if ((svsk = svc_setup_socket(serv, sock, &error, 1)) != NULL) + return 0; + +bummer: + dprintk("svc: svc_create_socket error = %d\n", -error); + sock_release(sock); + return error; +} + +/* + * Remove a dead socket + */ +void +svc_delete_socket(struct svc_sock *svsk) +{ + struct svc_serv *serv; + struct sock *sk; + + dprintk("svc: svc_delete_socket(%p)\n", svsk); + + serv = svsk->sk_server; + sk = svsk->sk_sk; + + sk->sk_state_change = svsk->sk_ostate; + sk->sk_data_ready = svsk->sk_odata; + sk->sk_write_space = svsk->sk_owspace; + + spin_lock_bh(&serv->sv_lock); + + list_del_init(&svsk->sk_list); + list_del_init(&svsk->sk_ready); + if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags)) + if (test_bit(SK_TEMP, &svsk->sk_flags)) + serv->sv_tmpcnt--; + + if (!svsk->sk_inuse) { + spin_unlock_bh(&serv->sv_lock); + sock_release(svsk->sk_sock); + kfree(svsk); + } else { + spin_unlock_bh(&serv->sv_lock); + dprintk(KERN_NOTICE "svc: server socket destroy delayed\n"); + /* svsk->sk_server = NULL; */ + } +} + +/* + * Make a socket for nfsd and lockd + */ +int +svc_makesock(struct svc_serv *serv, int protocol, unsigned short port) +{ + struct sockaddr_in sin; + + dprintk("svc: creating socket proto = %d\n", protocol); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = INADDR_ANY; + sin.sin_port = htons(port); + return svc_create_socket(serv, protocol, &sin); +} + +/* + * Handle defer and revisit of requests + */ + +static void svc_revisit(struct cache_deferred_req *dreq, int too_many) +{ + struct svc_deferred_req *dr = container_of(dreq, struct svc_deferred_req, handle); + struct svc_serv *serv = dreq->owner; + struct svc_sock *svsk; + + if (too_many) { + svc_sock_put(dr->svsk); + kfree(dr); + return; + } + dprintk("revisit queued\n"); + svsk = dr->svsk; + dr->svsk = NULL; + spin_lock_bh(&serv->sv_lock); + list_add(&dr->handle.recent, &svsk->sk_deferred); + spin_unlock_bh(&serv->sv_lock); + set_bit(SK_DEFERRED, &svsk->sk_flags); + svc_sock_enqueue(svsk); + svc_sock_put(svsk); +} + +static struct cache_deferred_req * +svc_defer(struct cache_req *req) +{ + struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle); + int size = sizeof(struct svc_deferred_req) + (rqstp->rq_arg.len); + struct svc_deferred_req *dr; + + if (rqstp->rq_arg.page_len) + return NULL; /* if more than a page, give up FIXME */ + if (rqstp->rq_deferred) { + dr = rqstp->rq_deferred; + rqstp->rq_deferred = NULL; + } else { + int skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; + /* FIXME maybe discard if size too large */ + dr = kmalloc(size, GFP_KERNEL); + if (dr == NULL) + return NULL; + + dr->handle.owner = rqstp->rq_server; + dr->prot = rqstp->rq_prot; + dr->addr = rqstp->rq_addr; + dr->argslen = rqstp->rq_arg.len >> 2; + memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2); + } + spin_lock_bh(&rqstp->rq_server->sv_lock); + rqstp->rq_sock->sk_inuse++; + dr->svsk = rqstp->rq_sock; + spin_unlock_bh(&rqstp->rq_server->sv_lock); + + dr->handle.revisit = svc_revisit; + return &dr->handle; +} + +/* + * recv data from a deferred request into an active one + */ +static int svc_deferred_recv(struct svc_rqst *rqstp) +{ + struct svc_deferred_req *dr = rqstp->rq_deferred; + + rqstp->rq_arg.head[0].iov_base = dr->args; + rqstp->rq_arg.head[0].iov_len = dr->argslen<<2; + rqstp->rq_arg.page_len = 0; + rqstp->rq_arg.len = dr->argslen<<2; + rqstp->rq_prot = dr->prot; + rqstp->rq_addr = dr->addr; + return dr->argslen<<2; +} + + +static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk) +{ + struct svc_deferred_req *dr = NULL; + struct svc_serv *serv = svsk->sk_server; + + if (!test_bit(SK_DEFERRED, &svsk->sk_flags)) + return NULL; + spin_lock_bh(&serv->sv_lock); + clear_bit(SK_DEFERRED, &svsk->sk_flags); + if (!list_empty(&svsk->sk_deferred)) { + dr = list_entry(svsk->sk_deferred.next, + struct svc_deferred_req, + handle.recent); + list_del_init(&dr->handle.recent); + set_bit(SK_DEFERRED, &svsk->sk_flags); + } + spin_unlock_bh(&serv->sv_lock); + return dr; +} diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c new file mode 100644 index 000000000000..1b9616a12e24 --- /dev/null +++ b/net/sunrpc/sysctl.c @@ -0,0 +1,193 @@ +/* + * linux/net/sunrpc/sysctl.c + * + * Sysctl interface to sunrpc module. + * + * I would prefer to register the sunrpc table below sys/net, but that's + * impossible at the moment. + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/linkage.h> +#include <linux/ctype.h> +#include <linux/fs.h> +#include <linux/sysctl.h> +#include <linux/module.h> + +#include <asm/uaccess.h> +#include <linux/sunrpc/types.h> +#include <linux/sunrpc/sched.h> +#include <linux/sunrpc/stats.h> +#include <linux/sunrpc/xprt.h> + +/* + * Declare the debug flags here + */ +unsigned int rpc_debug; +unsigned int nfs_debug; +unsigned int nfsd_debug; +unsigned int nlm_debug; + +#ifdef RPC_DEBUG + +static struct ctl_table_header *sunrpc_table_header; +static ctl_table sunrpc_table[]; + +void +rpc_register_sysctl(void) +{ + if (!sunrpc_table_header) { + sunrpc_table_header = register_sysctl_table(sunrpc_table, 1); +#ifdef CONFIG_PROC_FS + if (sunrpc_table[0].de) + sunrpc_table[0].de->owner = THIS_MODULE; +#endif + } + +} + +void +rpc_unregister_sysctl(void) +{ + if (sunrpc_table_header) { + unregister_sysctl_table(sunrpc_table_header); + sunrpc_table_header = NULL; + } +} + +static int +proc_dodebug(ctl_table *table, int write, struct file *file, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char tmpbuf[20], c, *s; + char __user *p; + unsigned int value; + size_t left, len; + + if ((*ppos && !write) || !*lenp) { + *lenp = 0; + return 0; + } + + left = *lenp; + + if (write) { + if (!access_ok(VERIFY_READ, buffer, left)) + return -EFAULT; + p = buffer; + while (left && __get_user(c, p) >= 0 && isspace(c)) + left--, p++; + if (!left) + goto done; + + if (left > sizeof(tmpbuf) - 1) + return -EINVAL; + if (copy_from_user(tmpbuf, p, left)) + return -EFAULT; + tmpbuf[left] = '\0'; + + for (s = tmpbuf, value = 0; '0' <= *s && *s <= '9'; s++, left--) + value = 10 * value + (*s - '0'); + if (*s && !isspace(*s)) + return -EINVAL; + while (left && isspace(*s)) + left--, s++; + *(unsigned int *) table->data = value; + /* Display the RPC tasks on writing to rpc_debug */ + if (table->ctl_name == CTL_RPCDEBUG) { + rpc_show_tasks(); + } + } else { + if (!access_ok(VERIFY_WRITE, buffer, left)) + return -EFAULT; + len = sprintf(tmpbuf, "%d", *(unsigned int *) table->data); + if (len > left) + len = left; + if (__copy_to_user(buffer, tmpbuf, len)) + return -EFAULT; + if ((left -= len) > 0) { + if (put_user('\n', (char __user *)buffer + len)) + return -EFAULT; + left--; + } + } + +done: + *lenp -= left; + *ppos += *lenp; + return 0; +} + +static unsigned int min_slot_table_size = RPC_MIN_SLOT_TABLE; +static unsigned int max_slot_table_size = RPC_MAX_SLOT_TABLE; + +static ctl_table debug_table[] = { + { + .ctl_name = CTL_RPCDEBUG, + .procname = "rpc_debug", + .data = &rpc_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dodebug + }, + { + .ctl_name = CTL_NFSDEBUG, + .procname = "nfs_debug", + .data = &nfs_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dodebug + }, + { + .ctl_name = CTL_NFSDDEBUG, + .procname = "nfsd_debug", + .data = &nfsd_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dodebug + }, + { + .ctl_name = CTL_NLMDEBUG, + .procname = "nlm_debug", + .data = &nlm_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dodebug + }, + { + .ctl_name = CTL_SLOTTABLE_UDP, + .procname = "udp_slot_table_entries", + .data = &xprt_udp_slot_table_entries, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_slot_table_size, + .extra2 = &max_slot_table_size + }, + { + .ctl_name = CTL_SLOTTABLE_TCP, + .procname = "tcp_slot_table_entries", + .data = &xprt_tcp_slot_table_entries, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_slot_table_size, + .extra2 = &max_slot_table_size + }, + { .ctl_name = 0 } +}; + +static ctl_table sunrpc_table[] = { + { + .ctl_name = CTL_SUNRPC, + .procname = "sunrpc", + .mode = 0555, + .child = debug_table + }, + { .ctl_name = 0 } +}; + +#endif diff --git a/net/sunrpc/timer.c b/net/sunrpc/timer.c new file mode 100644 index 000000000000..bcbdf6430d5c --- /dev/null +++ b/net/sunrpc/timer.c @@ -0,0 +1,107 @@ +/* + * linux/net/sunrpc/timer.c + * + * Estimate RPC request round trip time. + * + * Based on packet round-trip and variance estimator algorithms described + * in appendix A of "Congestion Avoidance and Control" by Van Jacobson + * and Michael J. Karels (ACM Computer Communication Review; Proceedings + * of the Sigcomm '88 Symposium in Stanford, CA, August, 1988). + * + * This RTT estimator is used only for RPC over datagram protocols. + * + * Copyright (C) 2002 Trond Myklebust <trond.myklebust@fys.uio.no> + */ + +#include <asm/param.h> + +#include <linux/types.h> +#include <linux/unistd.h> + +#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/xprt.h> +#include <linux/sunrpc/timer.h> + +#define RPC_RTO_MAX (60*HZ) +#define RPC_RTO_INIT (HZ/5) +#define RPC_RTO_MIN (HZ/10) + +void +rpc_init_rtt(struct rpc_rtt *rt, unsigned long timeo) +{ + unsigned long init = 0; + unsigned i; + + rt->timeo = timeo; + + if (timeo > RPC_RTO_INIT) + init = (timeo - RPC_RTO_INIT) << 3; + for (i = 0; i < 5; i++) { + rt->srtt[i] = init; + rt->sdrtt[i] = RPC_RTO_INIT; + rt->ntimeouts[i] = 0; + } +} + +/* + * NB: When computing the smoothed RTT and standard deviation, + * be careful not to produce negative intermediate results. + */ +void +rpc_update_rtt(struct rpc_rtt *rt, unsigned timer, long m) +{ + long *srtt, *sdrtt; + + if (timer-- == 0) + return; + + /* jiffies wrapped; ignore this one */ + if (m < 0) + return; + + if (m == 0) + m = 1L; + + srtt = (long *)&rt->srtt[timer]; + m -= *srtt >> 3; + *srtt += m; + + if (m < 0) + m = -m; + + sdrtt = (long *)&rt->sdrtt[timer]; + m -= *sdrtt >> 2; + *sdrtt += m; + + /* Set lower bound on the variance */ + if (*sdrtt < RPC_RTO_MIN) + *sdrtt = RPC_RTO_MIN; +} + +/* + * Estimate rto for an nfs rpc sent via. an unreliable datagram. + * Use the mean and mean deviation of rtt for the appropriate type of rpc + * for the frequent rpcs and a default for the others. + * The justification for doing "other" this way is that these rpcs + * happen so infrequently that timer est. would probably be stale. + * Also, since many of these rpcs are + * non-idempotent, a conservative timeout is desired. + * getattr, lookup, + * read, write, commit - A+4D + * other - timeo + */ + +unsigned long +rpc_calc_rto(struct rpc_rtt *rt, unsigned timer) +{ + unsigned long res; + + if (timer-- == 0) + return rt->timeo; + + res = ((rt->srtt[timer] + 7) >> 3) + rt->sdrtt[timer]; + if (res > RPC_RTO_MAX) + res = RPC_RTO_MAX; + + return res; +} diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c new file mode 100644 index 000000000000..4484931018eb --- /dev/null +++ b/net/sunrpc/xdr.c @@ -0,0 +1,917 @@ +/* + * linux/net/sunrpc/xdr.c + * + * Generic XDR support. + * + * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> + */ + +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/pagemap.h> +#include <linux/errno.h> +#include <linux/in.h> +#include <linux/net.h> +#include <net/sock.h> +#include <linux/sunrpc/xdr.h> +#include <linux/sunrpc/msg_prot.h> + +/* + * XDR functions for basic NFS types + */ +u32 * +xdr_encode_netobj(u32 *p, const struct xdr_netobj *obj) +{ + unsigned int quadlen = XDR_QUADLEN(obj->len); + + p[quadlen] = 0; /* zero trailing bytes */ + *p++ = htonl(obj->len); + memcpy(p, obj->data, obj->len); + return p + XDR_QUADLEN(obj->len); +} + +u32 * +xdr_decode_netobj(u32 *p, struct xdr_netobj *obj) +{ + unsigned int len; + + if ((len = ntohl(*p++)) > XDR_MAX_NETOBJ) + return NULL; + obj->len = len; + obj->data = (u8 *) p; + return p + XDR_QUADLEN(len); +} + +/** + * xdr_encode_opaque_fixed - Encode fixed length opaque data + * @p - pointer to current position in XDR buffer. + * @ptr - pointer to data to encode (or NULL) + * @nbytes - size of data. + * + * Copy the array of data of length nbytes at ptr to the XDR buffer + * at position p, then align to the next 32-bit boundary by padding + * with zero bytes (see RFC1832). + * Note: if ptr is NULL, only the padding is performed. + * + * Returns the updated current XDR buffer position + * + */ +u32 *xdr_encode_opaque_fixed(u32 *p, const void *ptr, unsigned int nbytes) +{ + if (likely(nbytes != 0)) { + unsigned int quadlen = XDR_QUADLEN(nbytes); + unsigned int padding = (quadlen << 2) - nbytes; + + if (ptr != NULL) + memcpy(p, ptr, nbytes); + if (padding != 0) + memset((char *)p + nbytes, 0, padding); + p += quadlen; + } + return p; +} +EXPORT_SYMBOL(xdr_encode_opaque_fixed); + +/** + * xdr_encode_opaque - Encode variable length opaque data + * @p - pointer to current position in XDR buffer. + * @ptr - pointer to data to encode (or NULL) + * @nbytes - size of data. + * + * Returns the updated current XDR buffer position + */ +u32 *xdr_encode_opaque(u32 *p, const void *ptr, unsigned int nbytes) +{ + *p++ = htonl(nbytes); + return xdr_encode_opaque_fixed(p, ptr, nbytes); +} +EXPORT_SYMBOL(xdr_encode_opaque); + +u32 * +xdr_encode_string(u32 *p, const char *string) +{ + return xdr_encode_array(p, string, strlen(string)); +} + +u32 * +xdr_decode_string(u32 *p, char **sp, int *lenp, int maxlen) +{ + unsigned int len; + char *string; + + if ((len = ntohl(*p++)) > maxlen) + return NULL; + if (lenp) + *lenp = len; + if ((len % 4) != 0) { + string = (char *) p; + } else { + string = (char *) (p - 1); + memmove(string, p, len); + } + string[len] = '\0'; + *sp = string; + return p + XDR_QUADLEN(len); +} + +u32 * +xdr_decode_string_inplace(u32 *p, char **sp, int *lenp, int maxlen) +{ + unsigned int len; + + if ((len = ntohl(*p++)) > maxlen) + return NULL; + *lenp = len; + *sp = (char *) p; + return p + XDR_QUADLEN(len); +} + +void +xdr_encode_pages(struct xdr_buf *xdr, struct page **pages, unsigned int base, + unsigned int len) +{ + struct kvec *tail = xdr->tail; + u32 *p; + + xdr->pages = pages; + xdr->page_base = base; + xdr->page_len = len; + + p = (u32 *)xdr->head[0].iov_base + XDR_QUADLEN(xdr->head[0].iov_len); + tail->iov_base = p; + tail->iov_len = 0; + + if (len & 3) { + unsigned int pad = 4 - (len & 3); + + *p = 0; + tail->iov_base = (char *)p + (len & 3); + tail->iov_len = pad; + len += pad; + } + xdr->buflen += len; + xdr->len += len; +} + +void +xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset, + struct page **pages, unsigned int base, unsigned int len) +{ + struct kvec *head = xdr->head; + struct kvec *tail = xdr->tail; + char *buf = (char *)head->iov_base; + unsigned int buflen = head->iov_len; + + head->iov_len = offset; + + xdr->pages = pages; + xdr->page_base = base; + xdr->page_len = len; + + tail->iov_base = buf + offset; + tail->iov_len = buflen - offset; + + xdr->buflen += len; +} + +void +xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, + skb_reader_t *desc, + skb_read_actor_t copy_actor) +{ + struct page **ppage = xdr->pages; + unsigned int len, pglen = xdr->page_len; + int ret; + + len = xdr->head[0].iov_len; + if (base < len) { + len -= base; + ret = copy_actor(desc, (char *)xdr->head[0].iov_base + base, len); + if (ret != len || !desc->count) + return; + base = 0; + } else + base -= len; + + if (pglen == 0) + goto copy_tail; + if (base >= pglen) { + base -= pglen; + goto copy_tail; + } + if (base || xdr->page_base) { + pglen -= base; + base += xdr->page_base; + ppage += base >> PAGE_CACHE_SHIFT; + base &= ~PAGE_CACHE_MASK; + } + do { + char *kaddr; + + len = PAGE_CACHE_SIZE; + kaddr = kmap_atomic(*ppage, KM_SKB_SUNRPC_DATA); + if (base) { + len -= base; + if (pglen < len) + len = pglen; + ret = copy_actor(desc, kaddr + base, len); + base = 0; + } else { + if (pglen < len) + len = pglen; + ret = copy_actor(desc, kaddr, len); + } + flush_dcache_page(*ppage); + kunmap_atomic(kaddr, KM_SKB_SUNRPC_DATA); + if (ret != len || !desc->count) + return; + ppage++; + } while ((pglen -= len) != 0); +copy_tail: + len = xdr->tail[0].iov_len; + if (base < len) + copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base); +} + + +int +xdr_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, + struct xdr_buf *xdr, unsigned int base, int msgflags) +{ + struct page **ppage = xdr->pages; + unsigned int len, pglen = xdr->page_len; + int err, ret = 0; + ssize_t (*sendpage)(struct socket *, struct page *, int, size_t, int); + + len = xdr->head[0].iov_len; + if (base < len || (addr != NULL && base == 0)) { + struct kvec iov = { + .iov_base = xdr->head[0].iov_base + base, + .iov_len = len - base, + }; + struct msghdr msg = { + .msg_name = addr, + .msg_namelen = addrlen, + .msg_flags = msgflags, + }; + if (xdr->len > len) + msg.msg_flags |= MSG_MORE; + + if (iov.iov_len != 0) + err = kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len); + else + err = kernel_sendmsg(sock, &msg, NULL, 0, 0); + if (ret == 0) + ret = err; + else if (err > 0) + ret += err; + if (err != iov.iov_len) + goto out; + base = 0; + } else + base -= len; + + if (pglen == 0) + goto copy_tail; + if (base >= pglen) { + base -= pglen; + goto copy_tail; + } + if (base || xdr->page_base) { + pglen -= base; + base += xdr->page_base; + ppage += base >> PAGE_CACHE_SHIFT; + base &= ~PAGE_CACHE_MASK; + } + + sendpage = sock->ops->sendpage ? : sock_no_sendpage; + do { + int flags = msgflags; + + len = PAGE_CACHE_SIZE; + if (base) + len -= base; + if (pglen < len) + len = pglen; + + if (pglen != len || xdr->tail[0].iov_len != 0) + flags |= MSG_MORE; + + /* Hmm... We might be dealing with highmem pages */ + if (PageHighMem(*ppage)) + sendpage = sock_no_sendpage; + err = sendpage(sock, *ppage, base, len, flags); + if (ret == 0) + ret = err; + else if (err > 0) + ret += err; + if (err != len) + goto out; + base = 0; + ppage++; + } while ((pglen -= len) != 0); +copy_tail: + len = xdr->tail[0].iov_len; + if (base < len) { + struct kvec iov = { + .iov_base = xdr->tail[0].iov_base + base, + .iov_len = len - base, + }; + struct msghdr msg = { + .msg_flags = msgflags, + }; + err = kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len); + if (ret == 0) + ret = err; + else if (err > 0) + ret += err; + } +out: + return ret; +} + + +/* + * Helper routines for doing 'memmove' like operations on a struct xdr_buf + * + * _shift_data_right_pages + * @pages: vector of pages containing both the source and dest memory area. + * @pgto_base: page vector address of destination + * @pgfrom_base: page vector address of source + * @len: number of bytes to copy + * + * Note: the addresses pgto_base and pgfrom_base are both calculated in + * the same way: + * if a memory area starts at byte 'base' in page 'pages[i]', + * then its address is given as (i << PAGE_CACHE_SHIFT) + base + * Also note: pgfrom_base must be < pgto_base, but the memory areas + * they point to may overlap. + */ +static void +_shift_data_right_pages(struct page **pages, size_t pgto_base, + size_t pgfrom_base, size_t len) +{ + struct page **pgfrom, **pgto; + char *vfrom, *vto; + size_t copy; + + BUG_ON(pgto_base <= pgfrom_base); + + pgto_base += len; + pgfrom_base += len; + + pgto = pages + (pgto_base >> PAGE_CACHE_SHIFT); + pgfrom = pages + (pgfrom_base >> PAGE_CACHE_SHIFT); + + pgto_base &= ~PAGE_CACHE_MASK; + pgfrom_base &= ~PAGE_CACHE_MASK; + + do { + /* Are any pointers crossing a page boundary? */ + if (pgto_base == 0) { + flush_dcache_page(*pgto); + pgto_base = PAGE_CACHE_SIZE; + pgto--; + } + if (pgfrom_base == 0) { + pgfrom_base = PAGE_CACHE_SIZE; + pgfrom--; + } + + copy = len; + if (copy > pgto_base) + copy = pgto_base; + if (copy > pgfrom_base) + copy = pgfrom_base; + pgto_base -= copy; + pgfrom_base -= copy; + + vto = kmap_atomic(*pgto, KM_USER0); + vfrom = kmap_atomic(*pgfrom, KM_USER1); + memmove(vto + pgto_base, vfrom + pgfrom_base, copy); + kunmap_atomic(vfrom, KM_USER1); + kunmap_atomic(vto, KM_USER0); + + } while ((len -= copy) != 0); + flush_dcache_page(*pgto); +} + +/* + * _copy_to_pages + * @pages: array of pages + * @pgbase: page vector address of destination + * @p: pointer to source data + * @len: length + * + * Copies data from an arbitrary memory location into an array of pages + * The copy is assumed to be non-overlapping. + */ +static void +_copy_to_pages(struct page **pages, size_t pgbase, const char *p, size_t len) +{ + struct page **pgto; + char *vto; + size_t copy; + + pgto = pages + (pgbase >> PAGE_CACHE_SHIFT); + pgbase &= ~PAGE_CACHE_MASK; + + do { + copy = PAGE_CACHE_SIZE - pgbase; + if (copy > len) + copy = len; + + vto = kmap_atomic(*pgto, KM_USER0); + memcpy(vto + pgbase, p, copy); + kunmap_atomic(vto, KM_USER0); + + pgbase += copy; + if (pgbase == PAGE_CACHE_SIZE) { + flush_dcache_page(*pgto); + pgbase = 0; + pgto++; + } + p += copy; + + } while ((len -= copy) != 0); + flush_dcache_page(*pgto); +} + +/* + * _copy_from_pages + * @p: pointer to destination + * @pages: array of pages + * @pgbase: offset of source data + * @len: length + * + * Copies data into an arbitrary memory location from an array of pages + * The copy is assumed to be non-overlapping. + */ +static void +_copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len) +{ + struct page **pgfrom; + char *vfrom; + size_t copy; + + pgfrom = pages + (pgbase >> PAGE_CACHE_SHIFT); + pgbase &= ~PAGE_CACHE_MASK; + + do { + copy = PAGE_CACHE_SIZE - pgbase; + if (copy > len) + copy = len; + + vfrom = kmap_atomic(*pgfrom, KM_USER0); + memcpy(p, vfrom + pgbase, copy); + kunmap_atomic(vfrom, KM_USER0); + + pgbase += copy; + if (pgbase == PAGE_CACHE_SIZE) { + pgbase = 0; + pgfrom++; + } + p += copy; + + } while ((len -= copy) != 0); +} + +/* + * xdr_shrink_bufhead + * @buf: xdr_buf + * @len: bytes to remove from buf->head[0] + * + * Shrinks XDR buffer's header kvec buf->head[0] by + * 'len' bytes. The extra data is not lost, but is instead + * moved into the inlined pages and/or the tail. + */ +static void +xdr_shrink_bufhead(struct xdr_buf *buf, size_t len) +{ + struct kvec *head, *tail; + size_t copy, offs; + unsigned int pglen = buf->page_len; + + tail = buf->tail; + head = buf->head; + BUG_ON (len > head->iov_len); + + /* Shift the tail first */ + if (tail->iov_len != 0) { + if (tail->iov_len > len) { + copy = tail->iov_len - len; + memmove((char *)tail->iov_base + len, + tail->iov_base, copy); + } + /* Copy from the inlined pages into the tail */ + copy = len; + if (copy > pglen) + copy = pglen; + offs = len - copy; + if (offs >= tail->iov_len) + copy = 0; + else if (copy > tail->iov_len - offs) + copy = tail->iov_len - offs; + if (copy != 0) + _copy_from_pages((char *)tail->iov_base + offs, + buf->pages, + buf->page_base + pglen + offs - len, + copy); + /* Do we also need to copy data from the head into the tail ? */ + if (len > pglen) { + offs = copy = len - pglen; + if (copy > tail->iov_len) + copy = tail->iov_len; + memcpy(tail->iov_base, + (char *)head->iov_base + + head->iov_len - offs, + copy); + } + } + /* Now handle pages */ + if (pglen != 0) { + if (pglen > len) + _shift_data_right_pages(buf->pages, + buf->page_base + len, + buf->page_base, + pglen - len); + copy = len; + if (len > pglen) + copy = pglen; + _copy_to_pages(buf->pages, buf->page_base, + (char *)head->iov_base + head->iov_len - len, + copy); + } + head->iov_len -= len; + buf->buflen -= len; + /* Have we truncated the message? */ + if (buf->len > buf->buflen) + buf->len = buf->buflen; +} + +/* + * xdr_shrink_pagelen + * @buf: xdr_buf + * @len: bytes to remove from buf->pages + * + * Shrinks XDR buffer's page array buf->pages by + * 'len' bytes. The extra data is not lost, but is instead + * moved into the tail. + */ +static void +xdr_shrink_pagelen(struct xdr_buf *buf, size_t len) +{ + struct kvec *tail; + size_t copy; + char *p; + unsigned int pglen = buf->page_len; + + tail = buf->tail; + BUG_ON (len > pglen); + + /* Shift the tail first */ + if (tail->iov_len != 0) { + p = (char *)tail->iov_base + len; + if (tail->iov_len > len) { + copy = tail->iov_len - len; + memmove(p, tail->iov_base, copy); + } else + buf->buflen -= len; + /* Copy from the inlined pages into the tail */ + copy = len; + if (copy > tail->iov_len) + copy = tail->iov_len; + _copy_from_pages((char *)tail->iov_base, + buf->pages, buf->page_base + pglen - len, + copy); + } + buf->page_len -= len; + buf->buflen -= len; + /* Have we truncated the message? */ + if (buf->len > buf->buflen) + buf->len = buf->buflen; +} + +void +xdr_shift_buf(struct xdr_buf *buf, size_t len) +{ + xdr_shrink_bufhead(buf, len); +} + +/** + * xdr_init_encode - Initialize a struct xdr_stream for sending data. + * @xdr: pointer to xdr_stream struct + * @buf: pointer to XDR buffer in which to encode data + * @p: current pointer inside XDR buffer + * + * Note: at the moment the RPC client only passes the length of our + * scratch buffer in the xdr_buf's header kvec. Previously this + * meant we needed to call xdr_adjust_iovec() after encoding the + * data. With the new scheme, the xdr_stream manages the details + * of the buffer length, and takes care of adjusting the kvec + * length for us. + */ +void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, uint32_t *p) +{ + struct kvec *iov = buf->head; + + xdr->buf = buf; + xdr->iov = iov; + xdr->end = (uint32_t *)((char *)iov->iov_base + iov->iov_len); + buf->len = iov->iov_len = (char *)p - (char *)iov->iov_base; + xdr->p = p; +} +EXPORT_SYMBOL(xdr_init_encode); + +/** + * xdr_reserve_space - Reserve buffer space for sending + * @xdr: pointer to xdr_stream + * @nbytes: number of bytes to reserve + * + * Checks that we have enough buffer space to encode 'nbytes' more + * bytes of data. If so, update the total xdr_buf length, and + * adjust the length of the current kvec. + */ +uint32_t * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes) +{ + uint32_t *p = xdr->p; + uint32_t *q; + + /* align nbytes on the next 32-bit boundary */ + nbytes += 3; + nbytes &= ~3; + q = p + (nbytes >> 2); + if (unlikely(q > xdr->end || q < p)) + return NULL; + xdr->p = q; + xdr->iov->iov_len += nbytes; + xdr->buf->len += nbytes; + return p; +} +EXPORT_SYMBOL(xdr_reserve_space); + +/** + * xdr_write_pages - Insert a list of pages into an XDR buffer for sending + * @xdr: pointer to xdr_stream + * @pages: list of pages + * @base: offset of first byte + * @len: length of data in bytes + * + */ +void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int base, + unsigned int len) +{ + struct xdr_buf *buf = xdr->buf; + struct kvec *iov = buf->tail; + buf->pages = pages; + buf->page_base = base; + buf->page_len = len; + + iov->iov_base = (char *)xdr->p; + iov->iov_len = 0; + xdr->iov = iov; + + if (len & 3) { + unsigned int pad = 4 - (len & 3); + + BUG_ON(xdr->p >= xdr->end); + iov->iov_base = (char *)xdr->p + (len & 3); + iov->iov_len += pad; + len += pad; + *xdr->p++ = 0; + } + buf->buflen += len; + buf->len += len; +} +EXPORT_SYMBOL(xdr_write_pages); + +/** + * xdr_init_decode - Initialize an xdr_stream for decoding data. + * @xdr: pointer to xdr_stream struct + * @buf: pointer to XDR buffer from which to decode data + * @p: current pointer inside XDR buffer + */ +void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, uint32_t *p) +{ + struct kvec *iov = buf->head; + unsigned int len = iov->iov_len; + + if (len > buf->len) + len = buf->len; + xdr->buf = buf; + xdr->iov = iov; + xdr->p = p; + xdr->end = (uint32_t *)((char *)iov->iov_base + len); +} +EXPORT_SYMBOL(xdr_init_decode); + +/** + * xdr_inline_decode - Retrieve non-page XDR data to decode + * @xdr: pointer to xdr_stream struct + * @nbytes: number of bytes of data to decode + * + * Check if the input buffer is long enough to enable us to decode + * 'nbytes' more bytes of data starting at the current position. + * If so return the current pointer, then update the current + * pointer position. + */ +uint32_t * xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes) +{ + uint32_t *p = xdr->p; + uint32_t *q = p + XDR_QUADLEN(nbytes); + + if (unlikely(q > xdr->end || q < p)) + return NULL; + xdr->p = q; + return p; +} +EXPORT_SYMBOL(xdr_inline_decode); + +/** + * xdr_read_pages - Ensure page-based XDR data to decode is aligned at current pointer position + * @xdr: pointer to xdr_stream struct + * @len: number of bytes of page data + * + * Moves data beyond the current pointer position from the XDR head[] buffer + * into the page list. Any data that lies beyond current position + "len" + * bytes is moved into the XDR tail[]. The current pointer is then + * repositioned at the beginning of the XDR tail. + */ +void xdr_read_pages(struct xdr_stream *xdr, unsigned int len) +{ + struct xdr_buf *buf = xdr->buf; + struct kvec *iov; + ssize_t shift; + unsigned int end; + int padding; + + /* Realign pages to current pointer position */ + iov = buf->head; + shift = iov->iov_len + (char *)iov->iov_base - (char *)xdr->p; + if (shift > 0) + xdr_shrink_bufhead(buf, shift); + + /* Truncate page data and move it into the tail */ + if (buf->page_len > len) + xdr_shrink_pagelen(buf, buf->page_len - len); + padding = (XDR_QUADLEN(len) << 2) - len; + xdr->iov = iov = buf->tail; + /* Compute remaining message length. */ + end = iov->iov_len; + shift = buf->buflen - buf->len; + if (shift < end) + end -= shift; + else if (shift > 0) + end = 0; + /* + * Position current pointer at beginning of tail, and + * set remaining message length. + */ + xdr->p = (uint32_t *)((char *)iov->iov_base + padding); + xdr->end = (uint32_t *)((char *)iov->iov_base + end); +} +EXPORT_SYMBOL(xdr_read_pages); + +static struct kvec empty_iov = {.iov_base = NULL, .iov_len = 0}; + +void +xdr_buf_from_iov(struct kvec *iov, struct xdr_buf *buf) +{ + buf->head[0] = *iov; + buf->tail[0] = empty_iov; + buf->page_len = 0; + buf->buflen = buf->len = iov->iov_len; +} + +/* Sets subiov to the intersection of iov with the buffer of length len + * starting base bytes after iov. Indicates empty intersection by setting + * length of subiov to zero. Decrements len by length of subiov, sets base + * to zero (or decrements it by length of iov if subiov is empty). */ +static void +iov_subsegment(struct kvec *iov, struct kvec *subiov, int *base, int *len) +{ + if (*base > iov->iov_len) { + subiov->iov_base = NULL; + subiov->iov_len = 0; + *base -= iov->iov_len; + } else { + subiov->iov_base = iov->iov_base + *base; + subiov->iov_len = min(*len, (int)iov->iov_len - *base); + *base = 0; + } + *len -= subiov->iov_len; +} + +/* Sets subbuf to the portion of buf of length len beginning base bytes + * from the start of buf. Returns -1 if base of length are out of bounds. */ +int +xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, + int base, int len) +{ + int i; + + subbuf->buflen = subbuf->len = len; + iov_subsegment(buf->head, subbuf->head, &base, &len); + + if (base < buf->page_len) { + i = (base + buf->page_base) >> PAGE_CACHE_SHIFT; + subbuf->pages = &buf->pages[i]; + subbuf->page_base = (base + buf->page_base) & ~PAGE_CACHE_MASK; + subbuf->page_len = min((int)buf->page_len - base, len); + len -= subbuf->page_len; + base = 0; + } else { + base -= buf->page_len; + subbuf->page_len = 0; + } + + iov_subsegment(buf->tail, subbuf->tail, &base, &len); + if (base || len) + return -1; + return 0; +} + +/* obj is assumed to point to allocated memory of size at least len: */ +int +read_bytes_from_xdr_buf(struct xdr_buf *buf, int base, void *obj, int len) +{ + struct xdr_buf subbuf; + int this_len; + int status; + + status = xdr_buf_subsegment(buf, &subbuf, base, len); + if (status) + goto out; + this_len = min(len, (int)subbuf.head[0].iov_len); + memcpy(obj, subbuf.head[0].iov_base, this_len); + len -= this_len; + obj += this_len; + this_len = min(len, (int)subbuf.page_len); + if (this_len) + _copy_from_pages(obj, subbuf.pages, subbuf.page_base, this_len); + len -= this_len; + obj += this_len; + this_len = min(len, (int)subbuf.tail[0].iov_len); + memcpy(obj, subbuf.tail[0].iov_base, this_len); +out: + return status; +} + +static int +read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj) +{ + u32 raw; + int status; + + status = read_bytes_from_xdr_buf(buf, base, &raw, sizeof(*obj)); + if (status) + return status; + *obj = ntohl(raw); + return 0; +} + +/* If the netobj starting offset bytes from the start of xdr_buf is contained + * entirely in the head or the tail, set object to point to it; otherwise + * try to find space for it at the end of the tail, copy it there, and + * set obj to point to it. */ +int +xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, int offset) +{ + u32 tail_offset = buf->head[0].iov_len + buf->page_len; + u32 obj_end_offset; + + if (read_u32_from_xdr_buf(buf, offset, &obj->len)) + goto out; + obj_end_offset = offset + 4 + obj->len; + + if (obj_end_offset <= buf->head[0].iov_len) { + /* The obj is contained entirely in the head: */ + obj->data = buf->head[0].iov_base + offset + 4; + } else if (offset + 4 >= tail_offset) { + if (obj_end_offset - tail_offset + > buf->tail[0].iov_len) + goto out; + /* The obj is contained entirely in the tail: */ + obj->data = buf->tail[0].iov_base + + offset - tail_offset + 4; + } else { + /* use end of tail as storage for obj: + * (We don't copy to the beginning because then we'd have + * to worry about doing a potentially overlapping copy. + * This assumes the object is at most half the length of the + * tail.) */ + if (obj->len > buf->tail[0].iov_len) + goto out; + obj->data = buf->tail[0].iov_base + buf->tail[0].iov_len - + obj->len; + if (read_bytes_from_xdr_buf(buf, offset + 4, + obj->data, obj->len)) + goto out; + + } + return 0; +out: + return -1; +} diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c new file mode 100644 index 000000000000..c74a6bb94074 --- /dev/null +++ b/net/sunrpc/xprt.c @@ -0,0 +1,1678 @@ +/* + * linux/net/sunrpc/xprt.c + * + * This is a generic RPC call interface supporting congestion avoidance, + * and asynchronous calls. + * + * The interface works like this: + * + * - When a process places a call, it allocates a request slot if + * one is available. Otherwise, it sleeps on the backlog queue + * (xprt_reserve). + * - Next, the caller puts together the RPC message, stuffs it into + * the request struct, and calls xprt_call(). + * - xprt_call transmits the message and installs the caller on the + * socket's wait list. At the same time, it installs a timer that + * is run after the packet's timeout has expired. + * - When a packet arrives, the data_ready handler walks the list of + * pending requests for that socket. If a matching XID is found, the + * caller is woken up, and the timer removed. + * - When no reply arrives within the timeout interval, the timer is + * fired by the kernel and runs xprt_timer(). It either adjusts the + * timeout values (minor timeout) or wakes up the caller with a status + * of -ETIMEDOUT. + * - When the caller receives a notification from RPC that a reply arrived, + * it should release the RPC slot, and process the reply. + * If the call timed out, it may choose to retry the operation by + * adjusting the initial timeout value, and simply calling rpc_call + * again. + * + * Support for async RPC is done through a set of RPC-specific scheduling + * primitives that `transparently' work for processes as well as async + * tasks that rely on callbacks. + * + * Copyright (C) 1995-1997, Olaf Kirch <okir@monad.swb.de> + * + * TCP callback races fixes (C) 1998 Red Hat Software <alan@redhat.com> + * TCP send fixes (C) 1998 Red Hat Software <alan@redhat.com> + * TCP NFS related read + write fixes + * (C) 1999 Dave Airlie, University of Limerick, Ireland <airlied@linux.ie> + * + * Rewrite of larges part of the code in order to stabilize TCP stuff. + * Fix behaviour when socket buffer is full. + * (C) 1999 Trond Myklebust <trond.myklebust@fys.uio.no> + */ + +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/capability.h> +#include <linux/sched.h> +#include <linux/errno.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/net.h> +#include <linux/mm.h> +#include <linux/udp.h> +#include <linux/tcp.h> +#include <linux/sunrpc/clnt.h> +#include <linux/file.h> +#include <linux/workqueue.h> +#include <linux/random.h> + +#include <net/sock.h> +#include <net/checksum.h> +#include <net/udp.h> +#include <net/tcp.h> + +/* + * Local variables + */ + +#ifdef RPC_DEBUG +# undef RPC_DEBUG_DATA +# define RPCDBG_FACILITY RPCDBG_XPRT +#endif + +#define XPRT_MAX_BACKOFF (8) +#define XPRT_IDLE_TIMEOUT (5*60*HZ) +#define XPRT_MAX_RESVPORT (800) + +/* + * Local functions + */ +static void xprt_request_init(struct rpc_task *, struct rpc_xprt *); +static inline void do_xprt_reserve(struct rpc_task *); +static void xprt_disconnect(struct rpc_xprt *); +static void xprt_connect_status(struct rpc_task *task); +static struct rpc_xprt * xprt_setup(int proto, struct sockaddr_in *ap, + struct rpc_timeout *to); +static struct socket *xprt_create_socket(struct rpc_xprt *, int, int); +static void xprt_bind_socket(struct rpc_xprt *, struct socket *); +static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); + +static int xprt_clear_backlog(struct rpc_xprt *xprt); + +#ifdef RPC_DEBUG_DATA +/* + * Print the buffer contents (first 128 bytes only--just enough for + * diropres return). + */ +static void +xprt_pktdump(char *msg, u32 *packet, unsigned int count) +{ + u8 *buf = (u8 *) packet; + int j; + + dprintk("RPC: %s\n", msg); + for (j = 0; j < count && j < 128; j += 4) { + if (!(j & 31)) { + if (j) + dprintk("\n"); + dprintk("0x%04x ", j); + } + dprintk("%02x%02x%02x%02x ", + buf[j], buf[j+1], buf[j+2], buf[j+3]); + } + dprintk("\n"); +} +#else +static inline void +xprt_pktdump(char *msg, u32 *packet, unsigned int count) +{ + /* NOP */ +} +#endif + +/* + * Look up RPC transport given an INET socket + */ +static inline struct rpc_xprt * +xprt_from_sock(struct sock *sk) +{ + return (struct rpc_xprt *) sk->sk_user_data; +} + +/* + * Serialize write access to sockets, in order to prevent different + * requests from interfering with each other. + * Also prevents TCP socket connects from colliding with writes. + */ +static int +__xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + + if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) { + if (task == xprt->snd_task) + return 1; + if (task == NULL) + return 0; + goto out_sleep; + } + if (xprt->nocong || __xprt_get_cong(xprt, task)) { + xprt->snd_task = task; + if (req) { + req->rq_bytes_sent = 0; + req->rq_ntrans++; + } + return 1; + } + smp_mb__before_clear_bit(); + clear_bit(XPRT_LOCKED, &xprt->sockstate); + smp_mb__after_clear_bit(); +out_sleep: + dprintk("RPC: %4d failed to lock socket %p\n", task->tk_pid, xprt); + task->tk_timeout = 0; + task->tk_status = -EAGAIN; + if (req && req->rq_ntrans) + rpc_sleep_on(&xprt->resend, task, NULL, NULL); + else + rpc_sleep_on(&xprt->sending, task, NULL, NULL); + return 0; +} + +static inline int +xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) +{ + int retval; + + spin_lock_bh(&xprt->sock_lock); + retval = __xprt_lock_write(xprt, task); + spin_unlock_bh(&xprt->sock_lock); + return retval; +} + + +static void +__xprt_lock_write_next(struct rpc_xprt *xprt) +{ + struct rpc_task *task; + + if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) + return; + if (!xprt->nocong && RPCXPRT_CONGESTED(xprt)) + goto out_unlock; + task = rpc_wake_up_next(&xprt->resend); + if (!task) { + task = rpc_wake_up_next(&xprt->sending); + if (!task) + goto out_unlock; + } + if (xprt->nocong || __xprt_get_cong(xprt, task)) { + struct rpc_rqst *req = task->tk_rqstp; + xprt->snd_task = task; + if (req) { + req->rq_bytes_sent = 0; + req->rq_ntrans++; + } + return; + } +out_unlock: + smp_mb__before_clear_bit(); + clear_bit(XPRT_LOCKED, &xprt->sockstate); + smp_mb__after_clear_bit(); +} + +/* + * Releases the socket for use by other requests. + */ +static void +__xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) +{ + if (xprt->snd_task == task) { + xprt->snd_task = NULL; + smp_mb__before_clear_bit(); + clear_bit(XPRT_LOCKED, &xprt->sockstate); + smp_mb__after_clear_bit(); + __xprt_lock_write_next(xprt); + } +} + +static inline void +xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) +{ + spin_lock_bh(&xprt->sock_lock); + __xprt_release_write(xprt, task); + spin_unlock_bh(&xprt->sock_lock); +} + +/* + * Write data to socket. + */ +static inline int +xprt_sendmsg(struct rpc_xprt *xprt, struct rpc_rqst *req) +{ + struct socket *sock = xprt->sock; + struct xdr_buf *xdr = &req->rq_snd_buf; + struct sockaddr *addr = NULL; + int addrlen = 0; + unsigned int skip; + int result; + + if (!sock) + return -ENOTCONN; + + xprt_pktdump("packet data:", + req->rq_svec->iov_base, + req->rq_svec->iov_len); + + /* For UDP, we need to provide an address */ + if (!xprt->stream) { + addr = (struct sockaddr *) &xprt->addr; + addrlen = sizeof(xprt->addr); + } + /* Dont repeat bytes */ + skip = req->rq_bytes_sent; + + clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags); + result = xdr_sendpages(sock, addr, addrlen, xdr, skip, MSG_DONTWAIT); + + dprintk("RPC: xprt_sendmsg(%d) = %d\n", xdr->len - skip, result); + + if (result >= 0) + return result; + + switch (result) { + case -ECONNREFUSED: + /* When the server has died, an ICMP port unreachable message + * prompts ECONNREFUSED. + */ + case -EAGAIN: + break; + case -ECONNRESET: + case -ENOTCONN: + case -EPIPE: + /* connection broken */ + if (xprt->stream) + result = -ENOTCONN; + break; + default: + printk(KERN_NOTICE "RPC: sendmsg returned error %d\n", -result); + } + return result; +} + +/* + * Van Jacobson congestion avoidance. Check if the congestion window + * overflowed. Put the task to sleep if this is the case. + */ +static int +__xprt_get_cong(struct rpc_xprt *xprt, struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + + if (req->rq_cong) + return 1; + dprintk("RPC: %4d xprt_cwnd_limited cong = %ld cwnd = %ld\n", + task->tk_pid, xprt->cong, xprt->cwnd); + if (RPCXPRT_CONGESTED(xprt)) + return 0; + req->rq_cong = 1; + xprt->cong += RPC_CWNDSCALE; + return 1; +} + +/* + * Adjust the congestion window, and wake up the next task + * that has been sleeping due to congestion + */ +static void +__xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) +{ + if (!req->rq_cong) + return; + req->rq_cong = 0; + xprt->cong -= RPC_CWNDSCALE; + __xprt_lock_write_next(xprt); +} + +/* + * Adjust RPC congestion window + * We use a time-smoothed congestion estimator to avoid heavy oscillation. + */ +static void +xprt_adjust_cwnd(struct rpc_xprt *xprt, int result) +{ + unsigned long cwnd; + + cwnd = xprt->cwnd; + if (result >= 0 && cwnd <= xprt->cong) { + /* The (cwnd >> 1) term makes sure + * the result gets rounded properly. */ + cwnd += (RPC_CWNDSCALE * RPC_CWNDSCALE + (cwnd >> 1)) / cwnd; + if (cwnd > RPC_MAXCWND(xprt)) + cwnd = RPC_MAXCWND(xprt); + __xprt_lock_write_next(xprt); + } else if (result == -ETIMEDOUT) { + cwnd >>= 1; + if (cwnd < RPC_CWNDSCALE) + cwnd = RPC_CWNDSCALE; + } + dprintk("RPC: cong %ld, cwnd was %ld, now %ld\n", + xprt->cong, xprt->cwnd, cwnd); + xprt->cwnd = cwnd; +} + +/* + * Reset the major timeout value + */ +static void xprt_reset_majortimeo(struct rpc_rqst *req) +{ + struct rpc_timeout *to = &req->rq_xprt->timeout; + + req->rq_majortimeo = req->rq_timeout; + if (to->to_exponential) + req->rq_majortimeo <<= to->to_retries; + else + req->rq_majortimeo += to->to_increment * to->to_retries; + if (req->rq_majortimeo > to->to_maxval || req->rq_majortimeo == 0) + req->rq_majortimeo = to->to_maxval; + req->rq_majortimeo += jiffies; +} + +/* + * Adjust timeout values etc for next retransmit + */ +int xprt_adjust_timeout(struct rpc_rqst *req) +{ + struct rpc_xprt *xprt = req->rq_xprt; + struct rpc_timeout *to = &xprt->timeout; + int status = 0; + + if (time_before(jiffies, req->rq_majortimeo)) { + if (to->to_exponential) + req->rq_timeout <<= 1; + else + req->rq_timeout += to->to_increment; + if (to->to_maxval && req->rq_timeout >= to->to_maxval) + req->rq_timeout = to->to_maxval; + req->rq_retries++; + pprintk("RPC: %lu retrans\n", jiffies); + } else { + req->rq_timeout = to->to_initval; + req->rq_retries = 0; + xprt_reset_majortimeo(req); + /* Reset the RTT counters == "slow start" */ + spin_lock_bh(&xprt->sock_lock); + rpc_init_rtt(req->rq_task->tk_client->cl_rtt, to->to_initval); + spin_unlock_bh(&xprt->sock_lock); + pprintk("RPC: %lu timeout\n", jiffies); + status = -ETIMEDOUT; + } + + if (req->rq_timeout == 0) { + printk(KERN_WARNING "xprt_adjust_timeout: rq_timeout = 0!\n"); + req->rq_timeout = 5 * HZ; + } + return status; +} + +/* + * Close down a transport socket + */ +static void +xprt_close(struct rpc_xprt *xprt) +{ + struct socket *sock = xprt->sock; + struct sock *sk = xprt->inet; + + if (!sk) + return; + + write_lock_bh(&sk->sk_callback_lock); + xprt->inet = NULL; + xprt->sock = NULL; + + sk->sk_user_data = NULL; + sk->sk_data_ready = xprt->old_data_ready; + sk->sk_state_change = xprt->old_state_change; + sk->sk_write_space = xprt->old_write_space; + write_unlock_bh(&sk->sk_callback_lock); + + sk->sk_no_check = 0; + + sock_release(sock); +} + +static void +xprt_socket_autoclose(void *args) +{ + struct rpc_xprt *xprt = (struct rpc_xprt *)args; + + xprt_disconnect(xprt); + xprt_close(xprt); + xprt_release_write(xprt, NULL); +} + +/* + * Mark a transport as disconnected + */ +static void +xprt_disconnect(struct rpc_xprt *xprt) +{ + dprintk("RPC: disconnected transport %p\n", xprt); + spin_lock_bh(&xprt->sock_lock); + xprt_clear_connected(xprt); + rpc_wake_up_status(&xprt->pending, -ENOTCONN); + spin_unlock_bh(&xprt->sock_lock); +} + +/* + * Used to allow disconnection when we've been idle + */ +static void +xprt_init_autodisconnect(unsigned long data) +{ + struct rpc_xprt *xprt = (struct rpc_xprt *)data; + + spin_lock(&xprt->sock_lock); + if (!list_empty(&xprt->recv) || xprt->shutdown) + goto out_abort; + if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) + goto out_abort; + spin_unlock(&xprt->sock_lock); + /* Let keventd close the socket */ + if (test_bit(XPRT_CONNECTING, &xprt->sockstate) != 0) + xprt_release_write(xprt, NULL); + else + schedule_work(&xprt->task_cleanup); + return; +out_abort: + spin_unlock(&xprt->sock_lock); +} + +static void xprt_socket_connect(void *args) +{ + struct rpc_xprt *xprt = (struct rpc_xprt *)args; + struct socket *sock = xprt->sock; + int status = -EIO; + + if (xprt->shutdown || xprt->addr.sin_port == 0) + goto out; + + /* + * Start by resetting any existing state + */ + xprt_close(xprt); + sock = xprt_create_socket(xprt, xprt->prot, xprt->resvport); + if (sock == NULL) { + /* couldn't create socket or bind to reserved port; + * this is likely a permanent error, so cause an abort */ + goto out; + } + xprt_bind_socket(xprt, sock); + xprt_sock_setbufsize(xprt); + + status = 0; + if (!xprt->stream) + goto out; + + /* + * Tell the socket layer to start connecting... + */ + status = sock->ops->connect(sock, (struct sockaddr *) &xprt->addr, + sizeof(xprt->addr), O_NONBLOCK); + dprintk("RPC: %p connect status %d connected %d sock state %d\n", + xprt, -status, xprt_connected(xprt), sock->sk->sk_state); + if (status < 0) { + switch (status) { + case -EINPROGRESS: + case -EALREADY: + goto out_clear; + } + } +out: + if (status < 0) + rpc_wake_up_status(&xprt->pending, status); + else + rpc_wake_up(&xprt->pending); +out_clear: + smp_mb__before_clear_bit(); + clear_bit(XPRT_CONNECTING, &xprt->sockstate); + smp_mb__after_clear_bit(); +} + +/* + * Attempt to connect a TCP socket. + * + */ +void xprt_connect(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + + dprintk("RPC: %4d xprt_connect xprt %p %s connected\n", task->tk_pid, + xprt, (xprt_connected(xprt) ? "is" : "is not")); + + if (xprt->shutdown) { + task->tk_status = -EIO; + return; + } + if (!xprt->addr.sin_port) { + task->tk_status = -EIO; + return; + } + if (!xprt_lock_write(xprt, task)) + return; + if (xprt_connected(xprt)) + goto out_write; + + if (task->tk_rqstp) + task->tk_rqstp->rq_bytes_sent = 0; + + task->tk_timeout = RPC_CONNECT_TIMEOUT; + rpc_sleep_on(&xprt->pending, task, xprt_connect_status, NULL); + if (!test_and_set_bit(XPRT_CONNECTING, &xprt->sockstate)) { + /* Note: if we are here due to a dropped connection + * we delay reconnecting by RPC_REESTABLISH_TIMEOUT/HZ + * seconds + */ + if (xprt->sock != NULL) + schedule_delayed_work(&xprt->sock_connect, + RPC_REESTABLISH_TIMEOUT); + else + schedule_work(&xprt->sock_connect); + } + return; + out_write: + xprt_release_write(xprt, task); +} + +/* + * We arrive here when awoken from waiting on connection establishment. + */ +static void +xprt_connect_status(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + + if (task->tk_status >= 0) { + dprintk("RPC: %4d xprt_connect_status: connection established\n", + task->tk_pid); + return; + } + + /* if soft mounted, just cause this RPC to fail */ + if (RPC_IS_SOFT(task)) + task->tk_status = -EIO; + + switch (task->tk_status) { + case -ECONNREFUSED: + case -ECONNRESET: + case -ENOTCONN: + return; + case -ETIMEDOUT: + dprintk("RPC: %4d xprt_connect_status: timed out\n", + task->tk_pid); + break; + default: + printk(KERN_ERR "RPC: error %d connecting to server %s\n", + -task->tk_status, task->tk_client->cl_server); + } + xprt_release_write(xprt, task); +} + +/* + * Look up the RPC request corresponding to a reply, and then lock it. + */ +static inline struct rpc_rqst * +xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid) +{ + struct list_head *pos; + struct rpc_rqst *req = NULL; + + list_for_each(pos, &xprt->recv) { + struct rpc_rqst *entry = list_entry(pos, struct rpc_rqst, rq_list); + if (entry->rq_xid == xid) { + req = entry; + break; + } + } + return req; +} + +/* + * Complete reply received. + * The TCP code relies on us to remove the request from xprt->pending. + */ +static void +xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied) +{ + struct rpc_task *task = req->rq_task; + struct rpc_clnt *clnt = task->tk_client; + + /* Adjust congestion window */ + if (!xprt->nocong) { + unsigned timer = task->tk_msg.rpc_proc->p_timer; + xprt_adjust_cwnd(xprt, copied); + __xprt_put_cong(xprt, req); + if (timer) { + if (req->rq_ntrans == 1) + rpc_update_rtt(clnt->cl_rtt, timer, + (long)jiffies - req->rq_xtime); + rpc_set_timeo(clnt->cl_rtt, timer, req->rq_ntrans - 1); + } + } + +#ifdef RPC_PROFILE + /* Profile only reads for now */ + if (copied > 1024) { + static unsigned long nextstat; + static unsigned long pkt_rtt, pkt_len, pkt_cnt; + + pkt_cnt++; + pkt_len += req->rq_slen + copied; + pkt_rtt += jiffies - req->rq_xtime; + if (time_before(nextstat, jiffies)) { + printk("RPC: %lu %ld cwnd\n", jiffies, xprt->cwnd); + printk("RPC: %ld %ld %ld %ld stat\n", + jiffies, pkt_cnt, pkt_len, pkt_rtt); + pkt_rtt = pkt_len = pkt_cnt = 0; + nextstat = jiffies + 5 * HZ; + } + } +#endif + + dprintk("RPC: %4d has input (%d bytes)\n", task->tk_pid, copied); + list_del_init(&req->rq_list); + req->rq_received = req->rq_private_buf.len = copied; + + /* ... and wake up the process. */ + rpc_wake_up_task(task); + return; +} + +static size_t +skb_read_bits(skb_reader_t *desc, void *to, size_t len) +{ + if (len > desc->count) + len = desc->count; + if (skb_copy_bits(desc->skb, desc->offset, to, len)) + return 0; + desc->count -= len; + desc->offset += len; + return len; +} + +static size_t +skb_read_and_csum_bits(skb_reader_t *desc, void *to, size_t len) +{ + unsigned int csum2, pos; + + if (len > desc->count) + len = desc->count; + pos = desc->offset; + csum2 = skb_copy_and_csum_bits(desc->skb, pos, to, len, 0); + desc->csum = csum_block_add(desc->csum, csum2, pos); + desc->count -= len; + desc->offset += len; + return len; +} + +/* + * We have set things up such that we perform the checksum of the UDP + * packet in parallel with the copies into the RPC client iovec. -DaveM + */ +int +csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) +{ + skb_reader_t desc; + + desc.skb = skb; + desc.offset = sizeof(struct udphdr); + desc.count = skb->len - desc.offset; + + if (skb->ip_summed == CHECKSUM_UNNECESSARY) + goto no_checksum; + + desc.csum = csum_partial(skb->data, desc.offset, skb->csum); + xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_and_csum_bits); + if (desc.offset != skb->len) { + unsigned int csum2; + csum2 = skb_checksum(skb, desc.offset, skb->len - desc.offset, 0); + desc.csum = csum_block_add(desc.csum, csum2, desc.offset); + } + if (desc.count) + return -1; + if ((unsigned short)csum_fold(desc.csum)) + return -1; + return 0; +no_checksum: + xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits); + if (desc.count) + return -1; + return 0; +} + +/* + * Input handler for RPC replies. Called from a bottom half and hence + * atomic. + */ +static void +udp_data_ready(struct sock *sk, int len) +{ + struct rpc_task *task; + struct rpc_xprt *xprt; + struct rpc_rqst *rovr; + struct sk_buff *skb; + int err, repsize, copied; + u32 _xid, *xp; + + read_lock(&sk->sk_callback_lock); + dprintk("RPC: udp_data_ready...\n"); + if (!(xprt = xprt_from_sock(sk))) { + printk("RPC: udp_data_ready request not found!\n"); + goto out; + } + + dprintk("RPC: udp_data_ready client %p\n", xprt); + + if ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL) + goto out; + + if (xprt->shutdown) + goto dropit; + + repsize = skb->len - sizeof(struct udphdr); + if (repsize < 4) { + printk("RPC: impossible RPC reply size %d!\n", repsize); + goto dropit; + } + + /* Copy the XID from the skb... */ + xp = skb_header_pointer(skb, sizeof(struct udphdr), + sizeof(_xid), &_xid); + if (xp == NULL) + goto dropit; + + /* Look up and lock the request corresponding to the given XID */ + spin_lock(&xprt->sock_lock); + rovr = xprt_lookup_rqst(xprt, *xp); + if (!rovr) + goto out_unlock; + task = rovr->rq_task; + + dprintk("RPC: %4d received reply\n", task->tk_pid); + + if ((copied = rovr->rq_private_buf.buflen) > repsize) + copied = repsize; + + /* Suck it into the iovec, verify checksum if not done by hw. */ + if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) + goto out_unlock; + + /* Something worked... */ + dst_confirm(skb->dst); + + xprt_complete_rqst(xprt, rovr, copied); + + out_unlock: + spin_unlock(&xprt->sock_lock); + dropit: + skb_free_datagram(sk, skb); + out: + read_unlock(&sk->sk_callback_lock); +} + +/* + * Copy from an skb into memory and shrink the skb. + */ +static inline size_t +tcp_copy_data(skb_reader_t *desc, void *p, size_t len) +{ + if (len > desc->count) + len = desc->count; + if (skb_copy_bits(desc->skb, desc->offset, p, len)) + return 0; + desc->offset += len; + desc->count -= len; + return len; +} + +/* + * TCP read fragment marker + */ +static inline void +tcp_read_fraghdr(struct rpc_xprt *xprt, skb_reader_t *desc) +{ + size_t len, used; + char *p; + + p = ((char *) &xprt->tcp_recm) + xprt->tcp_offset; + len = sizeof(xprt->tcp_recm) - xprt->tcp_offset; + used = tcp_copy_data(desc, p, len); + xprt->tcp_offset += used; + if (used != len) + return; + xprt->tcp_reclen = ntohl(xprt->tcp_recm); + if (xprt->tcp_reclen & 0x80000000) + xprt->tcp_flags |= XPRT_LAST_FRAG; + else + xprt->tcp_flags &= ~XPRT_LAST_FRAG; + xprt->tcp_reclen &= 0x7fffffff; + xprt->tcp_flags &= ~XPRT_COPY_RECM; + xprt->tcp_offset = 0; + /* Sanity check of the record length */ + if (xprt->tcp_reclen < 4) { + printk(KERN_ERR "RPC: Invalid TCP record fragment length\n"); + xprt_disconnect(xprt); + } + dprintk("RPC: reading TCP record fragment of length %d\n", + xprt->tcp_reclen); +} + +static void +tcp_check_recm(struct rpc_xprt *xprt) +{ + if (xprt->tcp_offset == xprt->tcp_reclen) { + xprt->tcp_flags |= XPRT_COPY_RECM; + xprt->tcp_offset = 0; + if (xprt->tcp_flags & XPRT_LAST_FRAG) { + xprt->tcp_flags &= ~XPRT_COPY_DATA; + xprt->tcp_flags |= XPRT_COPY_XID; + xprt->tcp_copied = 0; + } + } +} + +/* + * TCP read xid + */ +static inline void +tcp_read_xid(struct rpc_xprt *xprt, skb_reader_t *desc) +{ + size_t len, used; + char *p; + + len = sizeof(xprt->tcp_xid) - xprt->tcp_offset; + dprintk("RPC: reading XID (%Zu bytes)\n", len); + p = ((char *) &xprt->tcp_xid) + xprt->tcp_offset; + used = tcp_copy_data(desc, p, len); + xprt->tcp_offset += used; + if (used != len) + return; + xprt->tcp_flags &= ~XPRT_COPY_XID; + xprt->tcp_flags |= XPRT_COPY_DATA; + xprt->tcp_copied = 4; + dprintk("RPC: reading reply for XID %08x\n", + ntohl(xprt->tcp_xid)); + tcp_check_recm(xprt); +} + +/* + * TCP read and complete request + */ +static inline void +tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc) +{ + struct rpc_rqst *req; + struct xdr_buf *rcvbuf; + size_t len; + + /* Find and lock the request corresponding to this xid */ + spin_lock(&xprt->sock_lock); + req = xprt_lookup_rqst(xprt, xprt->tcp_xid); + if (!req) { + xprt->tcp_flags &= ~XPRT_COPY_DATA; + dprintk("RPC: XID %08x request not found!\n", + ntohl(xprt->tcp_xid)); + spin_unlock(&xprt->sock_lock); + return; + } + + rcvbuf = &req->rq_private_buf; + len = desc->count; + if (len > xprt->tcp_reclen - xprt->tcp_offset) { + skb_reader_t my_desc; + + len = xprt->tcp_reclen - xprt->tcp_offset; + memcpy(&my_desc, desc, sizeof(my_desc)); + my_desc.count = len; + xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied, + &my_desc, tcp_copy_data); + desc->count -= len; + desc->offset += len; + } else + xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied, + desc, tcp_copy_data); + xprt->tcp_copied += len; + xprt->tcp_offset += len; + + if (xprt->tcp_copied == req->rq_private_buf.buflen) + xprt->tcp_flags &= ~XPRT_COPY_DATA; + else if (xprt->tcp_offset == xprt->tcp_reclen) { + if (xprt->tcp_flags & XPRT_LAST_FRAG) + xprt->tcp_flags &= ~XPRT_COPY_DATA; + } + + if (!(xprt->tcp_flags & XPRT_COPY_DATA)) { + dprintk("RPC: %4d received reply complete\n", + req->rq_task->tk_pid); + xprt_complete_rqst(xprt, req, xprt->tcp_copied); + } + spin_unlock(&xprt->sock_lock); + tcp_check_recm(xprt); +} + +/* + * TCP discard extra bytes from a short read + */ +static inline void +tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc) +{ + size_t len; + + len = xprt->tcp_reclen - xprt->tcp_offset; + if (len > desc->count) + len = desc->count; + desc->count -= len; + desc->offset += len; + xprt->tcp_offset += len; + tcp_check_recm(xprt); +} + +/* + * TCP record receive routine + * We first have to grab the record marker, then the XID, then the data. + */ +static int +tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, + unsigned int offset, size_t len) +{ + struct rpc_xprt *xprt = rd_desc->arg.data; + skb_reader_t desc = { + .skb = skb, + .offset = offset, + .count = len, + .csum = 0 + }; + + dprintk("RPC: tcp_data_recv\n"); + do { + /* Read in a new fragment marker if necessary */ + /* Can we ever really expect to get completely empty fragments? */ + if (xprt->tcp_flags & XPRT_COPY_RECM) { + tcp_read_fraghdr(xprt, &desc); + continue; + } + /* Read in the xid if necessary */ + if (xprt->tcp_flags & XPRT_COPY_XID) { + tcp_read_xid(xprt, &desc); + continue; + } + /* Read in the request data */ + if (xprt->tcp_flags & XPRT_COPY_DATA) { + tcp_read_request(xprt, &desc); + continue; + } + /* Skip over any trailing bytes on short reads */ + tcp_read_discard(xprt, &desc); + } while (desc.count); + dprintk("RPC: tcp_data_recv done\n"); + return len - desc.count; +} + +static void tcp_data_ready(struct sock *sk, int bytes) +{ + struct rpc_xprt *xprt; + read_descriptor_t rd_desc; + + read_lock(&sk->sk_callback_lock); + dprintk("RPC: tcp_data_ready...\n"); + if (!(xprt = xprt_from_sock(sk))) { + printk("RPC: tcp_data_ready socket info not found!\n"); + goto out; + } + if (xprt->shutdown) + goto out; + + /* We use rd_desc to pass struct xprt to tcp_data_recv */ + rd_desc.arg.data = xprt; + rd_desc.count = 65536; + tcp_read_sock(sk, &rd_desc, tcp_data_recv); +out: + read_unlock(&sk->sk_callback_lock); +} + +static void +tcp_state_change(struct sock *sk) +{ + struct rpc_xprt *xprt; + + read_lock(&sk->sk_callback_lock); + if (!(xprt = xprt_from_sock(sk))) + goto out; + dprintk("RPC: tcp_state_change client %p...\n", xprt); + dprintk("RPC: state %x conn %d dead %d zapped %d\n", + sk->sk_state, xprt_connected(xprt), + sock_flag(sk, SOCK_DEAD), + sock_flag(sk, SOCK_ZAPPED)); + + switch (sk->sk_state) { + case TCP_ESTABLISHED: + spin_lock_bh(&xprt->sock_lock); + if (!xprt_test_and_set_connected(xprt)) { + /* Reset TCP record info */ + xprt->tcp_offset = 0; + xprt->tcp_reclen = 0; + xprt->tcp_copied = 0; + xprt->tcp_flags = XPRT_COPY_RECM | XPRT_COPY_XID; + rpc_wake_up(&xprt->pending); + } + spin_unlock_bh(&xprt->sock_lock); + break; + case TCP_SYN_SENT: + case TCP_SYN_RECV: + break; + default: + if (xprt_test_and_clear_connected(xprt)) + rpc_wake_up_status(&xprt->pending, -ENOTCONN); + break; + } + out: + read_unlock(&sk->sk_callback_lock); +} + +/* + * Called when more output buffer space is available for this socket. + * We try not to wake our writers until they can make "significant" + * progress, otherwise we'll waste resources thrashing sock_sendmsg + * with a bunch of small requests. + */ +static void +xprt_write_space(struct sock *sk) +{ + struct rpc_xprt *xprt; + struct socket *sock; + + read_lock(&sk->sk_callback_lock); + if (!(xprt = xprt_from_sock(sk)) || !(sock = sk->sk_socket)) + goto out; + if (xprt->shutdown) + goto out; + + /* Wait until we have enough socket memory */ + if (xprt->stream) { + /* from net/core/stream.c:sk_stream_write_space */ + if (sk_stream_wspace(sk) < sk_stream_min_wspace(sk)) + goto out; + } else { + /* from net/core/sock.c:sock_def_write_space */ + if (!sock_writeable(sk)) + goto out; + } + + if (!test_and_clear_bit(SOCK_NOSPACE, &sock->flags)) + goto out; + + spin_lock_bh(&xprt->sock_lock); + if (xprt->snd_task) + rpc_wake_up_task(xprt->snd_task); + spin_unlock_bh(&xprt->sock_lock); +out: + read_unlock(&sk->sk_callback_lock); +} + +/* + * RPC receive timeout handler. + */ +static void +xprt_timer(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + spin_lock(&xprt->sock_lock); + if (req->rq_received) + goto out; + + xprt_adjust_cwnd(req->rq_xprt, -ETIMEDOUT); + __xprt_put_cong(xprt, req); + + dprintk("RPC: %4d xprt_timer (%s request)\n", + task->tk_pid, req ? "pending" : "backlogged"); + + task->tk_status = -ETIMEDOUT; +out: + task->tk_timeout = 0; + rpc_wake_up_task(task); + spin_unlock(&xprt->sock_lock); +} + +/* + * Place the actual RPC call. + * We have to copy the iovec because sendmsg fiddles with its contents. + */ +int +xprt_prepare_transmit(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + int err = 0; + + dprintk("RPC: %4d xprt_prepare_transmit\n", task->tk_pid); + + if (xprt->shutdown) + return -EIO; + + spin_lock_bh(&xprt->sock_lock); + if (req->rq_received && !req->rq_bytes_sent) { + err = req->rq_received; + goto out_unlock; + } + if (!__xprt_lock_write(xprt, task)) { + err = -EAGAIN; + goto out_unlock; + } + + if (!xprt_connected(xprt)) { + err = -ENOTCONN; + goto out_unlock; + } +out_unlock: + spin_unlock_bh(&xprt->sock_lock); + return err; +} + +void +xprt_transmit(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + int status, retry = 0; + + + dprintk("RPC: %4d xprt_transmit(%u)\n", task->tk_pid, req->rq_slen); + + /* set up everything as needed. */ + /* Write the record marker */ + if (xprt->stream) { + u32 *marker = req->rq_svec[0].iov_base; + + *marker = htonl(0x80000000|(req->rq_slen-sizeof(*marker))); + } + + smp_rmb(); + if (!req->rq_received) { + if (list_empty(&req->rq_list)) { + spin_lock_bh(&xprt->sock_lock); + /* Update the softirq receive buffer */ + memcpy(&req->rq_private_buf, &req->rq_rcv_buf, + sizeof(req->rq_private_buf)); + /* Add request to the receive list */ + list_add_tail(&req->rq_list, &xprt->recv); + spin_unlock_bh(&xprt->sock_lock); + xprt_reset_majortimeo(req); + } + } else if (!req->rq_bytes_sent) + return; + + /* Continue transmitting the packet/record. We must be careful + * to cope with writespace callbacks arriving _after_ we have + * called xprt_sendmsg(). + */ + while (1) { + req->rq_xtime = jiffies; + status = xprt_sendmsg(xprt, req); + + if (status < 0) + break; + + if (xprt->stream) { + req->rq_bytes_sent += status; + + /* If we've sent the entire packet, immediately + * reset the count of bytes sent. */ + if (req->rq_bytes_sent >= req->rq_slen) { + req->rq_bytes_sent = 0; + goto out_receive; + } + } else { + if (status >= req->rq_slen) + goto out_receive; + status = -EAGAIN; + break; + } + + dprintk("RPC: %4d xmit incomplete (%d left of %d)\n", + task->tk_pid, req->rq_slen - req->rq_bytes_sent, + req->rq_slen); + + status = -EAGAIN; + if (retry++ > 50) + break; + } + + /* Note: at this point, task->tk_sleeping has not yet been set, + * hence there is no danger of the waking up task being put on + * schedq, and being picked up by a parallel run of rpciod(). + */ + task->tk_status = status; + + switch (status) { + case -EAGAIN: + if (test_bit(SOCK_ASYNC_NOSPACE, &xprt->sock->flags)) { + /* Protect against races with xprt_write_space */ + spin_lock_bh(&xprt->sock_lock); + /* Don't race with disconnect */ + if (!xprt_connected(xprt)) + task->tk_status = -ENOTCONN; + else if (test_bit(SOCK_NOSPACE, &xprt->sock->flags)) { + task->tk_timeout = req->rq_timeout; + rpc_sleep_on(&xprt->pending, task, NULL, NULL); + } + spin_unlock_bh(&xprt->sock_lock); + return; + } + /* Keep holding the socket if it is blocked */ + rpc_delay(task, HZ>>4); + return; + case -ECONNREFUSED: + task->tk_timeout = RPC_REESTABLISH_TIMEOUT; + rpc_sleep_on(&xprt->sending, task, NULL, NULL); + case -ENOTCONN: + return; + default: + if (xprt->stream) + xprt_disconnect(xprt); + } + xprt_release_write(xprt, task); + return; + out_receive: + dprintk("RPC: %4d xmit complete\n", task->tk_pid); + /* Set the task's receive timeout value */ + spin_lock_bh(&xprt->sock_lock); + if (!xprt->nocong) { + int timer = task->tk_msg.rpc_proc->p_timer; + task->tk_timeout = rpc_calc_rto(clnt->cl_rtt, timer); + task->tk_timeout <<= rpc_ntimeo(clnt->cl_rtt, timer) + req->rq_retries; + if (task->tk_timeout > xprt->timeout.to_maxval || task->tk_timeout == 0) + task->tk_timeout = xprt->timeout.to_maxval; + } else + task->tk_timeout = req->rq_timeout; + /* Don't race with disconnect */ + if (!xprt_connected(xprt)) + task->tk_status = -ENOTCONN; + else if (!req->rq_received) + rpc_sleep_on(&xprt->pending, task, NULL, xprt_timer); + __xprt_release_write(xprt, task); + spin_unlock_bh(&xprt->sock_lock); +} + +/* + * Reserve an RPC call slot. + */ +static inline void +do_xprt_reserve(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + + task->tk_status = 0; + if (task->tk_rqstp) + return; + if (!list_empty(&xprt->free)) { + struct rpc_rqst *req = list_entry(xprt->free.next, struct rpc_rqst, rq_list); + list_del_init(&req->rq_list); + task->tk_rqstp = req; + xprt_request_init(task, xprt); + return; + } + dprintk("RPC: waiting for request slot\n"); + task->tk_status = -EAGAIN; + task->tk_timeout = 0; + rpc_sleep_on(&xprt->backlog, task, NULL, NULL); +} + +void +xprt_reserve(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + + task->tk_status = -EIO; + if (!xprt->shutdown) { + spin_lock(&xprt->xprt_lock); + do_xprt_reserve(task); + spin_unlock(&xprt->xprt_lock); + if (task->tk_rqstp) + del_timer_sync(&xprt->timer); + } +} + +/* + * Allocate a 'unique' XID + */ +static inline u32 xprt_alloc_xid(struct rpc_xprt *xprt) +{ + return xprt->xid++; +} + +static inline void xprt_init_xid(struct rpc_xprt *xprt) +{ + get_random_bytes(&xprt->xid, sizeof(xprt->xid)); +} + +/* + * Initialize RPC request + */ +static void +xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt) +{ + struct rpc_rqst *req = task->tk_rqstp; + + req->rq_timeout = xprt->timeout.to_initval; + req->rq_task = task; + req->rq_xprt = xprt; + req->rq_xid = xprt_alloc_xid(xprt); + dprintk("RPC: %4d reserved req %p xid %08x\n", task->tk_pid, + req, ntohl(req->rq_xid)); +} + +/* + * Release an RPC call slot + */ +void +xprt_release(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + struct rpc_rqst *req; + + if (!(req = task->tk_rqstp)) + return; + spin_lock_bh(&xprt->sock_lock); + __xprt_release_write(xprt, task); + __xprt_put_cong(xprt, req); + if (!list_empty(&req->rq_list)) + list_del(&req->rq_list); + xprt->last_used = jiffies; + if (list_empty(&xprt->recv) && !xprt->shutdown) + mod_timer(&xprt->timer, xprt->last_used + XPRT_IDLE_TIMEOUT); + spin_unlock_bh(&xprt->sock_lock); + task->tk_rqstp = NULL; + memset(req, 0, sizeof(*req)); /* mark unused */ + + dprintk("RPC: %4d release request %p\n", task->tk_pid, req); + + spin_lock(&xprt->xprt_lock); + list_add(&req->rq_list, &xprt->free); + xprt_clear_backlog(xprt); + spin_unlock(&xprt->xprt_lock); +} + +/* + * Set default timeout parameters + */ +static void +xprt_default_timeout(struct rpc_timeout *to, int proto) +{ + if (proto == IPPROTO_UDP) + xprt_set_timeout(to, 5, 5 * HZ); + else + xprt_set_timeout(to, 5, 60 * HZ); +} + +/* + * Set constant timeout + */ +void +xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long incr) +{ + to->to_initval = + to->to_increment = incr; + to->to_maxval = incr * retr; + to->to_retries = retr; + to->to_exponential = 0; +} + +unsigned int xprt_udp_slot_table_entries = RPC_DEF_SLOT_TABLE; +unsigned int xprt_tcp_slot_table_entries = RPC_DEF_SLOT_TABLE; + +/* + * Initialize an RPC client + */ +static struct rpc_xprt * +xprt_setup(int proto, struct sockaddr_in *ap, struct rpc_timeout *to) +{ + struct rpc_xprt *xprt; + unsigned int entries; + size_t slot_table_size; + struct rpc_rqst *req; + + dprintk("RPC: setting up %s transport...\n", + proto == IPPROTO_UDP? "UDP" : "TCP"); + + entries = (proto == IPPROTO_TCP)? + xprt_tcp_slot_table_entries : xprt_udp_slot_table_entries; + + if ((xprt = kmalloc(sizeof(struct rpc_xprt), GFP_KERNEL)) == NULL) + return ERR_PTR(-ENOMEM); + memset(xprt, 0, sizeof(*xprt)); /* Nnnngh! */ + xprt->max_reqs = entries; + slot_table_size = entries * sizeof(xprt->slot[0]); + xprt->slot = kmalloc(slot_table_size, GFP_KERNEL); + if (xprt->slot == NULL) { + kfree(xprt); + return ERR_PTR(-ENOMEM); + } + memset(xprt->slot, 0, slot_table_size); + + xprt->addr = *ap; + xprt->prot = proto; + xprt->stream = (proto == IPPROTO_TCP)? 1 : 0; + if (xprt->stream) { + xprt->cwnd = RPC_MAXCWND(xprt); + xprt->nocong = 1; + xprt->max_payload = (1U << 31) - 1; + } else { + xprt->cwnd = RPC_INITCWND; + xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); + } + spin_lock_init(&xprt->sock_lock); + spin_lock_init(&xprt->xprt_lock); + init_waitqueue_head(&xprt->cong_wait); + + INIT_LIST_HEAD(&xprt->free); + INIT_LIST_HEAD(&xprt->recv); + INIT_WORK(&xprt->sock_connect, xprt_socket_connect, xprt); + INIT_WORK(&xprt->task_cleanup, xprt_socket_autoclose, xprt); + init_timer(&xprt->timer); + xprt->timer.function = xprt_init_autodisconnect; + xprt->timer.data = (unsigned long) xprt; + xprt->last_used = jiffies; + xprt->port = XPRT_MAX_RESVPORT; + + /* Set timeout parameters */ + if (to) { + xprt->timeout = *to; + } else + xprt_default_timeout(&xprt->timeout, xprt->prot); + + rpc_init_wait_queue(&xprt->pending, "xprt_pending"); + rpc_init_wait_queue(&xprt->sending, "xprt_sending"); + rpc_init_wait_queue(&xprt->resend, "xprt_resend"); + rpc_init_priority_wait_queue(&xprt->backlog, "xprt_backlog"); + + /* initialize free list */ + for (req = &xprt->slot[entries-1]; req >= &xprt->slot[0]; req--) + list_add(&req->rq_list, &xprt->free); + + xprt_init_xid(xprt); + + /* Check whether we want to use a reserved port */ + xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; + + dprintk("RPC: created transport %p with %u slots\n", xprt, + xprt->max_reqs); + + return xprt; +} + +/* + * Bind to a reserved port + */ +static inline int xprt_bindresvport(struct rpc_xprt *xprt, struct socket *sock) +{ + struct sockaddr_in myaddr = { + .sin_family = AF_INET, + }; + int err, port; + + /* Were we already bound to a given port? Try to reuse it */ + port = xprt->port; + do { + myaddr.sin_port = htons(port); + err = sock->ops->bind(sock, (struct sockaddr *) &myaddr, + sizeof(myaddr)); + if (err == 0) { + xprt->port = port; + return 0; + } + if (--port == 0) + port = XPRT_MAX_RESVPORT; + } while (err == -EADDRINUSE && port != xprt->port); + + printk("RPC: Can't bind to reserved port (%d).\n", -err); + return err; +} + +static void +xprt_bind_socket(struct rpc_xprt *xprt, struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (xprt->inet) + return; + + write_lock_bh(&sk->sk_callback_lock); + sk->sk_user_data = xprt; + xprt->old_data_ready = sk->sk_data_ready; + xprt->old_state_change = sk->sk_state_change; + xprt->old_write_space = sk->sk_write_space; + if (xprt->prot == IPPROTO_UDP) { + sk->sk_data_ready = udp_data_ready; + sk->sk_no_check = UDP_CSUM_NORCV; + xprt_set_connected(xprt); + } else { + tcp_sk(sk)->nonagle = 1; /* disable Nagle's algorithm */ + sk->sk_data_ready = tcp_data_ready; + sk->sk_state_change = tcp_state_change; + xprt_clear_connected(xprt); + } + sk->sk_write_space = xprt_write_space; + + /* Reset to new socket */ + xprt->sock = sock; + xprt->inet = sk; + write_unlock_bh(&sk->sk_callback_lock); + + return; +} + +/* + * Set socket buffer length + */ +void +xprt_sock_setbufsize(struct rpc_xprt *xprt) +{ + struct sock *sk = xprt->inet; + + if (xprt->stream) + return; + if (xprt->rcvsize) { + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + sk->sk_rcvbuf = xprt->rcvsize * xprt->max_reqs * 2; + } + if (xprt->sndsize) { + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + sk->sk_sndbuf = xprt->sndsize * xprt->max_reqs * 2; + sk->sk_write_space(sk); + } +} + +/* + * Datastream sockets are created here, but xprt_connect will create + * and connect stream sockets. + */ +static struct socket * xprt_create_socket(struct rpc_xprt *xprt, int proto, int resvport) +{ + struct socket *sock; + int type, err; + + dprintk("RPC: xprt_create_socket(%s %d)\n", + (proto == IPPROTO_UDP)? "udp" : "tcp", proto); + + type = (proto == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; + + if ((err = sock_create_kern(PF_INET, type, proto, &sock)) < 0) { + printk("RPC: can't create socket (%d).\n", -err); + return NULL; + } + + /* If the caller has the capability, bind to a reserved port */ + if (resvport && xprt_bindresvport(xprt, sock) < 0) { + printk("RPC: can't bind to reserved port.\n"); + goto failed; + } + + return sock; + +failed: + sock_release(sock); + return NULL; +} + +/* + * Create an RPC client transport given the protocol and peer address. + */ +struct rpc_xprt * +xprt_create_proto(int proto, struct sockaddr_in *sap, struct rpc_timeout *to) +{ + struct rpc_xprt *xprt; + + xprt = xprt_setup(proto, sap, to); + if (IS_ERR(xprt)) + dprintk("RPC: xprt_create_proto failed\n"); + else + dprintk("RPC: xprt_create_proto created xprt %p\n", xprt); + return xprt; +} + +/* + * Prepare for transport shutdown. + */ +static void +xprt_shutdown(struct rpc_xprt *xprt) +{ + xprt->shutdown = 1; + rpc_wake_up(&xprt->sending); + rpc_wake_up(&xprt->resend); + rpc_wake_up(&xprt->pending); + rpc_wake_up(&xprt->backlog); + wake_up(&xprt->cong_wait); + del_timer_sync(&xprt->timer); +} + +/* + * Clear the xprt backlog queue + */ +static int +xprt_clear_backlog(struct rpc_xprt *xprt) { + rpc_wake_up_next(&xprt->backlog); + wake_up(&xprt->cong_wait); + return 1; +} + +/* + * Destroy an RPC transport, killing off all requests. + */ +int +xprt_destroy(struct rpc_xprt *xprt) +{ + dprintk("RPC: destroying transport %p\n", xprt); + xprt_shutdown(xprt); + xprt_disconnect(xprt); + xprt_close(xprt); + kfree(xprt->slot); + kfree(xprt); + + return 0; +} |