From 72f465033702ebfe20db8f50edaad59f0f38b0f5 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 23 Aug 2010 13:35:09 +0200 Subject: bio-integrity.c: remove dependency on __GFP_NOFAIL The kmalloc() in bio_integrity_prep() is failable, so remove __GFP_NOFAIL from its mask. Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- fs/bio-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 612a5c38d3c..a8f4cc67998 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -413,7 +413,7 @@ int bio_integrity_prep(struct bio *bio) /* Allocate kernel buffer for protection data */ len = sectors * blk_integrity_tuple_size(bi); - buf = kmalloc(len, GFP_NOIO | __GFP_NOFAIL | q->bounce_gfp); + buf = kmalloc(len, GFP_NOIO | q->bounce_gfp); if (unlikely(buf == NULL)) { printk(KERN_ERR "could not allocate integrity buffer\n"); return -EIO; -- cgit v1.2.3 From 220eb7fd984bfc7e6b4005fdf32efe9cd8af7cf2 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 23 Aug 2010 13:36:20 +0200 Subject: fs/bio-integrity.c: return -ENOMEM on kmalloc failure Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- fs/bio-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index a8f4cc67998..4d0ff5ee27b 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -416,7 +416,7 @@ int bio_integrity_prep(struct bio *bio) buf = kmalloc(len, GFP_NOIO | q->bounce_gfp); if (unlikely(buf == NULL)) { printk(KERN_ERR "could not allocate integrity buffer\n"); - return -EIO; + return -ENOMEM; } end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; -- cgit v1.2.3 From b76b4014f9d988d2412b873e4d4c13c7f9afc4e4 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sat, 28 Aug 2010 08:52:10 +0200 Subject: writeback: Fix lost wake-up shutting down writeback thread Setting the task state here may cause us to miss the wake up from kthread_stop(), so we need to recheck kthread_should_stop() or risk sleeping forever in the following schedule(). Symptom was an indefinite hang on an NFSv4 mount. (NFSv4 may create multiple mounts in a temporary namespace while traversing the mount path, and since the temporary namespace is immediately destroyed, it may end up destroying a mount very soon after it was created, possibly making this race more likely.) INFO: task mount.nfs4:4314 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. mount.nfs4 D 0000000000000000 2880 4314 4313 0x00000000 ffff88001ed6da28 0000000000000046 ffff88001ed6dfd8 ffff88001ed6dfd8 ffff88001ed6c000 ffff88001ed6c000 ffff88001ed6c000 ffff88001e5003a0 ffff88001ed6dfd8 ffff88001e5003a8 ffff88001ed6c000 ffff88001ed6dfd8 Call Trace: [] schedule_timeout+0x1cd/0x2e0 [] ? mark_held_locks+0x6c/0xa0 [] ? _raw_spin_unlock_irq+0x30/0x60 [] ? trace_hardirqs_on_caller+0x14d/0x190 [] ? sub_preempt_count+0xe/0xd0 [] wait_for_common+0x120/0x190 [] ? default_wake_function+0x0/0x20 [] wait_for_completion+0x1d/0x20 [] kthread_stop+0x4a/0x150 [] ? thaw_process+0x70/0x80 [] bdi_unregister+0x10a/0x1a0 [] nfs_put_super+0x19/0x20 [] generic_shutdown_super+0x54/0xe0 [] kill_anon_super+0x16/0x60 [] nfs4_kill_super+0x39/0x90 [] deactivate_locked_super+0x45/0x60 [] deactivate_super+0x49/0x70 [] mntput_no_expire+0x84/0xe0 [] release_mounts+0x9f/0xc0 [] put_mnt_ns+0x65/0x80 [] nfs_follow_remote_path+0x1e6/0x420 [] nfs4_try_mount+0x6f/0xd0 [] nfs4_get_sb+0xa2/0x360 [] vfs_kern_mount+0x88/0x1f0 [] do_kern_mount+0x52/0x130 [] ? _lock_kernel+0x6a/0x170 [] do_mount+0x26e/0x7f0 [] ? copy_mount_options+0xea/0x190 [] sys_mount+0x98/0xf0 [] system_call_fastpath+0x16/0x1b 1 lock held by mount.nfs4/4314: #0: (&type->s_umount_key#24){+.+...}, at: [] deactivate_super+0x41/0x70 Signed-off-by: J. Bruce Fields Signed-off-by: Jens Axboe Acked-by: Artem Bityutskiy --- fs/fs-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 7d9d06ba184..81e086d8aa5 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -808,7 +808,7 @@ int bdi_writeback_thread(void *data) wb->last_active = jiffies; set_current_state(TASK_INTERRUPTIBLE); - if (!list_empty(&bdi->work_list)) { + if (!list_empty(&bdi->work_list) || kthread_should_stop()) { __set_current_state(TASK_RUNNING); continue; } -- cgit v1.2.3 From 7100ae97266e387d25d0c8a5d9934931f0b07dbc Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 8 Sep 2010 20:54:49 +0000 Subject: Revert "[CIFS] Eliminate unused variable warning" The change to kernel crypto and fixes to ntlvm2 and ntlmssp series, introduced a regression. Deferring this patch series to 2.6.37 after Shirish fixes it. This reverts commit c89e5198b26a869ce2842bad8519264f3394dee9. Signed-off-by: Steve French Acked-by: Jeff Layton CC: Shirish Pargaonkar --- fs/cifs/sess.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 795095f4eac..4788e16a02c 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -620,6 +620,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, struct key *spnego_key = NULL; __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ bool first_time; + char *ntlmsspblob; if (ses == NULL) return -EINVAL; @@ -867,8 +868,6 @@ ssetup_ntlmssp_authenticate: iov[1].iov_base = &pSMB->req.SecurityBlob[0]; } else if (phase == NtLmAuthenticate) { int blob_len; - char *ntlmsspblob; - ntlmsspblob = kmalloc(5 * sizeof(struct _AUTHENTICATE_MESSAGE), GFP_KERNEL); -- cgit v1.2.3 From 56234e2767496c125a858f880f1b3a62e04a3406 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 8 Sep 2010 20:57:05 +0000 Subject: Revert "Eliminate sparse warning - bad constant expression" This reverts commit 2d20ca835867d93ead6ce61780d883a4b128106d. The change to kernel crypto and fixes to ntlvm2 and ntlmssp series, introduced a regression. Deferring this patch series to 2.6.37 after Shirish fixes it. Signed-off-by: Steve French Acked-by: Jeff Layton CC: Shirish Pargaonkar --- fs/cifs/cifsencrypt.c | 193 +++++++++++++++++++------------------------------- fs/cifs/cifsglob.h | 7 -- 2 files changed, 72 insertions(+), 128 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 709f2296bdb..eef78c24e0c 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -45,38 +45,39 @@ extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8, static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, char *signature) { - int rc; + int rc = 0; + struct { + struct shash_desc shash; + char ctx[crypto_shash_descsize(server->ntlmssp.md5)]; + } sdesc; if (cifs_pdu == NULL || server == NULL || signature == NULL) return -EINVAL; - if (!server->ntlmssp.sdescmd5) { - cERROR(1, - "cifs_calculate_signature: can't generate signature\n"); - return -1; - } + sdesc.shash.tfm = server->ntlmssp.md5; + sdesc.shash.flags = 0x0; - rc = crypto_shash_init(&server->ntlmssp.sdescmd5->shash); + rc = crypto_shash_init(&sdesc.shash); if (rc) { - cERROR(1, "cifs_calculate_signature: oould not init md5\n"); + cERROR(1, "could not initialize master crypto API hmacmd5\n"); return rc; } if (server->secType == RawNTLMSSP) - crypto_shash_update(&server->ntlmssp.sdescmd5->shash, + crypto_shash_update(&sdesc.shash, server->session_key.data.ntlmv2.key, CIFS_NTLMV2_SESSKEY_SIZE); else - crypto_shash_update(&server->ntlmssp.sdescmd5->shash, + crypto_shash_update(&sdesc.shash, (char *)&server->session_key.data, server->session_key.len); - crypto_shash_update(&server->ntlmssp.sdescmd5->shash, + crypto_shash_update(&sdesc.shash, cifs_pdu->Protocol, cifs_pdu->smb_buf_length); - rc = crypto_shash_final(&server->ntlmssp.sdescmd5->shash, signature); + rc = crypto_shash_final(&sdesc.shash, signature); - return rc; + return 0; } @@ -114,28 +115,30 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec, struct TCP_Server_Info *server, char *signature) { int i; - int rc; + int rc = 0; + struct { + struct shash_desc shash; + char ctx[crypto_shash_descsize(server->ntlmssp.md5)]; + } sdesc; if (iov == NULL || server == NULL || signature == NULL) return -EINVAL; - if (!server->ntlmssp.sdescmd5) { - cERROR(1, "cifs_calc_signature2: can't generate signature\n"); - return -1; - } + sdesc.shash.tfm = server->ntlmssp.md5; + sdesc.shash.flags = 0x0; - rc = crypto_shash_init(&server->ntlmssp.sdescmd5->shash); + rc = crypto_shash_init(&sdesc.shash); if (rc) { - cERROR(1, "cifs_calc_signature2: oould not init md5\n"); + cERROR(1, "could not initialize master crypto API hmacmd5\n"); return rc; } if (server->secType == RawNTLMSSP) - crypto_shash_update(&server->ntlmssp.sdescmd5->shash, + crypto_shash_update(&sdesc.shash, server->session_key.data.ntlmv2.key, CIFS_NTLMV2_SESSKEY_SIZE); else - crypto_shash_update(&server->ntlmssp.sdescmd5->shash, + crypto_shash_update(&sdesc.shash, (char *)&server->session_key.data, server->session_key.len); @@ -143,7 +146,7 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec, if (iov[i].iov_len == 0) continue; if (iov[i].iov_base == NULL) { - cERROR(1, "cifs_calc_signature2: null iovec entry"); + cERROR(1, "null iovec entry"); return -EIO; } /* The first entry includes a length field (which does not get @@ -151,16 +154,16 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec, if (i == 0) { if (iov[0].iov_len <= 8) /* cmd field at offset 9 */ break; /* nothing to sign or corrupt header */ - crypto_shash_update(&server->ntlmssp.sdescmd5->shash, + crypto_shash_update(&sdesc.shash, iov[i].iov_base + 4, iov[i].iov_len - 4); } else - crypto_shash_update(&server->ntlmssp.sdescmd5->shash, + crypto_shash_update(&sdesc.shash, iov[i].iov_base, iov[i].iov_len); } - rc = crypto_shash_final(&server->ntlmssp.sdescmd5->shash, signature); + rc = crypto_shash_final(&sdesc.shash, signature); - return rc; + return 0; } int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, @@ -310,48 +313,43 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses, wchar_t *user; wchar_t *domain; wchar_t *server; - - if (!ses->server->ntlmssp.sdeschmacmd5) { - cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n"); - return -1; - } + struct { + struct shash_desc shash; + char ctx[crypto_shash_descsize(ses->server->ntlmssp.hmacmd5)]; + } sdesc; /* calculate md4 hash of password */ E_md4hash(ses->password, nt_hash); + sdesc.shash.tfm = ses->server->ntlmssp.hmacmd5; + sdesc.shash.flags = 0x0; + crypto_shash_setkey(ses->server->ntlmssp.hmacmd5, nt_hash, CIFS_NTHASH_SIZE); - rc = crypto_shash_init(&ses->server->ntlmssp.sdeschmacmd5->shash); + rc = crypto_shash_init(&sdesc.shash); if (rc) { - cERROR(1, "calc_ntlmv2_hash: could not init hmacmd5\n"); + cERROR(1, "could not initialize master crypto API hmacmd5\n"); return rc; } /* convert ses->userName to unicode and uppercase */ len = strlen(ses->userName); user = kmalloc(2 + (len * 2), GFP_KERNEL); - if (user == NULL) { - cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n"); - rc = -ENOMEM; + if (user == NULL) goto calc_exit_2; - } len = cifs_strtoUCS((__le16 *)user, ses->userName, len, nls_cp); UniStrupr(user); - crypto_shash_update(&ses->server->ntlmssp.sdeschmacmd5->shash, - (char *)user, 2 * len); + crypto_shash_update(&sdesc.shash, (char *)user, 2 * len); /* convert ses->domainName to unicode and uppercase */ if (ses->domainName) { len = strlen(ses->domainName); domain = kmalloc(2 + (len * 2), GFP_KERNEL); - if (domain == NULL) { - cERROR(1, "calc_ntlmv2_hash: domain mem alloc failure"); - rc = -ENOMEM; + if (domain == NULL) goto calc_exit_1; - } len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len, nls_cp); /* the following line was removed since it didn't work well @@ -359,19 +357,15 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses, Maybe converting the domain name earlier makes sense */ /* UniStrupr(domain); */ - crypto_shash_update(&ses->server->ntlmssp.sdeschmacmd5->shash, - (char *)domain, 2 * len); + crypto_shash_update(&sdesc.shash, (char *)domain, 2 * len); kfree(domain); } else if (ses->serverName) { len = strlen(ses->serverName); server = kmalloc(2 + (len * 2), GFP_KERNEL); - if (server == NULL) { - cERROR(1, "calc_ntlmv2_hash: server mem alloc failure"); - rc = -ENOMEM; + if (server == NULL) goto calc_exit_1; - } len = cifs_strtoUCS((__le16 *)server, ses->serverName, len, nls_cp); /* the following line was removed since it didn't work well @@ -379,20 +373,16 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses, Maybe converting the domain name earlier makes sense */ /* UniStrupr(domain); */ - crypto_shash_update(&ses->server->ntlmssp.sdeschmacmd5->shash, - (char *)server, 2 * len); + crypto_shash_update(&sdesc.shash, (char *)server, 2 * len); kfree(server); } - - rc = crypto_shash_final(&ses->server->ntlmssp.sdeschmacmd5->shash, - ses->server->ntlmv2_hash); - calc_exit_1: kfree(user); calc_exit_2: /* BB FIXME what about bytes 24 through 40 of the signing key? compare with the NTLM example */ + rc = crypto_shash_final(&sdesc.shash, ses->server->ntlmv2_hash); return rc; } @@ -452,33 +442,34 @@ CalcNTLMv2_response(const struct TCP_Server_Info *server, char *v2_session_response) { int rc; + struct { + struct shash_desc shash; + char ctx[crypto_shash_descsize(server->ntlmssp.hmacmd5)]; + } sdesc; - if (!server->ntlmssp.sdeschmacmd5) { - cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n"); - return -1; - } + sdesc.shash.tfm = server->ntlmssp.hmacmd5; + sdesc.shash.flags = 0x0; crypto_shash_setkey(server->ntlmssp.hmacmd5, server->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); - rc = crypto_shash_init(&server->ntlmssp.sdeschmacmd5->shash); + rc = crypto_shash_init(&sdesc.shash); if (rc) { - cERROR(1, "CalcNTLMv2_response: could not init hmacmd5"); + cERROR(1, "could not initialize master crypto API hmacmd5\n"); return rc; } memcpy(v2_session_response + CIFS_SERVER_CHALLENGE_SIZE, server->cryptKey, CIFS_SERVER_CHALLENGE_SIZE); - crypto_shash_update(&server->ntlmssp.sdeschmacmd5->shash, + crypto_shash_update(&sdesc.shash, v2_session_response + CIFS_SERVER_CHALLENGE_SIZE, sizeof(struct ntlmv2_resp) - CIFS_SERVER_CHALLENGE_SIZE); if (server->tilen) - crypto_shash_update(&server->ntlmssp.sdeschmacmd5->shash, + crypto_shash_update(&sdesc.shash, server->tiblob, server->tilen); - rc = crypto_shash_final(&server->ntlmssp.sdeschmacmd5->shash, - v2_session_response); + rc = crypto_shash_final(&sdesc.shash, v2_session_response); return rc; } @@ -489,6 +480,10 @@ setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf, { int rc = 0; struct ntlmv2_resp *buf = (struct ntlmv2_resp *)resp_buf; + struct { + struct shash_desc shash; + char ctx[crypto_shash_descsize(ses->server->ntlmssp.hmacmd5)]; + } sdesc; buf->blob_signature = cpu_to_le32(0x00000101); buf->reserved = 0; @@ -516,24 +511,21 @@ setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf, return rc; } - if (!ses->server->ntlmssp.sdeschmacmd5) { - cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n"); - return -1; - } - crypto_shash_setkey(ses->server->ntlmssp.hmacmd5, ses->server->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); - rc = crypto_shash_init(&ses->server->ntlmssp.sdeschmacmd5->shash); + sdesc.shash.tfm = ses->server->ntlmssp.hmacmd5; + sdesc.shash.flags = 0x0; + + rc = crypto_shash_init(&sdesc.shash); if (rc) { - cERROR(1, "setup_ntlmv2_rsp: could not init hmacmd5\n"); + cERROR(1, "could not initialize master crypto API hmacmd5\n"); return rc; } - crypto_shash_update(&ses->server->ntlmssp.sdeschmacmd5->shash, - resp_buf, CIFS_HMAC_MD5_HASH_SIZE); + crypto_shash_update(&sdesc.shash, resp_buf, CIFS_HMAC_MD5_HASH_SIZE); - rc = crypto_shash_final(&ses->server->ntlmssp.sdeschmacmd5->shash, + rc = crypto_shash_final(&sdesc.shash, ses->server->session_key.data.ntlmv2.key); memcpy(&ses->server->session_key.data.ntlmv2.resp, resp_buf, @@ -586,65 +578,24 @@ cifs_crypto_shash_release(struct TCP_Server_Info *server) if (server->ntlmssp.hmacmd5) crypto_free_shash(server->ntlmssp.hmacmd5); - - kfree(server->ntlmssp.sdeschmacmd5); - - kfree(server->ntlmssp.sdescmd5); } int cifs_crypto_shash_allocate(struct TCP_Server_Info *server) { - int rc; - unsigned int size; - server->ntlmssp.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0); if (!server->ntlmssp.hmacmd5 || IS_ERR(server->ntlmssp.hmacmd5)) { - cERROR(1, "could not allocate crypto hmacmd5\n"); + cERROR(1, "could not allocate master crypto API hmacmd5\n"); return 1; } server->ntlmssp.md5 = crypto_alloc_shash("md5", 0, 0); if (!server->ntlmssp.md5 || IS_ERR(server->ntlmssp.md5)) { - cERROR(1, "could not allocate crypto md5\n"); - rc = 1; - goto cifs_crypto_shash_allocate_ret1; - } - - size = sizeof(struct shash_desc) + - crypto_shash_descsize(server->ntlmssp.hmacmd5); - server->ntlmssp.sdeschmacmd5 = kmalloc(size, GFP_KERNEL); - if (!server->ntlmssp.sdeschmacmd5) { - cERROR(1, "cifs_crypto_shash_allocate: can't alloc hmacmd5\n"); - rc = -ENOMEM; - goto cifs_crypto_shash_allocate_ret2; - } - server->ntlmssp.sdeschmacmd5->shash.tfm = server->ntlmssp.hmacmd5; - server->ntlmssp.sdeschmacmd5->shash.flags = 0x0; - - - size = sizeof(struct shash_desc) + - crypto_shash_descsize(server->ntlmssp.md5); - server->ntlmssp.sdescmd5 = kmalloc(size, GFP_KERNEL); - if (!server->ntlmssp.sdescmd5) { - cERROR(1, "cifs_crypto_shash_allocate: can't alloc md5\n"); - rc = -ENOMEM; - goto cifs_crypto_shash_allocate_ret3; + crypto_free_shash(server->ntlmssp.hmacmd5); + cERROR(1, "could not allocate master crypto API md5\n"); + return 1; } - server->ntlmssp.sdescmd5->shash.tfm = server->ntlmssp.md5; - server->ntlmssp.sdescmd5->shash.flags = 0x0; return 0; - -cifs_crypto_shash_allocate_ret3: - kfree(server->ntlmssp.sdeschmacmd5); - -cifs_crypto_shash_allocate_ret2: - crypto_free_shash(server->ntlmssp.md5); - -cifs_crypto_shash_allocate_ret1: - crypto_free_shash(server->ntlmssp.hmacmd5); - - return rc; } diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index c9d0cfc086e..49563e0c172 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -123,19 +123,12 @@ struct cifs_cred { struct cifs_ace *aces; }; -struct sdesc { - struct shash_desc shash; - char ctx[]; -}; - struct ntlmssp_auth { __u32 client_flags; __u32 server_flags; unsigned char ciphertext[CIFS_CPHTXT_SIZE]; struct crypto_shash *hmacmd5; struct crypto_shash *md5; - struct sdesc *sdeschmacmd5; - struct sdesc *sdescmd5; }; /* -- cgit v1.2.3 From 745e507a9c79c6e1385d3414d5e56f3d4621a375 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 8 Sep 2010 21:09:27 +0000 Subject: Revert "missing changes during ntlmv2/ntlmssp auth and sign" This reverts commit 3ec6bbcdb4e85403f2c5958876ca9492afdf4031. The change to kernel crypto and fixes to ntlvm2 and ntlmssp series, introduced a regression. Deferring this patch series to 2.6.37 after Shirish fixes it. Signed-off-by: Steve French Acked-by: Jeff Layton CC: Shirish Pargaonkar --- fs/cifs/cifsencrypt.c | 2 -- fs/cifs/sess.c | 13 +++++-------- 2 files changed, 5 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index eef78c24e0c..051d00011ca 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -553,8 +553,6 @@ calc_seckey(struct TCP_Server_Info *server) return 1; } - desc.tfm = tfm_arc4; - crypto_blkcipher_setkey(tfm_arc4, server->session_key.data.ntlmv2.key, CIFS_CPHTXT_SIZE); sg_init_one(&sgin, sec_key, CIFS_CPHTXT_SIZE); diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 4788e16a02c..41fc5328120 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -408,8 +408,6 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, /* BB spec says that if AvId field of MsvAvTimestamp is populated then we must set the MIC field of the AUTHENTICATE_MESSAGE */ - ses->server->ntlmssp.server_flags = le32_to_cpu(pblob->NegotiateFlags); - tioffset = cpu_to_le16(pblob->TargetInfoArray.BufferOffset); tilen = cpu_to_le16(pblob->TargetInfoArray.Length); ses->server->tilen = tilen; @@ -442,13 +440,12 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer, /* BB is NTLMV2 session security format easier to use here? */ flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | - NTLMSSP_NEGOTIATE_NTLM; + NTLMSSP_NEGOTIATE_NT_ONLY | NTLMSSP_NEGOTIATE_NTLM; if (ses->server->secMode & - (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { - flags |= NTLMSSP_NEGOTIATE_SIGN | - NTLMSSP_NEGOTIATE_KEY_XCH | - NTLMSSP_NEGOTIATE_EXTENDED_SEC; - } + (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + flags |= NTLMSSP_NEGOTIATE_SIGN; + if (ses->server->secMode & SECMODE_SIGN_REQUIRED) + flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN; sec_blob->NegotiateFlags |= cpu_to_le32(flags); -- cgit v1.2.3 From c8e56f1f4fb9f82f63e4ce6d73a14501d0432c76 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 8 Sep 2010 21:10:58 +0000 Subject: Revert "[CIFS] Fix ntlmv2 auth with ntlmssp" This reverts commit 9fbc590860e75785bdaf8b83e48fabfe4d4f7d58. The change to kernel crypto and fixes to ntlvm2 and ntlmssp series, introduced a regression. Deferring this patch series to 2.6.37 after Shirish fixes it. Signed-off-by: Steve French Acked-by: Jeff Layton CC: Shirish Pargaonkar --- fs/cifs/Kconfig | 2 - fs/cifs/asn1.c | 6 +- fs/cifs/cifsencrypt.c | 416 +++++++++++++++----------------------------------- fs/cifs/cifsglob.h | 18 +-- fs/cifs/cifspdu.h | 7 +- fs/cifs/cifsproto.h | 12 +- fs/cifs/cifssmb.c | 13 +- fs/cifs/connect.c | 13 +- fs/cifs/ntlmssp.h | 13 -- fs/cifs/sess.c | 118 ++++---------- fs/cifs/transport.c | 6 +- 11 files changed, 172 insertions(+), 452 deletions(-) (limited to 'fs') diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 0da1debd499..917b7d449bb 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -2,8 +2,6 @@ config CIFS tristate "CIFS support (advanced network filesystem, SMBFS successor)" depends on INET select NLS - select CRYPTO_MD5 - select CRYPTO_ARC4 help This is the client VFS module for the Common Internet File System (CIFS) protocol which is the successor to the Server Message Block diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c index 21f0fbd8698..cfd1ce34e0b 100644 --- a/fs/cifs/asn1.c +++ b/fs/cifs/asn1.c @@ -597,13 +597,13 @@ decode_negTokenInit(unsigned char *security_blob, int length, if (compare_oid(oid, oidlen, MSKRB5_OID, MSKRB5_OID_LEN)) server->sec_mskerberos = true; - if (compare_oid(oid, oidlen, KRB5U2U_OID, + else if (compare_oid(oid, oidlen, KRB5U2U_OID, KRB5U2U_OID_LEN)) server->sec_kerberosu2u = true; - if (compare_oid(oid, oidlen, KRB5_OID, + else if (compare_oid(oid, oidlen, KRB5_OID, KRB5_OID_LEN)) server->sec_kerberos = true; - if (compare_oid(oid, oidlen, NTLMSSP_OID, + else if (compare_oid(oid, oidlen, NTLMSSP_OID, NTLMSSP_OID_LEN)) server->sec_ntlmssp = true; diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 051d00011ca..847628dfdc4 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -27,7 +27,6 @@ #include "md5.h" #include "cifs_unicode.h" #include "cifsproto.h" -#include "ntlmssp.h" #include #include @@ -43,44 +42,21 @@ extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24); static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, - struct TCP_Server_Info *server, char *signature) + const struct mac_key *key, char *signature) { - int rc = 0; - struct { - struct shash_desc shash; - char ctx[crypto_shash_descsize(server->ntlmssp.md5)]; - } sdesc; + struct MD5Context context; - if (cifs_pdu == NULL || server == NULL || signature == NULL) + if ((cifs_pdu == NULL) || (signature == NULL) || (key == NULL)) return -EINVAL; - sdesc.shash.tfm = server->ntlmssp.md5; - sdesc.shash.flags = 0x0; - - rc = crypto_shash_init(&sdesc.shash); - if (rc) { - cERROR(1, "could not initialize master crypto API hmacmd5\n"); - return rc; - } - - if (server->secType == RawNTLMSSP) - crypto_shash_update(&sdesc.shash, - server->session_key.data.ntlmv2.key, - CIFS_NTLMV2_SESSKEY_SIZE); - else - crypto_shash_update(&sdesc.shash, - (char *)&server->session_key.data, - server->session_key.len); - - crypto_shash_update(&sdesc.shash, - cifs_pdu->Protocol, cifs_pdu->smb_buf_length); - - rc = crypto_shash_final(&sdesc.shash, signature); + cifs_MD5_init(&context); + cifs_MD5_update(&context, (char *)&key->data, key->len); + cifs_MD5_update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length); + cifs_MD5_final(signature, &context); return 0; } - int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, __u32 *pexpected_response_sequence_number) { @@ -102,7 +78,8 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, server->sequence_number++; spin_unlock(&GlobalMid_Lock); - rc = cifs_calculate_signature(cifs_pdu, server, smb_signature); + rc = cifs_calculate_signature(cifs_pdu, &server->mac_signing_key, + smb_signature); if (rc) memset(cifs_pdu->Signature.SecuritySignature, 0, 8); else @@ -112,36 +89,16 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, } static int cifs_calc_signature2(const struct kvec *iov, int n_vec, - struct TCP_Server_Info *server, char *signature) + const struct mac_key *key, char *signature) { + struct MD5Context context; int i; - int rc = 0; - struct { - struct shash_desc shash; - char ctx[crypto_shash_descsize(server->ntlmssp.md5)]; - } sdesc; - if (iov == NULL || server == NULL || signature == NULL) + if ((iov == NULL) || (signature == NULL) || (key == NULL)) return -EINVAL; - sdesc.shash.tfm = server->ntlmssp.md5; - sdesc.shash.flags = 0x0; - - rc = crypto_shash_init(&sdesc.shash); - if (rc) { - cERROR(1, "could not initialize master crypto API hmacmd5\n"); - return rc; - } - - if (server->secType == RawNTLMSSP) - crypto_shash_update(&sdesc.shash, - server->session_key.data.ntlmv2.key, - CIFS_NTLMV2_SESSKEY_SIZE); - else - crypto_shash_update(&sdesc.shash, - (char *)&server->session_key.data, - server->session_key.len); - + cifs_MD5_init(&context); + cifs_MD5_update(&context, (char *)&key->data, key->len); for (i = 0; i < n_vec; i++) { if (iov[i].iov_len == 0) continue; @@ -154,18 +111,18 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec, if (i == 0) { if (iov[0].iov_len <= 8) /* cmd field at offset 9 */ break; /* nothing to sign or corrupt header */ - crypto_shash_update(&sdesc.shash, - iov[i].iov_base + 4, iov[i].iov_len - 4); + cifs_MD5_update(&context, iov[0].iov_base+4, + iov[0].iov_len-4); } else - crypto_shash_update(&sdesc.shash, - iov[i].iov_base, iov[i].iov_len); + cifs_MD5_update(&context, iov[i].iov_base, iov[i].iov_len); } - rc = crypto_shash_final(&sdesc.shash, signature); + cifs_MD5_final(signature, &context); return 0; } + int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, __u32 *pexpected_response_sequence_number) { @@ -188,7 +145,8 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, server->sequence_number++; spin_unlock(&GlobalMid_Lock); - rc = cifs_calc_signature2(iov, n_vec, server, smb_signature); + rc = cifs_calc_signature2(iov, n_vec, &server->mac_signing_key, + smb_signature); if (rc) memset(cifs_pdu->Signature.SecuritySignature, 0, 8); else @@ -198,14 +156,14 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, } int cifs_verify_signature(struct smb_hdr *cifs_pdu, - struct TCP_Server_Info *server, + const struct mac_key *mac_key, __u32 expected_sequence_number) { - int rc; + unsigned int rc; char server_response_sig[8]; char what_we_think_sig_should_be[20]; - if (cifs_pdu == NULL || server == NULL) + if ((cifs_pdu == NULL) || (mac_key == NULL)) return -EINVAL; if (cifs_pdu->Command == SMB_COM_NEGOTIATE) @@ -234,7 +192,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu, cpu_to_le32(expected_sequence_number); cifs_pdu->Signature.Sequence.Reserved = 0; - rc = cifs_calculate_signature(cifs_pdu, server, + rc = cifs_calculate_signature(cifs_pdu, mac_key, what_we_think_sig_should_be); if (rc) @@ -251,7 +209,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu, } /* We fill in key by putting in 40 byte array which was allocated by caller */ -int cifs_calculate_session_key(struct session_key *key, const char *rn, +int cifs_calculate_mac_key(struct mac_key *key, const char *rn, const char *password) { char temp_key[16]; @@ -265,6 +223,63 @@ int cifs_calculate_session_key(struct session_key *key, const char *rn, return 0; } +int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *ses, + const struct nls_table *nls_info) +{ + char temp_hash[16]; + struct HMACMD5Context ctx; + char *ucase_buf; + __le16 *unicode_buf; + unsigned int i, user_name_len, dom_name_len; + + if (ses == NULL) + return -EINVAL; + + E_md4hash(ses->password, temp_hash); + + hmac_md5_init_limK_to_64(temp_hash, 16, &ctx); + user_name_len = strlen(ses->userName); + if (user_name_len > MAX_USERNAME_SIZE) + return -EINVAL; + if (ses->domainName == NULL) + return -EINVAL; /* BB should we use CIFS_LINUX_DOM */ + dom_name_len = strlen(ses->domainName); + if (dom_name_len > MAX_USERNAME_SIZE) + return -EINVAL; + + ucase_buf = kmalloc((MAX_USERNAME_SIZE+1), GFP_KERNEL); + if (ucase_buf == NULL) + return -ENOMEM; + unicode_buf = kmalloc((MAX_USERNAME_SIZE+1)*4, GFP_KERNEL); + if (unicode_buf == NULL) { + kfree(ucase_buf); + return -ENOMEM; + } + + for (i = 0; i < user_name_len; i++) + ucase_buf[i] = nls_info->charset2upper[(int)ses->userName[i]]; + ucase_buf[i] = 0; + user_name_len = cifs_strtoUCS(unicode_buf, ucase_buf, + MAX_USERNAME_SIZE*2, nls_info); + unicode_buf[user_name_len] = 0; + user_name_len++; + + for (i = 0; i < dom_name_len; i++) + ucase_buf[i] = nls_info->charset2upper[(int)ses->domainName[i]]; + ucase_buf[i] = 0; + dom_name_len = cifs_strtoUCS(unicode_buf+user_name_len, ucase_buf, + MAX_USERNAME_SIZE*2, nls_info); + + unicode_buf[user_name_len + dom_name_len] = 0; + hmac_md5_update((const unsigned char *) unicode_buf, + (user_name_len+dom_name_len)*2, &ctx); + + hmac_md5_final(ses->server->ntlmv2_hash, &ctx); + kfree(ucase_buf); + kfree(unicode_buf); + return 0; +} + #ifdef CONFIG_CIFS_WEAK_PW_HASH void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt, char *lnm_session_key) @@ -309,29 +324,21 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses, { int rc = 0; int len; - char nt_hash[CIFS_NTHASH_SIZE]; + char nt_hash[16]; + struct HMACMD5Context *pctxt; wchar_t *user; wchar_t *domain; - wchar_t *server; - struct { - struct shash_desc shash; - char ctx[crypto_shash_descsize(ses->server->ntlmssp.hmacmd5)]; - } sdesc; - /* calculate md4 hash of password */ - E_md4hash(ses->password, nt_hash); + pctxt = kmalloc(sizeof(struct HMACMD5Context), GFP_KERNEL); - sdesc.shash.tfm = ses->server->ntlmssp.hmacmd5; - sdesc.shash.flags = 0x0; + if (pctxt == NULL) + return -ENOMEM; - crypto_shash_setkey(ses->server->ntlmssp.hmacmd5, nt_hash, - CIFS_NTHASH_SIZE); + /* calculate md4 hash of password */ + E_md4hash(ses->password, nt_hash); - rc = crypto_shash_init(&sdesc.shash); - if (rc) { - cERROR(1, "could not initialize master crypto API hmacmd5\n"); - return rc; - } + /* convert Domainname to unicode and uppercase */ + hmac_md5_init_limK_to_64(nt_hash, 16, pctxt); /* convert ses->userName to unicode and uppercase */ len = strlen(ses->userName); @@ -340,8 +347,7 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses, goto calc_exit_2; len = cifs_strtoUCS((__le16 *)user, ses->userName, len, nls_cp); UniStrupr(user); - - crypto_shash_update(&sdesc.shash, (char *)user, 2 * len); + hmac_md5_update((char *)user, 2*len, pctxt); /* convert ses->domainName to unicode and uppercase */ if (ses->domainName) { @@ -357,243 +363,65 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses, Maybe converting the domain name earlier makes sense */ /* UniStrupr(domain); */ - crypto_shash_update(&sdesc.shash, (char *)domain, 2 * len); + hmac_md5_update((char *)domain, 2*len, pctxt); kfree(domain); - } else if (ses->serverName) { - len = strlen(ses->serverName); - - server = kmalloc(2 + (len * 2), GFP_KERNEL); - if (server == NULL) - goto calc_exit_1; - len = cifs_strtoUCS((__le16 *)server, ses->serverName, len, - nls_cp); - /* the following line was removed since it didn't work well - with lower cased domain name that passed as an option. - Maybe converting the domain name earlier makes sense */ - /* UniStrupr(domain); */ - - crypto_shash_update(&sdesc.shash, (char *)server, 2 * len); - - kfree(server); } calc_exit_1: kfree(user); calc_exit_2: /* BB FIXME what about bytes 24 through 40 of the signing key? compare with the NTLM example */ - rc = crypto_shash_final(&sdesc.shash, ses->server->ntlmv2_hash); - - return rc; -} - -static int -find_domain_name(struct cifsSesInfo *ses) -{ - int rc = 0; - unsigned int attrsize; - unsigned int type; - unsigned char *blobptr; - struct ntlmssp2_name *attrptr; - - if (ses->server->tiblob) { - blobptr = ses->server->tiblob; - attrptr = (struct ntlmssp2_name *) blobptr; - - while ((type = attrptr->type) != 0) { - blobptr += 2; /* advance attr type */ - attrsize = attrptr->length; - blobptr += 2; /* advance attr size */ - if (type == NTLMSSP_AV_NB_DOMAIN_NAME) { - if (!ses->domainName) { - ses->domainName = - kmalloc(attrptr->length + 1, - GFP_KERNEL); - if (!ses->domainName) - return -ENOMEM; - cifs_from_ucs2(ses->domainName, - (__le16 *)blobptr, - attrptr->length, - attrptr->length, - load_nls_default(), false); - } - } - blobptr += attrsize; /* advance attr value */ - attrptr = (struct ntlmssp2_name *) blobptr; - } - } else { - ses->server->tilen = 2 * sizeof(struct ntlmssp2_name); - ses->server->tiblob = kmalloc(ses->server->tilen, GFP_KERNEL); - if (!ses->server->tiblob) { - ses->server->tilen = 0; - cERROR(1, "Challenge target info allocation failure"); - return -ENOMEM; - } - memset(ses->server->tiblob, 0x0, ses->server->tilen); - attrptr = (struct ntlmssp2_name *) ses->server->tiblob; - attrptr->type = cpu_to_le16(NTLMSSP_DOMAIN_TYPE); - } + hmac_md5_final(ses->server->ntlmv2_hash, pctxt); + kfree(pctxt); return rc; } -static int -CalcNTLMv2_response(const struct TCP_Server_Info *server, - char *v2_session_response) -{ - int rc; - struct { - struct shash_desc shash; - char ctx[crypto_shash_descsize(server->ntlmssp.hmacmd5)]; - } sdesc; - - sdesc.shash.tfm = server->ntlmssp.hmacmd5; - sdesc.shash.flags = 0x0; - - crypto_shash_setkey(server->ntlmssp.hmacmd5, server->ntlmv2_hash, - CIFS_HMAC_MD5_HASH_SIZE); - - rc = crypto_shash_init(&sdesc.shash); - if (rc) { - cERROR(1, "could not initialize master crypto API hmacmd5\n"); - return rc; - } - - memcpy(v2_session_response + CIFS_SERVER_CHALLENGE_SIZE, - server->cryptKey, CIFS_SERVER_CHALLENGE_SIZE); - crypto_shash_update(&sdesc.shash, - v2_session_response + CIFS_SERVER_CHALLENGE_SIZE, - sizeof(struct ntlmv2_resp) - CIFS_SERVER_CHALLENGE_SIZE); - - if (server->tilen) - crypto_shash_update(&sdesc.shash, - server->tiblob, server->tilen); - - rc = crypto_shash_final(&sdesc.shash, v2_session_response); - - return rc; -} - -int -setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf, +void setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf, const struct nls_table *nls_cp) { - int rc = 0; + int rc; struct ntlmv2_resp *buf = (struct ntlmv2_resp *)resp_buf; - struct { - struct shash_desc shash; - char ctx[crypto_shash_descsize(ses->server->ntlmssp.hmacmd5)]; - } sdesc; + struct HMACMD5Context context; buf->blob_signature = cpu_to_le32(0x00000101); buf->reserved = 0; buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); get_random_bytes(&buf->client_chal, sizeof(buf->client_chal)); buf->reserved2 = 0; - - if (!ses->domainName) { - rc = find_domain_name(ses); - if (rc) { - cERROR(1, "could not get domain/server name rc %d", rc); - return rc; - } - } + buf->names[0].type = cpu_to_le16(NTLMSSP_DOMAIN_TYPE); + buf->names[0].length = 0; + buf->names[1].type = 0; + buf->names[1].length = 0; /* calculate buf->ntlmv2_hash */ rc = calc_ntlmv2_hash(ses, nls_cp); - if (rc) { - cERROR(1, "could not get v2 hash rc %d", rc); - return rc; - } - rc = CalcNTLMv2_response(ses->server, resp_buf); - if (rc) { + if (rc) cERROR(1, "could not get v2 hash rc %d", rc); - return rc; - } - - crypto_shash_setkey(ses->server->ntlmssp.hmacmd5, - ses->server->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); - - sdesc.shash.tfm = ses->server->ntlmssp.hmacmd5; - sdesc.shash.flags = 0x0; - - rc = crypto_shash_init(&sdesc.shash); - if (rc) { - cERROR(1, "could not initialize master crypto API hmacmd5\n"); - return rc; - } - - crypto_shash_update(&sdesc.shash, resp_buf, CIFS_HMAC_MD5_HASH_SIZE); + CalcNTLMv2_response(ses, resp_buf); - rc = crypto_shash_final(&sdesc.shash, - ses->server->session_key.data.ntlmv2.key); + /* now calculate the MAC key for NTLMv2 */ + hmac_md5_init_limK_to_64(ses->server->ntlmv2_hash, 16, &context); + hmac_md5_update(resp_buf, 16, &context); + hmac_md5_final(ses->server->mac_signing_key.data.ntlmv2.key, &context); - memcpy(&ses->server->session_key.data.ntlmv2.resp, resp_buf, - sizeof(struct ntlmv2_resp)); - ses->server->session_key.len = 16 + sizeof(struct ntlmv2_resp); - - return rc; + memcpy(&ses->server->mac_signing_key.data.ntlmv2.resp, resp_buf, + sizeof(struct ntlmv2_resp)); + ses->server->mac_signing_key.len = 16 + sizeof(struct ntlmv2_resp); } -int -calc_seckey(struct TCP_Server_Info *server) -{ - int rc; - unsigned char sec_key[CIFS_NTLMV2_SESSKEY_SIZE]; - struct crypto_blkcipher *tfm_arc4; - struct scatterlist sgin, sgout; - struct blkcipher_desc desc; - - get_random_bytes(sec_key, CIFS_NTLMV2_SESSKEY_SIZE); - - tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", - 0, CRYPTO_ALG_ASYNC); - if (!tfm_arc4 || IS_ERR(tfm_arc4)) { - cERROR(1, "could not allocate " "master crypto API arc4\n"); - return 1; - } - - crypto_blkcipher_setkey(tfm_arc4, - server->session_key.data.ntlmv2.key, CIFS_CPHTXT_SIZE); - sg_init_one(&sgin, sec_key, CIFS_CPHTXT_SIZE); - sg_init_one(&sgout, server->ntlmssp.ciphertext, CIFS_CPHTXT_SIZE); - rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE); - - if (!rc) - memcpy(server->session_key.data.ntlmv2.key, - sec_key, CIFS_NTLMV2_SESSKEY_SIZE); - - crypto_free_blkcipher(tfm_arc4); - - return 0; -} - -void -cifs_crypto_shash_release(struct TCP_Server_Info *server) -{ - if (server->ntlmssp.md5) - crypto_free_shash(server->ntlmssp.md5); - - if (server->ntlmssp.hmacmd5) - crypto_free_shash(server->ntlmssp.hmacmd5); -} - -int -cifs_crypto_shash_allocate(struct TCP_Server_Info *server) +void CalcNTLMv2_response(const struct cifsSesInfo *ses, + char *v2_session_response) { - server->ntlmssp.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0); - if (!server->ntlmssp.hmacmd5 || - IS_ERR(server->ntlmssp.hmacmd5)) { - cERROR(1, "could not allocate master crypto API hmacmd5\n"); - return 1; - } + struct HMACMD5Context context; + /* rest of v2 struct already generated */ + memcpy(v2_session_response + 8, ses->server->cryptKey, 8); + hmac_md5_init_limK_to_64(ses->server->ntlmv2_hash, 16, &context); - server->ntlmssp.md5 = crypto_alloc_shash("md5", 0, 0); - if (!server->ntlmssp.md5 || IS_ERR(server->ntlmssp.md5)) { - crypto_free_shash(server->ntlmssp.hmacmd5); - cERROR(1, "could not allocate master crypto API md5\n"); - return 1; - } + hmac_md5_update(v2_session_response+8, + sizeof(struct ntlmv2_resp) - 8, &context); - return 0; + hmac_md5_final(v2_session_response, &context); +/* cifs_dump_mem("v2_sess_rsp: ", v2_session_response, 32); */ } diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 49563e0c172..0cdfb8c32ac 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -25,9 +25,6 @@ #include #include "cifs_fs_sb.h" #include "cifsacl.h" -#include -#include - /* * The sizes of various internal tables and strings */ @@ -100,7 +97,7 @@ enum protocolEnum { /* Netbios frames protocol not supported at this time */ }; -struct session_key { +struct mac_key { unsigned int len; union { char ntlm[CIFS_SESS_KEY_SIZE + 16]; @@ -123,14 +120,6 @@ struct cifs_cred { struct cifs_ace *aces; }; -struct ntlmssp_auth { - __u32 client_flags; - __u32 server_flags; - unsigned char ciphertext[CIFS_CPHTXT_SIZE]; - struct crypto_shash *hmacmd5; - struct crypto_shash *md5; -}; - /* ***************************************************************** * Except the CIFS PDUs themselves all the @@ -193,14 +182,11 @@ struct TCP_Server_Info { /* 16th byte of RFC1001 workstation name is always null */ char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; __u32 sequence_number; /* needed for CIFS PDU signature */ - struct session_key session_key; + struct mac_key mac_signing_key; char ntlmv2_hash[16]; unsigned long lstrp; /* when we got last response from this server */ u16 dialect; /* dialect index that server chose */ /* extended security flavors that server supports */ - unsigned int tilen; /* length of the target info blob */ - unsigned char *tiblob; /* target info blob in challenge response */ - struct ntlmssp_auth ntlmssp; /* various keys, ciphers, flags */ bool sec_kerberos; /* supports plain Kerberos */ bool sec_mskerberos; /* supports legacy MS Kerberos */ bool sec_kerberosu2u; /* supports U2U Kerberos */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 320e0fd0ba7..14d036d8db1 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -134,12 +134,6 @@ * Size of the session key (crypto key encrypted with the password */ #define CIFS_SESS_KEY_SIZE (24) -#define CIFS_CLIENT_CHALLENGE_SIZE (8) -#define CIFS_SERVER_CHALLENGE_SIZE (8) -#define CIFS_HMAC_MD5_HASH_SIZE (16) -#define CIFS_CPHTXT_SIZE (16) -#define CIFS_NTLMV2_SESSKEY_SIZE (16) -#define CIFS_NTHASH_SIZE (16) /* * Maximum user name length @@ -669,6 +663,7 @@ struct ntlmv2_resp { __le64 time; __u64 client_chal; /* random */ __u32 reserved2; + struct ntlmssp2_name names[2]; /* array of name entries could follow ending in minimum 4 byte struct */ } __attribute__((packed)); diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 1378d913384..1f545081408 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -361,15 +361,15 @@ extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *); extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *, __u32 *); extern int cifs_verify_signature(struct smb_hdr *, - struct TCP_Server_Info *server, + const struct mac_key *mac_key, __u32 expected_sequence_number); -extern int cifs_calculate_session_key(struct session_key *key, const char *rn, +extern int cifs_calculate_mac_key(struct mac_key *key, const char *rn, const char *pass); -extern int setup_ntlmv2_rsp(struct cifsSesInfo *, char *, +extern int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *, + const struct nls_table *); +extern void CalcNTLMv2_response(const struct cifsSesInfo *, char *); +extern void setup_ntlmv2_rsp(struct cifsSesInfo *, char *, const struct nls_table *); -extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *); -extern void cifs_crypto_shash_release(struct TCP_Server_Info *); -extern int calc_seckey(struct TCP_Server_Info *); #ifdef CONFIG_CIFS_WEAK_PW_HASH extern void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt, char *lnm_session_key); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 4bda920d1f7..c65c3419dd3 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -604,14 +604,11 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses) else rc = -EINVAL; - if (server->secType == Kerberos) { - if (!server->sec_kerberos && - !server->sec_mskerberos) - rc = -EOPNOTSUPP; - } else if (server->secType == RawNTLMSSP) { - if (!server->sec_ntlmssp) - rc = -EOPNOTSUPP; - } else + if (server->sec_kerberos || server->sec_mskerberos) + server->secType = Kerberos; + else if (server->sec_ntlmssp) + server->secType = RawNTLMSSP; + else rc = -EOPNOTSUPP; } } else diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index ec0ea4a43bd..0ea52e9f906 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1708,7 +1708,6 @@ cifs_put_smb_ses(struct cifsSesInfo *ses) CIFSSMBLogoff(xid, ses); _FreeXid(xid); } - cifs_crypto_shash_release(server); sesInfoFree(ses); cifs_put_tcp_session(server); } @@ -1788,23 +1787,13 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) ses->linux_uid = volume_info->linux_uid; ses->overrideSecFlg = volume_info->secFlg; - rc = cifs_crypto_shash_allocate(server); - if (rc) { - cERROR(1, "could not setup hash structures rc %d", rc); - goto get_ses_fail; - } - server->tilen = 0; - server->tiblob = NULL; - mutex_lock(&ses->session_mutex); rc = cifs_negotiate_protocol(xid, ses); if (!rc) rc = cifs_setup_session(xid, ses, volume_info->local_nls); mutex_unlock(&ses->session_mutex); - if (rc) { - cifs_crypto_shash_release(ses->server); + if (rc) goto get_ses_fail; - } /* success, put it on the list */ write_lock(&cifs_tcp_ses_lock); diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h index 1db0f0746a5..49c9a4e7531 100644 --- a/fs/cifs/ntlmssp.h +++ b/fs/cifs/ntlmssp.h @@ -61,19 +61,6 @@ #define NTLMSSP_NEGOTIATE_KEY_XCH 0x40000000 #define NTLMSSP_NEGOTIATE_56 0x80000000 -/* Define AV Pair Field IDs */ -#define NTLMSSP_AV_EOL 0 -#define NTLMSSP_AV_NB_COMPUTER_NAME 1 -#define NTLMSSP_AV_NB_DOMAIN_NAME 2 -#define NTLMSSP_AV_DNS_COMPUTER_NAME 3 -#define NTLMSSP_AV_DNS_DOMAIN_NAME 4 -#define NTLMSSP_AV_DNS_TREE_NAME 5 -#define NTLMSSP_AV_FLAGS 6 -#define NTLMSSP_AV_TIMESTAMP 7 -#define NTLMSSP_AV_RESTRICTION 8 -#define NTLMSSP_AV_TARGET_NAME 9 -#define NTLMSSP_AV_CHANNEL_BINDINGS 10 - /* Although typedefs are not commonly used for structure definitions */ /* in the Linux kernel, in this particular case they are useful */ /* to more closely match the standards document for NTLMSSP from */ diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 41fc5328120..0a57cb7db5d 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -383,9 +383,6 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft, static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, struct cifsSesInfo *ses) { - unsigned int tioffset; /* challeng message target info area */ - unsigned int tilen; /* challeng message target info area length */ - CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr; if (blob_len < sizeof(CHALLENGE_MESSAGE)) { @@ -408,18 +405,6 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, /* BB spec says that if AvId field of MsvAvTimestamp is populated then we must set the MIC field of the AUTHENTICATE_MESSAGE */ - tioffset = cpu_to_le16(pblob->TargetInfoArray.BufferOffset); - tilen = cpu_to_le16(pblob->TargetInfoArray.Length); - ses->server->tilen = tilen; - if (tilen) { - ses->server->tiblob = kmalloc(tilen, GFP_KERNEL); - if (!ses->server->tiblob) { - cERROR(1, "Challenge target info allocation failure"); - return -ENOMEM; - } - memcpy(ses->server->tiblob, bcc_ptr + tioffset, tilen); - } - return 0; } @@ -466,12 +451,10 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer, struct cifsSesInfo *ses, const struct nls_table *nls_cp, bool first) { - int rc; - unsigned int size; AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer; __u32 flags; unsigned char *tmp; - struct ntlmv2_resp ntlmv2_response = {}; + char ntlm_session_key[CIFS_SESS_KEY_SIZE]; memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8); sec_blob->MessageType = NtLmAuthenticate; @@ -494,25 +477,19 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer, sec_blob->LmChallengeResponse.Length = 0; sec_blob->LmChallengeResponse.MaximumLength = 0; - sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer); - rc = setup_ntlmv2_rsp(ses, (char *)&ntlmv2_response, nls_cp); - if (rc) { - cERROR(1, "error rc: %d during ntlmssp ntlmv2 setup", rc); - goto setup_ntlmv2_ret; - } - size = sizeof(struct ntlmv2_resp); - memcpy(tmp, (char *)&ntlmv2_response, size); - tmp += size; - if (ses->server->tilen > 0) { - memcpy(tmp, ses->server->tiblob, ses->server->tilen); - tmp += ses->server->tilen; - } else - ses->server->tilen = 0; + /* calculate session key, BB what about adding similar ntlmv2 path? */ + SMBNTencrypt(ses->password, ses->server->cryptKey, ntlm_session_key); + if (first) + cifs_calculate_mac_key(&ses->server->mac_signing_key, + ntlm_session_key, ses->password); - sec_blob->NtChallengeResponse.Length = cpu_to_le16(size + - ses->server->tilen); + memcpy(tmp, ntlm_session_key, CIFS_SESS_KEY_SIZE); + sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->NtChallengeResponse.Length = cpu_to_le16(CIFS_SESS_KEY_SIZE); sec_blob->NtChallengeResponse.MaximumLength = - cpu_to_le16(size + ses->server->tilen); + cpu_to_le16(CIFS_SESS_KEY_SIZE); + + tmp += CIFS_SESS_KEY_SIZE; if (ses->domainName == NULL) { sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); @@ -524,6 +501,7 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer, len = cifs_strtoUCS((__le16 *)tmp, ses->domainName, MAX_USERNAME_SIZE, nls_cp); len *= 2; /* unicode is 2 bytes each */ + len += 2; /* trailing null */ sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); sec_blob->DomainName.Length = cpu_to_le16(len); sec_blob->DomainName.MaximumLength = cpu_to_le16(len); @@ -540,6 +518,7 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer, len = cifs_strtoUCS((__le16 *)tmp, ses->userName, MAX_USERNAME_SIZE, nls_cp); len *= 2; /* unicode is 2 bytes each */ + len += 2; /* trailing null */ sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer); sec_blob->UserName.Length = cpu_to_le16(len); sec_blob->UserName.MaximumLength = cpu_to_le16(len); @@ -551,26 +530,9 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer, sec_blob->WorkstationName.MaximumLength = 0; tmp += 2; - if ((ses->server->ntlmssp.server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) && - !calc_seckey(ses->server)) { - memcpy(tmp, ses->server->ntlmssp.ciphertext, CIFS_CPHTXT_SIZE); - sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer); - sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE); - sec_blob->SessionKey.MaximumLength = - cpu_to_le16(CIFS_CPHTXT_SIZE); - tmp += CIFS_CPHTXT_SIZE; - } else { - sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer); - sec_blob->SessionKey.Length = 0; - sec_blob->SessionKey.MaximumLength = 0; - } - - ses->server->sequence_number = 0; - -setup_ntlmv2_ret: - if (ses->server->tilen > 0) - kfree(ses->server->tiblob); - + sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->SessionKey.Length = 0; + sec_blob->SessionKey.MaximumLength = 0; return tmp - pbuffer; } @@ -584,14 +546,15 @@ static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB, return; } -static int setup_ntlmssp_auth_req(char *ntlmsspblob, +static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB, struct cifsSesInfo *ses, const struct nls_table *nls, bool first_time) { int bloblen; - bloblen = build_ntlmssp_auth_blob(ntlmsspblob, ses, nls, + bloblen = build_ntlmssp_auth_blob(&pSMB->req.SecurityBlob[0], ses, nls, first_time); + pSMB->req.SecurityBlobLength = cpu_to_le16(bloblen); return bloblen; } @@ -617,7 +580,6 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, struct key *spnego_key = NULL; __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ bool first_time; - char *ntlmsspblob; if (ses == NULL) return -EINVAL; @@ -728,7 +690,7 @@ ssetup_ntlmssp_authenticate: if (first_time) /* should this be moved into common code with similar ntlmv2 path? */ - cifs_calculate_session_key(&ses->server->session_key, + cifs_calculate_mac_key(&ses->server->mac_signing_key, ntlm_session_key, ses->password); /* copy session key */ @@ -767,21 +729,12 @@ ssetup_ntlmssp_authenticate: cpu_to_le16(sizeof(struct ntlmv2_resp)); /* calculate session key */ - rc = setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp); - if (rc) { - kfree(v2_sess_key); - goto ssetup_exit; - } + setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp); /* FIXME: calculate MAC key */ memcpy(bcc_ptr, (char *)v2_sess_key, sizeof(struct ntlmv2_resp)); bcc_ptr += sizeof(struct ntlmv2_resp); kfree(v2_sess_key); - if (ses->server->tilen > 0) { - memcpy(bcc_ptr, ses->server->tiblob, - ses->server->tilen); - bcc_ptr += ses->server->tilen; - } if (ses->capabilities & CAP_UNICODE) { if (iov[0].iov_len % 2) { *bcc_ptr = 0; @@ -812,15 +765,15 @@ ssetup_ntlmssp_authenticate: } /* bail out if key is too long */ if (msg->sesskey_len > - sizeof(ses->server->session_key.data.krb5)) { + sizeof(ses->server->mac_signing_key.data.krb5)) { cERROR(1, "Kerberos signing key too long (%u bytes)", msg->sesskey_len); rc = -EOVERFLOW; goto ssetup_exit; } if (first_time) { - ses->server->session_key.len = msg->sesskey_len; - memcpy(ses->server->session_key.data.krb5, + ses->server->mac_signing_key.len = msg->sesskey_len; + memcpy(ses->server->mac_signing_key.data.krb5, msg->data, msg->sesskey_len); } pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; @@ -862,26 +815,12 @@ ssetup_ntlmssp_authenticate: if (phase == NtLmNegotiate) { setup_ntlmssp_neg_req(pSMB, ses); iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE); - iov[1].iov_base = &pSMB->req.SecurityBlob[0]; } else if (phase == NtLmAuthenticate) { int blob_len; - ntlmsspblob = kmalloc(5 * - sizeof(struct _AUTHENTICATE_MESSAGE), - GFP_KERNEL); - if (!ntlmsspblob) { - cERROR(1, "Can't allocate NTLMSSP"); - rc = -ENOMEM; - goto ssetup_exit; - } - - blob_len = setup_ntlmssp_auth_req(ntlmsspblob, - ses, - nls_cp, - first_time); + blob_len = setup_ntlmssp_auth_req(pSMB, ses, + nls_cp, + first_time); iov[1].iov_len = blob_len; - iov[1].iov_base = ntlmsspblob; - pSMB->req.SecurityBlobLength = - cpu_to_le16(blob_len); /* Make sure that we tell the server that we are using the uid that it just gave us back on the response (challenge) */ @@ -891,6 +830,7 @@ ssetup_ntlmssp_authenticate: rc = -ENOSYS; goto ssetup_exit; } + iov[1].iov_base = &pSMB->req.SecurityBlob[0]; /* unicode strings must be word aligned */ if ((iov[0].iov_len + iov[1].iov_len) % 2) { *bcc_ptr = 0; diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index e0588cdf4cc..82f78c4d697 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -543,7 +543,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, (ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))) { rc = cifs_verify_signature(midQ->resp_buf, - ses->server, + &ses->server->mac_signing_key, midQ->sequence_number+1); if (rc) { cERROR(1, "Unexpected SMB signature"); @@ -731,7 +731,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses, (ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))) { rc = cifs_verify_signature(out_buf, - ses->server, + &ses->server->mac_signing_key, midQ->sequence_number+1); if (rc) { cERROR(1, "Unexpected SMB signature"); @@ -981,7 +981,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon, (ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))) { rc = cifs_verify_signature(out_buf, - ses->server, + &ses->server->mac_signing_key, midQ->sequence_number+1); if (rc) { cERROR(1, "Unexpected SMB signature"); -- cgit v1.2.3 From 639e7a913d81f918bfbf506e6ecd54664f787cbd Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 3 Sep 2010 11:50:09 -0400 Subject: cifs: eliminate redundant xdev check in cifs_rename The VFS always checks that the source and target of a rename are on the same vfsmount, and hence have the same superblock. So, this check is redundant. Remove it and simplify the error handling. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/inode.c | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 86a164f08a7..93f77d438d3 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1462,28 +1462,17 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry, { char *fromName = NULL; char *toName = NULL; - struct cifs_sb_info *cifs_sb_source; - struct cifs_sb_info *cifs_sb_target; + struct cifs_sb_info *cifs_sb; struct cifsTconInfo *tcon; FILE_UNIX_BASIC_INFO *info_buf_source = NULL; FILE_UNIX_BASIC_INFO *info_buf_target; int xid, rc, tmprc; - cifs_sb_target = CIFS_SB(target_dir->i_sb); - cifs_sb_source = CIFS_SB(source_dir->i_sb); - tcon = cifs_sb_source->tcon; + cifs_sb = CIFS_SB(source_dir->i_sb); + tcon = cifs_sb->tcon; xid = GetXid(); - /* - * BB: this might be allowed if same server, but different share. - * Consider adding support for this - */ - if (tcon != cifs_sb_target->tcon) { - rc = -EXDEV; - goto cifs_rename_exit; - } - /* * we already have the rename sem so we do not need to * grab it again here to protect the path integrity @@ -1519,17 +1508,16 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry, info_buf_target = info_buf_source + 1; tmprc = CIFSSMBUnixQPathInfo(xid, tcon, fromName, info_buf_source, - cifs_sb_source->local_nls, - cifs_sb_source->mnt_cifs_flags & + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (tmprc != 0) goto unlink_target; - tmprc = CIFSSMBUnixQPathInfo(xid, tcon, - toName, info_buf_target, - cifs_sb_target->local_nls, - /* remap based on source sb */ - cifs_sb_source->mnt_cifs_flags & + tmprc = CIFSSMBUnixQPathInfo(xid, tcon, toName, + info_buf_target, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (tmprc == 0 && (info_buf_source->UniqueId == -- cgit v1.2.3 From 4266d9118f85b050a341992f0cfab40d392ef32c Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 8 Sep 2010 21:17:29 +0000 Subject: [CIFS] ntlmv2/ntlmssp remove-unused-function CalcNTLMv2_partial_mac_key This function is not used, so remove the definition and declaration. Reviewed-by: Jeff Layton Signed-off-by: Shirish Pargaonkar Signed-off-by: Steve French --- fs/cifs/cifsencrypt.c | 57 --------------------------------------------------- fs/cifs/cifsproto.h | 2 -- 2 files changed, 59 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 847628dfdc4..35042d8f733 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -223,63 +223,6 @@ int cifs_calculate_mac_key(struct mac_key *key, const char *rn, return 0; } -int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *ses, - const struct nls_table *nls_info) -{ - char temp_hash[16]; - struct HMACMD5Context ctx; - char *ucase_buf; - __le16 *unicode_buf; - unsigned int i, user_name_len, dom_name_len; - - if (ses == NULL) - return -EINVAL; - - E_md4hash(ses->password, temp_hash); - - hmac_md5_init_limK_to_64(temp_hash, 16, &ctx); - user_name_len = strlen(ses->userName); - if (user_name_len > MAX_USERNAME_SIZE) - return -EINVAL; - if (ses->domainName == NULL) - return -EINVAL; /* BB should we use CIFS_LINUX_DOM */ - dom_name_len = strlen(ses->domainName); - if (dom_name_len > MAX_USERNAME_SIZE) - return -EINVAL; - - ucase_buf = kmalloc((MAX_USERNAME_SIZE+1), GFP_KERNEL); - if (ucase_buf == NULL) - return -ENOMEM; - unicode_buf = kmalloc((MAX_USERNAME_SIZE+1)*4, GFP_KERNEL); - if (unicode_buf == NULL) { - kfree(ucase_buf); - return -ENOMEM; - } - - for (i = 0; i < user_name_len; i++) - ucase_buf[i] = nls_info->charset2upper[(int)ses->userName[i]]; - ucase_buf[i] = 0; - user_name_len = cifs_strtoUCS(unicode_buf, ucase_buf, - MAX_USERNAME_SIZE*2, nls_info); - unicode_buf[user_name_len] = 0; - user_name_len++; - - for (i = 0; i < dom_name_len; i++) - ucase_buf[i] = nls_info->charset2upper[(int)ses->domainName[i]]; - ucase_buf[i] = 0; - dom_name_len = cifs_strtoUCS(unicode_buf+user_name_len, ucase_buf, - MAX_USERNAME_SIZE*2, nls_info); - - unicode_buf[user_name_len + dom_name_len] = 0; - hmac_md5_update((const unsigned char *) unicode_buf, - (user_name_len+dom_name_len)*2, &ctx); - - hmac_md5_final(ses->server->ntlmv2_hash, &ctx); - kfree(ucase_buf); - kfree(unicode_buf); - return 0; -} - #ifdef CONFIG_CIFS_WEAK_PW_HASH void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt, char *lnm_session_key) diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 1f545081408..f399b16cb7d 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -365,8 +365,6 @@ extern int cifs_verify_signature(struct smb_hdr *, __u32 expected_sequence_number); extern int cifs_calculate_mac_key(struct mac_key *key, const char *rn, const char *pass); -extern int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *, - const struct nls_table *); extern void CalcNTLMv2_response(const struct cifsSesInfo *, char *); extern void setup_ntlmv2_rsp(struct cifsSesInfo *, char *, const struct nls_table *); -- cgit v1.2.3 From 522bbe65a2415fabce618186fc7777eb4c502989 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 3 Sep 2010 12:00:49 -0400 Subject: cifs: prevent cifsd from exiting prematurely When cifs_demultiplex_thread exits, it does a number of cleanup tasks including freeing the TCP_Server_Info struct. Much of the existing code in cifs assumes that when there is a cisfSesInfo struct, that it holds a reference to a valid TCP_Server_Info struct. We can never allow cifsd to exit when a cifsSesInfo struct is still holding a reference to the server. The server pointers will then point to freed memory. This patch eliminates a couple of questionable conditions where it does this. The idea here is to make an -EINTR return from kernel_recvmsg behave the same way as -ERESTARTSYS or -EAGAIN. If the task was signalled from cifs_put_tcp_session, then tcpStatus will be CifsExiting, and the kernel_recvmsg call will return quickly. There's also another condition where this can occur too -- if the tcpStatus is still in CifsNew, then it will also exit if the server closes the socket prematurely. I think we'll probably also need to fix that situation, but that requires a bit more consideration. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 0ea52e9f906..5f68b968faa 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -400,7 +400,9 @@ incomplete_rcv: cFYI(1, "call to reconnect done"); csocket = server->ssocket; continue; - } else if ((length == -ERESTARTSYS) || (length == -EAGAIN)) { + } else if (length == -ERESTARTSYS || + length == -EAGAIN || + length == -EINTR) { msleep(1); /* minimum sleep to prevent looping allowing socket to clear and app threads to set tcpStatus CifsNeedReconnect if server hung */ @@ -422,10 +424,6 @@ incomplete_rcv: and so simply return error to mount */ break; } - if (!try_to_freeze() && (length == -EINTR)) { - cFYI(1, "cifsd thread killed"); - break; - } cFYI(1, "Reconnect after unexpected peek error %d", length); cifs_reconnect(server); @@ -522,8 +520,7 @@ incomplete_rcv: total_read += length) { length = kernel_recvmsg(csocket, &smb_msg, &iov, 1, pdu_length - total_read, 0); - if ((server->tcpStatus == CifsExiting) || - (length == -EINTR)) { + if (server->tcpStatus == CifsExiting) { /* then will exit */ reconnect = 2; break; @@ -534,8 +531,9 @@ incomplete_rcv: /* Now we will reread sock */ reconnect = 1; break; - } else if ((length == -ERESTARTSYS) || - (length == -EAGAIN)) { + } else if (length == -ERESTARTSYS || + length == -EAGAIN || + length == -EINTR) { msleep(1); /* minimum sleep to prevent looping, allowing socket to clear and app threads to set tcpStatus -- cgit v1.2.3 From 7332f2a6217ee6925f83ef0e725013067ed316ba Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 3 Sep 2010 12:00:49 -0400 Subject: cifs: eliminate some more premature cifsd exits If the tcpStatus is still CifsNew, the main cifs_demultiplex_loop can break out prematurely in some cases. This is wrong as we will almost always have other structures with pointers to the TCP_Server_Info. If the main loop breaks under any other condition other than tcpStatus == CifsExiting, then it'll face a use-after-free situation. I don't see any reason to treat a CifsNew tcpStatus differently than CifsGood. I believe we'll still want to attempt to reconnect in either case. What should happen in those situations is that the MIDs get marked as MID_RETRY_NEEDED. This will make CIFSSMBNegotiate return -EAGAIN, and then the caller can retry the whole thing on a newly reconnected socket. If that fails again in the same way, the caller of cifs_get_smb_ses should tear down the TCP_Server_Info struct. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 41 ++++++++++++----------------------------- 1 file changed, 12 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 5f68b968faa..5fde83f0c75 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -416,14 +416,6 @@ incomplete_rcv: } else continue; } else if (length <= 0) { - if (server->tcpStatus == CifsNew) { - cFYI(1, "tcp session abend after SMBnegprot"); - /* some servers kill the TCP session rather than - returning an SMB negprot error, in which - case reconnecting here is not going to help, - and so simply return error to mount */ - break; - } cFYI(1, "Reconnect after unexpected peek error %d", length); cifs_reconnect(server); @@ -464,27 +456,18 @@ incomplete_rcv: an error on SMB negprot response */ cFYI(1, "Negative RFC1002 Session Response Error 0x%x)", pdu_length); - if (server->tcpStatus == CifsNew) { - /* if nack on negprot (rather than - ret of smb negprot error) reconnecting - not going to help, ret error to mount */ - break; - } else { - /* give server a second to - clean up before reconnect attempt */ - msleep(1000); - /* always try 445 first on reconnect - since we get NACK on some if we ever - connected to port 139 (the NACK is - since we do not begin with RFC1001 - session initialize frame) */ - server->addr.sockAddr.sin_port = - htons(CIFS_PORT); - cifs_reconnect(server); - csocket = server->ssocket; - wake_up(&server->response_q); - continue; - } + /* give server a second to clean up */ + msleep(1000); + /* always try 445 first on reconnect since we get NACK + * on some if we ever connected to port 139 (the NACK + * is since we do not begin with RFC1001 session + * initialize frame) + */ + server->addr.sockAddr.sin_port = htons(CIFS_PORT); + cifs_reconnect(server); + csocket = server->ssocket; + wake_up(&server->response_q); + continue; } else if (temp != (char) 0) { cERROR(1, "Unknown RFC 1002 frame"); cifs_dump_mem(" Received Data: ", (char *)smb_buffer, -- cgit v1.2.3 From 32670396e7fc6e4f37451a69339968985461a374 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 3 Sep 2010 12:00:50 -0400 Subject: cifs: prevent possible memory corruption in cifs_demultiplex_thread cifs_demultiplex_thread sets the addr.sockAddr.sin_port without any regard for the socket family. While it may be that the error in question here never occurs on an IPv6 socket, it's probably best to be safe and set the port properly if it ever does. Break the port setting code out of cifs_fill_sockaddr and into a new function, and call that from cifs_demultiplex_thread. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsproto.h | 3 ++- fs/cifs/connect.c | 3 ++- fs/cifs/netmisc.c | 22 +++++++++++++--------- 3 files changed, 17 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index f399b16cb7d..1d60c655e3e 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -87,8 +87,9 @@ extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr); extern int decode_negTokenInit(unsigned char *security_blob, int length, struct TCP_Server_Info *server); extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len); +extern int cifs_set_port(struct sockaddr *addr, const unsigned short int port); extern int cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len, - unsigned short int port); + const unsigned short int port); extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr); extern void header_assemble(struct smb_hdr *, char /* command */ , const struct cifsTconInfo *, int /* length of diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 5fde83f0c75..67dad54fbfa 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -463,7 +463,8 @@ incomplete_rcv: * is since we do not begin with RFC1001 session * initialize frame) */ - server->addr.sockAddr.sin_port = htons(CIFS_PORT); + cifs_set_port((struct sockaddr *) + &server->addr.sockAddr, CIFS_PORT); cifs_reconnect(server); csocket = server->ssocket; wake_up(&server->response_q); diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index f97851119e6..9aad47a2d62 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -206,26 +206,30 @@ cifs_convert_address(struct sockaddr *dst, const char *src, int len) } int -cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len, - const unsigned short int port) +cifs_set_port(struct sockaddr *addr, const unsigned short int port) { - if (!cifs_convert_address(dst, src, len)) - return 0; - - switch (dst->sa_family) { + switch (addr->sa_family) { case AF_INET: - ((struct sockaddr_in *)dst)->sin_port = htons(port); + ((struct sockaddr_in *)addr)->sin_port = htons(port); break; case AF_INET6: - ((struct sockaddr_in6 *)dst)->sin6_port = htons(port); + ((struct sockaddr_in6 *)addr)->sin6_port = htons(port); break; default: return 0; } - return 1; } +int +cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len, + const unsigned short int port) +{ + if (!cifs_convert_address(dst, src, len)) + return 0; + return cifs_set_port(dst, port); +} + /***************************************************************************** convert a NT status code to a dos class/code *****************************************************************************/ -- cgit v1.2.3 From 7a801ac6f5067539ceb5fad0fe90ec49fc156e47 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Thu, 9 Sep 2010 16:37:33 -0700 Subject: O_DIRECT: fix the splitting up of contiguous I/O commit c2c6ca4 (direct-io: do not merge logically non-contiguous requests) introduced a bug whereby all O_DIRECT I/Os were submitted a page at a time to the block layer. The problem is that the code expected dio->block_in_file to correspond to the current page in the dio. In fact, it corresponds to the previous page submitted via submit_page_section. This was purely an oversight, as the dio->cur_page_fs_offset field was introduced for just this purpose. This patch simply uses the correct variable when calculating whether there is a mismatch between contiguous logical blocks and contiguous physical blocks (as described in the comments). I also switched the if conditional following this check to an else if, to ensure that we never call dio_bio_submit twice for the same dio (in theory, this should not happen, anyway). I've tested this by running blktrace and verifying that a 64KB I/O was submitted as a single I/O. I also ran the patched kernel through xfstests' aio tests using xfs, ext4 (with 1k and 4k block sizes) and btrfs and verified that there were no regressions as compared to an unpatched kernel. Signed-off-by: Jeff Moyer Acked-by: Josef Bacik Cc: Christoph Hellwig Cc: Chris Mason Cc: [2.6.35.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/direct-io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index 51f270b479b..48d74c7391d 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -634,7 +634,7 @@ static int dio_send_cur_page(struct dio *dio) int ret = 0; if (dio->bio) { - loff_t cur_offset = dio->block_in_file << dio->blkbits; + loff_t cur_offset = dio->cur_page_fs_offset; loff_t bio_next_offset = dio->logical_offset_in_bio + dio->bio->bi_size; @@ -659,7 +659,7 @@ static int dio_send_cur_page(struct dio *dio) * Submit now if the underlying fs is about to perform a * metadata read */ - if (dio->boundary) + else if (dio->boundary) dio_bio_submit(dio); } -- cgit v1.2.3 From ed430fec756ad65f7cfba24f8ad17c3d5a403290 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 9 Sep 2010 16:37:36 -0700 Subject: proc: export uncached bit properly in /proc/kpageflags Fix the left-over old ifdef for PG_uncached in /proc/kpageflags. Now it's used by x86, too. Signed-off-by: Takashi Iwai Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/page.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/page.c b/fs/proc/page.c index 180cf5a0bd6..3b8b4566033 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -146,7 +146,7 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); #endif -#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR +#ifdef CONFIG_ARCH_USES_PG_UNCACHED u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached); #endif -- cgit v1.2.3 From ee3aebdd8f5f8eac41c25c80ceee3d728f920f3b Mon Sep 17 00:00:00 2001 From: Jan Sembera Date: Thu, 9 Sep 2010 16:37:54 -0700 Subject: binfmt_misc: fix binfmt_misc priority Commit 74641f584da ("alpha: binfmt_aout fix") (May 2009) introduced a regression - binfmt_misc is now consulted after binfmt_elf, which will unfortunately break ia32el. ia32 ELF binaries on ia64 used to be matched using binfmt_misc and executed using wrapper. As 32bit binaries are now matched by binfmt_elf before bindmt_misc kicks in, the wrapper is ignored. The fix increases precedence of binfmt_misc to the original state. Signed-off-by: Jan Sembera Cc: Ivan Kokshaysky Cc: Al Viro Cc: Richard Henderson [2.6.everything.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index a7528b91393..fd0cc0bf9a4 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -724,7 +724,7 @@ static int __init init_misc_binfmt(void) { int err = register_filesystem(&bm_fs_type); if (!err) { - err = register_binfmt(&misc_format); + err = insert_binfmt(&misc_format); if (err) unregister_filesystem(&bm_fs_type); } -- cgit v1.2.3 From 3ab04d5cf9736b7a4e9dfcf28285d8663b01aa0e Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Thu, 9 Sep 2010 16:38:12 -0700 Subject: vfs: take O_NONBLOCK out of the O_* uniqueness test O_NONBLOCK on parisc has a dual value: #define O_NONBLOCK 000200004 /* HPUX has separate NDELAY & NONBLOCK */ It is caught by the O_* bits uniqueness check and leads to a parisc compile error. The fix would be to take O_NONBLOCK out. Signed-off-by: Wu Fengguang Signed-off-by: James Bottomley Cc: Jamie Lokier Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fcntl.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fcntl.c b/fs/fcntl.c index 6769fd0f35b..f8cc34f542c 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -769,11 +769,15 @@ EXPORT_SYMBOL(kill_fasync); static int __init fcntl_init(void) { - /* please add new bits here to ensure allocation uniqueness */ - BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( + /* + * Please add new bits here to ensure allocation uniqueness. + * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY + * is defined as O_NONBLOCK on some platforms and not on others. + */ + BUILD_BUG_ON(18 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | - O_TRUNC | O_APPEND | O_NONBLOCK | + O_TRUNC | O_APPEND | /* O_NONBLOCK | */ __O_SYNC | O_DSYNC | FASYNC | O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | O_NOATIME | O_CLOEXEC | -- cgit v1.2.3 From eee743fd7eac9f2ea69ad06d093dfb5a12538fe5 Mon Sep 17 00:00:00 2001 From: "Jorge Boncompte [DTI2]" Date: Thu, 9 Sep 2010 16:38:19 -0700 Subject: minix: fix regression in minix_mkdir() Commit 9eed1fb721c ("minix: replace inode uid,gid,mode init with helper") broke directory creation on minix filesystems. Fix it by passing the needed mode flag to inode init helper. Signed-off-by: Jorge Boncompte [DTI2] Cc: Dmitry Monakhov Cc: Al Viro Cc: [2.6.35.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/minix/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/minix/namei.c b/fs/minix/namei.c index e20ee85955d..f3f3578393a 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -115,7 +115,7 @@ static int minix_mkdir(struct inode * dir, struct dentry *dentry, int mode) inode_inc_link_count(dir); - inode = minix_new_inode(dir, mode, &err); + inode = minix_new_inode(dir, S_IFDIR | mode, &err); if (!inode) goto out_dir; -- cgit v1.2.3 From a122eb2fdfd78b58c6dd992d6f4b1aaef667eef9 Mon Sep 17 00:00:00 2001 From: Dan Rosenberg Date: Mon, 6 Sep 2010 18:24:57 -0400 Subject: xfs: prevent reading uninitialized stack memory The XFS_IOC_FSGETXATTR ioctl allows unprivileged users to read 12 bytes of uninitialized stack memory, because the fsxattr struct declared on the stack in xfs_ioc_fsgetxattr() does not alter (or zero) the 12-byte fsx_pad member before copying it back to the user. This patch takes care of it. Signed-off-by: Dan Rosenberg Reviewed-by: Eric Sandeen Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_ioctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 4fec427b83e..3b9e626f7cd 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -785,6 +785,8 @@ xfs_ioc_fsgetxattr( { struct fsxattr fa; + memset(&fa, 0, sizeof(struct fsxattr)); + xfs_ilock(ip, XFS_ILOCK_SHARED); fa.fsx_xflags = xfs_ip2xflags(ip); fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; -- cgit v1.2.3 From 1b528181b2ffa14721fb28ad1bd539fe1732c583 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Tue, 7 Sep 2010 19:35:49 -0700 Subject: setup_arg_pages: diagnose excessive argument size The CONFIG_STACK_GROWSDOWN variant of setup_arg_pages() does not check the size of the argument/environment area on the stack. When it is unworkably large, shift_arg_pages() hits its BUG_ON. This is exploitable with a very large RLIMIT_STACK limit, to create a crash pretty easily. Check that the initial stack is not too large to make it possible to map in any executable. We're not checking that the actual executable (or intepreter, for binfmt_elf) will fit. So those mappings might clobber part of the initial stack mapping. But that is just userland lossage that userland made happen, not a kernel problem. Signed-off-by: Roland McGrath Reviewed-by: KOSAKI Motohiro Signed-off-by: Linus Torvalds --- fs/exec.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 2d945528274..1b63237fc6d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -594,6 +594,11 @@ int setup_arg_pages(struct linux_binprm *bprm, #else stack_top = arch_align_stack(stack_top); stack_top = PAGE_ALIGN(stack_top); + + if (unlikely(stack_top < mmap_min_addr) || + unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr)) + return -ENOMEM; + stack_shift = vma->vm_end - stack_top; bprm->p -= stack_shift; -- cgit v1.2.3 From 7993bc1f4663c0db67bb8f0d98e6678145b387cd Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Tue, 7 Sep 2010 19:36:28 -0700 Subject: execve: improve interactivity with large arguments This adds a preemption point during the copying of the argument and environment strings for execve, in copy_strings(). There is already a preemption point in the count() loop, so this doesn't add any new points in the abstract sense. When the total argument+environment strings are very large, the time spent copying them can be much more than a normal user time slice. So this change improves the interactivity of the rest of the system when one process is doing an execve with very large arguments. Signed-off-by: Roland McGrath Reviewed-by: KOSAKI Motohiro Signed-off-by: Linus Torvalds --- fs/exec.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 1b63237fc6d..6f2d777431a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -419,6 +419,8 @@ static int copy_strings(int argc, const char __user *const __user *argv, while (len > 0) { int offset, bytes_to_copy; + cond_resched(); + offset = pos % PAGE_SIZE; if (offset == 0) offset = PAGE_SIZE; -- cgit v1.2.3 From 9aea5a65aa7a1af9a4236dfaeb0088f1624f9919 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Tue, 7 Sep 2010 19:37:06 -0700 Subject: execve: make responsive to SIGKILL with large arguments An execve with a very large total of argument/environment strings can take a really long time in the execve system call. It runs uninterruptibly to count and copy all the strings. This change makes it abort the exec quickly if sent a SIGKILL. Note that this is the conservative change, to interrupt only for SIGKILL, by using fatal_signal_pending(). It would be perfectly correct semantics to let any signal interrupt the string-copying in execve, i.e. use signal_pending() instead of fatal_signal_pending(). We'll save that change for later, since it could have user-visible consequences, such as having a timer set too quickly make it so that an execve can never complete, though it always happened to work before. Signed-off-by: Roland McGrath Reviewed-by: KOSAKI Motohiro Signed-off-by: Linus Torvalds --- fs/exec.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 6f2d777431a..828dd2461d6 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -376,6 +376,9 @@ static int count(const char __user * const __user * argv, int max) argv++; if (i++ >= max) return -E2BIG; + + if (fatal_signal_pending(current)) + return -ERESTARTNOHAND; cond_resched(); } } @@ -419,6 +422,10 @@ static int copy_strings(int argc, const char __user *const __user *argv, while (len > 0) { int offset, bytes_to_copy; + if (fatal_signal_pending(current)) { + ret = -ERESTARTNOHAND; + goto out; + } cond_resched(); offset = pos % PAGE_SIZE; -- cgit v1.2.3 From 51749e47e191db8e588ad5cebea731caf7b705d7 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 8 Sep 2010 09:00:22 +0000 Subject: xfs: log IO completion workqueue is a high priority queue The workqueue implementation in 2.6.36-rcX has changed, resulting in the workqueues no longer having dedicated threads for work processing. This has caused severe livelocks under heavy parallel create workloads because the log IO completions have been getting held up behind metadata IO completions. Hence log commits would stall, memory allocation would stall because pages could not be cleaned, and lock contention on the AIL during inode IO completion processing was being seen to slow everything down even further. By making the log Io completion workqueue a high priority workqueue, they are queued ahead of all data/metadata IO completions and processed before the data/metadata completions. Hence the log never gets stalled, and operations needed to clean memory can continue as quickly as possible. This avoids the livelock conditions and allos the system to keep running under heavy load as per normal. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_buf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index d72cf2bb054..286e36e21da 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1932,7 +1932,8 @@ xfs_buf_init(void) if (!xfs_buf_zone) goto out; - xfslogd_workqueue = create_workqueue("xfslogd"); + xfslogd_workqueue = alloc_workqueue("xfslogd", + WQ_RESCUER | WQ_HIGHPRI, 1); if (!xfslogd_workqueue) goto out_free_buf_zone; -- cgit v1.2.3 From 5e64b0d9e86ffff8b299556341d85319117539e9 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Tue, 7 Sep 2010 13:30:05 +0800 Subject: ocfs2/lockdep: Move ip_xattr_sem out of ocfs2_xattr_get_nolock. As the name shows, we shouldn't have any lock in ocfs2_xattr_get_nolock. so lift ip_xattr_sem to the caller. This should be safe for us since the only 2 callers are: 1. ocfs2_xattr_get which will lock the resources. 2. ocfs2_mknod which don't need this locking. And this also resolves the following lockdep warning. ======================================================= [ INFO: possible circular locking dependency detected ] 2.6.35+ #5 ------------------------------------------------------- reflink/30027 is trying to acquire lock: (&oi->ip_alloc_sem){+.+.+.}, at: [] ocfs2_reflink_ioctl+0x69a/0x1226 [ocfs2] but task is already holding lock: (&oi->ip_xattr_sem){++++..}, at: [] ocfs2_reflink_ioctl+0x68b/0x1226 [ocfs2] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (&oi->ip_xattr_sem){++++..}: [] __lock_acquire+0x79a/0x7f1 [] lock_acquire+0xc6/0xed [] down_read+0x34/0x47 [] ocfs2_xattr_get_nolock+0xa0/0x4e6 [ocfs2] [] ocfs2_get_acl_nolock+0x5c/0x132 [ocfs2] [] ocfs2_init_acl+0x60/0x243 [ocfs2] [] ocfs2_mknod+0xae8/0xfea [ocfs2] [] ocfs2_create+0x9d/0x105 [ocfs2] [] vfs_create+0x9b/0xf4 [] do_last+0x2fd/0x5be [] do_filp_open+0x1fb/0x572 [] do_sys_open+0x5a/0xe7 [] sys_open+0x1b/0x1d [] system_call_fastpath+0x16/0x1b -> #2 (jbd2_handle){+.+...}: [] __lock_acquire+0x79a/0x7f1 [] lock_acquire+0xc6/0xed [] start_this_handle+0x4a3/0x4bc [jbd2] [] jbd2__journal_start+0xba/0xee [jbd2] [] jbd2_journal_start+0xe/0x10 [jbd2] [] ocfs2_start_trans+0xb7/0x19b [ocfs2] [] ocfs2_mknod+0x73e/0xfea [ocfs2] [] ocfs2_create+0x9d/0x105 [ocfs2] [] vfs_create+0x9b/0xf4 [] do_last+0x2fd/0x5be [] do_filp_open+0x1fb/0x572 [] do_sys_open+0x5a/0xe7 [] sys_open+0x1b/0x1d [] system_call_fastpath+0x16/0x1b -> #1 (&journal->j_trans_barrier){.+.+..}: [] __lock_acquire+0x79a/0x7f1 [] lock_release_non_nested+0x1e5/0x24b [] lock_release+0x158/0x17a [] __mutex_unlock_slowpath+0xbf/0x11b [] mutex_unlock+0x9/0xb [] ocfs2_free_ac_resource+0x31/0x67 [ocfs2] [] ocfs2_free_alloc_context+0x11/0x1d [ocfs2] [] ocfs2_write_begin_nolock+0x141e/0x159b [ocfs2] [] ocfs2_write_begin+0x11e/0x1e7 [ocfs2] [] generic_file_buffered_write+0x10c/0x210 [] ocfs2_file_aio_write+0x4cc/0x6d3 [ocfs2] [] do_sync_write+0xc2/0x106 [] vfs_write+0xae/0x131 [] sys_write+0x47/0x6f [] system_call_fastpath+0x16/0x1b -> #0 (&oi->ip_alloc_sem){+.+.+.}: [] validate_chain+0x727/0xd68 [] __lock_acquire+0x79a/0x7f1 [] lock_acquire+0xc6/0xed [] down_write+0x31/0x52 [] ocfs2_reflink_ioctl+0x69a/0x1226 [ocfs2] [] ocfs2_ioctl+0x61a/0x656 [ocfs2] [] vfs_ioctl+0x2a/0x9d [] do_vfs_ioctl+0x45d/0x4ae [] sys_ioctl+0x57/0x7a [] system_call_fastpath+0x16/0x1b Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/xattr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index d03469f6180..06fa5e77c40 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1286,13 +1286,11 @@ int ocfs2_xattr_get_nolock(struct inode *inode, xis.inode_bh = xbs.inode_bh = di_bh; di = (struct ocfs2_dinode *)di_bh->b_data; - down_read(&oi->ip_xattr_sem); ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer, buffer_size, &xis); if (ret == -ENODATA && di->i_xattr_loc) ret = ocfs2_xattr_block_get(inode, name_index, name, buffer, buffer_size, &xbs); - up_read(&oi->ip_xattr_sem); return ret; } @@ -1316,8 +1314,10 @@ static int ocfs2_xattr_get(struct inode *inode, mlog_errno(ret); return ret; } + down_read(&OCFS2_I(inode)->ip_xattr_sem); ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index, name, buffer, buffer_size); + up_read(&OCFS2_I(inode)->ip_xattr_sem); ocfs2_inode_unlock(inode, 0); -- cgit v1.2.3 From 07eaac9438b13ec0b863111698b91ccec8f3b8d4 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Tue, 7 Sep 2010 13:30:06 +0800 Subject: ocfs2: Fix lockdep warning in reflink. This patch change mutex_lock to a new subclass and add a new inode lock subclass for the target inode which caused this lockdep warning. ============================================= [ INFO: possible recursive locking detected ] 2.6.35+ #5 --------------------------------------------- reflink/11086 is trying to acquire lock: (Meta){+++++.}, at: [] ocfs2_reflink_ioctl+0x898/0x1229 [ocfs2] but task is already holding lock: (Meta){+++++.}, at: [] ocfs2_reflink_ioctl+0x5d3/0x1229 [ocfs2] other info that might help us debug this: 6 locks held by reflink/11086: #0: (&sb->s_type->i_mutex_key#15/1){+.+.+.}, at: [] lookup_create+0x26/0x97 #1: (&sb->s_type->i_mutex_key#15){+.+.+.}, at: [] ocfs2_reflink_ioctl+0x4d3/0x1229 [ocfs2] #2: (Meta){+++++.}, at: [] ocfs2_reflink_ioctl+0x5d3/0x1229 [ocfs2] #3: (&oi->ip_xattr_sem){+.+.+.}, at: [] ocfs2_reflink_ioctl+0x68b/0x1229 [ocfs2] #4: (&oi->ip_alloc_sem){+.+.+.}, at: [] ocfs2_reflink_ioctl+0x69a/0x1229 [ocfs2] #5: (&sb->s_type->i_mutex_key#15/2){+.+...}, at: [] ocfs2_reflink_ioctl+0x882/0x1229 [ocfs2] stack backtrace: Pid: 11086, comm: reflink Not tainted 2.6.35+ #5 Call Trace: [] validate_chain+0x56e/0xd68 [] ? mark_held_locks+0x49/0x69 [] __lock_acquire+0x79a/0x7f1 [] lock_acquire+0xc6/0xed [] ? ocfs2_reflink_ioctl+0x898/0x1229 [ocfs2] [] __ocfs2_cluster_lock+0x975/0xa0d [ocfs2] [] ? ocfs2_reflink_ioctl+0x898/0x1229 [ocfs2] [] ? ocfs2_wait_for_recovery+0x15/0x8a [ocfs2] [] ocfs2_inode_lock_full_nested+0x1ac/0xdc5 [ocfs2] [] ? ocfs2_reflink_ioctl+0x898/0x1229 [ocfs2] [] ? trace_hardirqs_on_caller+0x10b/0x12f [] ? debug_mutex_free_waiter+0x4f/0x53 [] ocfs2_reflink_ioctl+0x898/0x1229 [ocfs2] [] ? ocfs2_file_lock_res_init+0x66/0x78 [ocfs2] [] ? might_fault+0x40/0x8d [] ocfs2_ioctl+0x61a/0x656 [ocfs2] [] ? mntput_no_expire+0x1d/0xb0 [] ? path_put+0x2c/0x31 [] vfs_ioctl+0x2a/0x9d [] do_vfs_ioctl+0x45d/0x4ae [] ? _raw_spin_unlock+0x26/0x2a [] ? sysret_check+0x27/0x62 [] sys_ioctl+0x57/0x7a [] system_call_fastpath+0x16/0x1b Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/dlmglue.h | 1 + fs/ocfs2/refcounttree.c | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index d1ce48e1b3d..1d596d8c4a4 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -84,6 +84,7 @@ enum { OI_LS_PARENT, OI_LS_RENAME1, OI_LS_RENAME2, + OI_LS_REFLINK_TARGET, }; int ocfs2_dlm_init(struct ocfs2_super *osb); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 0afeda83120..efdd7560740 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4201,8 +4201,9 @@ static int __ocfs2_reflink(struct dentry *old_dentry, goto out; } - mutex_lock(&new_inode->i_mutex); - ret = ocfs2_inode_lock(new_inode, &new_bh, 1); + mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD); + ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1, + OI_LS_REFLINK_TARGET); if (ret) { mlog_errno(ret); goto out_unlock; -- cgit v1.2.3 From 0f4da216b8c3c35c90ecd18e1899c6f125957c2b Mon Sep 17 00:00:00 2001 From: Tristan Ye Date: Wed, 8 Sep 2010 17:12:38 +0800 Subject: Ocfs2: Re-access the journal after ocfs2_insert_extent() in dxdir codes. In ocfs2_dx_dir_rebalance(), we need to rejournal_acess the blocks after calling ocfs2_insert_extent() since growing an extent tree may trigger ocfs2_extend_trans(), which makes previous journal_access meaningless. Signed-off-by: Tristan Ye Signed-off-by: Joel Becker --- fs/ocfs2/dir.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index f04ebcfffc4..c49f6de0e7a 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -3931,6 +3931,15 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir, goto out_commit; } + cpos = split_hash; + ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle, + data_ac, meta_ac, new_dx_leaves, + num_dx_leaves); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + for (i = 0; i < num_dx_leaves; i++) { ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), orig_dx_leaves[i], @@ -3939,15 +3948,14 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir, mlog_errno(ret); goto out_commit; } - } - cpos = split_hash; - ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle, - data_ac, meta_ac, new_dx_leaves, - num_dx_leaves); - if (ret) { - mlog_errno(ret); - goto out_commit; + ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), + new_dx_leaves[i], + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } } ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf, -- cgit v1.2.3 From 228ac6357718df2d5c8d70210fa51b2225aab5ee Mon Sep 17 00:00:00 2001 From: Tristan Ye Date: Fri, 10 Sep 2010 10:16:33 +0800 Subject: Ocfs2: Handle empty list in lockres_seq_start() for dlmdebug.c This patch tries to handle the case in which list 'dlm->tracking_list' is empty, to avoid accessing an invalid pointer. It fixes the following oops: http://oss.oracle.com/bugzilla/show_bug.cgi?id=1287 Signed-off-by: Tristan Ye Signed-off-by: Joel Becker --- fs/ocfs2/dlm/dlmdebug.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 5efdd37dfe4..901ca52bf86 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -636,8 +636,14 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos) spin_lock(&dlm->track_lock); if (oldres) track_list = &oldres->tracking; - else + else { track_list = &dlm->tracking_list; + if (list_empty(track_list)) { + dl = NULL; + spin_unlock(&dlm->track_lock); + goto bail; + } + } list_for_each_entry(res, track_list, tracking) { if (&res->tracking == &dlm->tracking_list) @@ -660,6 +666,7 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos) } else dl = NULL; +bail: /* passed to seq_show */ return dl; } -- cgit v1.2.3 From ca04d9c3ec721e474f00992efc1b1afb625507f5 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 26 Aug 2010 16:12:01 -0700 Subject: ceph: fix null pointer deref on anon root dentry release When we release a root dentry, particularly after a splice, the parent (actually our) inode was evaluating to NULL and was getting dereferenced by ceph_snap(). This is reproduced by something as simple as mount -t ceph monhost:/a/b mnt mount -t ceph monhost:/a mnt2 ls mnt2 A splice_dentry() would kill the old 'b' inode's root dentry, and we'd crash while releasing it. Fix by checking for both the ROOT and NULL cases explicitly. We only need to invalidate the parent dir when we have a correct parent to invalidate. Signed-off-by: Sage Weil --- fs/ceph/dir.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 6e4f43ff23e..a1986eb5204 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1021,11 +1021,15 @@ out_touch: static void ceph_dentry_release(struct dentry *dentry) { struct ceph_dentry_info *di = ceph_dentry(dentry); - struct inode *parent_inode = dentry->d_parent->d_inode; - u64 snapid = ceph_snap(parent_inode); + struct inode *parent_inode = NULL; + u64 snapid = CEPH_NOSNAP; + if (!IS_ROOT(dentry)) { + parent_inode = dentry->d_parent->d_inode; + if (parent_inode) + snapid = ceph_snap(parent_inode); + } dout("dentry_release %p parent %p\n", dentry, parent_inode); - if (parent_inode && snapid != CEPH_SNAPDIR) { struct ceph_inode_info *ci = ceph_inode(parent_inode); -- cgit v1.2.3 From 3d4401d9d0aef5c40706350685ddea3df6708496 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Fri, 3 Sep 2010 12:57:11 -0700 Subject: ceph: fix pagelist kunmap tail A wrong parameter was passed to the kunmap. Signed-off-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/pagelist.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c index b6859f47d36..46a368b6dce 100644 --- a/fs/ceph/pagelist.c +++ b/fs/ceph/pagelist.c @@ -5,10 +5,18 @@ #include "pagelist.h" +static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl) +{ + struct page *page = list_entry(pl->head.prev, struct page, + lru); + kunmap(page); +} + int ceph_pagelist_release(struct ceph_pagelist *pl) { if (pl->mapped_tail) - kunmap(pl->mapped_tail); + ceph_pagelist_unmap_tail(pl); + while (!list_empty(&pl->head)) { struct page *page = list_first_entry(&pl->head, struct page, lru); @@ -26,7 +34,7 @@ static int ceph_pagelist_addpage(struct ceph_pagelist *pl) pl->room += PAGE_SIZE; list_add_tail(&page->lru, &pl->head); if (pl->mapped_tail) - kunmap(pl->mapped_tail); + ceph_pagelist_unmap_tail(pl); pl->mapped_tail = kmap(page); return 0; } -- cgit v1.2.3 From 3612abbd5df6baa9ca3e0777c6c8646e202d3f66 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 7 Sep 2010 15:59:27 -0700 Subject: ceph: fix reconnect encoding for old servers Fix the reconnect encoding to encode the cap record when the MDS does not have the FLOCK capability (i.e., pre v0.22). Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index f091b135178..fad95f8f260 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2374,6 +2374,8 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, num_fcntl_locks, num_flock_locks); unlock_kernel(); + } else { + err = ceph_pagelist_append(pagelist, &rec, reclen); } out_free: -- cgit v1.2.3 From a77d9f7dce7600058d56f0670ed29d77abffcde2 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 11 Sep 2010 10:55:25 -0700 Subject: ceph: fix file offset wrapping at 4GB on 32-bit archs Cast the value before shifting so that we don't run out of bits with a 32-bit unsigned long. This fixes wrapping of high file offsets into the low 4GB of a file on disk, and the subsequent data corruption for large files. Signed-off-by: Sage Weil --- fs/ceph/addr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 4cfce1ee31f..50461b8c23a 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -766,7 +766,8 @@ get_more_pages: /* ok */ if (locked_pages == 0) { /* prepare async write request */ - offset = page->index << PAGE_CACHE_SHIFT; + offset = (unsigned long long)page->index + << PAGE_CACHE_SHIFT; len = wsize; req = ceph_osdc_new_request(&client->osdc, &ci->i_layout, -- cgit v1.2.3 From b1bde04c6d9a120dec602cc8a70b8a7f21600883 Mon Sep 17 00:00:00 2001 From: Fabio Olive Leite Date: Sun, 12 Sep 2010 19:55:25 -0400 Subject: Remove incorrect do_vfs_lock message The do_vfs_lock function on fs/nfs/file.c is only called if NLM is not being used, via the -onolock mount option. Therefore it cannot really be "out of sync with lock manager" when the local locking function called returns an error, as there will be no corresponding call to the NLM. For details, simply check the if/else on do_setlk and do_unlk on fs/nfs/file.c. Signed-Off-By: Fabio Olive Leite Reviewed-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index eb51bd6201d..05bf3c0dc75 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -723,10 +723,6 @@ static int do_vfs_lock(struct file *file, struct file_lock *fl) default: BUG(); } - if (res < 0) - dprintk(KERN_WARNING "%s: VFS is out of sync with lock manager" - " - error %d!\n", - __func__, res); return res; } -- cgit v1.2.3 From b20d37ca9561711c6a3c4b859c2855f49565e061 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 12 Sep 2010 19:55:26 -0400 Subject: NFS: Fix a typo in nfs_sockaddr_match_ipaddr6 Reported-by: Ben Greear Signed-off-by: Trond Myklebust Cc: stable@kernel.org --- fs/nfs/client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 4e7df2adb21..e7340729af8 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -275,7 +275,7 @@ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, sin1->sin6_scope_id != sin2->sin6_scope_id) return 0; - return ipv6_addr_equal(&sin1->sin6_addr, &sin1->sin6_addr); + return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr); } #else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, -- cgit v1.2.3 From fbf3fdd2443965d9ba6fb4b5fecd1f6e0847218f Mon Sep 17 00:00:00 2001 From: Menyhart Zoltan Date: Sun, 12 Sep 2010 19:55:26 -0400 Subject: statfs() gives ESTALE error Hi, An NFS client executes a statfs("file", &buff) call. "file" exists / existed, the client has read / written it, but it has already closed it. user_path(pathname, &path) looks up "file" successfully in the directory-cache and restarts the aging timer of the directory-entry. Even if "file" has already been removed from the server, because the lookupcache=positive option I use, keeps the entries valid for a while. nfs_statfs() returns ESTALE if "file" has already been removed from the server. If the user application repeats the statfs("file", &buff) call, we are stuck: "file" remains young forever in the directory-cache. Signed-off-by: Zoltan Menyhart Signed-off-by: Trond Myklebust Cc: stable@kernel.org --- fs/nfs/super.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index ec3966e4706..f4cbf0c306c 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -431,7 +431,15 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) goto out_err; error = server->nfs_client->rpc_ops->statfs(server, fh, &res); + if (unlikely(error == -ESTALE)) { + struct dentry *pd_dentry; + pd_dentry = dget_parent(dentry); + if (pd_dentry != NULL) { + nfs_zap_caches(pd_dentry->d_inode); + dput(pd_dentry); + } + } nfs_free_fattr(res.fattr); if (error < 0) goto out_err; -- cgit v1.2.3 From 827e3457022d0bb0b1bb8a0eb88501876fe7dcf0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 12 Sep 2010 19:57:50 -0400 Subject: SUNRPC: Fix the NFSv4 and RPCSEC_GSS Kconfig dependencies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The NFSv4 client's callback server calls svc_gss_principal(), which is defined in the auth_rpcgss.ko The NFSv4 server has the same dependency, and in addition calls svcauth_gss_flavor(), gss_mech_get_by_pseudoflavor(), gss_pseudoflavor_to_service() and gss_mech_put() from the same module. The module auth_rpcgss itself has no dependencies aside from sunrpc, so we only need to select RPCSEC_GSS. Reported-by: Uwe Kleine-König Signed-off-by: Trond Myklebust --- fs/nfs/Kconfig | 1 + fs/nfsd/Kconfig | 1 + 2 files changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 6c2aad49d73..f7e13db613c 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -63,6 +63,7 @@ config NFS_V3_ACL config NFS_V4 bool "NFS client support for NFS version 4" depends on NFS_FS + select SUNRPC_GSS help This option enables support for version 4 of the NFS protocol (RFC 3530) in the kernel's NFS client. diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 95932f523ae..4264377552e 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -69,6 +69,7 @@ config NFSD_V4 depends on NFSD && PROC_FS && EXPERIMENTAL select NFSD_V3 select FS_POSIX_ACL + select SUNRPC_GSS help This option enables support in your system's NFS server for version 4 of the NFS protocol (RFC 3530). -- cgit v1.2.3 From 62b2be591a9b12c550308ef7718a31abfc815b50 Mon Sep 17 00:00:00 2001 From: Latchesar Ionkov Date: Tue, 24 Aug 2010 18:13:59 +0000 Subject: fs/9p, net/9p: memory leak fixes Four memory leak fixes in the 9P code. Signed-off-by: Latchesar Ionkov Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_inode.c | 2 ++ net/9p/client.c | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index c7c23eab944..84159cf9c52 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -1128,6 +1128,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, v9fs_stat2inode(st, dentry->d_inode, dentry->d_inode->i_sb); generic_fillattr(dentry->d_inode, stat); + p9stat_free(st); kfree(st); return 0; } @@ -1489,6 +1490,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) retval = strnlen(buffer, buflen); done: + p9stat_free(st); kfree(st); return retval; } diff --git a/net/9p/client.c b/net/9p/client.c index dc6f2f26d02..9eb72505308 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -331,8 +331,10 @@ static void p9_tag_cleanup(struct p9_client *c) } } - if (c->tagpool) + if (c->tagpool) { + p9_idpool_put(0, c->tagpool); /* free reserved tag 0 */ p9_idpool_destroy(c->tagpool); + } /* free requests associated with tags */ for (row = 0; row < (c->max_tag/P9_ROW_MAXTAG); row++) { @@ -944,6 +946,7 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, int nwname, char **wnames, int16_t nwqids, count; err = 0; + wqids = NULL; clnt = oldfid->clnt; if (clone) { fid = p9_fid_create(clnt); @@ -994,9 +997,11 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, int nwname, char **wnames, else fid->qid = oldfid->qid; + kfree(wqids); return fid; clunk_fid: + kfree(wqids); p9_client_clunk(fid); fid = NULL; -- cgit v1.2.3 From 5c25f347a7b00b2ebe0a55c4a3cfe4c3e1e8725e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 24 Aug 2010 10:30:49 +0000 Subject: fs/9p: Fix error handling in v9fs_get_sb This was introduced by 7cadb63d58a932041afa3f957d5cbb6ce69dcee5 Signed-off-by: Aneesh Kumar K.V Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_super.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index f9311077de6..1d12ba0ed3d 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -122,6 +122,10 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, fid = v9fs_session_init(v9ses, dev_name, data); if (IS_ERR(fid)) { retval = PTR_ERR(fid); + /* + * we need to call session_close to tear down some + * of the data structure setup by session_init + */ goto close_session; } @@ -144,7 +148,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, retval = -ENOMEM; goto release_sb; } - sb->s_root = root; if (v9fs_proto_dotl(v9ses)) { @@ -152,7 +155,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); if (IS_ERR(st)) { retval = PTR_ERR(st); - goto clunk_fid; + goto release_sb; } v9fs_stat2inode_dotl(st, root->d_inode); @@ -162,7 +165,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, st = p9_client_stat(fid); if (IS_ERR(st)) { retval = PTR_ERR(st); - goto clunk_fid; + goto release_sb; } root->d_inode->i_ino = v9fs_qid2ino(&st->qid); @@ -174,19 +177,24 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, v9fs_fid_add(root, fid); -P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n"); + P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n"); simple_set_mnt(mnt, sb); return 0; clunk_fid: p9_client_clunk(fid); - close_session: v9fs_session_close(v9ses); kfree(v9ses); return retval; - release_sb: + /* + * we will do the session_close and root dentry release + * in the below call. But we need to clunk fid, because we haven't + * attached the fid to dentry so it won't get clunked + * automatically. + */ + p9_client_clunk(fid); deactivate_locked_super(sb); return retval; } -- cgit v1.2.3 From 62726a7ab3a6a3624256172af055ff0a38c6ffa2 Mon Sep 17 00:00:00 2001 From: jvrao Date: Wed, 25 Aug 2010 16:26:21 +0000 Subject: 9p: Check for NULL fid in v9fs_dir_release() NULL fid should be handled in cases where we endup calling v9fs_dir_release() before even we instantiate the fid in filp. Signed-off-by: Venkateswararao Jujjuri Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_dir.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 16c8a2a98c1..899f168fd19 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -292,9 +292,11 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) fid = filp->private_data; P9_DPRINTK(P9_DEBUG_VFS, - "inode: %p filp: %p fid: %d\n", inode, filp, fid->fid); + "v9fs_dir_release: inode: %p filp: %p fid: %d\n", + inode, filp, fid ? fid->fid : -1); filemap_write_and_wait(inode->i_mapping); - p9_client_clunk(fid); + if (fid) + p9_client_clunk(fid); return 0; } -- cgit v1.2.3 From 3c30750ffafbc32af040b09f777b67aa2486b063 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 30 Aug 2010 16:04:35 +0000 Subject: fs/9p: Use the correct dentry operations We should use the cached dentry operation only if caching mode is enabled Signed-off-by: Aneesh Kumar K.V Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 84159cf9c52..a6990bbf605 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -730,7 +730,10 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int mode, P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); goto error; } - dentry->d_op = &v9fs_cached_dentry_operations; + if (v9ses->cache) + dentry->d_op = &v9fs_cached_dentry_operations; + else + dentry->d_op = &v9fs_dentry_operations; d_instantiate(dentry, inode); err = v9fs_fid_add(dentry, fid); if (err < 0) -- cgit v1.2.3 From 1d76e3135733a06aa12bb35891c05f306b27b2d6 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 30 Aug 2010 17:43:07 +0000 Subject: fs/9p: Don't use dotl version of mknod for dotu inode operations We should not use dotlversion for the dotu inode operations Signed-off-by: Aneesh Kumar K.V Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index a6990bbf605..9e670d52764 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -1947,7 +1947,7 @@ static const struct inode_operations v9fs_dir_inode_operations_dotu = { .unlink = v9fs_vfs_unlink, .mkdir = v9fs_vfs_mkdir, .rmdir = v9fs_vfs_rmdir, - .mknod = v9fs_vfs_mknod_dotl, + .mknod = v9fs_vfs_mknod, .rename = v9fs_vfs_rename, .getattr = v9fs_vfs_getattr, .setattr = v9fs_vfs_setattr, -- cgit v1.2.3 From 467c525109d5d542d7d416b0c11bdd54610fe2f4 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 13 Sep 2010 11:39:20 -0700 Subject: ceph: fix dn offset during readdir_prepopulate When adding the readdir results to the cache, ceph_set_dentry_offset was clobbered our just-set offset. This can cause the readdir result offsets to get out of sync with the server. Add an argument to the helper so that it does not. This bug was introduced by 1cd3935bedccf592d44343890251452a6dd74fc4. Signed-off-by: Sage Weil --- fs/ceph/inode.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index e7cca414da0..62377ec37ed 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -845,7 +845,7 @@ static void ceph_set_dentry_offset(struct dentry *dn) * the caller) if we fail. */ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, - bool *prehash) + bool *prehash, bool set_offset) { struct dentry *realdn; @@ -877,7 +877,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, } if ((!prehash || *prehash) && d_unhashed(dn)) d_rehash(dn); - ceph_set_dentry_offset(dn); + if (set_offset) + ceph_set_dentry_offset(dn); out: return dn; } @@ -1062,7 +1063,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, d_delete(dn); goto done; } - dn = splice_dentry(dn, in, &have_lease); + dn = splice_dentry(dn, in, &have_lease, true); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; @@ -1105,7 +1106,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, goto done; } dout(" linking snapped dir %p to dn %p\n", in, dn); - dn = splice_dentry(dn, in, NULL); + dn = splice_dentry(dn, in, NULL, true); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; @@ -1237,7 +1238,7 @@ retry_lookup: err = PTR_ERR(in); goto out; } - dn = splice_dentry(dn, in, NULL); + dn = splice_dentry(dn, in, NULL, false); if (IS_ERR(dn)) dn = NULL; } -- cgit v1.2.3 From 8bef9239ee1a42eb37d3f83bacf6a75f019c028d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 14 Sep 2010 15:45:44 -0700 Subject: ceph: correctly set 'follows' in flushsnap messages The 'follows' should match the seq for the snap context for the given snap cap, which is the context under which we have been dirtying and writing data and metadata. The snapshot that _contains_ those updates thus _follows_ that context's seq #. Signed-off-by: Sage Weil --- fs/ceph/snap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 4868b9dcac5..9e836afba34 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -467,7 +467,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) INIT_LIST_HEAD(&capsnap->ci_item); INIT_LIST_HEAD(&capsnap->flushing_item); - capsnap->follows = snapc->seq - 1; + capsnap->follows = snapc->seq; capsnap->issued = __ceph_caps_issued(ci, NULL); capsnap->dirty = dirty; -- cgit v1.2.3 From cfc0bf6640dfd0f43bf8bfec5a475284809baa4d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 14 Sep 2010 15:50:59 -0700 Subject: ceph: stop sending FLUSHSNAPs when we hit a dirty capsnap Stop sending FLUSHSNAP messages when we hit a capsnap that has dirty_pages or is still writing. We'll send the newer capsnaps only after the older ones complete. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index a2069b6680a..9fbe9019155 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1227,7 +1227,7 @@ retry: * pages to be written out. */ if (capsnap->dirty_pages || capsnap->writing) - continue; + break; /* * if cap writeback already occurred, we should have dropped @@ -1276,8 +1276,8 @@ retry: &session->s_cap_snaps_flushing); spin_unlock(&inode->i_lock); - dout("flush_snaps %p cap_snap %p follows %lld size %llu\n", - inode, capsnap, next_follows, capsnap->size); + dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n", + inode, capsnap, capsnap->follows, capsnap->flush_tid); send_cap_msg(session, ceph_vino(inode).ino, 0, CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0, capsnap->dirty, 0, capsnap->flush_tid, 0, mseq, -- cgit v1.2.3 From 460cf3411b858ad509d5255e0dfaf862a83c0299 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 14 Sep 2010 11:38:24 -0400 Subject: cifs: fix potential double put of TCP session reference cifs_get_smb_ses must be called on a server pointer on which it holds an active reference. It first does a search for an existing SMB session. If it finds one, it'll put the server reference and then try to ensure that the negprot is done, etc. If it encounters an error at that point then it'll return an error. There's a potential problem here though. When cifs_get_smb_ses returns an error, the caller will also put the TCP server reference leading to a double-put. Fix this by having cifs_get_smb_ses only put the server reference if it found an existing session that it could use and isn't returning an error. Cc: stable@kernel.org Reviewed-by: Suresh Jayaraman Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 67dad54fbfa..88c84a38bcc 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1706,9 +1706,6 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) if (ses) { cFYI(1, "Existing smb sess found (status=%d)", ses->status); - /* existing SMB ses has a server reference already */ - cifs_put_tcp_session(server); - mutex_lock(&ses->session_mutex); rc = cifs_negotiate_protocol(xid, ses); if (rc) { @@ -1731,6 +1728,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) } } mutex_unlock(&ses->session_mutex); + + /* existing SMB ses has a server reference already */ + cifs_put_tcp_session(server); FreeXid(xid); return ses; } -- cgit v1.2.3 From 75e1c70fc31490ef8a373ea2a4bea2524099b478 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Fri, 10 Sep 2010 14:16:00 -0700 Subject: aio: check for multiplication overflow in do_io_submit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tavis Ormandy pointed out that do_io_submit does not do proper bounds checking on the passed-in iocb array:        if (unlikely(nr < 0))                return -EINVAL;        if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(iocbpp)))))                return -EFAULT;                      ^^^^^^^^^^^^^^^^^^ The attached patch checks for overflow, and if it is detected, the number of iocbs submitted is scaled down to a number that will fit in the long.  This is an ok thing to do, as sys_io_submit is documented as returning the number of iocbs submitted, so callers should handle a return value of less than the 'nr' argument passed in. Reported-by: Tavis Ormandy Signed-off-by: Jeff Moyer Signed-off-by: Linus Torvalds --- fs/aio.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 3006b5bc33d..1320b2a05fb 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1659,6 +1659,9 @@ long do_io_submit(aio_context_t ctx_id, long nr, if (unlikely(nr < 0)) return -EINVAL; + if (unlikely(nr > LONG_MAX/sizeof(*iocbpp))) + nr = LONG_MAX/sizeof(*iocbpp); + if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp))))) return -EFAULT; -- cgit v1.2.3 From ae00d4f37f4df56821331deb1028748110dd6dc9 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 16 Sep 2010 16:26:51 -0700 Subject: ceph: fix cap_snap and realm split The cap_snap creation/queueing relies on both the current i_head_snapc _and_ the i_snap_realm pointers being correct, so that the new cap_snap can properly reference the old context and the new i_head_snapc can be updated to reference the new snaprealm's context. To fix this, we: - move inodes completely to the new (split) realm so that i_snap_realm is correct, and - generate the new snapc's _before_ queueing the cap_snaps in ceph_update_snap_trace(). Signed-off-by: Sage Weil --- fs/ceph/addr.c | 4 +-- fs/ceph/snap.c | 88 +++++++++++++++++++-------------------------------------- fs/ceph/super.h | 2 ++ 3 files changed, 33 insertions(+), 61 deletions(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 50461b8c23a..efbc604001c 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -411,8 +411,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) if (i_size < page_off + len) len = i_size - page_off; - dout("writepage %p page %p index %lu on %llu~%u\n", - inode, page, page->index, page_off, len); + dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", + inode, page, page->index, page_off, len, snapc); writeback_stat = atomic_long_inc_return(&client->writeback_count); if (writeback_stat > diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 9e836afba34..9e6eef14b7d 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -119,6 +119,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm( INIT_LIST_HEAD(&realm->children); INIT_LIST_HEAD(&realm->child_item); INIT_LIST_HEAD(&realm->empty_item); + INIT_LIST_HEAD(&realm->dirty_item); INIT_LIST_HEAD(&realm->inodes_with_caps); spin_lock_init(&realm->inodes_with_caps_lock); __insert_snap_realm(&mdsc->snap_realms, realm); @@ -604,6 +605,7 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm; int invalidate = 0; int err = -ENOMEM; + LIST_HEAD(dirty_realms); dout("update_snap_trace deletion=%d\n", deletion); more: @@ -626,24 +628,6 @@ more: } } - if (le64_to_cpu(ri->seq) > realm->seq) { - dout("update_snap_trace updating %llx %p %lld -> %lld\n", - realm->ino, realm, realm->seq, le64_to_cpu(ri->seq)); - /* - * if the realm seq has changed, queue a cap_snap for every - * inode with open caps. we do this _before_ we update - * the realm info so that we prepare for writeback under the - * _previous_ snap context. - * - * ...unless it's a snap deletion! - */ - if (!deletion) - queue_realm_cap_snaps(realm); - } else { - dout("update_snap_trace %llx %p seq %lld unchanged\n", - realm->ino, realm, realm->seq); - } - /* ensure the parent is correct */ err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent)); if (err < 0) @@ -651,6 +635,8 @@ more: invalidate += err; if (le64_to_cpu(ri->seq) > realm->seq) { + dout("update_snap_trace updating %llx %p %lld -> %lld\n", + realm->ino, realm, realm->seq, le64_to_cpu(ri->seq)); /* update realm parameters, snap lists */ realm->seq = le64_to_cpu(ri->seq); realm->created = le64_to_cpu(ri->created); @@ -668,9 +654,17 @@ more: if (err < 0) goto fail; + /* queue realm for cap_snap creation */ + list_add(&realm->dirty_item, &dirty_realms); + invalidate = 1; } else if (!realm->cached_context) { + dout("update_snap_trace %llx %p seq %lld new\n", + realm->ino, realm, realm->seq); invalidate = 1; + } else { + dout("update_snap_trace %llx %p seq %lld unchanged\n", + realm->ino, realm, realm->seq); } dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino, @@ -683,6 +677,14 @@ more: if (invalidate) rebuild_snap_realms(realm); + /* + * queue cap snaps _after_ we've built the new snap contexts, + * so that i_head_snapc can be set appropriately. + */ + list_for_each_entry(realm, &dirty_realms, dirty_item) { + queue_realm_cap_snaps(realm); + } + __cleanup_empty_realms(mdsc); return 0; @@ -816,6 +818,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, }; struct inode *inode = ceph_find_inode(sb, vino); struct ceph_inode_info *ci; + struct ceph_snap_realm *oldrealm; if (!inode) continue; @@ -841,18 +844,19 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, dout(" will move %p to split realm %llx %p\n", inode, realm->ino, realm); /* - * Remove the inode from the realm's inode - * list, but don't add it to the new realm - * yet. We don't want the cap_snap to be - * queued (again) by ceph_update_snap_trace() - * below. Queue it _now_, under the old context. + * Move the inode to the new realm */ spin_lock(&realm->inodes_with_caps_lock); list_del_init(&ci->i_snap_realm_item); + list_add(&ci->i_snap_realm_item, + &realm->inodes_with_caps); + oldrealm = ci->i_snap_realm; + ci->i_snap_realm = realm; spin_unlock(&realm->inodes_with_caps_lock); spin_unlock(&inode->i_lock); - ceph_queue_cap_snap(ci); + ceph_get_snap_realm(mdsc, realm); + ceph_put_snap_realm(mdsc, oldrealm); iput(inode); continue; @@ -880,43 +884,9 @@ skip_inode: ceph_update_snap_trace(mdsc, p, e, op == CEPH_SNAP_OP_DESTROY); - if (op == CEPH_SNAP_OP_SPLIT) { - /* - * ok, _now_ add the inodes into the new realm. - */ - for (i = 0; i < num_split_inos; i++) { - struct ceph_vino vino = { - .ino = le64_to_cpu(split_inos[i]), - .snap = CEPH_NOSNAP, - }; - struct inode *inode = ceph_find_inode(sb, vino); - struct ceph_inode_info *ci; - - if (!inode) - continue; - ci = ceph_inode(inode); - spin_lock(&inode->i_lock); - if (list_empty(&ci->i_snap_realm_item)) { - struct ceph_snap_realm *oldrealm = - ci->i_snap_realm; - - dout(" moving %p to split realm %llx %p\n", - inode, realm->ino, realm); - spin_lock(&realm->inodes_with_caps_lock); - list_add(&ci->i_snap_realm_item, - &realm->inodes_with_caps); - ci->i_snap_realm = realm; - spin_unlock(&realm->inodes_with_caps_lock); - ceph_get_snap_realm(mdsc, realm); - ceph_put_snap_realm(mdsc, oldrealm); - } - spin_unlock(&inode->i_lock); - iput(inode); - } - + if (op == CEPH_SNAP_OP_SPLIT) /* we took a reference when we created the realm, above */ ceph_put_snap_realm(mdsc, realm); - } __cleanup_empty_realms(mdsc); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index c33897ae572..c80bfbe27b0 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -690,6 +690,8 @@ struct ceph_snap_realm { struct list_head empty_item; /* if i have ref==0 */ + struct list_head dirty_item; /* if realm needs new context */ + /* the current set of snaps for this realm */ struct ceph_snap_context *cached_context; -- cgit v1.2.3 From 5f4874903df3562b9d5649fc1cf7b8c6bb238e42 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Thu, 9 Sep 2010 14:45:00 +0100 Subject: GFS2: gfs2_logd should be using interruptible waits Looks like this crept in, in a recent update. Reported-by: Krzysztof Urbaniak Signed-off-by: Steven Whitehouse --- fs/gfs2/log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index cde1248a622..ac750bd31a6 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -932,7 +932,7 @@ int gfs2_logd(void *data) do { prepare_to_wait(&sdp->sd_logd_waitq, &wait, - TASK_UNINTERRUPTIBLE); + TASK_INTERRUPTIBLE); if (!gfs2_ail_flush_reqd(sdp) && !gfs2_jrnl_flush_reqd(sdp) && !kthread_should_stop()) -- cgit v1.2.3 From e835124c2be289515b918f2688ced4249e2de566 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 17 Sep 2010 08:03:08 -0700 Subject: ceph: only send one flushsnap per cap_snap per mds session Sending multiple flushsnap messages is problematic because we ignore the response if the tid doesn't match, and the server may only respond to each one once. It's also a waste. So, skip cap_snaps that are already on the flushing list, unless the caller tells us to resend (because we are reconnecting). Signed-off-by: Sage Weil --- fs/ceph/caps.c | 19 +++++++++++++++---- fs/ceph/snap.c | 2 +- fs/ceph/super.h | 3 ++- 3 files changed, 18 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 9fbe9019155..b01c316a814 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1195,10 +1195,14 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, * asynchronously back to the MDS once sync writes complete and dirty * data is written out. * + * Unless @again is true, skip cap_snaps that were already sent to + * the MDS (i.e., during this session). + * * Called under i_lock. Takes s_mutex as needed. */ void __ceph_flush_snaps(struct ceph_inode_info *ci, - struct ceph_mds_session **psession) + struct ceph_mds_session **psession, + int again) __releases(ci->vfs_inode->i_lock) __acquires(ci->vfs_inode->i_lock) { @@ -1240,6 +1244,13 @@ retry: dout("no auth cap (migrating?), doing nothing\n"); goto out; } + + /* only flush each capsnap once */ + if (!again && !list_empty(&capsnap->flushing_item)) { + dout("already flushed %p, skipping\n", capsnap); + continue; + } + mds = ci->i_auth_cap->session->s_mds; mseq = ci->i_auth_cap->mseq; @@ -1314,7 +1325,7 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci) struct inode *inode = &ci->vfs_inode; spin_lock(&inode->i_lock); - __ceph_flush_snaps(ci, NULL); + __ceph_flush_snaps(ci, NULL, 0); spin_unlock(&inode->i_lock); } @@ -1477,7 +1488,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, /* flush snaps first time around only */ if (!list_empty(&ci->i_cap_snaps)) - __ceph_flush_snaps(ci, &session); + __ceph_flush_snaps(ci, &session, 0); goto retry_locked; retry: spin_lock(&inode->i_lock); @@ -1894,7 +1905,7 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, if (cap && cap->session == session) { dout("kick_flushing_caps %p cap %p capsnap %p\n", inode, cap, capsnap); - __ceph_flush_snaps(ci, &session); + __ceph_flush_snaps(ci, &session, 1); } else { pr_err("%p auth cap %p not mds%d ???\n", inode, cap, session->s_mds); diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 9e6eef14b7d..190b6c4a6f2 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -717,7 +717,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc) igrab(inode); spin_unlock(&mdsc->snap_flush_lock); spin_lock(&inode->i_lock); - __ceph_flush_snaps(ci, &session); + __ceph_flush_snaps(ci, &session, 0); spin_unlock(&inode->i_lock); iput(inode); spin_lock(&mdsc->snap_flush_lock); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index c80bfbe27b0..b87638e84c4 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -828,7 +828,8 @@ extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc); extern void __ceph_flush_snaps(struct ceph_inode_info *ci, - struct ceph_mds_session **psession); + struct ceph_mds_session **psession, + int again); extern void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_session *session); extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc); -- cgit v1.2.3 From a43fb73101eaf6db0b33d22c152b338ab8b3edbb Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 17 Sep 2010 09:54:08 -0700 Subject: ceph: check mapping to determine if FILE_CACHE cap is used See if the i_data mapping has any pages to determine if the FILE_CACHE capability is currently in use, instead of assuming it is any time the rdcache_gen value is set (i.e., issued -> used). This allows the MDS RECALL_STATE process work for inodes that have cached pages. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index b01c316a814..73c153092f7 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -814,7 +814,7 @@ int __ceph_caps_used(struct ceph_inode_info *ci) used |= CEPH_CAP_PIN; if (ci->i_rd_ref) used |= CEPH_CAP_FILE_RD; - if (ci->i_rdcache_ref || ci->i_rdcache_gen) + if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages) used |= CEPH_CAP_FILE_CACHE; if (ci->i_wr_ref) used |= CEPH_CAP_FILE_WR; -- cgit v1.2.3 From be4f104dfd3b5e3ae262bff607965cfc38027dec Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 17 Sep 2010 12:30:31 -0700 Subject: ceph: select CRYPTO We select CRYPTO_AES, but not CRYPTO. Signed-off-by: Sage Weil --- fs/ceph/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index bc87b9c1d27..0fcd2640c23 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -3,6 +3,7 @@ config CEPH_FS depends on INET && EXPERIMENTAL select LIBCRC32C select CRYPTO_AES + select CRYPTO help Choose Y or M here to include support for mounting the experimental Ceph distributed file system. Ceph is an extremely -- cgit v1.2.3 From 50aff040363d31f87e94f38f1710973d99489951 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sat, 21 Aug 2010 14:40:20 +0800 Subject: ocfs2/net: fix uninitialized ret in o2net_send_message_vec() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mmotm/fs/ocfs2/cluster/tcp.c: In function ‘o2net_send_message_vec’: mmotm/fs/ocfs2/cluster/tcp.c:980:6: warning: ‘ret’ may be used uninitialized in this function It seems a real bug introduced by commit 9af0b38ff3 (ocfs2/net: Use wait_event() in o2net_send_message_vec()). cc: Sunil Mushran Signed-off-by: Wu Fengguang Signed-off-by: Joel Becker --- fs/ocfs2/cluster/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 1361997cf20..cbe2f057cc2 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -977,7 +977,7 @@ static int o2net_tx_can_proceed(struct o2net_node *nn, int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, size_t caller_veclen, u8 target_node, int *status) { - int ret; + int ret = 0; struct o2net_msg *msg = NULL; size_t veclen, caller_bytes = 0; struct kvec *vec = NULL; -- cgit v1.2.3 From 112d421df2fddc0278584b084f4fcfedd144c5f4 Mon Sep 17 00:00:00 2001 From: Jan Harkes Date: Fri, 17 Sep 2010 23:26:01 -0400 Subject: Coda: mount hangs because of missed REQ_WRITE rename Coda's REQ_* defines were renamed to avoid clashes with the block layer (commit 4aeefdc69f7b: "coda: fixup clash with block layer REQ_* defines"). However one was missed and response messages are no longer matched with requests and waiting threads are no longer woken up. This patch fixes this. Signed-off-by: Jan Harkes [ Also fixed up whitespace while at it -Linus ] Signed-off-by: Linus Torvalds --- fs/coda/psdev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index de89645777c..116af7546cf 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -184,8 +184,8 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf, } /* adjust outsize. is this useful ?? */ - req->uc_outSize = nbytes; - req->uc_flags |= REQ_WRITE; + req->uc_outSize = nbytes; + req->uc_flags |= CODA_REQ_WRITE; count = nbytes; /* Convert filedescriptor into a file handle */ -- cgit v1.2.3 From 371d217ee1ff8b418b8f73fb2a34990f951ec2d4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 21 Sep 2010 11:49:01 +0200 Subject: char: Mark /dev/zero and /dev/kmem as not capable of writeback These devices don't do any writeback but their device inodes still can get dirty so mark bdi appropriately so that bdi code does the right thing and files inodes to lists of bdi carrying the device inodes. Cc: stable@kernel.org Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- drivers/char/mem.c | 3 ++- fs/char_dev.c | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/drivers/char/mem.c b/drivers/char/mem.c index a398ecdbd75..1f528fad351 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -788,10 +788,11 @@ static const struct file_operations zero_fops = { /* * capabilities for /dev/zero * - permits private mappings, "copies" are taken of the source of zeros + * - no writeback happens */ static struct backing_dev_info zero_bdi = { .name = "char/mem", - .capabilities = BDI_CAP_MAP_COPY, + .capabilities = BDI_CAP_MAP_COPY | BDI_CAP_NO_ACCT_AND_WRITEBACK, }; static const struct file_operations full_fops = { diff --git a/fs/char_dev.c b/fs/char_dev.c index f80a4f25123..143d393881c 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -40,7 +40,9 @@ struct backing_dev_info directly_mappable_cdev_bdi = { #endif /* permit direct mmap, for read, write or exec */ BDI_CAP_MAP_DIRECT | - BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP), + BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP | + /* no writeback happens */ + BDI_CAP_NO_ACCT_AND_WRITEBACK), }; static struct kobj_map *cdev_map; -- cgit v1.2.3 From 692ebd17c2905313fff3c504c249c6a0faad16ec Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 21 Sep 2010 11:51:01 +0200 Subject: bdi: Fix warnings in __mark_inode_dirty for /dev/zero and friends Inodes of devices such as /dev/zero can get dirty for example via utime(2) syscall or due to atime update. Backing device of such inodes (zero_bdi, etc.) is however unable to handle dirty inodes and thus __mark_inode_dirty complains. In fact, inode should be rather dirtied against backing device of the filesystem holding it. This is generally a good rule except for filesystems such as 'bdev' or 'mtd_inodefs'. Inodes in these pseudofilesystems are referenced from ordinary filesystem inodes and carry mapping with real data of the device. Thus for these inodes we have to use inode->i_mapping->backing_dev_info as we did so far. We distinguish these filesystems by checking whether sb->s_bdi points to a non-trivial backing device or not. Example: Assume we have an ext3 filesystem on /dev/sda1 mounted on /. There's a device inode A described by a path "/dev/sdb" on this filesystem. This inode will be dirtied against backing device "8:0" after this patch. bdev filesystem contains block device inode B coupled with our inode A. When someone modifies a page of /dev/sdb, it's B that gets dirtied and the dirtying happens against the backing device "8:16". Thus both inodes get filed to a correct bdi list. Cc: stable@kernel.org Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 81e086d8aa5..5581122bd2c 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -52,8 +52,6 @@ struct wb_writeback_work { #define CREATE_TRACE_POINTS #include -#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) - /* * We don't actually have pdflush, but this one is exported though /proc... */ @@ -71,6 +69,27 @@ int writeback_in_progress(struct backing_dev_info *bdi) return test_bit(BDI_writeback_running, &bdi->state); } +static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info; + + /* + * For inodes on standard filesystems, we use superblock's bdi. For + * inodes on virtual filesystems, we want to use inode mapping's bdi + * because they can possibly point to something useful (think about + * block_dev filesystem). + */ + if (sb->s_bdi && sb->s_bdi != &noop_backing_dev_info) { + /* Some device inodes could play dirty tricks. Catch them... */ + WARN(bdi != sb->s_bdi && bdi_cap_writeback_dirty(bdi), + "Dirtiable inode bdi %s != sb bdi %s\n", + bdi->name, sb->s_bdi->name); + return sb->s_bdi; + } + return bdi; +} + static void bdi_queue_work(struct backing_dev_info *bdi, struct wb_writeback_work *work) { -- cgit v1.2.3 From 767b68e96993e29e3480d7ecdd9c4b84667c5762 Mon Sep 17 00:00:00 2001 From: Dan Rosenberg Date: Wed, 22 Sep 2010 14:32:56 -0400 Subject: Prevent freeing uninitialized pointer in compat_do_readv_writev In 32-bit compatibility mode, the error handling for compat_do_readv_writev() may free an uninitialized pointer, potentially leading to all sorts of ugly memory corruption. This is reliably triggerable by unprivileged users by invoking the readv()/writev() syscalls with an invalid iovec pointer. The below patch fixes this to emulate the non-compat version. Introduced by commit b83733639a49 ("compat: factor out compat_rw_copy_check_uvector from compat_do_readv_writev") Signed-off-by: Dan Rosenberg Cc: stable@kernel.org (2.6.35) Cc: Al Viro Signed-off-by: Linus Torvalds --- fs/compat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/compat.c b/fs/compat.c index 718c7062aec..0644a154672 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1153,7 +1153,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, { compat_ssize_t tot_len; struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov; + struct iovec *iov = iovstack; ssize_t ret; io_fn_t fn; iov_fn_t fnv; -- cgit v1.2.3 From c227e69028473c7c7994a9b0a2cc0034f3f7e0fe Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 22 Sep 2010 13:04:54 -0700 Subject: /proc/vmcore: fix seeking Commit 73296bc611 ("procfs: Use generic_file_llseek in /proc/vmcore") broke seeking on /proc/vmcore. This changes it back to use default_llseek in order to restore the original behaviour. The problem with generic_file_llseek is that it only allows seeks up to inode->i_sb->s_maxbytes, which is zero on procfs and some other virtual file systems. We should merge generic_file_llseek and default_llseek some day and clean this up in a proper way, but for 2.6.35/36, reverting vmcore is the safer solution. Signed-off-by: Arnd Bergmann Cc: Frederic Weisbecker Reported-by: CAI Qian Tested-by: CAI Qian Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/vmcore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 91c817ff02c..2367fb3f70b 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -163,7 +163,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, static const struct file_operations proc_vmcore_operations = { .read = read_vmcore, - .llseek = generic_file_llseek, + .llseek = default_llseek, }; static struct vmcore* __init get_new_element(void) -- cgit v1.2.3 From a0c42bac79731276c9b2f28d54f9e658fcf843a2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 22 Sep 2010 13:05:03 -0700 Subject: aio: do not return ERESTARTSYS as a result of AIO OCFS2 can return ERESTARTSYS from its write function when the process is signalled while waiting for a cluster lock (and the filesystem is mounted with intr mount option). Generally, it seems reasonable to allow filesystems to return this error code from its IO functions. As we must not leak ERESTARTSYS (and similar error codes) to userspace as a result of an AIO operation, we have to properly convert it to EINTR inside AIO code (restarting the syscall isn't really an option because other AIO could have been already submitted by the same io_submit syscall). Signed-off-by: Jan Kara Reviewed-by: Jeff Moyer Cc: Christoph Hellwig Cc: Zach Brown Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 1320b2a05fb..250b0a73c8a 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -712,8 +712,16 @@ static ssize_t aio_run_iocb(struct kiocb *iocb) */ ret = retry(iocb); - if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) + if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) { + /* + * There's no easy way to restart the syscall since other AIO's + * may be already running. Just fail this IO with EINTR. + */ + if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR || + ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK)) + ret = -EINTR; aio_complete(iocb, ret, 0); + } out: spin_lock_irq(&ctx->ctx_lock); -- cgit v1.2.3 From 1c2499ae87f828eabddf6483b0dfc11da1100c07 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Wed, 22 Sep 2010 13:05:06 -0700 Subject: /proc/pid/smaps: fix dirty pages accounting Currently, /proc//smaps has wrong dirty pages accounting. Shared_Dirty and Private_Dirty output only pte dirty pages and ignore PG_dirty page flag. It is difference against documentation, but also inconsistent against Referenced field. (Referenced checks both pte and page flags) This patch fixes it. Test program: large-array.c --------------------------------------------------- #include #include #include #include char array[1*1024*1024*1024L]; int main(void) { memset(array, 1, sizeof(array)); pause(); return 0; } --------------------------------------------------- Test case: 1. run ./large-array 2. cat /proc/`pidof large-array`/smaps 3. swapoff -a 4. cat /proc/`pidof large-array`/smaps again Test result: 00601000-40601000 rw-p 00000000 00:00 0 Size: 1048576 kB Rss: 1048576 kB Pss: 1048576 kB Shared_Clean: 0 kB Shared_Dirty: 0 kB Private_Clean: 218992 kB <-- showed pages as clean incorrectly Private_Dirty: 829584 kB Referenced: 388364 kB Swap: 0 kB KernelPageSize: 4 kB MMUPageSize: 4 kB 00601000-40601000 rw-p 00000000 00:00 0 Size: 1048576 kB Rss: 1048576 kB Pss: 1048576 kB Shared_Clean: 0 kB Shared_Dirty: 0 kB Private_Clean: 0 kB Private_Dirty: 1048576 kB <-- fixed Referenced: 388480 kB Swap: 0 kB KernelPageSize: 4 kB MMUPageSize: 4 kB Signed-off-by: KOSAKI Motohiro Acked-by: Hugh Dickins Cc: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 271afc48b9a..1dbca4e8cc1 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -363,13 +363,13 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, mss->referenced += PAGE_SIZE; mapcount = page_mapcount(page); if (mapcount >= 2) { - if (pte_dirty(ptent)) + if (pte_dirty(ptent) || PageDirty(page)) mss->shared_dirty += PAGE_SIZE; else mss->shared_clean += PAGE_SIZE; mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount; } else { - if (pte_dirty(ptent)) + if (pte_dirty(ptent) || PageDirty(page)) mss->private_dirty += PAGE_SIZE; else mss->private_clean += PAGE_SIZE; -- cgit v1.2.3 From 12828061cdacfb1db3eb03fd71952d5ebc555bbb Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 13 Sep 2010 14:00:23 +0800 Subject: ocfs2: update ctime when changing the file's permission by setfacl In commit 30e2bab, ext3 fixed it. So change it accordingly in ocfs2. Steps to reproduce: # touch aaa # stat -c %Z aaa 1283760364 # setfacl -m 'u::x,g::x,o::x' aaa # stat -c %Z aaa 1283760364 Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/acl.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index a76e0aa5cd3..391915093fe 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -209,7 +209,10 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh, } inode->i_mode = new_mode; + inode->i_ctime = CURRENT_TIME; di->i_mode = cpu_to_le16(inode->i_mode); + di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); ocfs2_journal_dirty(handle, di_bh); -- cgit v1.2.3 From 47dea423799d98c53793237ab386a94976f305d5 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 13 Sep 2010 15:13:50 +0800 Subject: ocfs2: Use cpu_to_le16 for e_leaf_clusters in ocfs2_bg_discontig_add_extent. e_leaf_clusters is a le16, so use cpu_to_le16 instead of cpu_to_le32. What's more, we change 'clusters' to unsigned int to signify that the size of 'clusters' isn't important here. Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/suballoc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 8a286f54dca..849c2f0e0a0 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -357,7 +357,7 @@ out: static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb, struct ocfs2_group_desc *bg, struct ocfs2_chain_list *cl, - u64 p_blkno, u32 clusters) + u64 p_blkno, unsigned int clusters) { struct ocfs2_extent_list *el = &bg->bg_list; struct ocfs2_extent_rec *rec; @@ -369,7 +369,7 @@ static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb, rec->e_blkno = cpu_to_le64(p_blkno); rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc)); - rec->e_leaf_clusters = cpu_to_le32(clusters); + rec->e_leaf_clusters = cpu_to_le16(clusters); le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc)); le16_add_cpu(&bg->bg_free_bits_count, clusters * le16_to_cpu(cl->cl_bpc)); -- cgit v1.2.3 From 4a452de4fdfe4dbb27e491904d8bfaf1262bdff4 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Sun, 19 Sep 2010 13:42:28 +0800 Subject: ocfs2: Move 'wanted' into parens of ocfs2_resmap_resv_bits. The first time I read the function ocfs2_resmap_resv_bits, I consider about what 'wanted' will be used and consider about the comments. Then I find it is only used if the reservation is empty. ;) So we'd better move it to the parens so that it make the code more readable, what's more, ocfs2_resmap_resv_bits is used so frequently and we should save some cpus. Acked-by: Mark Fasheh Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/reservations.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c index d8b6e4259b8..3e78db361bc 100644 --- a/fs/ocfs2/reservations.c +++ b/fs/ocfs2/reservations.c @@ -732,25 +732,23 @@ int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap, struct ocfs2_alloc_reservation *resv, int *cstart, int *clen) { - unsigned int wanted = *clen; - if (resv == NULL || ocfs2_resmap_disabled(resmap)) return -ENOSPC; spin_lock(&resv_lock); - /* - * We don't want to over-allocate for temporary - * windows. Otherwise, we run the risk of fragmenting the - * allocation space. - */ - wanted = ocfs2_resv_window_bits(resmap, resv); - if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen) - wanted = *clen; - if (ocfs2_resv_empty(resv)) { - mlog(0, "empty reservation, find new window\n"); + /* + * We don't want to over-allocate for temporary + * windows. Otherwise, we run the risk of fragmenting the + * allocation space. + */ + unsigned int wanted = ocfs2_resv_window_bits(resmap, resv); + if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen) + wanted = *clen; + + mlog(0, "empty reservation, find new window\n"); /* * Try to get a window here. If it works, we must fall * through and test the bitmap . This avoids some -- cgit v1.2.3 From 0000b862027d624ac564609b87c1aa4d14dd1e46 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Sun, 19 Sep 2010 13:42:29 +0800 Subject: ocfs2: Sync inode flags with ext2. We sync our inode flags with ext2 and define them by hex values. But actually in commit 3669567(4 years ago), all these values are moved to include/linux/fs.h. So we'd better also use them as what ext2 did. So sync our inode flags with ext2 by using FS_*. Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/ocfs2_fs.h | 37 +++++++++++++++++++++++++------------ fs/ocfs2/ocfs2_ioctl.h | 8 ++++---- 2 files changed, 29 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 33f1c9a8258..fa31d05e41b 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -235,18 +235,31 @@ #define OCFS2_HAS_REFCOUNT_FL (0x0010) /* Inode attributes, keep in sync with EXT2 */ -#define OCFS2_SECRM_FL (0x00000001) /* Secure deletion */ -#define OCFS2_UNRM_FL (0x00000002) /* Undelete */ -#define OCFS2_COMPR_FL (0x00000004) /* Compress file */ -#define OCFS2_SYNC_FL (0x00000008) /* Synchronous updates */ -#define OCFS2_IMMUTABLE_FL (0x00000010) /* Immutable file */ -#define OCFS2_APPEND_FL (0x00000020) /* writes to file may only append */ -#define OCFS2_NODUMP_FL (0x00000040) /* do not dump file */ -#define OCFS2_NOATIME_FL (0x00000080) /* do not update atime */ -#define OCFS2_DIRSYNC_FL (0x00010000) /* dirsync behaviour (directories only) */ - -#define OCFS2_FL_VISIBLE (0x000100FF) /* User visible flags */ -#define OCFS2_FL_MODIFIABLE (0x000100FF) /* User modifiable flags */ +#define OCFS2_SECRM_FL FS_SECRM_FL /* Secure deletion */ +#define OCFS2_UNRM_FL FS_UNRM_FL /* Undelete */ +#define OCFS2_COMPR_FL FS_COMPR_FL /* Compress file */ +#define OCFS2_SYNC_FL FS_SYNC_FL /* Synchronous updates */ +#define OCFS2_IMMUTABLE_FL FS_IMMUTABLE_FL /* Immutable file */ +#define OCFS2_APPEND_FL FS_APPEND_FL /* writes to file may only append */ +#define OCFS2_NODUMP_FL FS_NODUMP_FL /* do not dump file */ +#define OCFS2_NOATIME_FL FS_NOATIME_FL /* do not update atime */ +/* Reserved for compression usage... */ +#define OCFS2_DIRTY_FL FS_DIRTY_FL +#define OCFS2_COMPRBLK_FL FS_COMPRBLK_FL /* One or more compressed clusters */ +#define OCFS2_NOCOMP_FL FS_NOCOMP_FL /* Don't compress */ +#define OCFS2_ECOMPR_FL FS_ECOMPR_FL /* Compression error */ +/* End compression flags --- maybe not all used */ +#define OCFS2_BTREE_FL FS_BTREE_FL /* btree format dir */ +#define OCFS2_INDEX_FL FS_INDEX_FL /* hash-indexed directory */ +#define OCFS2_IMAGIC_FL FS_IMAGIC_FL /* AFS directory */ +#define OCFS2_JOURNAL_DATA_FL FS_JOURNAL_DATA_FL /* Reserved for ext3 */ +#define OCFS2_NOTAIL_FL FS_NOTAIL_FL /* file tail should not be merged */ +#define OCFS2_DIRSYNC_FL FS_DIRSYNC_FL /* dirsync behaviour (directories only) */ +#define OCFS2_TOPDIR_FL FS_TOPDIR_FL /* Top of directory hierarchies*/ +#define OCFS2_RESERVED_FL FS_RESERVED_FL /* reserved for ext2 lib */ + +#define OCFS2_FL_VISIBLE FS_FL_USER_VISIBLE /* User visible flags */ +#define OCFS2_FL_MODIFIABLE FS_FL_USER_MODIFIABLE /* User modifiable flags */ /* * Extent record flags (e_node.leaf.flags) diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h index 2d3420af1a8..5d241505690 100644 --- a/fs/ocfs2/ocfs2_ioctl.h +++ b/fs/ocfs2/ocfs2_ioctl.h @@ -23,10 +23,10 @@ /* * ioctl commands */ -#define OCFS2_IOC_GETFLAGS _IOR('f', 1, long) -#define OCFS2_IOC_SETFLAGS _IOW('f', 2, long) -#define OCFS2_IOC32_GETFLAGS _IOR('f', 1, int) -#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int) +#define OCFS2_IOC_GETFLAGS FS_IOC_GETFLAGS +#define OCFS2_IOC_SETFLAGS FS_IOC_SETFLAGS +#define OCFS2_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define OCFS2_IOC32_SETFLAGS FS_IOC32_SETFLAGS /* * Space reservation / allocation / free ioctls and argument structure -- cgit v1.2.3 From 5dad6c39d156fbbde0b0ef170d9173feffdeb546 Mon Sep 17 00:00:00 2001 From: Srinivas Eeda Date: Tue, 21 Sep 2010 16:27:26 -0700 Subject: o2dlm: force free mles during dlm exit While umounting, a block mle doesn't get freed if dlm is shutdown after master request is received but before assert master. This results in unclean shutdown of dlm domain. This patch frees all mles that lie around after other nodes were notified about exiting the dlm and marking dlm state as leaving. Only block mles are expected to be around, so we log ERROR for other mles but still free them. Signed-off-by: Srinivas Eeda Signed-off-by: Joel Becker --- fs/ocfs2/dlm/dlmcommon.h | 1 + fs/ocfs2/dlm/dlmdomain.c | 1 + fs/ocfs2/dlm/dlmmaster.c | 40 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 4b6ae2c13b4..765298908f1 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -1030,6 +1030,7 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node); +void dlm_force_free_mles(struct dlm_ctxt *dlm); int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock); int __dlm_lockres_has_locks(struct dlm_lock_resource *res); int __dlm_lockres_unused(struct dlm_lock_resource *res); diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 153abb5abef..11a5c87fd7f 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -693,6 +693,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm) dlm_mark_domain_leaving(dlm); dlm_leave_domain(dlm); + dlm_force_free_mles(dlm); dlm_complete_dlm_shutdown(dlm); } dlm_put(dlm); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index ffb4c68dafa..f564b0e5f80 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -3433,3 +3433,43 @@ void dlm_lockres_release_ast(struct dlm_ctxt *dlm, wake_up(&res->wq); wake_up(&dlm->migration_wq); } + +void dlm_force_free_mles(struct dlm_ctxt *dlm) +{ + int i; + struct hlist_head *bucket; + struct dlm_master_list_entry *mle; + struct hlist_node *tmp, *list; + + /* + * We notified all other nodes that we are exiting the domain and + * marked the dlm state to DLM_CTXT_LEAVING. If any mles are still + * around we force free them and wake any processes that are waiting + * on the mles + */ + spin_lock(&dlm->spinlock); + spin_lock(&dlm->master_lock); + + BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING); + BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES)); + + for (i = 0; i < DLM_HASH_BUCKETS; i++) { + bucket = dlm_master_hash(dlm, i); + hlist_for_each_safe(list, tmp, bucket) { + mle = hlist_entry(list, struct dlm_master_list_entry, + master_hash_node); + if (mle->type != DLM_MLE_BLOCK) { + mlog(ML_ERROR, "bad mle: %p\n", mle); + dlm_print_one_mle(mle); + } + atomic_set(&mle->woken, 1); + wake_up(&mle->wq); + + __dlm_unlink_mle(dlm, mle); + __dlm_mle_detach_hb_events(dlm, mle); + __dlm_put_mle(mle); + } + } + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); +} -- cgit v1.2.3 From 80168676ebfe4af51407d30f336d67f082d45201 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 24 Sep 2010 18:13:44 +1000 Subject: xfs: force background CIL push under sustained load I have been seeing occasional pauses in transaction throughput up to 30s long under heavy parallel workloads. The only notable thing was that the xfsaild was trying to be active during the pauses, but making no progress. It was running exactly 20 times a second (on the 50ms no-progress backoff), and the number of pushbuf events was constant across this time as well. IOWs, the xfsaild appeared to be stuck on buffers that it could not push out. Further investigation indicated that it was trying to push out inode buffers that were pinned and/or locked. The xfsbufd was also getting woken at the same frequency (by the xfsaild, no doubt) to push out delayed write buffers. The xfsbufd was not making any progress because all the buffers in the delwri queue were pinned. This scan- and-make-no-progress dance went one in the trace for some seconds, before the xfssyncd came along an issued a log force, and then things started going again. However, I noticed something strange about the log force - there were way too many IO's issued. 516 log buffers were written, to be exact. That added up to 129MB of log IO, which got me very interested because it's almost exactly 25% of the size of the log. He delayed logging code is suppose to aggregate the minimum of 25% of the log or 8MB worth of changes before flushing. That's what really puzzled me - why did a log force write 129MB instead of only 8MB? Essentially what has happened is that no CIL pushes had occurred since the previous tail push which cleared out 25% of the log space. That caused all the new transactions to block because there wasn't log space for them, but they kick the xfsaild to push the tail. However, the xfsaild was not making progress because there were buffers it could not lock and flush, and the xfsbufd could not flush them because they were pinned. As a result, both the xfsaild and the xfsbufd could not move the tail of the log forward without the CIL first committing. The cause of the problem was that the background CIL push, which should happen when 8MB of aggregated changes have been committed, is being held off by the concurrent transaction commit load. The background push does a down_write_trylock() which will fail if there is a concurrent transaction commit holding the push lock in read mode. With 8 CPUs all doing transactions as fast as they can, there was enough concurrent transaction commits to hold off the background push until tail-pushing could no longer free log space, and the halt would occur. It should be noted that there is no reason why it would halt at 25% of log space used by a single CIL checkpoint. This bug could definitely violate the "no transaction should be larger than half the log" requirement and hence result in corruption if the system crashed under heavy load. This sort of bug is exactly the reason why delayed logging was tagged as experimental.... The fix is to start blocking background pushes once the threshold has been exceeded. Rework the threshold calculations to keep the amount of log space a CIL checkpoint can use to below that of the AIL push threshold to avoid the problem completely. Signed-off-by: Dave Chinner Reviewed-by: Alex Elder Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_log_cil.c | 12 +++++++++--- fs/xfs/xfs_log_priv.h | 37 +++++++++++++++++++++---------------- 2 files changed, 30 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index ed575fb4b49..7e206fc1fa3 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -405,9 +405,15 @@ xlog_cil_push( new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); new_ctx->ticket = xlog_cil_ticket_alloc(log); - /* lock out transaction commit, but don't block on background push */ + /* + * Lock out transaction commit, but don't block for background pushes + * unless we are well over the CIL space limit. See the definition of + * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic + * used here. + */ if (!down_write_trylock(&cil->xc_ctx_lock)) { - if (!push_seq) + if (!push_seq && + cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log)) goto out_free_ticket; down_write(&cil->xc_ctx_lock); } @@ -422,7 +428,7 @@ xlog_cil_push( goto out_skip; /* check for a previously pushed seqeunce */ - if (push_seq < cil->xc_ctx->sequence) + if (push_seq && push_seq < cil->xc_ctx->sequence) goto out_skip; /* diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index ced52b98b32..edcdfe01617 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -426,13 +426,13 @@ struct xfs_cil { }; /* - * The amount of log space we should the CIL to aggregate is difficult to size. - * Whatever we chose we have to make we can get a reservation for the log space - * effectively, that it is large enough to capture sufficient relogging to - * reduce log buffer IO significantly, but it is not too large for the log or - * induces too much latency when writing out through the iclogs. We track both - * space consumed and the number of vectors in the checkpoint context, so we - * need to decide which to use for limiting. + * The amount of log space we allow the CIL to aggregate is difficult to size. + * Whatever we choose, we have to make sure we can get a reservation for the + * log space effectively, that it is large enough to capture sufficient + * relogging to reduce log buffer IO significantly, but it is not too large for + * the log or induces too much latency when writing out through the iclogs. We + * track both space consumed and the number of vectors in the checkpoint + * context, so we need to decide which to use for limiting. * * Every log buffer we write out during a push needs a header reserved, which * is at least one sector and more for v2 logs. Hence we need a reservation of @@ -459,16 +459,21 @@ struct xfs_cil { * checkpoint transaction ticket is specific to the checkpoint context, rather * than the CIL itself. * - * With dynamic reservations, we can basically make up arbitrary limits for the - * checkpoint size so long as they don't violate any other size rules. Hence - * the initial maximum size for the checkpoint transaction will be set to a - * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit - * right now based on the latency of writing out a large amount of data through - * the circular iclog buffers. + * With dynamic reservations, we can effectively make up arbitrary limits for + * the checkpoint size so long as they don't violate any other size rules. + * Recovery imposes a rule that no transaction exceed half the log, so we are + * limited by that. Furthermore, the log transaction reservation subsystem + * tries to keep 25% of the log free, so we need to keep below that limit or we + * risk running out of free log space to start any new transactions. + * + * In order to keep background CIL push efficient, we will set a lower + * threshold at which background pushing is attempted without blocking current + * transaction commits. A separate, higher bound defines when CIL pushes are + * enforced to ensure we stay within our maximum checkpoint size bounds. + * threshold, yet give us plenty of space for aggregation on large logs. */ - -#define XLOG_CIL_SPACE_LIMIT(log) \ - (min((log->l_logsize >> 2), (8 * 1024 * 1024))) +#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3) +#define XLOG_CIL_HARD_SPACE_LIMIT(log) (3 * (log->l_logsize >> 4)) /* * The reservation head lsn is not made up of a cycle number and block number. -- cgit v1.2.3 From 522440ed55d2cc8855ea5f82bc067e0483b2e1be Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 29 Sep 2010 09:49:54 -0400 Subject: cifs: set backing_dev_info on new S_ISREG inodes Testing on very recent kernel (2.6.36-rc6) made this warning pop: WARNING: at fs/fs-writeback.c:87 inode_to_bdi+0x65/0x70() Hardware name: Dirtiable inode bdi default != sb bdi cifs ...the following patch fixes it and seems to be the obviously correct thing to do for cifs. Cc: stable@kernel.org Acked-by: Dave Kleikamp Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/inode.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 93f77d438d3..53cce8cc222 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -801,6 +801,8 @@ retry_iget5_locked: inode->i_flags |= S_NOATIME | S_NOCMTIME; if (inode->i_state & I_NEW) { inode->i_ino = hash; + if (S_ISREG(inode->i_mode)) + inode->i_data.backing_dev_info = sb->s_bdi; #ifdef CONFIG_CIFS_FSCACHE /* initialize per-inode cache cookie pointer */ CIFS_I(inode)->fscache = NULL; -- cgit v1.2.3 From 1fc8a117865b54590acd773a55fbac9221b018f0 Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Wed, 29 Sep 2010 17:33:05 -0700 Subject: ocfs2: Don't walk off the end of fast symlinks. ocfs2 fast symlinks are NUL terminated strings stored inline in the inode data area. However, disk corruption or a local attacker could, in theory, remove that NUL. Because we're using strlen() (my fault, introduced in a731d1 when removing vfs_follow_link()), we could walk off the end of that string. Signed-off-by: Joel Becker Cc: stable@kernel.org --- fs/ocfs2/symlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index 32499d213fc..9975457c981 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -128,7 +128,7 @@ static void *ocfs2_fast_follow_link(struct dentry *dentry, } /* Fast symlinks can't be large */ - len = strlen(target); + len = strnlen(target, ocfs2_fast_symlink_chars(inode->i_sb)); link = kzalloc(len + 1, GFP_NOFS); if (!link) { status = -ENOMEM; -- cgit v1.2.3 From f569599ae70f0899035f8d5876a7939f629c5976 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 29 Sep 2010 15:27:08 -0400 Subject: cifs: prevent infinite recursion in cifs_reconnect_tcon cifs_reconnect_tcon is called from smb_init. After a successful reconnect, cifs_reconnect_tcon will call reset_cifs_unix_caps. That function will, in turn call CIFSSMBQFSUnixInfo and CIFSSMBSetFSUnixInfo. Those functions also call smb_init. It's possible for the session and tcon reconnect to succeed, and then for another cifs_reconnect to occur before CIFSSMBQFSUnixInfo or CIFSSMBSetFSUnixInfo to be called. That'll cause those functions to call smb_init and cifs_reconnect_tcon again, ad infinitum... Break the infinite recursion by having those functions use a new smb_init variant that doesn't attempt to perform a reconnect. Reported-and-Tested-by: Michal Suchanek Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifssmb.c | 49 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index c65c3419dd3..7e83b356cc9 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -232,7 +232,7 @@ static int small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, void **request_buf) { - int rc = 0; + int rc; rc = cifs_reconnect_tcon(tcon, smb_command); if (rc) @@ -250,7 +250,7 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, if (tcon != NULL) cifs_stats_inc(&tcon->num_smbs_sent); - return rc; + return 0; } int @@ -281,16 +281,9 @@ small_smb_init_no_tc(const int smb_command, const int wct, /* If the return code is zero, this function must fill in request_buf pointer */ static int -smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, - void **request_buf /* returned */ , - void **response_buf /* returned */ ) +__smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, + void **request_buf, void **response_buf) { - int rc = 0; - - rc = cifs_reconnect_tcon(tcon, smb_command); - if (rc) - return rc; - *request_buf = cifs_buf_get(); if (*request_buf == NULL) { /* BB should we add a retry in here if not a writepage? */ @@ -309,7 +302,31 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, if (tcon != NULL) cifs_stats_inc(&tcon->num_smbs_sent); - return rc; + return 0; +} + +/* If the return code is zero, this function must fill in request_buf pointer */ +static int +smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, + void **request_buf, void **response_buf) +{ + int rc; + + rc = cifs_reconnect_tcon(tcon, smb_command); + if (rc) + return rc; + + return __smb_init(smb_command, wct, tcon, request_buf, response_buf); +} + +static int +smb_init_no_reconnect(int smb_command, int wct, struct cifsTconInfo *tcon, + void **request_buf, void **response_buf) +{ + if (tcon->ses->need_reconnect || tcon->need_reconnect) + return -EHOSTDOWN; + + return __smb_init(smb_command, wct, tcon, request_buf, response_buf); } static int validate_t2(struct smb_t2_rsp *pSMB) @@ -4534,8 +4551,8 @@ CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon) cFYI(1, "In QFSUnixInfo"); QFSUnixRetry: - rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, - (void **) &pSMBr); + rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon, + (void **) &pSMB, (void **) &pSMBr); if (rc) return rc; @@ -4604,8 +4621,8 @@ CIFSSMBSetFSUnixInfo(const int xid, struct cifsTconInfo *tcon, __u64 cap) cFYI(1, "In SETFSUnixInfo"); SETFSUnixRetry: /* BB switch to small buf init to save memory */ - rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, - (void **) &pSMBr); + rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon, + (void **) &pSMB, (void **) &pSMBr); if (rc) return rc; -- cgit v1.2.3 From 3036e7b490bf7878c6dae952eec5fb87b1106589 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 30 Sep 2010 15:15:33 -0700 Subject: proc: make /proc/pid/limits world readable Having the limits file world readable will ease the task of system management on systems where root privileges might be restricted. Having admin restricted with root priviledges, he/she could not check other users process' limits. Also it'd align with most of the /proc stat files. Signed-off-by: Jiri Olsa Acked-by: Neil Horman Cc: Eugene Teo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index a1c43e7c8a7..8e4addaa542 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2675,7 +2675,7 @@ static const struct pid_entry tgid_base_stuff[] = { INF("auxv", S_IRUSR, proc_pid_auxv), ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), - INF("limits", S_IRUSR, proc_pid_limits), + INF("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif @@ -3011,7 +3011,7 @@ static const struct pid_entry tid_base_stuff[] = { INF("auxv", S_IRUSR, proc_pid_auxv), ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), - INF("limits", S_IRUSR, proc_pid_limits), + INF("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif -- cgit v1.2.3 From 3f259d092c7a2fdf217823e8f1838530adb0cdb0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 Sep 2010 15:15:37 -0700 Subject: reiserfs: fix dependency inversion between inode and reiserfs mutexes The reiserfs mutex already depends on the inode mutex, so we can't lock the inode mutex in reiserfs_unpack() without using the safe locking API, because reiserfs_unpack() is always called with the reiserfs mutex locked. This fixes: ======================================================= [ INFO: possible circular locking dependency detected ] 2.6.35c #13 ------------------------------------------------------- lilo/1606 is trying to acquire lock: (&sb->s_type->i_mutex_key#8){+.+.+.}, at: [] reiserfs_unpack+0x60/0x110 [reiserfs] but task is already holding lock: (&REISERFS_SB(s)->lock){+.+.+.}, at: [] reiserfs_write_lock+0x28/0x40 [reiserfs] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&REISERFS_SB(s)->lock){+.+.+.}: [] lock_acquire+0x67/0x80 [] __mutex_lock_common+0x4d/0x410 [] mutex_lock_nested+0x18/0x20 [] reiserfs_write_lock+0x28/0x40 [reiserfs] [] reiserfs_lookup_privroot+0x2a/0x90 [reiserfs] [] reiserfs_fill_super+0x941/0xe60 [reiserfs] [] get_sb_bdev+0x117/0x170 [] get_super_block+0x21/0x30 [reiserfs] [] vfs_kern_mount+0x6a/0x1b0 [] do_kern_mount+0x39/0xe0 [] do_mount+0x340/0x790 [] sys_mount+0x84/0xb0 [] syscall_call+0x7/0xb -> #0 (&sb->s_type->i_mutex_key#8){+.+.+.}: [] __lock_acquire+0x1026/0x1180 [] lock_acquire+0x67/0x80 [] __mutex_lock_common+0x4d/0x410 [] mutex_lock_nested+0x18/0x20 [] reiserfs_unpack+0x60/0x110 [reiserfs] [] reiserfs_ioctl+0x272/0x320 [reiserfs] [] vfs_ioctl+0x28/0xa0 [] do_vfs_ioctl+0x32d/0x5c0 [] sys_ioctl+0x63/0x70 [] syscall_call+0x7/0xb other info that might help us debug this: 1 lock held by lilo/1606: #0: (&REISERFS_SB(s)->lock){+.+.+.}, at: [] reiserfs_write_lock+0x28/0x40 [reiserfs] stack backtrace: Pid: 1606, comm: lilo Not tainted 2.6.35c #13 Call Trace: [] __lock_acquire+0x1026/0x1180 [] lock_acquire+0x67/0x80 [] __mutex_lock_common+0x4d/0x410 [] mutex_lock_nested+0x18/0x20 [] reiserfs_unpack+0x60/0x110 [reiserfs] [] reiserfs_ioctl+0x272/0x320 [reiserfs] [] vfs_ioctl+0x28/0xa0 [] do_vfs_ioctl+0x32d/0x5c0 [] sys_ioctl+0x63/0x70 [] syscall_call+0x7/0xb Reported-by: Jarek Poplawski Tested-by: Jarek Poplawski Signed-off-by: Frederic Weisbecker Cc: Jeff Mahoney Cc: [2.6.32 and later] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index f53505de071..679d5029f50 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -188,7 +188,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) /* we need to make sure nobody is changing the file size beneath ** us */ - mutex_lock(&inode->i_mutex); + reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb); reiserfs_write_lock(inode->i_sb); write_from = inode->i_size & (blocksize - 1); -- cgit v1.2.3 From 9d8117e72bf453dd9d85e0cd322ce4a0f8bccbc0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 Sep 2010 15:15:38 -0700 Subject: reiserfs: fix unwanted reiserfs lock recursion Prevent from recursively locking the reiserfs lock in reiserfs_unpack() because we may call journal_begin() that requires the lock to be taken only once, otherwise it won't be able to release the lock while taking other mutexes, ending up in inverted dependencies between the journal mutex and the reiserfs lock for example. This fixes: ======================================================= [ INFO: possible circular locking dependency detected ] 2.6.35.4.4a #3 ------------------------------------------------------- lilo/1620 is trying to acquire lock: (&journal->j_mutex){+.+...}, at: [] do_journal_begin_r+0x7f/0x340 [reiserfs] but task is already holding lock: (&REISERFS_SB(s)->lock){+.+.+.}, at: [] reiserfs_write_lock+0x28/0x40 [reiserfs] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&REISERFS_SB(s)->lock){+.+.+.}: [] lock_acquire+0x67/0x80 [] __mutex_lock_common+0x4d/0x410 [] mutex_lock_nested+0x18/0x20 [] reiserfs_write_lock+0x28/0x40 [reiserfs] [] do_journal_begin_r+0x86/0x340 [reiserfs] [] journal_begin+0x77/0x140 [reiserfs] [] reiserfs_remount+0x224/0x530 [reiserfs] [] do_remount_sb+0x60/0x110 [] do_mount+0x625/0x790 [] sys_mount+0x84/0xb0 [] syscall_call+0x7/0xb -> #0 (&journal->j_mutex){+.+...}: [] __lock_acquire+0x1026/0x1180 [] lock_acquire+0x67/0x80 [] __mutex_lock_common+0x4d/0x410 [] mutex_lock_nested+0x18/0x20 [] do_journal_begin_r+0x7f/0x340 [reiserfs] [] journal_begin+0x77/0x140 [reiserfs] [] reiserfs_persistent_transaction+0x41/0x90 [reiserfs] [] reiserfs_get_block+0x22c/0x1530 [reiserfs] [] __block_prepare_write+0x1bb/0x3a0 [] block_prepare_write+0x26/0x40 [] reiserfs_prepare_write+0x88/0x170 [reiserfs] [] reiserfs_unpack+0xe6/0x120 [reiserfs] [] reiserfs_ioctl+0x272/0x320 [reiserfs] [] vfs_ioctl+0x28/0xa0 [] do_vfs_ioctl+0x32d/0x5c0 [] sys_ioctl+0x63/0x70 [] syscall_call+0x7/0xb other info that might help us debug this: 2 locks held by lilo/1620: #0: (&sb->s_type->i_mutex_key#8){+.+.+.}, at: [] reiserfs_unpack+0x6a/0x120 [reiserfs] #1: (&REISERFS_SB(s)->lock){+.+.+.}, at: [] reiserfs_write_lock+0x28/0x40 [reiserfs] stack backtrace: Pid: 1620, comm: lilo Not tainted 2.6.35.4.4a #3 Call Trace: [] __lock_acquire+0x1026/0x1180 [] lock_acquire+0x67/0x80 [] __mutex_lock_common+0x4d/0x410 [] mutex_lock_nested+0x18/0x20 [] do_journal_begin_r+0x7f/0x340 [reiserfs] [] journal_begin+0x77/0x140 [reiserfs] [] reiserfs_persistent_transaction+0x41/0x90 [reiserfs] [] reiserfs_get_block+0x22c/0x1530 [reiserfs] [] __block_prepare_write+0x1bb/0x3a0 [] block_prepare_write+0x26/0x40 [] reiserfs_prepare_write+0x88/0x170 [reiserfs] [] reiserfs_unpack+0xe6/0x120 [reiserfs] [] reiserfs_ioctl+0x272/0x320 [reiserfs] [] vfs_ioctl+0x28/0xa0 [] do_vfs_ioctl+0x32d/0x5c0 [] sys_ioctl+0x63/0x70 [] syscall_call+0x7/0xb Reported-by: Jarek Poplawski Tested-by: Jarek Poplawski Signed-off-by: Frederic Weisbecker Cc: Jeff Mahoney Cc: All since 2.6.32 Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/ioctl.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 679d5029f50..5cbb81e134a 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -170,6 +170,7 @@ int reiserfs_prepare_write(struct file *f, struct page *page, int reiserfs_unpack(struct inode *inode, struct file *filp) { int retval = 0; + int depth; int index; struct page *page; struct address_space *mapping; @@ -189,7 +190,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) ** us */ reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb); - reiserfs_write_lock(inode->i_sb); + depth = reiserfs_write_lock_once(inode->i_sb); write_from = inode->i_size & (blocksize - 1); /* if we are on a block boundary, we are already unpacked. */ @@ -224,6 +225,6 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) out: mutex_unlock(&inode->i_mutex); - reiserfs_write_unlock(inode->i_sb); + reiserfs_write_unlock_once(inode->i_sb, depth); return retval; } -- cgit v1.2.3 From 0157443c56bcc50be4933ebdff3ece723dfd535c Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 30 Sep 2010 22:06:21 +0200 Subject: fuse: Initialize total_len in fuse_retrieve() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fs/fuse/dev.c:1357: warning: ‘total_len’ may be used uninitialized in this function Initialize total_len to zero, else its value will be undefined. Signed-off-by: Geert Uytterhoeven Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index d367af1514e..cde755cca56 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1354,7 +1354,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, loff_t file_size; unsigned int num; unsigned int offset; - size_t total_len; + size_t total_len = 0; req = fuse_get_req(fc); if (IS_ERR(req)) -- cgit v1.2.3 From aaead25b954879e1a708ff2f3602f494c18d20b5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Oct 2010 14:25:33 +0200 Subject: writeback: always use sb->s_bdi for writeback purposes We currently use struct backing_dev_info for various different purposes. Originally it was introduced to describe a backing device which includes an unplug and congestion function and various bits of readahead information and VM-relevant flags. We're also using for tracking dirty inodes for writeback. To make writeback properly find all inodes we need to only access the per-filesystem backing_device pointed to by the superblock in ->s_bdi inside the writeback code, and not the instances pointeded to by inode->i_mapping->backing_dev which can be overriden by special devices or might not be set at all by some filesystems. Long term we should split out the writeback-relevant bits of struct backing_device_info (which includes more than the current bdi_writeback) and only point to it from the superblock while leaving the traditional backing device as a separate structure that can be overriden by devices. The one exception for now is the block device filesystem which really wants different writeback contexts for it's different (internal) inodes to handle the writeout more efficiently. For now we do this with a hack in fs-writeback.c because we're so late in the cycle, but in the future I plan to replace this with a superblock method that allows for multiple writeback contexts per filesystem. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 5581122bd2c..ab38fef1c9a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -72,22 +72,11 @@ int writeback_in_progress(struct backing_dev_info *bdi) static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) { struct super_block *sb = inode->i_sb; - struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info; - /* - * For inodes on standard filesystems, we use superblock's bdi. For - * inodes on virtual filesystems, we want to use inode mapping's bdi - * because they can possibly point to something useful (think about - * block_dev filesystem). - */ - if (sb->s_bdi && sb->s_bdi != &noop_backing_dev_info) { - /* Some device inodes could play dirty tricks. Catch them... */ - WARN(bdi != sb->s_bdi && bdi_cap_writeback_dirty(bdi), - "Dirtiable inode bdi %s != sb bdi %s\n", - bdi->name, sb->s_bdi->name); - return sb->s_bdi; - } - return bdi; + if (strcmp(sb->s_type->name, "bdev") == 0) + return inode->i_mapping->backing_dev_info; + + return sb->s_bdi; } static void bdi_queue_work(struct backing_dev_info *bdi, -- cgit v1.2.3 From 54b5187b5a1ad6573ade8b18e065dda92501fc52 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Thu, 7 Oct 2010 15:26:08 -0700 Subject: ocfs2/cluster: Add heartbeat mode configfs parameter Add heartbeat mode parameter to the configfs tree. This will be used to set/show the heartbeat mode. The user is free to toggle the mode between local and global as long as there is no active heartbeat region. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 72 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 41d5f1f92d5..4d36459a834 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -77,7 +77,19 @@ static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type); #define O2HB_DEFAULT_BLOCK_BITS 9 +enum o2hb_heartbeat_modes { + O2HB_HEARTBEAT_LOCAL = 0, + O2HB_HEARTBEAT_GLOBAL, + O2HB_HEARTBEAT_NUM_MODES, +}; + +char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = { + "local", /* O2HB_HEARTBEAT_LOCAL */ + "global", /* O2HB_HEARTBEAT_GLOBAL */ +}; + unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD; +unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL; /* Only sets a new threshold if there are no active regions. * @@ -94,6 +106,22 @@ static void o2hb_dead_threshold_set(unsigned int threshold) } } +static int o2hb_global_hearbeat_mode_set(unsigned int hb_mode) +{ + int ret = -1; + + if (hb_mode < O2HB_HEARTBEAT_NUM_MODES) { + spin_lock(&o2hb_live_lock); + if (list_empty(&o2hb_all_regions)) { + o2hb_heartbeat_mode = hb_mode; + ret = 0; + } + spin_unlock(&o2hb_live_lock); + } + + return ret; +} + struct o2hb_node_event { struct list_head hn_item; enum o2hb_callback_type hn_event_type; @@ -1688,6 +1716,41 @@ static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group return count; } +static +ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group, + char *page) +{ + return sprintf(page, "%s\n", + o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]); +} + +static +ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group, + const char *page, size_t count) +{ + unsigned int i; + int ret; + size_t len; + + len = (page[count - 1] == '\n') ? count - 1 : count; + if (!len) + return -EINVAL; + + for (i = 0; i < O2HB_HEARTBEAT_NUM_MODES; ++i) { + if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len)) + continue; + + ret = o2hb_global_hearbeat_mode_set(i); + if (!ret) + printk(KERN_NOTICE "ocfs2: Heartbeat mode set to %s\n", + o2hb_heartbeat_mode_desc[i]); + return count; + } + + return -EINVAL; + +} + static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = { .attr = { .ca_owner = THIS_MODULE, .ca_name = "dead_threshold", @@ -1696,8 +1759,17 @@ static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold .store = o2hb_heartbeat_group_threshold_store, }; +static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "mode", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2hb_heartbeat_group_mode_show, + .store = o2hb_heartbeat_group_mode_store, +}; + static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = { &o2hb_heartbeat_group_attr_threshold.attr, + &o2hb_heartbeat_group_attr_mode.attr, NULL, }; -- cgit v1.2.3 From 98f486f23bc5b6a6fa90e1a0707b7e9fe0e7f3e4 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Sat, 9 Oct 2010 10:24:46 -0700 Subject: ocfs2: Add an incompat feature flag OCFS2_FEATURE_INCOMPAT_CLUSTERINFO OCFS2_FEATURE_INCOMPAT_CLUSTERINFO allows us to use sb->s_cluster_info for both userspace and o2cb cluster stacks. It also allows us to extend cluster info to include stack flags. This patch also adds stackflags to sb->s_clusterinfo. It also introduces a clusterinfo flag OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT to denote the enabled global heartbeat mode. This incompat flag can be set/cleared using tunefs.ocfs2 --fs-features. The clusterinfo flag is set/cleared using tunefs.ocfs2 --update-cluster-stack. Signed-off-by: Sunil Mushran --- fs/ocfs2/ocfs2.h | 31 +++++++++++++++++++++++++++++-- fs/ocfs2/ocfs2_fs.h | 40 ++++++++++++++++++++++++++++++++++------ fs/ocfs2/super.c | 4 +++- 3 files changed, 66 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index c67003b6b5a..d5496a792bd 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -368,6 +368,8 @@ struct ocfs2_super struct ocfs2_alloc_stats alloc_stats; char dev_str[20]; /* "major,minor" of the device */ + u8 osb_stackflags; + char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; struct ocfs2_cluster_connection *cconn; struct ocfs2_lock_res osb_super_lockres; @@ -601,10 +603,35 @@ static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb) return ret; } -static inline int ocfs2_userspace_stack(struct ocfs2_super *osb) +static inline int ocfs2_clusterinfo_valid(struct ocfs2_super *osb) { return (osb->s_feature_incompat & - OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK); + (OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK | + OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)); +} + +static inline int ocfs2_userspace_stack(struct ocfs2_super *osb) +{ + if (ocfs2_clusterinfo_valid(osb) && + memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK, + OCFS2_STACK_LABEL_LEN)) + return 1; + return 0; +} + +static inline int ocfs2_o2cb_stack(struct ocfs2_super *osb) +{ + if (ocfs2_clusterinfo_valid(osb) && + !memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK, + OCFS2_STACK_LABEL_LEN)) + return 1; + return 0; +} + +static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb) +{ + return ocfs2_o2cb_stack(osb) && + (osb->osb_stackflags & OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT); } static inline int ocfs2_mount_local(struct ocfs2_super *osb) diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index fa31d05e41b..d5b1d99abc3 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -101,7 +101,8 @@ | OCFS2_FEATURE_INCOMPAT_META_ECC \ | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \ | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \ - | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG) + | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG \ + | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO) #define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) @@ -169,6 +170,13 @@ /* Discontigous block groups */ #define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000 +/* + * Incompat bit to indicate useable clusterinfo with stackflags for all + * cluster stacks (userspace adnd o2cb). If this bit is set, + * INCOMPAT_USERSPACE_STACK becomes superfluous and thus should not be set. + */ +#define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO 0x4000 + /* * backup superblock flag is used to indicate that this volume * has backup superblocks. @@ -292,10 +300,13 @@ #define OCFS2_VOL_UUID_LEN 16 #define OCFS2_MAX_VOL_LABEL_LEN 64 -/* The alternate, userspace stack fields */ +/* The cluster stack fields */ #define OCFS2_STACK_LABEL_LEN 4 #define OCFS2_CLUSTER_NAME_LEN 16 +/* Classic (historically speaking) cluster stack */ +#define OCFS2_CLASSIC_CLUSTER_STACK "o2cb" + /* Journal limits (in bytes) */ #define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) @@ -305,6 +316,11 @@ */ #define OCFS2_MIN_XATTR_INLINE_SIZE 256 +/* + * Cluster info flags (ocfs2_cluster_info.ci_stackflags) + */ +#define OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT (0x01) + struct ocfs2_system_inode_info { char *si_name; int si_iflags; @@ -566,9 +582,21 @@ struct ocfs2_slot_map_extended { */ }; +/* + * ci_stackflags is only valid if the incompat bit + * OCFS2_FEATURE_INCOMPAT_CLUSTERINFO is set. + */ struct ocfs2_cluster_info { /*00*/ __u8 ci_stack[OCFS2_STACK_LABEL_LEN]; - __le32 ci_reserved; + union { + __le32 ci_reserved; + struct { + __u8 ci_stackflags; + __u8 ci_reserved1; + __u8 ci_reserved2; + __u8 ci_reserved3; + }; + }; /*08*/ __u8 ci_cluster[OCFS2_CLUSTER_NAME_LEN]; /*18*/ }; @@ -605,9 +633,9 @@ struct ocfs2_super_block { * group header */ /*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ /*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */ -/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Selected userspace - stack. Only valid - with INCOMPAT flag. */ +/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Only valid if either + userspace or clusterinfo + INCOMPAT flag set. */ /*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size for this fs*/ __le16 s_reserved0; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index fa1be1b304d..75543173939 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2149,7 +2149,9 @@ static int ocfs2_initialize_super(struct super_block *sb, goto bail; } - if (ocfs2_userspace_stack(osb)) { + if (ocfs2_clusterinfo_valid(osb)) { + osb->osb_stackflags = + OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags; memcpy(osb->osb_cluster_stack, OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, OCFS2_STACK_LABEL_LEN); -- cgit v1.2.3 From 2c442719e90a44a6982c033d69df4aae4b167cfa Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Thu, 7 Oct 2010 15:23:50 -0700 Subject: ocfs2: Add support for heartbeat=global mount option Adds support for heartbeat=global mount option. It ensures that the heartbeat mode passed matches the one enabled on disk. Signed-off-by: Sunil Mushran --- fs/ocfs2/ocfs2.h | 4 +++- fs/ocfs2/ocfs2_fs.h | 1 + fs/ocfs2/super.c | 55 +++++++++++++++++++++++++++++++++++++++-------------- 3 files changed, 45 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index d5496a792bd..481387b90b2 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -243,7 +243,7 @@ enum ocfs2_local_alloc_state enum ocfs2_mount_options { - OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Heartbeat started in local mode */ + OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Local heartbeat */ OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */ OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */ OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ @@ -256,6 +256,8 @@ enum ocfs2_mount_options control lists */ OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */ OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */ + OCFS2_MOUNT_HB_NONE = 1 << 12, /* No heartbeat */ + OCFS2_MOUNT_HB_GLOBAL = 1 << 13, /* Global heartbeat */ }; #define OCFS2_OSB_SOFT_RO 0x0001 diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index d5b1d99abc3..28ff536b4f8 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -376,6 +376,7 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { /* Parameter passed from mount.ocfs2 to module */ #define OCFS2_HB_NONE "heartbeat=none" #define OCFS2_HB_LOCAL "heartbeat=local" +#define OCFS2_HB_GLOBAL "heartbeat=global" /* * OCFS2 directory file types. Only the low 3 bits are used. The diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 75543173939..4e009ad303a 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -162,6 +162,7 @@ enum { Opt_nointr, Opt_hb_none, Opt_hb_local, + Opt_hb_global, Opt_data_ordered, Opt_data_writeback, Opt_atime_quantum, @@ -190,6 +191,7 @@ static const match_table_t tokens = { {Opt_nointr, "nointr"}, {Opt_hb_none, OCFS2_HB_NONE}, {Opt_hb_local, OCFS2_HB_LOCAL}, + {Opt_hb_global, OCFS2_HB_GLOBAL}, {Opt_data_ordered, "data=ordered"}, {Opt_data_writeback, "data=writeback"}, {Opt_atime_quantum, "atime_quantum=%u"}, @@ -608,6 +610,7 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) int ret = 0; struct mount_options parsed_options; struct ocfs2_super *osb = OCFS2_SB(sb); + u32 tmp; lock_kernel(); @@ -617,8 +620,9 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) goto out; } - if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) != - (parsed_options.mount_opt & OCFS2_MOUNT_HB_LOCAL)) { + tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | + OCFS2_MOUNT_HB_NONE; + if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { ret = -EINVAL; mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n"); goto out; @@ -809,23 +813,29 @@ bail: static int ocfs2_verify_heartbeat(struct ocfs2_super *osb) { - if (ocfs2_mount_local(osb)) { - if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) { + u32 hb_enabled = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL; + + if (osb->s_mount_opt & hb_enabled) { + if (ocfs2_mount_local(osb)) { mlog(ML_ERROR, "Cannot heartbeat on a locally " "mounted device.\n"); return -EINVAL; } - } - - if (ocfs2_userspace_stack(osb)) { - if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) { + if (ocfs2_userspace_stack(osb)) { mlog(ML_ERROR, "Userspace stack expected, but " "o2cb heartbeat arguments passed to mount\n"); return -EINVAL; } + if (((osb->s_mount_opt & OCFS2_MOUNT_HB_GLOBAL) && + !ocfs2_cluster_o2cb_global_heartbeat(osb)) || + ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) && + ocfs2_cluster_o2cb_global_heartbeat(osb))) { + mlog(ML_ERROR, "Mismatching o2cb heartbeat modes\n"); + return -EINVAL; + } } - if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) { + if (!(osb->s_mount_opt & hb_enabled)) { if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) && !ocfs2_userspace_stack(osb)) { mlog(ML_ERROR, "Heartbeat has to be started to mount " @@ -1291,6 +1301,7 @@ static int ocfs2_parse_options(struct super_block *sb, { int status; char *p; + u32 tmp; mlog_entry("remount: %d, options: \"%s\"\n", is_remount, options ? options : "(none)"); @@ -1322,7 +1333,10 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL; break; case Opt_hb_none: - mopt->mount_opt &= ~OCFS2_MOUNT_HB_LOCAL; + mopt->mount_opt |= OCFS2_MOUNT_HB_NONE; + break; + case Opt_hb_global: + mopt->mount_opt |= OCFS2_MOUNT_HB_GLOBAL; break; case Opt_barrier: if (match_int(&args[0], &option)) { @@ -1477,6 +1491,15 @@ static int ocfs2_parse_options(struct super_block *sb, } } + /* Ensure only one heartbeat mode */ + tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | + OCFS2_MOUNT_HB_NONE); + if (hweight32(tmp) != 1) { + mlog(ML_ERROR, "Invalid heartbeat mount options\n"); + status = 0; + goto bail; + } + status = 1; bail: @@ -1490,10 +1513,14 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) unsigned long opts = osb->s_mount_opt; unsigned int local_alloc_megs; - if (opts & OCFS2_MOUNT_HB_LOCAL) - seq_printf(s, ",_netdev,heartbeat=local"); - else - seq_printf(s, ",heartbeat=none"); + if (opts & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL)) { + seq_printf(s, ",_netdev"); + if (opts & OCFS2_MOUNT_HB_LOCAL) + seq_printf(s, ",%s", OCFS2_HB_LOCAL); + else + seq_printf(s, ",%s", OCFS2_HB_GLOBAL); + } else + seq_printf(s, ",%s", OCFS2_HB_NONE); if (opts & OCFS2_MOUNT_NOINTR) seq_printf(s, ",nointr"); -- cgit v1.2.3 From b1365d0bd14b912cceb424cbeed9fe939a9038e3 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Wed, 6 Oct 2010 17:55:34 -0700 Subject: ocfs2/dlm: Expose dlm_protocol in dlm_state Add dlm_protocol to the list of info shown by the debugfs file, dlm_state. Signed-off-by: Sunil Mushran --- fs/ocfs2/dlm/dlmdebug.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 901ca52bf86..f693ab812f3 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -782,7 +782,9 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db) /* Domain: xxxxxxxxxx Key: 0xdfbac769 */ out += snprintf(db->buf + out, db->len - out, - "Domain: %s Key: 0x%08x\n", dlm->name, dlm->key); + "Domain: %s Key: 0x%08x Protocol: %d.%d\n", + dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major, + dlm->dlm_locking_proto.pv_minor); /* Thread Pid: xxx Node: xxx State: xxxxx */ out += snprintf(db->buf + out, db->len - out, -- cgit v1.2.3 From b3c85c4cdf77154acc940dd0f14d1fb99cbbaf75 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Thu, 7 Oct 2010 14:31:06 -0700 Subject: ocfs2/cluster: Get all heartbeat regions Export function in o2hb to get a list of heartbeat regions. It also adds an upper limit to the length of the heartbeat region name. o2hb_global_heartbeat_active() currently disables global heartbeat. It will be enabled in a later patch after all the code is added. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 34 ++++++++++++++++++++++++++++++++++ fs/ocfs2/cluster/heartbeat.h | 4 ++++ 2 files changed, 38 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 4d36459a834..3415e58ff77 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1623,6 +1623,9 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g if (reg == NULL) return ERR_PTR(-ENOMEM); + if (strlen(name) > O2HB_MAX_REGION_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + config_item_init_type_name(®->hr_item, name, &o2hb_region_type); spin_lock(&o2hb_live_lock); @@ -2035,3 +2038,34 @@ void o2hb_stop_all_regions(void) spin_unlock(&o2hb_live_lock); } EXPORT_SYMBOL_GPL(o2hb_stop_all_regions); + +int o2hb_get_all_regions(char *region_uuids, u8 max_regions) +{ + struct o2hb_region *reg; + int numregs = 0; + char *p; + + spin_lock(&o2hb_live_lock); + + p = region_uuids; + list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { + mlog(0, "Region: %s\n", config_item_name(®->hr_item)); + if (numregs < max_regions) { + memcpy(p, config_item_name(®->hr_item), + O2HB_MAX_REGION_NAME_LEN); + p += O2HB_MAX_REGION_NAME_LEN; + } + numregs++; + } + + spin_unlock(&o2hb_live_lock); + + return numregs; +} +EXPORT_SYMBOL_GPL(o2hb_get_all_regions); + +int o2hb_global_heartbeat_active(void) +{ + return 0; +} +EXPORT_SYMBOL(o2hb_global_heartbeat_active); diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h index 2f1649253b4..00ad8e8fea5 100644 --- a/fs/ocfs2/cluster/heartbeat.h +++ b/fs/ocfs2/cluster/heartbeat.h @@ -31,6 +31,8 @@ #define O2HB_REGION_TIMEOUT_MS 2000 +#define O2HB_MAX_REGION_NAME_LEN 32 + /* number of changes to be seen as live */ #define O2HB_LIVE_THRESHOLD 2 /* number of equal samples to be seen as dead */ @@ -81,5 +83,7 @@ int o2hb_check_node_heartbeating(u8 node_num); int o2hb_check_node_heartbeating_from_callback(u8 node_num); int o2hb_check_local_node_heartbeating(void); void o2hb_stop_all_regions(void); +int o2hb_get_all_regions(char *region_uuids, u8 numregions); +int o2hb_global_heartbeat_active(void); #endif /* O2CLUSTER_HEARTBEAT_H */ -- cgit v1.2.3 From ea2034416b54700e30371f2ad6517cbb94674083 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Sat, 9 Oct 2010 10:26:23 -0700 Subject: ocfs2/dlm: Add message DLM_QUERY_REGION Adds new dlm message DLM_QUERY_REGION that sends the names of all active heartbeat regions. This message is only sent in the global heartbeat mode. If the regions in the joining node do not fully match the ones in the active nodes, the join domain request is rejected. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/ocfs2_nodemanager.h | 6 + fs/ocfs2/dlm/dlmcommon.h | 12 +- fs/ocfs2/dlm/dlmdomain.c | 218 +++++++++++++++++++++++++++++++++++ 3 files changed, 235 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h index 5b9854bad57..49b594325be 100644 --- a/fs/ocfs2/cluster/ocfs2_nodemanager.h +++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h @@ -36,4 +36,10 @@ /* host name, group name, cluster name all 64 bytes */ #define O2NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN +/* + * Maximum number of global heartbeat regions allowed. + * **CAUTION** Changing this number will break dlm compatibility. + */ +#define O2NM_MAX_REGIONS 32 + #endif /* _OCFS2_NODEMANAGER_H */ diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 765298908f1..aa506d3e2ae 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -445,7 +445,8 @@ enum { DLM_LOCK_REQUEST_MSG, /* 515 */ DLM_RECO_DATA_DONE_MSG, /* 516 */ DLM_BEGIN_RECO_MSG, /* 517 */ - DLM_FINALIZE_RECO_MSG /* 518 */ + DLM_FINALIZE_RECO_MSG, /* 518 */ + DLM_QUERY_REGION, /* 519 */ }; struct dlm_reco_node_data @@ -727,6 +728,15 @@ struct dlm_cancel_join u8 domain[O2NM_MAX_NAME_LEN]; }; +struct dlm_query_region { + u8 qr_node; + u8 qr_numregions; + u8 qr_namelen; + u8 pad1; + u8 qr_domain[O2NM_MAX_NAME_LEN]; + u8 qr_regions[O2HB_MAX_REGION_NAME_LEN * O2NM_MAX_REGIONS]; +}; + struct dlm_exit_domain { u8 node_idx; diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 11a5c87fd7f..49650756dfe 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -128,6 +128,9 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events); * will have a negotiated version with the same major number and a minor * number equal or smaller. The dlm_ctxt->dlm_locking_proto field should * be used to determine what a running domain is actually using. + * + * New in version 1.1: + * - Message DLM_QUERY_REGION added to support global heartbeat */ static const struct dlm_protocol_version dlm_protocol = { .pv_major = 1, @@ -142,6 +145,8 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, void **ret_data); static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, void **ret_data); +static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, + void *data, void **ret_data); static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, void **ret_data); static int dlm_protocol_compare(struct dlm_protocol_version *existing, @@ -921,6 +926,203 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, return 0; } +static int dlm_match_regions(struct dlm_ctxt *dlm, + struct dlm_query_region *qr) +{ + char *local = NULL, *remote = qr->qr_regions; + char *l, *r; + int localnr, i, j, foundit; + int status = 0; + + if (!o2hb_global_heartbeat_active()) { + if (qr->qr_numregions) { + mlog(ML_ERROR, "Domain %s: Joining node %d has global " + "heartbeat enabled but local node %d does not\n", + qr->qr_domain, qr->qr_node, dlm->node_num); + status = -EINVAL; + } + goto bail; + } + + if (o2hb_global_heartbeat_active() && !qr->qr_numregions) { + mlog(ML_ERROR, "Domain %s: Local node %d has global " + "heartbeat enabled but joining node %d does not\n", + qr->qr_domain, dlm->node_num, qr->qr_node); + status = -EINVAL; + goto bail; + } + + r = remote; + for (i = 0; i < qr->qr_numregions; ++i) { + mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, r); + r += O2HB_MAX_REGION_NAME_LEN; + } + + local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL); + if (!local) { + status = -ENOMEM; + goto bail; + } + + localnr = o2hb_get_all_regions(local, O2NM_MAX_REGIONS); + + /* compare local regions with remote */ + l = local; + for (i = 0; i < localnr; ++i) { + foundit = 0; + r = remote; + for (j = 0; j <= qr->qr_numregions; ++j) { + if (!memcmp(l, r, O2HB_MAX_REGION_NAME_LEN)) { + foundit = 1; + break; + } + r += O2HB_MAX_REGION_NAME_LEN; + } + if (!foundit) { + status = -EINVAL; + mlog(ML_ERROR, "Domain %s: Region '%.*s' registered " + "in local node %d but not in joining node %d\n", + qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, l, + dlm->node_num, qr->qr_node); + goto bail; + } + l += O2HB_MAX_REGION_NAME_LEN; + } + + /* compare remote with local regions */ + r = remote; + for (i = 0; i < qr->qr_numregions; ++i) { + foundit = 0; + l = local; + for (j = 0; j < localnr; ++j) { + if (!memcmp(r, l, O2HB_MAX_REGION_NAME_LEN)) { + foundit = 1; + break; + } + l += O2HB_MAX_REGION_NAME_LEN; + } + if (!foundit) { + status = -EINVAL; + mlog(ML_ERROR, "Domain %s: Region '%.*s' registered " + "in joining node %d but not in local node %d\n", + qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, r, + qr->qr_node, dlm->node_num); + goto bail; + } + r += O2HB_MAX_REGION_NAME_LEN; + } + +bail: + kfree(local); + + return status; +} + +static int dlm_send_regions(struct dlm_ctxt *dlm, unsigned long *node_map) +{ + struct dlm_query_region *qr = NULL; + int status, ret = 0, i; + char *p; + + if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES) + goto bail; + + qr = kzalloc(sizeof(struct dlm_query_region), GFP_KERNEL); + if (!qr) { + ret = -ENOMEM; + mlog_errno(ret); + goto bail; + } + + qr->qr_node = dlm->node_num; + qr->qr_namelen = strlen(dlm->name); + memcpy(qr->qr_domain, dlm->name, qr->qr_namelen); + /* if local hb, the numregions will be zero */ + if (o2hb_global_heartbeat_active()) + qr->qr_numregions = o2hb_get_all_regions(qr->qr_regions, + O2NM_MAX_REGIONS); + + p = qr->qr_regions; + for (i = 0; i < qr->qr_numregions; ++i, p += O2HB_MAX_REGION_NAME_LEN) + mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, p); + + i = -1; + while ((i = find_next_bit(node_map, O2NM_MAX_NODES, + i + 1)) < O2NM_MAX_NODES) { + if (i == dlm->node_num) + continue; + + mlog(0, "Sending regions to node %d\n", i); + + ret = o2net_send_message(DLM_QUERY_REGION, DLM_MOD_KEY, qr, + sizeof(struct dlm_query_region), + i, &status); + if (ret >= 0) + ret = status; + if (ret) { + mlog(ML_ERROR, "Region mismatch %d, node %d\n", + ret, i); + break; + } + } + +bail: + kfree(qr); + return ret; +} + +static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, + void *data, void **ret_data) +{ + struct dlm_query_region *qr; + struct dlm_ctxt *dlm = NULL; + int status = 0; + int locked = 0; + + qr = (struct dlm_query_region *) msg->buf; + + mlog(0, "Node %u queries hb regions on domain %s\n", qr->qr_node, + qr->qr_domain); + + status = -EINVAL; + + spin_lock(&dlm_domain_lock); + dlm = __dlm_lookup_domain_full(qr->qr_domain, qr->qr_namelen); + if (!dlm) { + mlog(ML_ERROR, "Node %d queried hb regions on domain %s " + "before join domain\n", qr->qr_node, qr->qr_domain); + goto bail; + } + + spin_lock(&dlm->spinlock); + locked = 1; + if (dlm->joining_node != qr->qr_node) { + mlog(ML_ERROR, "Node %d queried hb regions on domain %s " + "but joining node is %d\n", qr->qr_node, qr->qr_domain, + dlm->joining_node); + goto bail; + } + + /* Support for global heartbeat was added in 1.1 */ + if (dlm->dlm_locking_proto.pv_major == 1 && + dlm->dlm_locking_proto.pv_minor == 0) { + mlog(ML_ERROR, "Node %d queried hb regions on domain %s " + "but active dlm protocol is %d.%d\n", qr->qr_node, + qr->qr_domain, dlm->dlm_locking_proto.pv_major, + dlm->dlm_locking_proto.pv_minor); + goto bail; + } + + status = dlm_match_regions(dlm, qr); + +bail: + if (locked) + spin_unlock(&dlm->spinlock); + spin_unlock(&dlm_domain_lock); + + return status; +} + static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, void **ret_data) { @@ -1241,6 +1443,15 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) set_bit(dlm->node_num, dlm->domain_map); spin_unlock(&dlm->spinlock); + /* Support for global heartbeat was added in 1.1 */ + if (dlm_protocol.pv_major > 1 || dlm_protocol.pv_minor > 0) { + status = dlm_send_regions(dlm, ctxt->yes_resp_map); + if (status) { + mlog_errno(status); + goto bail; + } + } + dlm_send_join_asserts(dlm, ctxt->yes_resp_map); /* Joined state *must* be set before the joining node @@ -1807,6 +2018,13 @@ static int dlm_register_net_handlers(void) sizeof(struct dlm_cancel_join), dlm_cancel_join_handler, NULL, NULL, &dlm_join_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_QUERY_REGION, DLM_MOD_KEY, + sizeof(struct dlm_query_region), + dlm_query_region_handler, + NULL, NULL, &dlm_join_handlers); bail: if (status < 0) -- cgit v1.2.3 From 5f3c6d9c615770708df464c170316f83cf437242 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Wed, 6 Oct 2010 17:55:29 -0700 Subject: ocfs2: Print message if user mounts without starting global heartbeat In global heartbeat mode, the heartbeat is started by the user. This patch prints an error if the user attempts to mount a volume without starting the heartbeat. Signed-off-by: Sunil Mushran --- fs/ocfs2/stack_o2cb.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index 0d3049f696c..19965b00c43 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -283,6 +283,8 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn) /* for now we only have one cluster/node, make sure we see it * in the heartbeat universe */ if (!o2hb_check_local_node_heartbeating()) { + if (o2hb_global_heartbeat_active()) + mlog(ML_ERROR, "Global heartbeat not started\n"); rc = -EINVAL; goto out; } -- cgit v1.2.3 From 18cfdf1b1a8e83b09e4185c02396257ce7e7bee3 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Thu, 7 Oct 2010 16:47:03 -0700 Subject: ocfs2/dlm: Add message DLM_QUERY_NODEINFO Adds new dlm message DLM_QUERY_NODEINFO that sends the attributes of all registered nodes. This message is sent if the negotiated dlm protocol is 1.1 or higher. If the information of the joining node does not match that of any existing nodes, the join domain request is rejected. Signed-off-by: Sunil Mushran --- fs/ocfs2/dlm/dlmcommon.h | 17 +++++ fs/ocfs2/dlm/dlmdomain.c | 182 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 198 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index aa506d3e2ae..b36d0bf77a5 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -447,6 +447,7 @@ enum { DLM_BEGIN_RECO_MSG, /* 517 */ DLM_FINALIZE_RECO_MSG, /* 518 */ DLM_QUERY_REGION, /* 519 */ + DLM_QUERY_NODEINFO, /* 520 */ }; struct dlm_reco_node_data @@ -737,6 +738,22 @@ struct dlm_query_region { u8 qr_regions[O2HB_MAX_REGION_NAME_LEN * O2NM_MAX_REGIONS]; }; +struct dlm_node_info { + u8 ni_nodenum; + u8 pad1; + u16 ni_ipv4_port; + u32 ni_ipv4_address; +}; + +struct dlm_query_nodeinfo { + u8 qn_nodenum; + u8 qn_numnodes; + u8 qn_namelen; + u8 pad1; + u8 qn_domain[O2NM_MAX_NAME_LEN]; + struct dlm_node_info qn_nodes[O2NM_MAX_NODES]; +}; + struct dlm_exit_domain { u8 node_idx; diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 49650756dfe..78d428f5e10 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -131,6 +131,7 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events); * * New in version 1.1: * - Message DLM_QUERY_REGION added to support global heartbeat + * - Message DLM_QUERY_NODEINFO added to allow online node removes */ static const struct dlm_protocol_version dlm_protocol = { .pv_major = 1, @@ -1123,6 +1124,173 @@ bail: return status; } +static int dlm_match_nodes(struct dlm_ctxt *dlm, struct dlm_query_nodeinfo *qn) +{ + struct o2nm_node *local; + struct dlm_node_info *remote; + int i, j; + int status = 0; + + for (j = 0; j < qn->qn_numnodes; ++j) + mlog(0, "Node %3d, %pI4:%u\n", qn->qn_nodes[j].ni_nodenum, + &(qn->qn_nodes[j].ni_ipv4_address), + ntohs(qn->qn_nodes[j].ni_ipv4_port)); + + for (i = 0; i < O2NM_MAX_NODES && !status; ++i) { + local = o2nm_get_node_by_num(i); + remote = NULL; + for (j = 0; j < qn->qn_numnodes; ++j) { + if (qn->qn_nodes[j].ni_nodenum == i) { + remote = &(qn->qn_nodes[j]); + break; + } + } + + if (!local && !remote) + continue; + + if ((local && !remote) || (!local && remote)) + status = -EINVAL; + + if (!status && + ((remote->ni_nodenum != local->nd_num) || + (remote->ni_ipv4_port != local->nd_ipv4_port) || + (remote->ni_ipv4_address != local->nd_ipv4_address))) + status = -EINVAL; + + if (status) { + if (remote && !local) + mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) " + "registered in joining node %d but not in " + "local node %d\n", qn->qn_domain, + remote->ni_nodenum, + &(remote->ni_ipv4_address), + ntohs(remote->ni_ipv4_port), + qn->qn_nodenum, dlm->node_num); + if (local && !remote) + mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) " + "registered in local node %d but not in " + "joining node %d\n", qn->qn_domain, + local->nd_num, &(local->nd_ipv4_address), + ntohs(local->nd_ipv4_port), + dlm->node_num, qn->qn_nodenum); + BUG_ON((!local && !remote)); + } + + if (local) + o2nm_node_put(local); + } + + return status; +} + +static int dlm_send_nodeinfo(struct dlm_ctxt *dlm, unsigned long *node_map) +{ + struct dlm_query_nodeinfo *qn = NULL; + struct o2nm_node *node; + int ret = 0, status, count, i; + + if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES) + goto bail; + + qn = kzalloc(sizeof(struct dlm_query_nodeinfo), GFP_KERNEL); + if (!qn) { + ret = -ENOMEM; + mlog_errno(ret); + goto bail; + } + + for (i = 0, count = 0; i < O2NM_MAX_NODES; ++i) { + node = o2nm_get_node_by_num(i); + if (!node) + continue; + qn->qn_nodes[count].ni_nodenum = node->nd_num; + qn->qn_nodes[count].ni_ipv4_port = node->nd_ipv4_port; + qn->qn_nodes[count].ni_ipv4_address = node->nd_ipv4_address; + mlog(0, "Node %3d, %pI4:%u\n", node->nd_num, + &(node->nd_ipv4_address), ntohs(node->nd_ipv4_port)); + ++count; + o2nm_node_put(node); + } + + qn->qn_nodenum = dlm->node_num; + qn->qn_numnodes = count; + qn->qn_namelen = strlen(dlm->name); + memcpy(qn->qn_domain, dlm->name, qn->qn_namelen); + + i = -1; + while ((i = find_next_bit(node_map, O2NM_MAX_NODES, + i + 1)) < O2NM_MAX_NODES) { + if (i == dlm->node_num) + continue; + + mlog(0, "Sending nodeinfo to node %d\n", i); + + ret = o2net_send_message(DLM_QUERY_NODEINFO, DLM_MOD_KEY, + qn, sizeof(struct dlm_query_nodeinfo), + i, &status); + if (ret >= 0) + ret = status; + if (ret) { + mlog(ML_ERROR, "node mismatch %d, node %d\n", ret, i); + break; + } + } + +bail: + kfree(qn); + return ret; +} + +static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len, + void *data, void **ret_data) +{ + struct dlm_query_nodeinfo *qn; + struct dlm_ctxt *dlm = NULL; + int locked = 0, status = -EINVAL; + + qn = (struct dlm_query_nodeinfo *) msg->buf; + + mlog(0, "Node %u queries nodes on domain %s\n", qn->qn_nodenum, + qn->qn_domain); + + spin_lock(&dlm_domain_lock); + dlm = __dlm_lookup_domain_full(qn->qn_domain, qn->qn_namelen); + if (!dlm) { + mlog(ML_ERROR, "Node %d queried nodes on domain %s before " + "join domain\n", qn->qn_nodenum, qn->qn_domain); + goto bail; + } + + spin_lock(&dlm->spinlock); + locked = 1; + if (dlm->joining_node != qn->qn_nodenum) { + mlog(ML_ERROR, "Node %d queried nodes on domain %s but " + "joining node is %d\n", qn->qn_nodenum, qn->qn_domain, + dlm->joining_node); + goto bail; + } + + /* Support for node query was added in 1.1 */ + if (dlm->dlm_locking_proto.pv_major == 1 && + dlm->dlm_locking_proto.pv_minor == 0) { + mlog(ML_ERROR, "Node %d queried nodes on domain %s " + "but active dlm protocol is %d.%d\n", qn->qn_nodenum, + qn->qn_domain, dlm->dlm_locking_proto.pv_major, + dlm->dlm_locking_proto.pv_minor); + goto bail; + } + + status = dlm_match_nodes(dlm, qn); + +bail: + if (locked) + spin_unlock(&dlm->spinlock); + spin_unlock(&dlm_domain_lock); + + return status; +} + static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, void **ret_data) { @@ -1443,8 +1611,13 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) set_bit(dlm->node_num, dlm->domain_map); spin_unlock(&dlm->spinlock); - /* Support for global heartbeat was added in 1.1 */ + /* Support for global heartbeat and node info was added in 1.1 */ if (dlm_protocol.pv_major > 1 || dlm_protocol.pv_minor > 0) { + status = dlm_send_nodeinfo(dlm, ctxt->yes_resp_map); + if (status) { + mlog_errno(status); + goto bail; + } status = dlm_send_regions(dlm, ctxt->yes_resp_map); if (status) { mlog_errno(status); @@ -2026,6 +2199,13 @@ static int dlm_register_net_handlers(void) dlm_query_region_handler, NULL, NULL, &dlm_join_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_QUERY_NODEINFO, DLM_MOD_KEY, + sizeof(struct dlm_query_nodeinfo), + dlm_query_nodeinfo_handler, + NULL, NULL, &dlm_join_handlers); bail: if (status < 0) dlm_unregister_net_handlers(); -- cgit v1.2.3 From 18c50cb0d3c293eabd6c2ef89c43f2a968e709ed Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Wed, 6 Oct 2010 18:26:59 -0700 Subject: ocfs2/cluster: Print messages when adding/removing heartbeat regions Prints messages when the user adds or removes heartbeat regions in global heartbeat mode. These messages are useful when debugging cluster related issues. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 3415e58ff77..12bb12ba864 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1476,6 +1476,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, else ret = -EIO; + if (hb_task && o2hb_global_heartbeat_active()) + printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n", + config_item_name(®->hr_item)); + out: if (filp) fput(filp); @@ -1659,6 +1663,9 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, wake_up(&o2hb_steady_queue); } + if (o2hb_global_heartbeat_active()) + printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n", + config_item_name(®->hr_item)); config_item_put(item); } @@ -1745,7 +1752,7 @@ ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group, ret = o2hb_global_hearbeat_mode_set(i); if (!ret) - printk(KERN_NOTICE "ocfs2: Heartbeat mode set to %s\n", + printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n", o2hb_heartbeat_mode_desc[i]); return count; } -- cgit v1.2.3 From 39a298563e0619b1b6e2e0974e58801de780621c Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Thu, 7 Oct 2010 17:30:17 -0700 Subject: ocfs2/cluster: Print messages when adding/removing nodes Prints messages when the user adds or removes nodes. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/masklog.h | 3 ++- fs/ocfs2/cluster/nodemanager.c | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index fd96e2a2fa5..ea2ed9f56c9 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -119,7 +119,8 @@ #define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ #define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ #define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */ -#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */ +#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */ +#define ML_CLUSTER 0x0000001000000000ULL /* cluster stack */ #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE) #define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT) diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index ed0c9f367fe..bb240647ca5 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -711,6 +711,8 @@ static struct config_item *o2nm_node_group_make_item(struct config_group *group, config_item_init_type_name(&node->nd_item, name, &o2nm_node_type); spin_lock_init(&node->nd_lock); + mlog(ML_CLUSTER, "o2nm: Registering node %s\n", name); + return &node->nd_item; } @@ -744,6 +746,9 @@ static void o2nm_node_group_drop_item(struct config_group *group, } write_unlock(&cluster->cl_nodes_lock); + mlog(ML_CLUSTER, "o2nm: Unregistered node %s\n", + config_item_name(&node->nd_item)); + config_item_put(item); } -- cgit v1.2.3 From 0e105d37c2adb19cb777aa6701a866f211764a30 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Thu, 7 Oct 2010 17:00:16 -0700 Subject: ocfs2/cluster: Check slots for unconfigured live nodes o2hb currently checks slots for configured nodes only. This patch makes it check the slots for the live nodes too to take care of a race in which a node is removed from the configuration but not from the live map. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 38 +++++++++++++++++++++++++++++++------- fs/ocfs2/cluster/tcp.c | 5 +++++ 2 files changed, 36 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 12bb12ba864..a8f10649674 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -541,6 +541,8 @@ static void o2hb_queue_node_event(struct o2hb_node_event *event, { assert_spin_locked(&o2hb_live_lock); + BUG_ON((!node) && (type != O2HB_NODE_DOWN_CB)); + event->hn_event_type = type; event->hn_node = node; event->hn_node_num = node_num; @@ -593,14 +595,22 @@ static int o2hb_check_slot(struct o2hb_region *reg, u64 cputime; unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS; unsigned int slot_dead_ms; + int tmp; memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); - /* Is this correct? Do we assume that the node doesn't exist - * if we're not configured for him? */ + /* + * If a node is no longer configured but is still in the livemap, we + * may need to clear that bit from the livemap. + */ node = o2nm_get_node_by_num(slot->ds_node_num); - if (!node) - return 0; + if (!node) { + spin_lock(&o2hb_live_lock); + tmp = test_bit(slot->ds_node_num, o2hb_live_node_bitmap); + spin_unlock(&o2hb_live_lock); + if (!tmp) + return 0; + } if (!o2hb_verify_crc(reg, hb_block)) { /* all paths from here will drop o2hb_live_lock for @@ -717,8 +727,9 @@ fire_callbacks: if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { clear_bit(slot->ds_node_num, o2hb_live_node_bitmap); - o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node, - slot->ds_node_num); + /* node can be null */ + o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, + node, slot->ds_node_num); changed = 1; } @@ -738,7 +749,8 @@ out: o2hb_run_event_list(&event); - o2nm_node_put(node); + if (node) + o2nm_node_put(node); return changed; } @@ -765,6 +777,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) { int i, ret, highest_node, change = 0; unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; struct o2hb_bio_wait_ctxt write_wc; ret = o2nm_configured_node_map(configured_nodes, @@ -774,6 +787,17 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) return ret; } + /* + * If a node is not configured but is in the livemap, we still need + * to read the slot so as to be able to remove it from the livemap. + */ + o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap)); + i = -1; + while ((i = find_next_bit(live_node_bitmap, + O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { + set_bit(i, configured_nodes); + } + highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); if (highest_node >= O2NM_MAX_NODES) { mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index cbe2f057cc2..9aa426e4212 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1696,6 +1696,9 @@ static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num, { o2quo_hb_down(node_num); + if (!node) + return; + if (node_num != o2nm_this_node()) o2net_disconnect_node(node); @@ -1709,6 +1712,8 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num, o2quo_hb_up(node_num); + BUG_ON(!node); + /* ensure an immediate connect attempt */ nn->nn_last_connect_attempt = jiffies - (msecs_to_jiffies(o2net_reconnect_delay()) + 1); -- cgit v1.2.3 From 8ca8b0bbd841b6bcd8ac05e51b0143aa61cfeff3 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Thu, 7 Oct 2010 17:01:27 -0700 Subject: ocfs2/cluster: Reorganize o2hb debugfs init o2hb debugfs handling is reorganized to allow for easy expansion. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 101 +++++++++++++++++++++++++++++++++---------- 1 file changed, 78 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index a8f10649674..16e49765c85 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -62,8 +62,19 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; static LIST_HEAD(o2hb_node_events); static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); +#define O2HB_DB_TYPE_LIVENODES 0 +struct o2hb_debug_buf { + int db_type; + int db_size; + int db_len; + void *db_data; +}; + +static struct o2hb_debug_buf *o2hb_db_livenodes; + #define O2HB_DEBUG_DIR "o2hb" #define O2HB_DEBUG_LIVENODES "livenodes" + static struct dentry *o2hb_debug_dir; static struct dentry *o2hb_debug_livenodes; @@ -969,21 +980,35 @@ static int o2hb_thread(void *data) #ifdef CONFIG_DEBUG_FS static int o2hb_debug_open(struct inode *inode, struct file *file) { + struct o2hb_debug_buf *db = inode->i_private; unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; char *buf = NULL; int i = -1; int out = 0; + /* max_nodes should be the largest bitmap we pass here */ + BUG_ON(sizeof(map) < db->db_size); + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!buf) goto bail; - o2hb_fill_node_map(map, sizeof(map)); + switch (db->db_type) { + case O2HB_DB_TYPE_LIVENODES: + spin_lock(&o2hb_live_lock); + memcpy(map, db->db_data, db->db_size); + spin_unlock(&o2hb_live_lock); + break; + + default: + goto done; + } - while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) + while ((i = find_next_bit(map, db->db_len, i + 1)) < db->db_len) out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i); out += snprintf(buf + out, PAGE_SIZE - out, "\n"); +done: i_size_write(inode, out); file->private_data = buf; @@ -1030,10 +1055,56 @@ static const struct file_operations o2hb_debug_fops = { void o2hb_exit(void) { - if (o2hb_debug_livenodes) - debugfs_remove(o2hb_debug_livenodes); - if (o2hb_debug_dir) - debugfs_remove(o2hb_debug_dir); + kfree(o2hb_db_livenodes); + debugfs_remove(o2hb_debug_livenodes); + debugfs_remove(o2hb_debug_dir); +} + +static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir, + struct o2hb_debug_buf **db, int db_len, + int type, int size, int len, void *data) +{ + *db = kmalloc(db_len, GFP_KERNEL); + if (!*db) + return NULL; + + (*db)->db_type = type; + (*db)->db_size = size; + (*db)->db_len = len; + (*db)->db_data = data; + + return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db, + &o2hb_debug_fops); +} + +static int o2hb_debug_init(void) +{ + int ret = -ENOMEM; + + o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL); + if (!o2hb_debug_dir) { + mlog_errno(ret); + goto bail; + } + + o2hb_debug_livenodes = o2hb_debug_create(O2HB_DEBUG_LIVENODES, + o2hb_debug_dir, + &o2hb_db_livenodes, + sizeof(*o2hb_db_livenodes), + O2HB_DB_TYPE_LIVENODES, + sizeof(o2hb_live_node_bitmap), + O2NM_MAX_NODES, + o2hb_live_node_bitmap); + if (!o2hb_debug_livenodes) { + mlog_errno(ret); + goto bail; + } + ret = 0; +bail: + if (ret) + o2hb_exit(); + + return ret; } int o2hb_init(void) @@ -1050,23 +1121,7 @@ int o2hb_init(void) memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); - o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL); - if (!o2hb_debug_dir) { - mlog_errno(-ENOMEM); - return -ENOMEM; - } - - o2hb_debug_livenodes = debugfs_create_file(O2HB_DEBUG_LIVENODES, - S_IFREG|S_IRUSR, - o2hb_debug_dir, NULL, - &o2hb_debug_fops); - if (!o2hb_debug_livenodes) { - mlog_errno(-ENOMEM); - debugfs_remove(o2hb_debug_dir); - return -ENOMEM; - } - - return 0; + return o2hb_debug_init(); } /* if we're already in a callback then we're already serialized by the sem */ -- cgit v1.2.3 From 823a637ae933fde8fdb280612dd3ff9912e301e3 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Wed, 6 Oct 2010 17:55:21 -0700 Subject: ocfs2/cluster: Maintain live node bitmap per heartbeat region Currently we track a global livenode bitmap that keeps track of all nodes that are heartbeating in all regions. This patch adds the ability to track the livenode bitmap on a per region basis. We will use this facility in a later patch to allow us to withstand the loss of a minority number of regions. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 16e49765c85..188f50269b8 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -174,6 +174,9 @@ struct o2hb_region { struct block_device *hr_bdev; struct o2hb_disk_slot *hr_slots; + /* live node map of this region */ + unsigned long hr_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + /* let the person setting up hb wait for it to return until it * has reached a 'steady' state. This will be fixed when we have * a more complete api that doesn't lead to this sort of fragility. */ @@ -688,6 +691,8 @@ fire_callbacks: mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n", slot->ds_node_num, (long long)slot->ds_last_generation); + set_bit(slot->ds_node_num, reg->hr_live_node_bitmap); + /* first on the list generates a callback */ if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { set_bit(slot->ds_node_num, o2hb_live_node_bitmap); @@ -733,6 +738,8 @@ fire_callbacks: mlog(ML_HEARTBEAT, "Node %d left my region\n", slot->ds_node_num); + clear_bit(slot->ds_node_num, reg->hr_live_node_bitmap); + /* last off the live_slot generates a callback */ list_del_init(&slot->ds_live_item); if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { -- cgit v1.2.3 From 536f0741f324f116d8b059295999945a2dac56bc Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Thu, 7 Oct 2010 17:03:07 -0700 Subject: ocfs2/cluster: Track number of global heartbeat regions In global heartbeat mode, we have a upper limit for the number of active regions. This patch adds the facility to track the number of active global heartbeat regions and fails to start heartbeat if the number exceeds the maximum. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 188f50269b8..d66b17c000d 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -62,6 +62,12 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; static LIST_HEAD(o2hb_node_events); static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); +/* + * In global heartbeat, we maintain a series of region bitmaps. + * - o2hb_region_bitmap allows us to limit the region number to max region. + */ +static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; + #define O2HB_DB_TYPE_LIVENODES 0 struct o2hb_debug_buf { int db_type; @@ -176,6 +182,7 @@ struct o2hb_region { /* live node map of this region */ unsigned long hr_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned int hr_region_num; /* let the person setting up hb wait for it to return until it * has reached a 'steady' state. This will be fixed when we have @@ -1127,6 +1134,7 @@ int o2hb_init(void) INIT_LIST_HEAD(&o2hb_node_events); memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); + memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap)); return o2hb_debug_init(); } @@ -1716,12 +1724,22 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g if (strlen(name) > O2HB_MAX_REGION_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - config_item_init_type_name(®->hr_item, name, &o2hb_region_type); - spin_lock(&o2hb_live_lock); + reg->hr_region_num = 0; + if (o2hb_global_heartbeat_active()) { + reg->hr_region_num = find_first_zero_bit(o2hb_region_bitmap, + O2NM_MAX_REGIONS); + if (reg->hr_region_num >= O2NM_MAX_REGIONS) { + spin_unlock(&o2hb_live_lock); + return ERR_PTR(-EFBIG); + } + set_bit(reg->hr_region_num, o2hb_region_bitmap); + } list_add_tail(®->hr_all_item, &o2hb_all_regions); spin_unlock(&o2hb_live_lock); + config_item_init_type_name(®->hr_item, name, &o2hb_region_type); + return ®->hr_item; } @@ -1733,6 +1751,8 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, /* stop the thread when the user removes the region dir */ spin_lock(&o2hb_live_lock); + if (o2hb_global_heartbeat_active()) + clear_bit(reg->hr_region_num, o2hb_region_bitmap); hb_task = reg->hr_task; reg->hr_task = NULL; spin_unlock(&o2hb_live_lock); -- cgit v1.2.3 From e7d656baf6607a0775f4ca85464a4ead306741e5 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Wed, 6 Oct 2010 17:55:18 -0700 Subject: ocfs2/cluster: Track bitmap of live heartbeat regions A heartbeat region becomes live (or active) after a fixed number of (steady) iterations. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index d66b17c000d..2a7cd17e96f 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -65,8 +65,10 @@ static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); /* * In global heartbeat, we maintain a series of region bitmaps. * - o2hb_region_bitmap allows us to limit the region number to max region. + * - o2hb_live_region_bitmap tracks live regions (seen steady iterations). */ static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; +static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; #define O2HB_DB_TYPE_LIVENODES 0 struct o2hb_debug_buf { @@ -1135,6 +1137,7 @@ int o2hb_init(void) memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap)); + memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap)); return o2hb_debug_init(); } @@ -1563,6 +1566,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, /* Ok, we were woken. Make sure it wasn't by drop_item() */ spin_lock(&o2hb_live_lock); hb_task = reg->hr_task; + if (o2hb_global_heartbeat_active()) + set_bit(reg->hr_region_num, o2hb_live_region_bitmap); spin_unlock(&o2hb_live_lock); if (hb_task) @@ -1751,8 +1756,10 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, /* stop the thread when the user removes the region dir */ spin_lock(&o2hb_live_lock); - if (o2hb_global_heartbeat_active()) + if (o2hb_global_heartbeat_active()) { clear_bit(reg->hr_region_num, o2hb_region_bitmap); + clear_bit(reg->hr_region_num, o2hb_live_region_bitmap); + } hb_task = reg->hr_task; reg->hr_task = NULL; spin_unlock(&o2hb_live_lock); -- cgit v1.2.3 From 43182d2a799865872041b6e4d8387131e9462f56 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Wed, 6 Oct 2010 17:55:16 -0700 Subject: ocfs2/cluster: Maintain bitmap of quorum regions o2hb allows online adding of regions. However, a newly added region is not used in quorum calculations unless it has been added on all nodes. This patch tracks a bitmap of such quorum regions. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 2a7cd17e96f..62a8af27134 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -66,9 +66,12 @@ static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); * In global heartbeat, we maintain a series of region bitmaps. * - o2hb_region_bitmap allows us to limit the region number to max region. * - o2hb_live_region_bitmap tracks live regions (seen steady iterations). + * - o2hb_quorum_region_bitmap tracks live regions that have seen all nodes + * heartbeat on it. */ static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; +static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; #define O2HB_DB_TYPE_LIVENODES 0 struct o2hb_debug_buf { @@ -607,6 +610,35 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot) o2nm_node_put(node); } +static void o2hb_set_quorum_device(struct o2hb_region *reg, + struct o2hb_disk_slot *slot) +{ + assert_spin_locked(&o2hb_live_lock); + + if (!o2hb_global_heartbeat_active()) + return; + + if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) + return; + + /* + * A region can be added to the quorum only when it sees all + * live nodes heartbeat on it. In other words, the region has been + * added to all nodes. + */ + if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap, + sizeof(o2hb_live_node_bitmap))) + return; + + if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD) + return; + + printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n", + config_item_name(®->hr_item)); + + set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); +} + static int o2hb_check_slot(struct o2hb_region *reg, struct o2hb_disk_slot *slot) { @@ -772,6 +804,8 @@ fire_callbacks: slot->ds_equal_samples = 0; } out: + o2hb_set_quorum_device(reg, slot); + spin_unlock(&o2hb_live_lock); o2hb_run_event_list(&event); @@ -1138,6 +1172,7 @@ int o2hb_init(void) memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap)); memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap)); + memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap)); return o2hb_debug_init(); } -- cgit v1.2.3 From b1c5ebfbe398b3360614a4788c02061cd153e60a Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Thu, 7 Oct 2010 17:05:52 -0700 Subject: ocfs2/cluster: Maintain bitmap of failed regions In global heartbeat mode, we track the bitmap of regions that have seen heartbeat timeouts. We fence if the number of such regions is greater than or equal to half the number of quorum regions. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 62a8af27134..f890656127f 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -68,10 +68,12 @@ static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); * - o2hb_live_region_bitmap tracks live regions (seen steady iterations). * - o2hb_quorum_region_bitmap tracks live regions that have seen all nodes * heartbeat on it. + * - o2hb_failed_region_bitmap tracks the regions that have seen io timeouts. */ static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; +static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; #define O2HB_DB_TYPE_LIVENODES 0 struct o2hb_debug_buf { @@ -217,8 +219,19 @@ struct o2hb_bio_wait_ctxt { int wc_error; }; +static int o2hb_pop_count(void *map, int count) +{ + int i = -1, pop = 0; + + while ((i = find_next_bit(map, count, i + 1)) < count) + pop++; + return pop; +} + static void o2hb_write_timeout(struct work_struct *work) { + int failed, quorum; + unsigned long flags; struct o2hb_region *reg = container_of(work, struct o2hb_region, hr_write_timeout_work.work); @@ -226,6 +239,28 @@ static void o2hb_write_timeout(struct work_struct *work) mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u " "milliseconds\n", reg->hr_dev_name, jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); + + if (o2hb_global_heartbeat_active()) { + spin_lock_irqsave(&o2hb_live_lock, flags); + if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) + set_bit(reg->hr_region_num, o2hb_failed_region_bitmap); + failed = o2hb_pop_count(&o2hb_failed_region_bitmap, + O2NM_MAX_REGIONS); + quorum = o2hb_pop_count(&o2hb_quorum_region_bitmap, + O2NM_MAX_REGIONS); + spin_unlock_irqrestore(&o2hb_live_lock, flags); + + mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n", + quorum, failed); + + /* + * Fence if the number of failed regions >= half the number + * of quorum regions + */ + if ((failed << 1) < quorum) + return; + } + o2quo_disk_timeout(); } @@ -234,6 +269,11 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg) mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", O2HB_MAX_WRITE_TIMEOUT_MS); + if (o2hb_global_heartbeat_active()) { + spin_lock(&o2hb_live_lock); + clear_bit(reg->hr_region_num, o2hb_failed_region_bitmap); + spin_unlock(&o2hb_live_lock); + } cancel_delayed_work(®->hr_write_timeout_work); reg->hr_last_timeout_start = jiffies; schedule_delayed_work(®->hr_write_timeout_work, @@ -1173,6 +1213,7 @@ int o2hb_init(void) memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap)); memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap)); memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap)); + memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap)); return o2hb_debug_init(); } -- cgit v1.2.3 From a6de013654b4839c8609e26241ebd9eb1ecc52e6 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Wed, 6 Oct 2010 17:55:13 -0700 Subject: ocfs2/cluster: Create debugfs files for live, quorum and failed region bitmaps This patch prints the bitmaps of live, quorum and failed regions. This information will be useful in debugging cluster issues. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 63 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index f890656127f..b06b9e52fba 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -76,6 +76,9 @@ static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; #define O2HB_DB_TYPE_LIVENODES 0 +#define O2HB_DB_TYPE_LIVEREGIONS 1 +#define O2HB_DB_TYPE_QUORUMREGIONS 2 +#define O2HB_DB_TYPE_FAILEDREGIONS 3 struct o2hb_debug_buf { int db_type; int db_size; @@ -84,12 +87,21 @@ struct o2hb_debug_buf { }; static struct o2hb_debug_buf *o2hb_db_livenodes; +static struct o2hb_debug_buf *o2hb_db_liveregions; +static struct o2hb_debug_buf *o2hb_db_quorumregions; +static struct o2hb_debug_buf *o2hb_db_failedregions; #define O2HB_DEBUG_DIR "o2hb" #define O2HB_DEBUG_LIVENODES "livenodes" +#define O2HB_DEBUG_LIVEREGIONS "live_regions" +#define O2HB_DEBUG_QUORUMREGIONS "quorum_regions" +#define O2HB_DEBUG_FAILEDREGIONS "failed_regions" static struct dentry *o2hb_debug_dir; static struct dentry *o2hb_debug_livenodes; +static struct dentry *o2hb_debug_liveregions; +static struct dentry *o2hb_debug_quorumregions; +static struct dentry *o2hb_debug_failedregions; static LIST_HEAD(o2hb_all_regions); @@ -1085,6 +1097,9 @@ static int o2hb_debug_open(struct inode *inode, struct file *file) switch (db->db_type) { case O2HB_DB_TYPE_LIVENODES: + case O2HB_DB_TYPE_LIVEREGIONS: + case O2HB_DB_TYPE_QUORUMREGIONS: + case O2HB_DB_TYPE_FAILEDREGIONS: spin_lock(&o2hb_live_lock); memcpy(map, db->db_data, db->db_size); spin_unlock(&o2hb_live_lock); @@ -1146,6 +1161,12 @@ static const struct file_operations o2hb_debug_fops = { void o2hb_exit(void) { kfree(o2hb_db_livenodes); + kfree(o2hb_db_liveregions); + kfree(o2hb_db_quorumregions); + kfree(o2hb_db_failedregions); + debugfs_remove(o2hb_debug_failedregions); + debugfs_remove(o2hb_debug_quorumregions); + debugfs_remove(o2hb_debug_liveregions); debugfs_remove(o2hb_debug_livenodes); debugfs_remove(o2hb_debug_dir); } @@ -1189,6 +1210,48 @@ static int o2hb_debug_init(void) mlog_errno(ret); goto bail; } + + o2hb_debug_liveregions = o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS, + o2hb_debug_dir, + &o2hb_db_liveregions, + sizeof(*o2hb_db_liveregions), + O2HB_DB_TYPE_LIVEREGIONS, + sizeof(o2hb_live_region_bitmap), + O2NM_MAX_REGIONS, + o2hb_live_region_bitmap); + if (!o2hb_debug_liveregions) { + mlog_errno(ret); + goto bail; + } + + o2hb_debug_quorumregions = + o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS, + o2hb_debug_dir, + &o2hb_db_quorumregions, + sizeof(*o2hb_db_quorumregions), + O2HB_DB_TYPE_QUORUMREGIONS, + sizeof(o2hb_quorum_region_bitmap), + O2NM_MAX_REGIONS, + o2hb_quorum_region_bitmap); + if (!o2hb_debug_quorumregions) { + mlog_errno(ret); + goto bail; + } + + o2hb_debug_failedregions = + o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS, + o2hb_debug_dir, + &o2hb_db_failedregions, + sizeof(*o2hb_db_failedregions), + O2HB_DB_TYPE_FAILEDREGIONS, + sizeof(o2hb_failed_region_bitmap), + O2NM_MAX_REGIONS, + o2hb_failed_region_bitmap); + if (!o2hb_debug_failedregions) { + mlog_errno(ret); + goto bail; + } + ret = 0; bail: if (ret) -- cgit v1.2.3 From 1f28530537f106f83e5cf7ef0193075667b6d520 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Wed, 6 Oct 2010 17:55:12 -0700 Subject: ocfs2/cluster: Create debugfs dir/files for each region This patch creates debugfs directory for each o2hb region and creates files to expose the region number and the per region live node bitmap. This information will be useful in debugging cluster issues. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 77 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index b06b9e52fba..f28de4b09c6 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -79,6 +79,8 @@ static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; #define O2HB_DB_TYPE_LIVEREGIONS 1 #define O2HB_DB_TYPE_QUORUMREGIONS 2 #define O2HB_DB_TYPE_FAILEDREGIONS 3 +#define O2HB_DB_TYPE_REGION_LIVENODES 4 +#define O2HB_DB_TYPE_REGION_NUMBER 5 struct o2hb_debug_buf { int db_type; int db_size; @@ -96,6 +98,7 @@ static struct o2hb_debug_buf *o2hb_db_failedregions; #define O2HB_DEBUG_LIVEREGIONS "live_regions" #define O2HB_DEBUG_QUORUMREGIONS "quorum_regions" #define O2HB_DEBUG_FAILEDREGIONS "failed_regions" +#define O2HB_DEBUG_REGION_NUMBER "num" static struct dentry *o2hb_debug_dir; static struct dentry *o2hb_debug_livenodes; @@ -203,6 +206,12 @@ struct o2hb_region { unsigned long hr_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; unsigned int hr_region_num; + struct dentry *hr_debug_dir; + struct dentry *hr_debug_livenodes; + struct dentry *hr_debug_regnum; + struct o2hb_debug_buf *hr_db_livenodes; + struct o2hb_debug_buf *hr_db_regnum; + /* let the person setting up hb wait for it to return until it * has reached a 'steady' state. This will be fixed when we have * a more complete api that doesn't lead to this sort of fragility. */ @@ -1083,6 +1092,7 @@ static int o2hb_thread(void *data) static int o2hb_debug_open(struct inode *inode, struct file *file) { struct o2hb_debug_buf *db = inode->i_private; + struct o2hb_region *reg; unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; char *buf = NULL; int i = -1; @@ -1105,6 +1115,19 @@ static int o2hb_debug_open(struct inode *inode, struct file *file) spin_unlock(&o2hb_live_lock); break; + case O2HB_DB_TYPE_REGION_LIVENODES: + spin_lock(&o2hb_live_lock); + reg = (struct o2hb_region *)db->db_data; + memcpy(map, reg->hr_live_node_bitmap, db->db_size); + spin_unlock(&o2hb_live_lock); + break; + + case O2HB_DB_TYPE_REGION_NUMBER: + reg = (struct o2hb_region *)db->db_data; + out += snprintf(buf + out, PAGE_SIZE - out, "%d\n", + reg->hr_region_num); + goto done; + default: goto done; } @@ -1342,6 +1365,12 @@ static void o2hb_region_release(struct config_item *item) if (reg->hr_slots) kfree(reg->hr_slots); + kfree(reg->hr_db_regnum); + kfree(reg->hr_db_livenodes); + debugfs_remove(reg->hr_debug_livenodes); + debugfs_remove(reg->hr_debug_regnum); + debugfs_remove(reg->hr_debug_dir); + spin_lock(&o2hb_live_lock); list_del(®->hr_all_item); spin_unlock(&o2hb_live_lock); @@ -1856,10 +1885,52 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group : NULL; } +static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) +{ + int ret = -ENOMEM; + + reg->hr_debug_dir = + debugfs_create_dir(config_item_name(®->hr_item), dir); + if (!reg->hr_debug_dir) { + mlog_errno(ret); + goto bail; + } + + reg->hr_debug_livenodes = + o2hb_debug_create(O2HB_DEBUG_LIVENODES, + reg->hr_debug_dir, + &(reg->hr_db_livenodes), + sizeof(*(reg->hr_db_livenodes)), + O2HB_DB_TYPE_REGION_LIVENODES, + sizeof(reg->hr_live_node_bitmap), + O2NM_MAX_NODES, reg); + if (!reg->hr_debug_livenodes) { + mlog_errno(ret); + goto bail; + } + + reg->hr_debug_regnum = + o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER, + reg->hr_debug_dir, + &(reg->hr_db_regnum), + sizeof(*(reg->hr_db_regnum)), + O2HB_DB_TYPE_REGION_NUMBER, + 0, O2NM_MAX_NODES, reg); + if (!reg->hr_debug_regnum) { + mlog_errno(ret); + goto bail; + } + + ret = 0; +bail: + return ret; +} + static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group, const char *name) { struct o2hb_region *reg = NULL; + int ret; reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL); if (reg == NULL) @@ -1884,6 +1955,12 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g config_item_init_type_name(®->hr_item, name, &o2hb_region_type); + ret = o2hb_debug_region_init(reg, o2hb_debug_dir); + if (ret) { + config_item_put(®->hr_item); + return ERR_PTR(ret); + } + return ®->hr_item; } -- cgit v1.2.3 From d6aa1c7c9e4b48081c2302e14b0f857017461efd Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Wed, 6 Oct 2010 18:50:50 -0700 Subject: ocfs2/cluster: Add mlogs for heartbeat up/down events This patch adds mlogs for o2hb up and down events. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index f28de4b09c6..e8676accf90 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -797,6 +797,8 @@ fire_callbacks: /* first on the list generates a callback */ if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { + mlog(ML_HEARTBEAT, "o2hb: Add node %d to live nodes " + "bitmap\n", slot->ds_node_num); set_bit(slot->ds_node_num, o2hb_live_node_bitmap); o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node, @@ -845,6 +847,8 @@ fire_callbacks: /* last off the live_slot generates a callback */ list_del_init(&slot->ds_live_item); if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { + mlog(ML_HEARTBEAT, "o2hb: Remove node %d from live " + "nodes bitmap\n", slot->ds_node_num); clear_bit(slot->ds_node_num, o2hb_live_node_bitmap); /* node can be null */ -- cgit v1.2.3 From 43695d095dfaf266a8a940d9b07eed7f46076b49 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Wed, 6 Oct 2010 17:55:09 -0700 Subject: ocfs2/cluster: Show per region heartbeat elapsed time This patch adds a per region debugfs file that shows the elapsed time since the time the o2hb timer was last armed. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index e8676accf90..29aee2128ed 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -81,6 +81,7 @@ static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; #define O2HB_DB_TYPE_FAILEDREGIONS 3 #define O2HB_DB_TYPE_REGION_LIVENODES 4 #define O2HB_DB_TYPE_REGION_NUMBER 5 +#define O2HB_DB_TYPE_REGION_ELAPSED_TIME 6 struct o2hb_debug_buf { int db_type; int db_size; @@ -99,6 +100,7 @@ static struct o2hb_debug_buf *o2hb_db_failedregions; #define O2HB_DEBUG_QUORUMREGIONS "quorum_regions" #define O2HB_DEBUG_FAILEDREGIONS "failed_regions" #define O2HB_DEBUG_REGION_NUMBER "num" +#define O2HB_DEBUG_REGION_ELAPSED_TIME "elapsed_time_in_ms" static struct dentry *o2hb_debug_dir; static struct dentry *o2hb_debug_livenodes; @@ -209,8 +211,10 @@ struct o2hb_region { struct dentry *hr_debug_dir; struct dentry *hr_debug_livenodes; struct dentry *hr_debug_regnum; + struct dentry *hr_debug_elapsed_time; struct o2hb_debug_buf *hr_db_livenodes; struct o2hb_debug_buf *hr_db_regnum; + struct o2hb_debug_buf *hr_db_elapsed_time; /* let the person setting up hb wait for it to return until it * has reached a 'steady' state. This will be fixed when we have @@ -1132,6 +1136,13 @@ static int o2hb_debug_open(struct inode *inode, struct file *file) reg->hr_region_num); goto done; + case O2HB_DB_TYPE_REGION_ELAPSED_TIME: + reg = (struct o2hb_region *)db->db_data; + out += snprintf(buf + out, PAGE_SIZE - out, "%u\n", + jiffies_to_msecs(jiffies - + reg->hr_last_timeout_start)); + goto done; + default: goto done; } @@ -1925,6 +1936,18 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) goto bail; } + reg->hr_debug_elapsed_time = + o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME, + reg->hr_debug_dir, + &(reg->hr_db_elapsed_time), + sizeof(*(reg->hr_db_elapsed_time)), + O2HB_DB_TYPE_REGION_ELAPSED_TIME, + 0, 0, reg); + if (!reg->hr_debug_elapsed_time) { + mlog_errno(ret); + goto bail; + } + ret = 0; bail: return ret; -- cgit v1.2.3 From 4d94aa1b1d437f9513ddc89974d8bd214b8304f6 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Sat, 9 Oct 2010 10:27:04 -0700 Subject: ocfs2/cluster: Bump up dlm protocol to version 1.1 dlm protocol 1.1. activates messages DLM_QUERY_REGION and DLM_QUERY_NODEINFO that are a must for global heartbeat. It also activates o2hb_global_heartbeat_active(). Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 2 +- fs/ocfs2/dlm/dlmdomain.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 29aee2128ed..6a1280a013e 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -2429,6 +2429,6 @@ EXPORT_SYMBOL_GPL(o2hb_get_all_regions); int o2hb_global_heartbeat_active(void) { - return 0; + return (o2hb_heartbeat_mode == O2HB_HEARTBEAT_GLOBAL); } EXPORT_SYMBOL(o2hb_global_heartbeat_active); diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 78d428f5e10..58a93b95373 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -135,7 +135,7 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events); */ static const struct dlm_protocol_version dlm_protocol = { .pv_major = 1, - .pv_minor = 0, + .pv_minor = 1, }; #define DLM_DOMAIN_BACKOFF_MS 200 -- cgit v1.2.3 From d4396eafe402b710a8535137b3bf2abe6c059a15 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Fri, 15 Oct 2010 11:57:21 -0700 Subject: ocfs2/cluster: Release debugfs file elapsed_time_in_ms An earlier commit forgot to remove a debugfs file, elapsed_time_in_ms. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 6a1280a013e..52c7557f3e2 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1384,6 +1384,7 @@ static void o2hb_region_release(struct config_item *item) kfree(reg->hr_db_livenodes); debugfs_remove(reg->hr_debug_livenodes); debugfs_remove(reg->hr_debug_regnum); + debugfs_remove(reg->hr_debug_elapsed_time); debugfs_remove(reg->hr_debug_dir); spin_lock(&o2hb_live_lock); -- cgit v1.2.3