Patchwork [BUG:924,v2] nfs: Introduce trusted-write and trusted-sync options

login
register
Submitter Shehjar Tikoo
Date 2010-05-18 07:07:42
Message ID <1274166462-9210-1-git-send-email-shehjart@gluster.com>
Download mbox | patch
Permalink /patch/3286/
State Accepted
Delegated to: Anand Avati
Headers show

Comments

Shehjar Tikoo - 2010-05-18 07:07:42
From: Shehjar Tikoo <shehjart@gluster.com>

Introduces two new options:

1. nfs3.*.trusted-write: Forces UNSTABLE writes to return STABLE to NFS
clients to prevent the clients from sending a COMMIT. STABLE writes
are still handled in a sync manner and so are COMMITs if they're sent
at all.

2. nfs3.*.trusted-sync: Forces all WRITEs and COMMITs to return STABLE
return flags to NFS clients to avoid the overhead of STABLE writes, and
COMMITs that follow UNSTABLE writes. This includes the trusted-write
functionality. In addition to the trusted-write, it also writes
STABLE writes in an UNSTABLE manner.

Both violate the NFS protocol but allow better write perf in most
configurations. Use with caution.

Signed-off-by: Shehjar Tikoo <shehjart@gluster.com>
---
 xlators/nfs/server/src/nfs.c  |   23 ++++++
 xlators/nfs/server/src/nfs3.c |  164 +++++++++++++++++++++++++++++++++++++++--
 xlators/nfs/server/src/nfs3.h |    2 +
 3 files changed, 183 insertions(+), 6 deletions(-)

Patch

diff --git a/xlators/nfs/server/src/nfs.c b/xlators/nfs/server/src/nfs.c
index 14d82fc..3b69ea1 100644
--- a/xlators/nfs/server/src/nfs.c
+++ b/xlators/nfs/server/src/nfs.c
@@ -583,6 +583,29 @@  struct volume_options options[] = {
           .description = "Type of access desired for this subvolume: "
                          " read-only, read-write(default)"
         },
+        { .key  = {"nfs3.*.trusted-write"},
+          .type = GF_OPTION_TYPE_BOOL,
+          .description = "On an UNSTABLE write from client, return STABLE flag"
+                         " to force client to not send a COMMIT request. In "
+                         "some environments, combined with a replicated "
+                         "GlusterFS setup, this option can improve write "
+                         "performance. This flag allows user to trust Gluster"
+                         " replication logic to sync data to the disks and "
+                         "recover when required. COMMIT requests if received "
+                         "will be handled in a default manner by fsyncing."
+                         " STABLE writes are still handled in a sync manner. "
+                         "Off by default."
+
+        },
+        { .key  = {"nfs3.*.trusted-sync"},
+          .type = GF_OPTION_TYPE_BOOL,
+          .description = "All writes and COMMIT requests are treated as async."
+                         " This implies that no write requests are guaranteed"
+                         " to be on server disks when the write reply is "
+                         "received at the NFS client. Trusted sync includes "
+                         " trusted-write behaviour. Off by default."
+
+        },
         { .key  = {"rpc-auth.auth-unix"},
           .type = GF_OPTION_TYPE_BOOL,
           .description = "Disable or enable the AUTH_UNIX authentication type."
diff --git a/xlators/nfs/server/src/nfs3.c b/xlators/nfs/server/src/nfs3.c
index 883dab0..8232cec 100644
--- a/xlators/nfs/server/src/nfs3.c
+++ b/xlators/nfs/server/src/nfs3.c
@@ -140,6 +140,9 @@ 
         } while (0)                                                     \
 
 
+#define nfs3_export_sync_trusted(nf3stt, xlid) ((nf3stt)->exports[xlid]).trusted_sync
+#define nfs3_export_write_trusted(nf3stt, xlid) ((nf3stt)->exports[xlid]).trusted_write
+
 int
 nfs3_solaris_zerolen_fh (struct nfs3_fh *fh, int fhlen)
 {
@@ -1593,6 +1596,73 @@  nfs3svc_write_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
 }
 
 
+/*
+ * If this logic determines that the write should return a reply to the client
+ * after this function, the return value is -1 and the writetype is reset to
+ * the type of write we want to signify to the client.
+ *
+ * In case the write should continue to serve the request according to the type
+ * of stable write, a 0 is returned and writetype is left as it is.
+ */
+int
+nfs3_write_how (int *writetype, int write_trusted, int sync_trusted)
+{
+        int     ret = -1;
+
+        if (*writetype == UNSTABLE) {
+                /* On an UNSTABLE write, only return STABLE when trusted-write
+                 * is set. TW is also set when trusted-sync is set.
+                 */
+                if (write_trusted)
+                        *writetype = FILE_SYNC;
+
+                goto err;
+        } else if ((*writetype == DATA_SYNC) || (*writetype == FILE_SYNC)) {
+
+                /* On a STABLE write, if sync-trusted is on, only then, return
+                 * without syncing.
+                 */
+                if (sync_trusted)
+                        goto err;
+        }
+
+        ret = 0;
+err:
+        return ret;
+}
+
+
+/*
+ * Before going into the write reply logic, here is a matrix that shows the
+ * requirements for a write reply as given by RFC1813.
+ *
+ * Requested Write Type ||      Possible Returns
+ * ==============================================
+ * FILE_SYNC            ||      FILE_SYNC
+ * DATA_SYNC            ||      DATA_SYNC or FILE_SYNC
+ * UNSTABLE             ||      DATA_SYNC or FILE_SYNC or UNSTABLE
+ *
+ * Write types other than UNSTABLE are together called STABLE.
+ * RS - Return Stable
+ * RU - Return Unstable
+ * WS - Write Stable
+ * WU - Write Unstable
+ *
+ *+============================================+
+ *| Vol Opts -> || trusted-write| trusted-sync |
+ *| Write Type  ||              |              |
+ *|-------------||--------------|--------------|
+ *| STABLE      ||      WS      |   WU         |
+ *|             ||      RS      |   RS         |
+ *|-------------||--------------|--------------|
+ *| UNSTABLE    ||      WU      |   WU         |
+ *|             ||      RS      |   RS         |
+ *|-------------||--------------|--------------|
+ *| COMMIT      ||    fsync     | getattr      |
+ *+============================================+
+ *
+ *
+ */
 int32_t
 nfs3svc_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                    int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
@@ -1603,6 +1673,8 @@  nfs3svc_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
         nfs_user_t              nfu = {0, };
         nfs3_call_state_t       *cs = NULL;
         struct nfs3_state       *nfs3 = NULL;
+        int                     write_trusted = 0;
+        int                     sync_trusted = 0;
 
         cs = frame->local;
         nfs3 = rpcsvc_request_program_private (cs->req);
@@ -1611,11 +1683,15 @@  nfs3svc_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                 goto err;
         }
 
-        /* So that we do send a reply if an unstable write was requested. */
-        ret = -1;
         stat = NFS3_OK;
         cs->maxcount = op_ret;
-        if (cs->writetype == UNSTABLE)
+
+        write_trusted = nfs3_export_write_trusted (cs->nfs3state,
+                                                   cs->resolvefh.xlatorid);
+        sync_trusted = nfs3_export_sync_trusted (cs->nfs3state,
+                                                 cs->resolvefh.xlatorid);
+        ret = nfs3_write_how (&cs->writetype, write_trusted, sync_trusted);
+        if (ret == -1)
                 goto err;
 
         nfs_request_user_init (&nfu, cs->req);
@@ -4385,6 +4461,13 @@  nfs3_commit_resume (void *carg)
 
         cs = (nfs3_call_state_t *)carg;
         nfs3_check_fh_resolve_status (cs, stat, nfs3err);
+
+        if (nfs3_export_sync_trusted (cs->nfs3state, cs->resolvefh.xlatorid)) {
+                ret = -1;
+                stat = NFS3_OK;
+                goto nfs3err;
+        }
+
         nfs_request_user_init (&nfu, cs->req);
         ret = nfs_fsync (cs->nfsx, cs->vol, &nfu, cs->fd, 0,
                          nfs3svc_commit_cbk, cs);
@@ -4395,7 +4478,8 @@  nfs3err:
         if (ret < 0) {
                 nfs3_log_common_res (rpcsvc_request_xid (cs->req), "COMMIT",
                                      stat, -ret);
-                nfs3_commit_reply (cs->req, stat, 0, NULL, NULL);
+                nfs3_commit_reply (cs->req, stat, cs->nfs3state->serverstart,
+                                   NULL, NULL);
                 nfs3_call_state_wipe (cs);
                 ret = 0;
         }
@@ -4649,6 +4733,7 @@  nfs3_init_subvolume_options (struct nfs3_export *exp, dict_t *options)
         char            *optstr = NULL;
         char            searchkey[1024];
         char            *name = NULL;
+        gf_boolean_t    boolt = _gf_false;
 
         if ((!exp) || (!options))
                 return -1;
@@ -4693,8 +4778,75 @@  nfs3_init_subvolume_options (struct nfs3_export *exp, dict_t *options)
                 }
         }
 
-        gf_log (GF_NFS3, GF_LOG_TRACE, "%s: %s", exp->subvol->name,
-                (exp->access == GF_NFS3_VOLACCESS_RO)?"read-only":"read-write");
+        exp->trusted_sync = 0;
+        ret = snprintf (searchkey, 1024, "nfs3.%s.trusted-sync", name);
+        if (ret < 0) {
+                gf_log (GF_NFS3, GF_LOG_ERROR, "snprintf failed");
+                ret = -1;
+                goto err;
+        }
+
+        if (dict_get (options, searchkey)) {
+                ret = dict_get_str (options, searchkey, &optstr);
+                if (ret < 0) {
+                        gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to read "
+                                " option: %s", searchkey);
+                        ret = -1;
+                        goto err;
+                }
+
+                ret = gf_string2boolean (optstr, &boolt);
+                if (ret < 0) {
+                        gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to convert str "
+                                "to gf_boolean_t");
+                        ret = -1;
+                        goto err;
+                }
+
+                if (boolt == _gf_true)
+                        exp->trusted_sync = 1;
+        }
+
+        exp->trusted_write = 0;
+        ret = snprintf (searchkey, 1024, "nfs3.%s.trusted-write", name);
+        if (ret < 0) {
+                gf_log (GF_NFS3, GF_LOG_ERROR, "snprintf failed");
+                ret = -1;
+                goto err;
+        }
+
+        if (dict_get (options, searchkey)) {
+                ret = dict_get_str (options, searchkey, &optstr);
+                if (ret < 0) {
+                        gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to read "
+                                " option: %s", searchkey);
+                        ret = -1;
+                        goto err;
+                }
+
+                ret = gf_string2boolean (optstr, &boolt);
+                if (ret < 0) {
+                        gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to convert str "
+                                "to gf_boolean_t");
+                        ret = -1;
+                        goto err;
+                }
+
+                if (boolt == _gf_true)
+                        exp->trusted_write = 1;
+        }
+
+        /* If trusted-sync is on, then we also switch on trusted-write because
+         * tw is included in ts. In write logic, we're then only checking for
+         * tw.
+         */
+        if (exp->trusted_sync)
+                exp->trusted_write = 1;
+
+        gf_log (GF_NFS3, GF_LOG_TRACE, "%s: %s, %s, %s", exp->subvol->name,
+                (exp->access == GF_NFS3_VOLACCESS_RO)?"read-only":"read-write",
+                (exp->trusted_sync == 0)?"no trusted_sync":"trusted_sync",
+                (exp->trusted_write == 0)?"no trusted_write":"trusted_write");
         ret = 0;
 err:
         return ret;
diff --git a/xlators/nfs/server/src/nfs3.h b/xlators/nfs/server/src/nfs3.h
index bb5fbb7..ccdad44 100644
--- a/xlators/nfs/server/src/nfs3.h
+++ b/xlators/nfs/server/src/nfs3.h
@@ -84,6 +84,8 @@  struct nfs3_fd_entry {
 struct nfs3_export {
         xlator_t                *subvol;
         int                     access;
+        int                     trusted_sync;
+        int                     trusted_write;
 };
 
 #define GF_NFS3_DEFAULT_VOLACCESS       (GF_NFS3_VOLACCESS_RW)