int flags;
        int err;
        unsigned long blocknr;
+       ktime_t start_time;
+       u64 commit_time;
        char *tagp = NULL;
        journal_header_t *header;
        journal_block_tag_t *tag = NULL;
        commit_transaction->t_state = T_FLUSH;
        journal->j_committing_transaction = commit_transaction;
        journal->j_running_transaction = NULL;
+       start_time = ktime_get();
        commit_transaction->t_log_start = journal->j_head;
        wake_up(&journal->j_wait_transaction_locked);
        spin_unlock(&journal->j_state_lock);
        J_ASSERT(commit_transaction == journal->j_committing_transaction);
        journal->j_commit_sequence = commit_transaction->t_tid;
        journal->j_committing_transaction = NULL;
+       commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+
+       /*
+        * weight the commit time higher than the average time so we don't
+        * react too strongly to vast changes in commit time
+        */
+       if (likely(journal->j_average_commit_time))
+               journal->j_average_commit_time = (commit_time*3 +
+                               journal->j_average_commit_time) / 4;
+       else
+               journal->j_average_commit_time = commit_time;
+
        spin_unlock(&journal->j_state_lock);
 
        if (commit_transaction->t_checkpoint_list == NULL &&
 
 #include <linux/timer.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
+#include <linux/hrtimer.h>
 
 static void __journal_temp_unlink_buffer(struct journal_head *jh);
 
 {
        transaction->t_journal = journal;
        transaction->t_state = T_RUNNING;
+       transaction->t_start_time = ktime_get();
        transaction->t_tid = journal->j_transaction_sequence++;
        transaction->t_expires = jiffies + journal->j_commit_interval;
        spin_lock_init(&transaction->t_handle_lock);
 {
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal = transaction->t_journal;
-       int old_handle_count, err;
+       int err;
        pid_t pid;
 
        J_ASSERT(journal_current_handle() == handle);
         * on IO anyway.  Speeds up many-threaded, many-dir operations
         * by 30x or more...
         *
+        * We try and optimize the sleep time against what the underlying disk
+        * can do, instead of having a static sleep time.  This is usefull for
+        * the case where our storage is so fast that it is more optimal to go
+        * ahead and force a flush and wait for the transaction to be committed
+        * than it is to wait for an arbitrary amount of time for new writers to
+        * join the transaction.  We acheive this by measuring how long it takes
+        * to commit a transaction, and compare it with how long this
+        * transaction has been running, and if run time < commit time then we
+        * sleep for the delta and commit.  This greatly helps super fast disks
+        * that would see slowdowns as more threads started doing fsyncs.
+        *
         * But don't do this if this process was the most recent one to
         * perform a synchronous write.  We do this to detect the case where a
         * single process is doing a stream of sync writes.  No point in waiting
         */
        pid = current->pid;
        if (handle->h_sync && journal->j_last_sync_writer != pid) {
+               u64 commit_time, trans_time;
+
                journal->j_last_sync_writer = pid;
-               do {
-                       old_handle_count = transaction->t_handle_count;
-                       schedule_timeout_uninterruptible(1);
-               } while (old_handle_count != transaction->t_handle_count);
+
+               spin_lock(&journal->j_state_lock);
+               commit_time = journal->j_average_commit_time;
+               spin_unlock(&journal->j_state_lock);
+
+               trans_time = ktime_to_ns(ktime_sub(ktime_get(),
+                                                  transaction->t_start_time));
+
+               commit_time = min_t(u64, commit_time,
+                                   1000*jiffies_to_usecs(1));
+
+               if (trans_time < commit_time) {
+                       ktime_t expires = ktime_add_ns(ktime_get(),
+                                                      commit_time);
+                       set_current_state(TASK_UNINTERRUPTIBLE);
+                       schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+               }
        }
 
        current->journal_info = NULL;
 
         */
        unsigned long           t_expires;
 
+       /*
+        * When this transaction started, in nanoseconds [no locking]
+        */
+       ktime_t                 t_start_time;
+
        /*
         * How many handles used this transaction? [t_handle_lock]
         */
        struct buffer_head      **j_wbuf;
        int                     j_wbufsize;
 
+       /*
+        * this is the pid of the last person to run a synchronous operation
+        * through the journal.
+        */
        pid_t                   j_last_sync_writer;
 
+       /*
+        * the average amount of time in nanoseconds it takes to commit a
+        * transaction to the disk.  [j_state_lock]
+        */
+       u64                     j_average_commit_time;
+
        /*
         * An opaque pointer to fs-private information.  ext3 puts its
         * superblock pointer here