]> pilppa.org Git - linux-2.6-omap-h63xx.git/blobdiff - fs/xfs/linux-2.6/xfs_sync.c
[XFS] Warn on transaction in flight on read-only remount
[linux-2.6-omap-h63xx.git] / fs / xfs / linux-2.6 / xfs_sync.c
index c765eb2a8dca05bbd90d7fb588096142bd8db66a..a608e72fa4054c3892bfc45b17e0343d21fd31ce 100644 (file)
 #include "xfs_inode_item.h"
 #include "xfs_rw.h"
 
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
 /*
- * xfs_sync flushes any pending I/O to file system vfsp.
- *
- * This routine is called by vfs_sync() to make sure that things make it
- * out to disk eventually, on sync() system calls to flush out everything,
- * and when the file system is unmounted.  For the vfs_sync() case, all
- * we really need to do is sync out the log to make all of our meta-data
- * updates permanent (except for timestamps).  For calls from pflushd(),
- * dirty pages are kept moving by calling pdflush() on the inodes
- * containing them.  We also flush the inodes that we can lock without
- * sleeping and the superblock if we can lock it without sleeping from
- * vfs_sync() so that items at the tail of the log are always moving out.
- *
- * Flags:
- *      SYNC_BDFLUSH - We're being called from vfs_sync() so we don't want
- *                    to sleep if we can help it.  All we really need
- *                    to do is ensure that the log is synced at least
- *                    periodically.  We also push the inodes and
- *                    superblock if we can lock them without sleeping
- *                     and they are not pinned.
- *      SYNC_ATTR    - We need to flush the inodes.  If SYNC_BDFLUSH is not
- *                    set, then we really want to lock each inode and flush
- *                    it.
- *      SYNC_WAIT    - All the flushes that take place in this call should
- *                    be synchronous.
- *      SYNC_DELWRI  - This tells us to push dirty pages associated with
- *                    inodes.  SYNC_WAIT and SYNC_BDFLUSH are used to
- *                    determine if they should be flushed sync, async, or
- *                    delwri.
- *      SYNC_CLOSE   - This flag is passed when the system is being
- *                    unmounted.  We should sync and invalidate everything.
- *      SYNC_FSDATA  - This indicates that the caller would like to make
- *                    sure the superblock is safe on disk.  We can ensure
- *                    this by simply making sure the log gets flushed
- *                    if SYNC_BDFLUSH is set, and by actually writing it
- *                    out otherwise.
- *     SYNC_IOWAIT  - The caller wants us to wait for all data I/O to complete
- *                    before we return (including direct I/O). Forms the drain
- *                    side of the write barrier needed to safely quiesce the
- *                    filesystem.
- *
+ * Sync all the inodes in the given AG according to the
+ * direction given by the flags.
  */
-int
-xfs_sync(
+STATIC int
+xfs_sync_inodes_ag(
        xfs_mount_t     *mp,
+       int             ag,
        int             flags)
 {
-       int             error;
+       xfs_perag_t     *pag = &mp->m_perag[ag];
+       int             nr_found;
+       uint32_t        first_index = 0;
+       int             error = 0;
+       int             last_error = 0;
+       int             fflag = XFS_B_ASYNC;
+
+       if (flags & SYNC_DELWRI)
+               fflag = XFS_B_DELWRI;
+       if (flags & SYNC_WAIT)
+               fflag = 0;              /* synchronous overrides all */
+
+       do {
+               struct inode    *inode;
+               xfs_inode_t     *ip = NULL;
+               int             lock_flags = XFS_ILOCK_SHARED;
+
+               /*
+                * use a gang lookup to find the next inode in the tree
+                * as the tree is sparse and a gang lookup walks to find
+                * the number of objects requested.
+                */
+               read_lock(&pag->pag_ici_lock);
+               nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+                               (void**)&ip, first_index, 1);
+
+               if (!nr_found) {
+                       read_unlock(&pag->pag_ici_lock);
+                       break;
+               }
+
+               /*
+                * Update the index for the next lookup. Catch overflows
+                * into the next AG range which can occur if we have inodes
+                * in the last block of the AG and we are currently
+                * pointing to the last inode.
+                */
+               first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+               if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
+                       read_unlock(&pag->pag_ici_lock);
+                       break;
+               }
+
+               /* nothing to sync during shutdown */
+               if (XFS_FORCED_SHUTDOWN(mp)) {
+                       read_unlock(&pag->pag_ici_lock);
+                       return 0;
+               }
+
+               /*
+                * If we can't get a reference on the inode, it must be
+                * in reclaim. Leave it for the reclaim code to flush.
+                */
+               inode = VFS_I(ip);
+               if (!igrab(inode)) {
+                       read_unlock(&pag->pag_ici_lock);
+                       continue;
+               }
+               read_unlock(&pag->pag_ici_lock);
+
+               /* avoid new or bad inodes */
+               if (is_bad_inode(inode) ||
+                   xfs_iflags_test(ip, XFS_INEW)) {
+                       IRELE(ip);
+                       continue;
+               }
 
-       /*
-        * Get the Quota Manager to flush the dquots.
-        *
-        * If XFS quota support is not enabled or this filesystem
-        * instance does not use quotas XFS_QM_DQSYNC will always
-        * return zero.
-        */
-       error = XFS_QM_DQSYNC(mp, flags);
-       if (error) {
                /*
-                * If we got an IO error, we will be shutting down.
-                * So, there's nothing more for us to do here.
+                * If we have to flush data or wait for I/O completion
+                * we need to hold the iolock.
                 */
-               ASSERT(error != EIO || XFS_FORCED_SHUTDOWN(mp));
-               if (XFS_FORCED_SHUTDOWN(mp))
+               if ((flags & SYNC_DELWRI) && VN_DIRTY(inode)) {
+                       xfs_ilock(ip, XFS_IOLOCK_SHARED);
+                       lock_flags |= XFS_IOLOCK_SHARED;
+                       error = xfs_flush_pages(ip, 0, -1, fflag, FI_NONE);
+                       if (flags & SYNC_IOWAIT)
+                               xfs_ioend_wait(ip);
+               }
+               xfs_ilock(ip, XFS_ILOCK_SHARED);
+
+               if ((flags & SYNC_ATTR) && !xfs_inode_clean(ip)) {
+                       if (flags & SYNC_WAIT) {
+                               xfs_iflock(ip);
+                               if (!xfs_inode_clean(ip))
+                                       error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
+                               else
+                                       xfs_ifunlock(ip);
+                       } else if (xfs_iflock_nowait(ip)) {
+                               if (!xfs_inode_clean(ip))
+                                       error = xfs_iflush(ip, XFS_IFLUSH_DELWRI);
+                               else
+                                       xfs_ifunlock(ip);
+                       }
+               }
+               xfs_iput(ip, lock_flags);
+
+               if (error)
+                       last_error = error;
+               /*
+                * bail out if the filesystem is corrupted.
+                */
+               if (error == EFSCORRUPTED)
                        return XFS_ERROR(error);
-       }
 
-       if (flags & SYNC_IOWAIT)
-               xfs_filestream_flush(mp);
+       } while (nr_found);
 
-       return xfs_syncsub(mp, flags, NULL);
+       return last_error;
 }
 
-/*
- * xfs sync routine for internal use
- *
- * This routine supports all of the flags defined for the generic vfs_sync
- * interface as explained above under xfs_sync.
- *
- */
 int
 xfs_sync_inodes(
        xfs_mount_t     *mp,
-       int             flags,
-       int             *bypassed)
+       int             flags)
 {
-       xfs_inode_t     *ip = NULL;
-       struct inode    *vp = NULL;
        int             error;
        int             last_error;
-       uint64_t        fflag;
-       uint            lock_flags;
-       uint            base_lock_flags;
-       boolean_t       mount_locked;
-       boolean_t       vnode_refed;
-       int             preempt;
-       xfs_iptr_t      *ipointer;
-#ifdef DEBUG
-       boolean_t       ipointer_in = B_FALSE;
-
-#define IPOINTER_SET   ipointer_in = B_TRUE
-#define IPOINTER_CLR   ipointer_in = B_FALSE
-#else
-#define IPOINTER_SET
-#define IPOINTER_CLR
-#endif
-
-
-/* Insert a marker record into the inode list after inode ip. The list
- * must be locked when this is called. After the call the list will no
- * longer be locked.
- */
-#define IPOINTER_INSERT(ip, mp)        { \
-               ASSERT(ipointer_in == B_FALSE); \
-               ipointer->ip_mnext = ip->i_mnext; \
-               ipointer->ip_mprev = ip; \
-               ip->i_mnext = (xfs_inode_t *)ipointer; \
-               ipointer->ip_mnext->i_mprev = (xfs_inode_t *)ipointer; \
-               preempt = 0; \
-               XFS_MOUNT_IUNLOCK(mp); \
-               mount_locked = B_FALSE; \
-               IPOINTER_SET; \
-       }
-
-/* Remove the marker from the inode list. If the marker was the only item
- * in the list then there are no remaining inodes and we should zero out
- * the whole list. If we are the current head of the list then move the head
- * past us.
- */
-#define IPOINTER_REMOVE(ip, mp)        { \
-               ASSERT(ipointer_in == B_TRUE); \
-               if (ipointer->ip_mnext != (xfs_inode_t *)ipointer) { \
-                       ip = ipointer->ip_mnext; \
-                       ip->i_mprev = ipointer->ip_mprev; \
-                       ipointer->ip_mprev->i_mnext = ip; \
-                       if (mp->m_inodes == (xfs_inode_t *)ipointer) { \
-                               mp->m_inodes = ip; \
-                       } \
-               } else { \
-                       ASSERT(mp->m_inodes == (xfs_inode_t *)ipointer); \
-                       mp->m_inodes = NULL; \
-                       ip = NULL; \
-               } \
-               IPOINTER_CLR; \
-       }
-
-#define XFS_PREEMPT_MASK       0x7f
+       int             i;
+       int             lflags = XFS_LOG_FORCE;
 
-       ASSERT(!(flags & SYNC_BDFLUSH));
-
-       if (bypassed)
-               *bypassed = 0;
        if (mp->m_flags & XFS_MOUNT_RDONLY)
                return 0;
        error = 0;
        last_error = 0;
-       preempt = 0;
 
-       /* Allocate a reference marker */
-       ipointer = (xfs_iptr_t *)kmem_zalloc(sizeof(xfs_iptr_t), KM_SLEEP);
+       if (flags & SYNC_WAIT)
+               lflags |= XFS_LOG_SYNC;
 
-       fflag = XFS_B_ASYNC;            /* default is don't wait */
+       for (i = 0; i < mp->m_sb.sb_agcount; i++) {
+               if (!mp->m_perag[i].pag_ici_init)
+                       continue;
+               error = xfs_sync_inodes_ag(mp, i, flags);
+               if (error)
+                       last_error = error;
+               if (error == EFSCORRUPTED)
+                       break;
+       }
        if (flags & SYNC_DELWRI)
-               fflag = XFS_B_DELWRI;
-       if (flags & SYNC_WAIT)
-               fflag = 0;              /* synchronous overrides all */
+               xfs_log_force(mp, 0, lflags);
 
-       base_lock_flags = XFS_ILOCK_SHARED;
-       if (flags & (SYNC_DELWRI | SYNC_CLOSE)) {
-               /*
-                * We need the I/O lock if we're going to call any of
-                * the flush/inval routines.
-                */
-               base_lock_flags |= XFS_IOLOCK_SHARED;
+       return XFS_ERROR(last_error);
+}
+
+STATIC int
+xfs_commit_dummy_trans(
+       struct xfs_mount        *mp,
+       uint                    log_flags)
+{
+       struct xfs_inode        *ip = mp->m_rootip;
+       struct xfs_trans        *tp;
+       int                     error;
+
+       /*
+        * Put a dummy transaction in the log to tell recovery
+        * that all others are OK.
+        */
+       tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
+       error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+       if (error) {
+               xfs_trans_cancel(tp, 0);
+               return error;
        }
 
-       XFS_MOUNT_ILOCK(mp);
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
 
-       ip = mp->m_inodes;
+       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+       xfs_trans_ihold(tp, ip);
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+       /* XXX(hch): ignoring the error here.. */
+       error = xfs_trans_commit(tp, 0);
 
-       mount_locked = B_TRUE;
-       vnode_refed  = B_FALSE;
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
-       IPOINTER_CLR;
+       xfs_log_force(mp, 0, log_flags);
+       return 0;
+}
 
-       do {
-               ASSERT(ipointer_in == B_FALSE);
-               ASSERT(vnode_refed == B_FALSE);
+int
+xfs_sync_fsdata(
+       struct xfs_mount        *mp,
+       int                     flags)
+{
+       struct xfs_buf          *bp;
+       struct xfs_buf_log_item *bip;
+       int                     error = 0;
 
-               lock_flags = base_lock_flags;
+       /*
+        * If this is xfssyncd() then only sync the superblock if we can
+        * lock it without sleeping and it is not pinned.
+        */
+       if (flags & SYNC_BDFLUSH) {
+               ASSERT(!(flags & SYNC_WAIT));
 
-               /*
-                * There were no inodes in the list, just break out
-                * of the loop.
-                */
-               if (ip == NULL) {
-                       break;
-               }
+               bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
+               if (!bp)
+                       goto out;
+
+               bip = XFS_BUF_FSPRIVATE(bp, struct xfs_buf_log_item *);
+               if (!bip || !xfs_buf_item_dirty(bip) || XFS_BUF_ISPINNED(bp))
+                       goto out_brelse;
+       } else {
+               bp = xfs_getsb(mp, 0);
 
                /*
-                * We found another sync thread marker - skip it
+                * If the buffer is pinned then push on the log so we won't
+                * get stuck waiting in the write for someone, maybe
+                * ourselves, to flush the log.
+                *
+                * Even though we just pushed the log above, we did not have
+                * the superblock buffer locked at that point so it can
+                * become pinned in between there and here.
                 */
-               if (ip->i_mount == NULL) {
-                       ip = ip->i_mnext;
-                       continue;
-               }
+               if (XFS_BUF_ISPINNED(bp))
+                       xfs_log_force(mp, 0, XFS_LOG_FORCE);
+       }
 
-               vp = VFS_I(ip);
 
-               /*
-                * If the vnode is gone then this is being torn down,
-                * call reclaim if it is flushed, else let regular flush
-                * code deal with it later in the loop.
-                */
+       if (flags & SYNC_WAIT)
+               XFS_BUF_UNASYNC(bp);
+       else
+               XFS_BUF_ASYNC(bp);
 
-               if (vp == NULL) {
-                       /* Skip ones already in reclaim */
-                       if (ip->i_flags & XFS_IRECLAIM) {
-                               ip = ip->i_mnext;
-                               continue;
-                       }
-                       if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) {
-                               ip = ip->i_mnext;
-                       } else if ((xfs_ipincount(ip) == 0) &&
-                                   xfs_iflock_nowait(ip)) {
-                               IPOINTER_INSERT(ip, mp);
-
-                               xfs_finish_reclaim(ip, 1,
-                                               XFS_IFLUSH_DELWRI_ELSE_ASYNC);
-
-                               XFS_MOUNT_ILOCK(mp);
-                               mount_locked = B_TRUE;
-                               IPOINTER_REMOVE(ip, mp);
-                       } else {
-                               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                               ip = ip->i_mnext;
-                       }
-                       continue;
-               }
+       return xfs_bwrite(mp, bp);
 
-               if (VN_BAD(vp)) {
-                       ip = ip->i_mnext;
-                       continue;
-               }
+ out_brelse:
+       xfs_buf_relse(bp);
+ out:
+       return error;
+}
 
-               if (XFS_FORCED_SHUTDOWN(mp) && !(flags & SYNC_CLOSE)) {
-                       XFS_MOUNT_IUNLOCK(mp);
-                       kmem_free(ipointer);
-                       return 0;
-               }
+/*
+ * When remounting a filesystem read-only or freezing the filesystem, we have
+ * two phases to execute. This first phase is syncing the data before we
+ * quiesce the filesystem, and the second is flushing all the inodes out after
+ * we've waited for all the transactions created by the first phase to
+ * complete. The second phase ensures that the inodes are written to their
+ * location on disk rather than just existing in transactions in the log. This
+ * means after a quiesce there is no log replay required to write the inodes to
+ * disk (this is the main difference between a sync and a quiesce).
+ */
+/*
+ * First stage of freeze - no writers will make progress now we are here,
+ * so we flush delwri and delalloc buffers here, then wait for all I/O to
+ * complete.  Data is frozen at that point. Metadata is not frozen,
+ * transactions can still occur here so don't bother flushing the buftarg
+ * because it'll just get dirty again.
+ */
+int
+xfs_quiesce_data(
+       struct xfs_mount        *mp)
+{
+       int error;
 
-               /*
-                * Try to lock without sleeping.  We're out of order with
-                * the inode list lock here, so if we fail we need to drop
-                * the mount lock and try again.  If we're called from
-                * bdflush() here, then don't bother.
-                *
-                * The inode lock here actually coordinates with the
-                * almost spurious inode lock in xfs_ireclaim() to prevent
-                * the vnode we handle here without a reference from
-                * being freed while we reference it.  If we lock the inode
-                * while it's on the mount list here, then the spurious inode
-                * lock in xfs_ireclaim() after the inode is pulled from
-                * the mount list will sleep until we release it here.
-                * This keeps the vnode from being freed while we reference
-                * it.
-                */
-               if (xfs_ilock_nowait(ip, lock_flags) == 0) {
-                       if (vp == NULL) {
-                               ip = ip->i_mnext;
-                               continue;
-                       }
+       /* push non-blocking */
+       xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_BDFLUSH);
+       XFS_QM_DQSYNC(mp, SYNC_BDFLUSH);
+       xfs_filestream_flush(mp);
 
-                       vp = vn_grab(vp);
-                       if (vp == NULL) {
-                               ip = ip->i_mnext;
-                               continue;
-                       }
+       /* push and block */
+       xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_WAIT|SYNC_IOWAIT);
+       XFS_QM_DQSYNC(mp, SYNC_WAIT);
 
-                       IPOINTER_INSERT(ip, mp);
-                       xfs_ilock(ip, lock_flags);
+       /* write superblock and hoover up shutdown errors */
+       error = xfs_sync_fsdata(mp, 0);
 
-                       ASSERT(vp == VFS_I(ip));
-                       ASSERT(ip->i_mount == mp);
+       /* flush data-only devices */
+       if (mp->m_rtdev_targp)
+               XFS_bflush(mp->m_rtdev_targp);
 
-                       vnode_refed = B_TRUE;
+       return error;
+}
+
+STATIC void
+xfs_quiesce_fs(
+       struct xfs_mount        *mp)
+{
+       int     count = 0, pincount;
+
+       xfs_flush_buftarg(mp->m_ddev_targp, 0);
+       xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
+
+       /*
+        * This loop must run at least twice.  The first instance of the loop
+        * will flush most meta data but that will generate more meta data
+        * (typically directory updates).  Which then must be flushed and
+        * logged before we can write the unmount record.
+        */
+       do {
+               xfs_sync_inodes(mp, SYNC_ATTR|SYNC_WAIT);
+               pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
+               if (!pincount) {
+                       delay(50);
+                       count++;
                }
+       } while (count < 2);
+}
 
-               /* From here on in the loop we may have a marker record
-                * in the inode list.
-                */
+/*
+ * Second stage of a quiesce. The data is already synced, now we have to take
+ * care of the metadata. New transactions are already blocked, so we need to
+ * wait for any remaining transactions to drain out before proceding.
+ */
+void
+xfs_quiesce_attr(
+       struct xfs_mount        *mp)
+{
+       int     error = 0;
 
-               /*
-                * If we have to flush data or wait for I/O completion
-                * we need to drop the ilock that we currently hold.
-                * If we need to drop the lock, insert a marker if we
-                * have not already done so.
-                */
-               if ((flags & (SYNC_CLOSE|SYNC_IOWAIT)) ||
-                   ((flags & SYNC_DELWRI) && VN_DIRTY(vp))) {
-                       if (mount_locked) {
-                               IPOINTER_INSERT(ip, mp);
-                       }
-                       xfs_iunlock(ip, XFS_ILOCK_SHARED);
+       /* wait for all modifications to complete */
+       while (atomic_read(&mp->m_active_trans) > 0)
+               delay(100);
 
-                       if (flags & SYNC_CLOSE) {
-                               /* Shutdown case. Flush and invalidate. */
-                               if (XFS_FORCED_SHUTDOWN(mp))
-                                       xfs_tosspages(ip, 0, -1,
-                                                            FI_REMAPF);
-                               else
-                                       error = xfs_flushinval_pages(ip,
-                                                       0, -1, FI_REMAPF);
-                       } else if ((flags & SYNC_DELWRI) && VN_DIRTY(vp)) {
-                               error = xfs_flush_pages(ip, 0,
-                                                       -1, fflag, FI_NONE);
-                       }
+       /* flush inodes and push all remaining buffers out to disk */
+       xfs_quiesce_fs(mp);
 
-                       /*
-                        * When freezing, we need to wait ensure all I/O (including direct
-                        * I/O) is complete to ensure no further data modification can take
-                        * place after this point
-                        */
-                       if (flags & SYNC_IOWAIT)
-                               vn_iowait(ip);
+       /*
+        * Just warn here till VFS can correctly support
+        * read-only remount without racing.
+        */
+       WARN_ON(atomic_read(&mp->m_active_trans) != 0);
+
+       /* Push the superblock and write an unmount record */
+       error = xfs_log_sbcount(mp, 1);
+       if (error)
+               xfs_fs_cmn_err(CE_WARN, mp,
+                               "xfs_attr_quiesce: failed to log sb changes. "
+                               "Frozen image may not be consistent.");
+       xfs_log_unmount_write(mp);
+       xfs_unmountfs_writesb(mp);
+}
 
-                       xfs_ilock(ip, XFS_ILOCK_SHARED);
-               }
+/*
+ * Enqueue a work item to be picked up by the vfs xfssyncd thread.
+ * Doing this has two advantages:
+ * - It saves on stack space, which is tight in certain situations
+ * - It can be used (with care) as a mechanism to avoid deadlocks.
+ * Flushing while allocating in a full filesystem requires both.
+ */
+STATIC void
+xfs_syncd_queue_work(
+       struct xfs_mount *mp,
+       void            *data,
+       void            (*syncer)(struct xfs_mount *, void *))
+{
+       struct bhv_vfs_sync_work *work;
+
+       work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
+       INIT_LIST_HEAD(&work->w_list);
+       work->w_syncer = syncer;
+       work->w_data = data;
+       work->w_mount = mp;
+       spin_lock(&mp->m_sync_lock);
+       list_add_tail(&work->w_list, &mp->m_sync_list);
+       spin_unlock(&mp->m_sync_lock);
+       wake_up_process(mp->m_sync_task);
+}
 
-               if ((flags & SYNC_ATTR) &&
-                   (ip->i_update_core ||
-                    (ip->i_itemp && ip->i_itemp->ili_format.ilf_fields))) {
-                       if (mount_locked)
-                               IPOINTER_INSERT(ip, mp);
+/*
+ * Flush delayed allocate data, attempting to free up reserved space
+ * from existing allocations.  At this point a new allocation attempt
+ * has failed with ENOSPC and we are in the process of scratching our
+ * heads, looking about for more room...
+ */
+STATIC void
+xfs_flush_inode_work(
+       struct xfs_mount *mp,
+       void            *arg)
+{
+       struct inode    *inode = arg;
+       filemap_flush(inode->i_mapping);
+       iput(inode);
+}
 
-                       if (flags & SYNC_WAIT) {
-                               xfs_iflock(ip);
-                               error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
-
-                       /*
-                        * If we can't acquire the flush lock, then the inode
-                        * is already being flushed so don't bother waiting.
-                        *
-                        * If we can lock it then do a delwri flush so we can
-                        * combine multiple inode flushes in each disk write.
-                        */
-                       } else if (xfs_iflock_nowait(ip)) {
-                               error = xfs_iflush(ip, XFS_IFLUSH_DELWRI);
-                       } else if (bypassed) {
-                               (*bypassed)++;
-                       }
-               }
+void
+xfs_flush_inode(
+       xfs_inode_t     *ip)
+{
+       struct inode    *inode = VFS_I(ip);
 
-               if (lock_flags != 0) {
-                       xfs_iunlock(ip, lock_flags);
-               }
+       igrab(inode);
+       xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
+       delay(msecs_to_jiffies(500));
+}
 
-               if (vnode_refed) {
-                       /*
-                        * If we had to take a reference on the vnode
-                        * above, then wait until after we've unlocked
-                        * the inode to release the reference.  This is
-                        * because we can be already holding the inode
-                        * lock when IRELE() calls xfs_inactive().
-                        *
-                        * Make sure to drop the mount lock before calling
-                        * IRELE() so that we don't trip over ourselves if
-                        * we have to go for the mount lock again in the
-                        * inactive code.
-                        */
-                       if (mount_locked) {
-                               IPOINTER_INSERT(ip, mp);
-                       }
+/*
+ * This is the "bigger hammer" version of xfs_flush_inode_work...
+ * (IOW, "If at first you don't succeed, use a Bigger Hammer").
+ */
+STATIC void
+xfs_flush_device_work(
+       struct xfs_mount *mp,
+       void            *arg)
+{
+       struct inode    *inode = arg;
+       sync_blockdev(mp->m_super->s_bdev);
+       iput(inode);
+}
 
-                       IRELE(ip);
+void
+xfs_flush_device(
+       xfs_inode_t     *ip)
+{
+       struct inode    *inode = VFS_I(ip);
 
-                       vnode_refed = B_FALSE;
-               }
+       igrab(inode);
+       xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
+       delay(msecs_to_jiffies(500));
+       xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
+}
 
-               if (error) {
-                       last_error = error;
-               }
+/*
+ * Every sync period we need to unpin all items, reclaim inodes, sync
+ * quota and write out the superblock. We might need to cover the log
+ * to indicate it is idle.
+ */
+STATIC void
+xfs_sync_worker(
+       struct xfs_mount *mp,
+       void            *unused)
+{
+       int             error;
+
+       if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+               xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+               xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
+               /* dgc: errors ignored here */
+               error = XFS_QM_DQSYNC(mp, SYNC_BDFLUSH);
+               error = xfs_sync_fsdata(mp, SYNC_BDFLUSH);
+               if (xfs_log_need_covered(mp))
+                       error = xfs_commit_dummy_trans(mp, XFS_LOG_FORCE);
+       }
+       mp->m_sync_seq++;
+       wake_up(&mp->m_wait_single_sync_task);
+}
 
+STATIC int
+xfssyncd(
+       void                    *arg)
+{
+       struct xfs_mount        *mp = arg;
+       long                    timeleft;
+       bhv_vfs_sync_work_t     *work, *n;
+       LIST_HEAD               (tmp);
+
+       set_freezable();
+       timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
+       for (;;) {
+               timeleft = schedule_timeout_interruptible(timeleft);
+               /* swsusp */
+               try_to_freeze();
+               if (kthread_should_stop() && list_empty(&mp->m_sync_list))
+                       break;
+
+               spin_lock(&mp->m_sync_lock);
                /*
-                * bail out if the filesystem is corrupted.
+                * We can get woken by laptop mode, to do a sync -
+                * that's the (only!) case where the list would be
+                * empty with time remaining.
                 */
-               if (error == EFSCORRUPTED)  {
-                       if (!mount_locked) {
-                               XFS_MOUNT_ILOCK(mp);
-                               IPOINTER_REMOVE(ip, mp);
-                       }
-                       XFS_MOUNT_IUNLOCK(mp);
-                       ASSERT(ipointer_in == B_FALSE);
-                       kmem_free(ipointer);
-                       return XFS_ERROR(error);
+               if (!timeleft || list_empty(&mp->m_sync_list)) {
+                       if (!timeleft)
+                               timeleft = xfs_syncd_centisecs *
+                                                       msecs_to_jiffies(10);
+                       INIT_LIST_HEAD(&mp->m_sync_work.w_list);
+                       list_add_tail(&mp->m_sync_work.w_list,
+                                       &mp->m_sync_list);
                }
-
-               /* Let other threads have a chance at the mount lock
-                * if we have looped many times without dropping the
-                * lock.
-                */
-               if ((++preempt & XFS_PREEMPT_MASK) == 0) {
-                       if (mount_locked) {
-                               IPOINTER_INSERT(ip, mp);
-                       }
+               list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list)
+                       list_move(&work->w_list, &tmp);
+               spin_unlock(&mp->m_sync_lock);
+
+               list_for_each_entry_safe(work, n, &tmp, w_list) {
+                       (*work->w_syncer)(mp, work->w_data);
+                       list_del(&work->w_list);
+                       if (work == &mp->m_sync_work)
+                               continue;
+                       kmem_free(work);
                }
+       }
 
-               if (mount_locked == B_FALSE) {
-                       XFS_MOUNT_ILOCK(mp);
-                       mount_locked = B_TRUE;
-                       IPOINTER_REMOVE(ip, mp);
-                       continue;
-               }
+       return 0;
+}
+
+int
+xfs_syncd_init(
+       struct xfs_mount        *mp)
+{
+       mp->m_sync_work.w_syncer = xfs_sync_worker;
+       mp->m_sync_work.w_mount = mp;
+       mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
+       if (IS_ERR(mp->m_sync_task))
+               return -PTR_ERR(mp->m_sync_task);
+       return 0;
+}
+
+void
+xfs_syncd_stop(
+       struct xfs_mount        *mp)
+{
+       kthread_stop(mp->m_sync_task);
+}
 
-               ASSERT(ipointer_in == B_FALSE);
-               ip = ip->i_mnext;
+int
+xfs_reclaim_inode(
+       xfs_inode_t     *ip,
+       int             locked,
+       int             sync_mode)
+{
+       xfs_perag_t     *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
 
-       } while (ip != mp->m_inodes);
+       /* The hash lock here protects a thread in xfs_iget_core from
+        * racing with us on linking the inode back with a vnode.
+        * Once we have the XFS_IRECLAIM flag set it will not touch
+        * us.
+        */
+       write_lock(&pag->pag_ici_lock);
+       spin_lock(&ip->i_flags_lock);
+       if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
+           !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
+               spin_unlock(&ip->i_flags_lock);
+               write_unlock(&pag->pag_ici_lock);
+               if (locked) {
+                       xfs_ifunlock(ip);
+                       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+               }
+               return 1;
+       }
+       __xfs_iflags_set(ip, XFS_IRECLAIM);
+       spin_unlock(&ip->i_flags_lock);
+       write_unlock(&pag->pag_ici_lock);
+       xfs_put_perag(ip->i_mount, pag);
 
-       XFS_MOUNT_IUNLOCK(mp);
+       /*
+        * If the inode is still dirty, then flush it out.  If the inode
+        * is not in the AIL, then it will be OK to flush it delwri as
+        * long as xfs_iflush() does not keep any references to the inode.
+        * We leave that decision up to xfs_iflush() since it has the
+        * knowledge of whether it's OK to simply do a delwri flush of
+        * the inode or whether we need to wait until the inode is
+        * pulled from the AIL.
+        * We get the flush lock regardless, though, just to make sure
+        * we don't free it while it is being flushed.
+        */
+       if (!locked) {
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
+               xfs_iflock(ip);
+       }
 
-       ASSERT(ipointer_in == B_FALSE);
+       /*
+        * In the case of a forced shutdown we rely on xfs_iflush() to
+        * wait for the inode to be unpinned before returning an error.
+        */
+       if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
+               /* synchronize with xfs_iflush_done */
+               xfs_iflock(ip);
+               xfs_ifunlock(ip);
+       }
 
-       kmem_free(ipointer);
-       return XFS_ERROR(last_error);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       xfs_ireclaim(ip);
+       return 0;
 }
 
 /*
- * xfs sync routine for internal use
- *
- * This routine supports all of the flags defined for the generic vfs_sync
- * interface as explained above under xfs_sync.
- *
+ * We set the inode flag atomically with the radix tree tag.
+ * Once we get tag lookups on the radix tree, this inode flag
+ * can go away.
  */
-int
-xfs_syncsub(
+void
+xfs_inode_set_reclaim_tag(
+       xfs_inode_t     *ip)
+{
+       xfs_mount_t     *mp = ip->i_mount;
+       xfs_perag_t     *pag = xfs_get_perag(mp, ip->i_ino);
+
+       read_lock(&pag->pag_ici_lock);
+       spin_lock(&ip->i_flags_lock);
+       radix_tree_tag_set(&pag->pag_ici_root,
+                       XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+       __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+       spin_unlock(&ip->i_flags_lock);
+       read_unlock(&pag->pag_ici_lock);
+       xfs_put_perag(mp, pag);
+}
+
+void
+__xfs_inode_clear_reclaim_tag(
        xfs_mount_t     *mp,
-       int             flags,
-       int             *bypassed)
+       xfs_perag_t     *pag,
+       xfs_inode_t     *ip)
 {
-       int             error = 0;
-       int             last_error = 0;
-       uint            log_flags = XFS_LOG_FORCE;
-       xfs_buf_t       *bp;
-       xfs_buf_log_item_t      *bip;
+       radix_tree_tag_clear(&pag->pag_ici_root,
+                       XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+}
 
-       /*
-        * Sync out the log.  This ensures that the log is periodically
-        * flushed even if there is not enough activity to fill it up.
-        */
-       if (flags & SYNC_WAIT)
-               log_flags |= XFS_LOG_SYNC;
+void
+xfs_inode_clear_reclaim_tag(
+       xfs_inode_t     *ip)
+{
+       xfs_mount_t     *mp = ip->i_mount;
+       xfs_perag_t     *pag = xfs_get_perag(mp, ip->i_ino);
+
+       read_lock(&pag->pag_ici_lock);
+       spin_lock(&ip->i_flags_lock);
+       __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+       spin_unlock(&ip->i_flags_lock);
+       read_unlock(&pag->pag_ici_lock);
+       xfs_put_perag(mp, pag);
+}
 
-       xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
 
-       if (flags & (SYNC_ATTR|SYNC_DELWRI)) {
-               if (flags & SYNC_BDFLUSH)
-                       xfs_finish_reclaim_all(mp, 1);
-               else
-                       error = xfs_sync_inodes(mp, flags, bypassed);
-       }
+STATIC void
+xfs_reclaim_inodes_ag(
+       xfs_mount_t     *mp,
+       int             ag,
+       int             noblock,
+       int             mode)
+{
+       xfs_inode_t     *ip = NULL;
+       xfs_perag_t     *pag = &mp->m_perag[ag];
+       int             nr_found;
+       uint32_t        first_index;
+       int             skipped;
+
+restart:
+       first_index = 0;
+       skipped = 0;
+       do {
+               /*
+                * use a gang lookup to find the next inode in the tree
+                * as the tree is sparse and a gang lookup walks to find
+                * the number of objects requested.
+                */
+               read_lock(&pag->pag_ici_lock);
+               nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
+                                       (void**)&ip, first_index, 1,
+                                       XFS_ICI_RECLAIM_TAG);
 
-       /*
-        * Flushing out dirty data above probably generated more
-        * log activity, so if this isn't vfs_sync() then flush
-        * the log again.
-        */
-       if (flags & SYNC_DELWRI) {
-               xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
-       }
+               if (!nr_found) {
+                       read_unlock(&pag->pag_ici_lock);
+                       break;
+               }
 
-       if (flags & SYNC_FSDATA) {
                /*
-                * If this is vfs_sync() then only sync the superblock
-                * if we can lock it without sleeping and it is not pinned.
+                * Update the index for the next lookup. Catch overflows
+                * into the next AG range which can occur if we have inodes
+                * in the last block of the AG and we are currently
+                * pointing to the last inode.
                 */
-               if (flags & SYNC_BDFLUSH) {
-                       bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
-                       if (bp != NULL) {
-                               bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*);
-                               if ((bip != NULL) &&
-                                   xfs_buf_item_dirty(bip)) {
-                                       if (!(XFS_BUF_ISPINNED(bp))) {
-                                               XFS_BUF_ASYNC(bp);
-                                               error = xfs_bwrite(mp, bp);
-                                       } else {
-                                               xfs_buf_relse(bp);
-                                       }
-                               } else {
-                                       xfs_buf_relse(bp);
-                               }
-                       }
-               } else {
-                       bp = xfs_getsb(mp, 0);
-                       /*
-                        * If the buffer is pinned then push on the log so
-                        * we won't get stuck waiting in the write for
-                        * someone, maybe ourselves, to flush the log.
-                        * Even though we just pushed the log above, we
-                        * did not have the superblock buffer locked at
-                        * that point so it can become pinned in between
-                        * there and here.
-                        */
-                       if (XFS_BUF_ISPINNED(bp))
-                               xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
-                       if (flags & SYNC_WAIT)
-                               XFS_BUF_UNASYNC(bp);
-                       else
-                               XFS_BUF_ASYNC(bp);
-                       error = xfs_bwrite(mp, bp);
+               first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+               if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
+                       read_unlock(&pag->pag_ici_lock);
+                       break;
                }
-               if (error) {
-                       last_error = error;
+
+               /* ignore if already under reclaim */
+               if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
+                       read_unlock(&pag->pag_ici_lock);
+                       continue;
                }
-       }
 
-       /*
-        * Now check to see if the log needs a "dummy" transaction.
-        */
-       if (!(flags & SYNC_REMOUNT) && xfs_log_need_covered(mp)) {
-               xfs_trans_t *tp;
-               xfs_inode_t *ip;
+               if (noblock) {
+                       if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+                               read_unlock(&pag->pag_ici_lock);
+                               continue;
+                       }
+                       if (xfs_ipincount(ip) ||
+                           !xfs_iflock_nowait(ip)) {
+                               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                               read_unlock(&pag->pag_ici_lock);
+                               continue;
+                       }
+               }
+               read_unlock(&pag->pag_ici_lock);
 
                /*
-                * Put a dummy transaction in the log to tell
-                * recovery that all others are OK.
+                * hmmm - this is an inode already in reclaim. Do
+                * we even bother catching it here?
                 */
-               tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
-               if ((error = xfs_trans_reserve(tp, 0,
-                               XFS_ICHANGE_LOG_RES(mp),
-                               0, 0, 0)))  {
-                       xfs_trans_cancel(tp, 0);
-                       return error;
-               }
-
-               ip = mp->m_rootip;
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
+               if (xfs_reclaim_inode(ip, noblock, mode))
+                       skipped++;
+       } while (nr_found);
 
-               xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-               xfs_trans_ihold(tp, ip);
-               xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-               error = xfs_trans_commit(tp, 0);
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
+       if (skipped) {
+               delay(1);
+               goto restart;
        }
+       return;
 
-       /*
-        * When shutting down, we need to insure that the AIL is pushed
-        * to disk or the filesystem can appear corrupt from the PROM.
-        */
-       if ((flags & (SYNC_CLOSE|SYNC_WAIT)) == (SYNC_CLOSE|SYNC_WAIT)) {
-               XFS_bflush(mp->m_ddev_targp);
-               if (mp->m_rtdev_targp) {
-                       XFS_bflush(mp->m_rtdev_targp);
-               }
-       }
+}
 
-       return XFS_ERROR(last_error);
+int
+xfs_reclaim_inodes(
+       xfs_mount_t     *mp,
+       int              noblock,
+       int             mode)
+{
+       int             i;
+
+       for (i = 0; i < mp->m_sb.sb_agcount; i++) {
+               if (!mp->m_perag[i].pag_ici_init)
+                       continue;
+               xfs_reclaim_inodes_ag(mp, i, noblock, mode);
+       }
+       return 0;
 }
+
+