fs/xfs/xfs_vnodeops.c

   1 /*
   2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18
  19 #include "xfs.h"
  20 #include "xfs_fs.h"
  21 #include "xfs_types.h"
  22 #include "xfs_bit.h"
  23 #include "xfs_log.h"
  24 #include "xfs_inum.h"
  25 #include "xfs_trans.h"
  26 #include "xfs_sb.h"
  27 #include "xfs_ag.h"
  28 #include "xfs_dir2.h"
  29 #include "xfs_dmapi.h"
  30 #include "xfs_mount.h"
  31 #include "xfs_da_btree.h"
  32 #include "xfs_bmap_btree.h"
  33 #include "xfs_alloc_btree.h"
  34 #include "xfs_ialloc_btree.h"
  35 #include "xfs_dir2_sf.h"
  36 #include "xfs_attr_sf.h"
  37 #include "xfs_dinode.h"
  38 #include "xfs_inode.h"
  39 #include "xfs_inode_item.h"
  40 #include "xfs_itable.h"
  41 #include "xfs_btree.h"
  42 #include "xfs_ialloc.h"
  43 #include "xfs_alloc.h"
  44 #include "xfs_bmap.h"
  45 #include "xfs_attr.h"
  46 #include "xfs_rw.h"
  47 #include "xfs_error.h"
  48 #include "xfs_quota.h"
  49 #include "xfs_utils.h"
  50 #include "xfs_rtalloc.h"
  51 #include "xfs_refcache.h"
  52 #include "xfs_trans_space.h"
  53 #include "xfs_log_priv.h"
  54 #include "xfs_filestream.h"
  55
  56 STATIC int
  57 xfs_open(
  58         bhv_desc_t      *bdp,
  59         cred_t          *credp)
  60 {
  61         int             mode;
  62         bhv_vnode_t     *vp = BHV_TO_VNODE(bdp);
  63         xfs_inode_t     *ip = XFS_BHVTOI(bdp);
  64
  65         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
  66                 return XFS_ERROR(EIO);
  67
  68         /*
  69          * If it's a directory with any blocks, read-ahead block 0
  70          * as we're almost certain to have the next operation be a read there.
  71          */
  72         if (VN_ISDIR(vp) && ip->i_d.di_nextents > 0) {
  73                 mode = xfs_ilock_map_shared(ip);
  74                 if (ip->i_d.di_nextents > 0)
  75                         (void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
  76                 xfs_iunlock(ip, mode);
  77         }
  78         return 0;
  79 }
  80
  81 /*
  82  * xfs_getattr
  83  */
  84 STATIC int
  85 xfs_getattr(
  86         bhv_desc_t      *bdp,
  87         bhv_vattr_t     *vap,
  88         int             flags,
  89         cred_t          *credp)
  90 {
  91         xfs_inode_t     *ip;
  92         xfs_mount_t     *mp;
  93         bhv_vnode_t     *vp;
  94
  95         vp  = BHV_TO_VNODE(bdp);
  96         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
  97
  98         ip = XFS_BHVTOI(bdp);
  99         mp = ip->i_mount;
 100
 101         if (XFS_FORCED_SHUTDOWN(mp))
 102                 return XFS_ERROR(EIO);
 103
 104         if (!(flags & ATTR_LAZY))
 105                 xfs_ilock(ip, XFS_ILOCK_SHARED);
 106
 107         vap->va_size = XFS_ISIZE(ip);
 108         if (vap->va_mask == XFS_AT_SIZE)
 109                 goto all_done;
 110
 111         vap->va_nblocks =
 112                 XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
 113         vap->va_nodeid = ip->i_ino;
 114 #if XFS_BIG_INUMS
 115         vap->va_nodeid += mp->m_inoadd;
 116 #endif
 117         vap->va_nlink = ip->i_d.di_nlink;
 118
 119         /*
 120          * Quick exit for non-stat callers
 121          */
 122         if ((vap->va_mask &
 123             ~(XFS_AT_SIZE|XFS_AT_FSID|XFS_AT_NODEID|
 124               XFS_AT_NLINK|XFS_AT_BLKSIZE)) == 0)
 125                 goto all_done;
 126
 127         /*
 128          * Copy from in-core inode.
 129          */
 130         vap->va_mode = ip->i_d.di_mode;
 131         vap->va_uid = ip->i_d.di_uid;
 132         vap->va_gid = ip->i_d.di_gid;
 133         vap->va_projid = ip->i_d.di_projid;
 134
 135         /*
 136          * Check vnode type block/char vs. everything else.
 137          */
 138         switch (ip->i_d.di_mode & S_IFMT) {
 139         case S_IFBLK:
 140         case S_IFCHR:
 141                 vap->va_rdev = ip->i_df.if_u2.if_rdev;
 142                 vap->va_blocksize = BLKDEV_IOSIZE;
 143                 break;
 144         default:
 145                 vap->va_rdev = 0;
 146
 147                 if (!(ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
 148                         vap->va_blocksize = xfs_preferred_iosize(mp);
 149                 } else {
 150
 151                         /*
 152                          * If the file blocks are being allocated from a
 153                          * realtime partition, then return the inode's
 154                          * realtime extent size or the realtime volume's
 155                          * extent size.
 156                          */
 157                         vap->va_blocksize =
 158                                 xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog;
 159                 }
 160                 break;
 161         }
 162
 163         vn_atime_to_timespec(vp, &vap->va_atime);
 164         vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
 165         vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
 166         vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
 167         vap->va_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
 168
 169         /*
 170          * Exit for stat callers.  See if any of the rest of the fields
 171          * to be filled in are needed.
 172          */
 173         if ((vap->va_mask &
 174              (XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
 175               XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
 176                 goto all_done;
 177
 178         /*
 179          * Convert di_flags to xflags.
 180          */
 181         vap->va_xflags = xfs_ip2xflags(ip);
 182
 183         /*
 184          * Exit for inode revalidate.  See if any of the rest of
 185          * the fields to be filled in are needed.
 186          */
 187         if ((vap->va_mask &
 188              (XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
 189               XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
 190                 goto all_done;
 191
 192         vap->va_extsize = ip->i_d.di_extsize << mp->m_sb.sb_blocklog;
 193         vap->va_nextents =
 194                 (ip->i_df.if_flags & XFS_IFEXTENTS) ?
 195                         ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) :
 196                         ip->i_d.di_nextents;
 197         if (ip->i_afp)
 198                 vap->va_anextents =
 199                         (ip->i_afp->if_flags & XFS_IFEXTENTS) ?
 200                                 ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) :
 201                                  ip->i_d.di_anextents;
 202         else
 203                 vap->va_anextents = 0;
 204         vap->va_gen = ip->i_d.di_gen;
 205
 206  all_done:
 207         if (!(flags & ATTR_LAZY))
 208                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
 209         return 0;
 210 }
 211
 212
 213 /*
 214  * xfs_setattr
 215  */
 216 int
 217 xfs_setattr(
 218         bhv_desc_t              *bdp,
 219         bhv_vattr_t             *vap,
 220         int                     flags,
 221         cred_t                  *credp)
 222 {
 223         xfs_inode_t             *ip;
 224         xfs_trans_t             *tp;
 225         xfs_mount_t             *mp;
 226         int                     mask;
 227         int                     code;
 228         uint                    lock_flags;
 229         uint                    commit_flags=0;
 230         uid_t                   uid=0, iuid=0;
 231         gid_t                   gid=0, igid=0;
 232         int                     timeflags = 0;
 233         bhv_vnode_t             *vp;
 234         xfs_prid_t              projid=0, iprojid=0;
 235         int                     mandlock_before, mandlock_after;
 236         struct xfs_dquot        *udqp, *gdqp, *olddquot1, *olddquot2;
 237         int                     file_owner;
 238         int                     need_iolock = 1;
 239
 240         vp = BHV_TO_VNODE(bdp);
 241         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
 242
 243         if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
 244                 return XFS_ERROR(EROFS);
 245
 246         /*
 247          * Cannot set certain attributes.
 248          */
 249         mask = vap->va_mask;
 250         if (mask & XFS_AT_NOSET) {
 251                 return XFS_ERROR(EINVAL);
 252         }
 253
 254         ip = XFS_BHVTOI(bdp);
 255         mp = ip->i_mount;
 256
 257         if (XFS_FORCED_SHUTDOWN(mp))
 258                 return XFS_ERROR(EIO);
 259
 260         /*
 261          * Timestamps do not need to be logged and hence do not
 262          * need to be done within a transaction.
 263          */
 264         if (mask & XFS_AT_UPDTIMES) {
 265                 ASSERT((mask & ~XFS_AT_UPDTIMES) == 0);
 266                 timeflags = ((mask & XFS_AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) |
 267                             ((mask & XFS_AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) |
 268                             ((mask & XFS_AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0);
 269                 xfs_ichgtime(ip, timeflags);
 270                 return 0;
 271         }
 272
 273         olddquot1 = olddquot2 = NULL;
 274         udqp = gdqp = NULL;
 275
 276         /*
 277          * If disk quotas is on, we make sure that the dquots do exist on disk,
 278          * before we start any other transactions. Trying to do this later
 279          * is messy. We don't care to take a readlock to look at the ids
 280          * in inode here, because we can't hold it across the trans_reserve.
 281          * If the IDs do change before we take the ilock, we're covered
 282          * because the i_*dquot fields will get updated anyway.
 283          */
 284         if (XFS_IS_QUOTA_ON(mp) &&
 285             (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID))) {
 286                 uint    qflags = 0;
 287
 288                 if ((mask & XFS_AT_UID) && XFS_IS_UQUOTA_ON(mp)) {
 289                         uid = vap->va_uid;
 290                         qflags |= XFS_QMOPT_UQUOTA;
 291                 } else {
 292                         uid = ip->i_d.di_uid;
 293                 }
 294                 if ((mask & XFS_AT_GID) && XFS_IS_GQUOTA_ON(mp)) {
 295                         gid = vap->va_gid;
 296                         qflags |= XFS_QMOPT_GQUOTA;
 297                 }  else {
 298                         gid = ip->i_d.di_gid;
 299                 }
 300                 if ((mask & XFS_AT_PROJID) && XFS_IS_PQUOTA_ON(mp)) {
 301                         projid = vap->va_projid;
 302                         qflags |= XFS_QMOPT_PQUOTA;
 303                 }  else {
 304                         projid = ip->i_d.di_projid;
 305                 }
 306                 /*
 307                  * We take a reference when we initialize udqp and gdqp,
 308                  * so it is important that we never blindly double trip on
 309                  * the same variable. See xfs_create() for an example.
 310                  */
 311                 ASSERT(udqp == NULL);
 312                 ASSERT(gdqp == NULL);
 313                 code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, projid, qflags,
 314                                          &udqp, &gdqp);
 315                 if (code)
 316                         return code;
 317         }
 318
 319         /*
 320          * For the other attributes, we acquire the inode lock and
 321          * first do an error checking pass.
 322          */
 323         tp = NULL;
 324         lock_flags = XFS_ILOCK_EXCL;
 325         if (flags & ATTR_NOLOCK)
 326                 need_iolock = 0;
 327         if (!(mask & XFS_AT_SIZE)) {
 328                 if ((mask != (XFS_AT_CTIME|XFS_AT_ATIME|XFS_AT_MTIME)) ||
 329                     (mp->m_flags & XFS_MOUNT_WSYNC)) {
 330                         tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
 331                         commit_flags = 0;
 332                         if ((code = xfs_trans_reserve(tp, 0,
 333                                                      XFS_ICHANGE_LOG_RES(mp), 0,
 334                                                      0, 0))) {
 335                                 lock_flags = 0;
 336                                 goto error_return;
 337                         }
 338                 }
 339         } else {
 340                 if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
 341                     !(flags & ATTR_DMI)) {
 342                         int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
 343                         code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, vp,
 344                                 vap->va_size, 0, dmflags, NULL);
 345                         if (code) {
 346                                 lock_flags = 0;
 347                                 goto error_return;
 348                         }
 349                 }
 350                 if (need_iolock)
 351                         lock_flags |= XFS_IOLOCK_EXCL;
 352         }
 353
 354         xfs_ilock(ip, lock_flags);
 355
 356         /* boolean: are we the file owner? */
 357         file_owner = (current_fsuid(credp) == ip->i_d.di_uid);
 358
 359         /*
 360          * Change various properties of a file.
 361          * Only the owner or users with CAP_FOWNER
 362          * capability may do these things.
 363          */
 364         if (mask &
 365             (XFS_AT_MODE|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_UID|
 366              XFS_AT_GID|XFS_AT_PROJID)) {
 367                 /*
 368                  * CAP_FOWNER overrides the following restrictions:
 369                  *
 370                  * The user ID of the calling process must be equal
 371                  * to the file owner ID, except in cases where the
 372                  * CAP_FSETID capability is applicable.
 373                  */
 374                 if (!file_owner && !capable(CAP_FOWNER)) {
 375                         code = XFS_ERROR(EPERM);
 376                         goto error_return;
 377                 }
 378
 379                 /*
 380                  * CAP_FSETID overrides the following restrictions:
 381                  *
 382                  * The effective user ID of the calling process shall match
 383                  * the file owner when setting the set-user-ID and
 384                  * set-group-ID bits on that file.
 385                  *
 386                  * The effective group ID or one of the supplementary group
 387                  * IDs of the calling process shall match the group owner of
 388                  * the file when setting the set-group-ID bit on that file
 389                  */
 390                 if (mask & XFS_AT_MODE) {
 391                         mode_t m = 0;
 392
 393                         if ((vap->va_mode & S_ISUID) && !file_owner)
 394                                 m |= S_ISUID;
 395                         if ((vap->va_mode & S_ISGID) &&
 396                             !in_group_p((gid_t)ip->i_d.di_gid))
 397                                 m |= S_ISGID;
 398 #if 0
 399                         /* Linux allows this, Irix doesn't. */
 400                         if ((vap->va_mode & S_ISVTX) && !VN_ISDIR(vp))
 401                                 m |= S_ISVTX;
 402 #endif
 403                         if (m && !capable(CAP_FSETID))
 404                                 vap->va_mode &= ~m;
 405                 }
 406         }
 407
 408         /*
 409          * Change file ownership.  Must be the owner or privileged.
 410          * If the system was configured with the "restricted_chown"
 411          * option, the owner is not permitted to give away the file,
 412          * and can change the group id only to a group of which he
 413          * or she is a member.
 414          */
 415         if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
 416                 /*
 417                  * These IDs could have changed since we last looked at them.
 418                  * But, we're assured that if the ownership did change
 419                  * while we didn't have the inode locked, inode's dquot(s)
 420                  * would have changed also.
 421                  */
 422                 iuid = ip->i_d.di_uid;
 423                 iprojid = ip->i_d.di_projid;
 424                 igid = ip->i_d.di_gid;
 425                 gid = (mask & XFS_AT_GID) ? vap->va_gid : igid;
 426                 uid = (mask & XFS_AT_UID) ? vap->va_uid : iuid;
 427                 projid = (mask & XFS_AT_PROJID) ? (xfs_prid_t)vap->va_projid :
 428                          iprojid;
 429
 430                 /*
 431                  * CAP_CHOWN overrides the following restrictions:
 432                  *
 433                  * If _POSIX_CHOWN_RESTRICTED is defined, this capability
 434                  * shall override the restriction that a process cannot
 435                  * change the user ID of a file it owns and the restriction
 436                  * that the group ID supplied to the chown() function
 437                  * shall be equal to either the group ID or one of the
 438                  * supplementary group IDs of the calling process.
 439                  */
 440                 if (restricted_chown &&
 441                     (iuid != uid || (igid != gid &&
 442                                      !in_group_p((gid_t)gid))) &&
 443                     !capable(CAP_CHOWN)) {
 444                         code = XFS_ERROR(EPERM);
 445                         goto error_return;
 446                 }
 447                 /*
 448                  * Do a quota reservation only if uid/projid/gid is actually
 449                  * going to change.
 450                  */
 451                 if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
 452                     (XFS_IS_PQUOTA_ON(mp) && iprojid != projid) ||
 453                     (XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
 454                         ASSERT(tp);
 455                         code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
 456                                                 capable(CAP_FOWNER) ?
 457                                                 XFS_QMOPT_FORCE_RES : 0);
 458                         if (code)       /* out of quota */
 459                                 goto error_return;
 460                 }
 461         }
 462
 463         /*
 464          * Truncate file.  Must have write permission and not be a directory.
 465          */
 466         if (mask & XFS_AT_SIZE) {
 467                 /* Short circuit the truncate case for zero length files */
 468                 if ((vap->va_size == 0) &&
 469                    (ip->i_size == 0) && (ip->i_d.di_nextents == 0)) {
 470                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
 471                         lock_flags &= ~XFS_ILOCK_EXCL;
 472                         if (mask & XFS_AT_CTIME)
 473                                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 474                         code = 0;
 475                         goto error_return;
 476                 }
 477
 478                 if (VN_ISDIR(vp)) {
 479                         code = XFS_ERROR(EISDIR);
 480                         goto error_return;
 481                 } else if (!VN_ISREG(vp)) {
 482                         code = XFS_ERROR(EINVAL);
 483                         goto error_return;
 484                 }
 485                 /*
 486                  * Make sure that the dquots are attached to the inode.
 487                  */
 488                 if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED)))
 489                         goto error_return;
 490         }
 491
 492         /*
 493          * Change file access or modified times.
 494          */
 495         if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
 496                 if (!file_owner) {
 497                         if ((flags & ATTR_UTIME) &&
 498                             !capable(CAP_FOWNER)) {
 499                                 code = XFS_ERROR(EPERM);
 500                                 goto error_return;
 501                         }
 502                 }
 503         }
 504
 505         /*
 506          * Change extent size or realtime flag.
 507          */
 508         if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
 509                 /*
 510                  * Can't change extent size if any extents are allocated.
 511                  */
 512                 if (ip->i_d.di_nextents && (mask & XFS_AT_EXTSIZE) &&
 513                     ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
 514                      vap->va_extsize) ) {
 515                         code = XFS_ERROR(EINVAL);       /* EFBIG? */
 516                         goto error_return;
 517                 }
 518
 519                 /*
 520                  * Can't change realtime flag if any extents are allocated.
 521                  */
 522                 if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
 523                     (mask & XFS_AT_XFLAGS) &&
 524                     (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) !=
 525                     (vap->va_xflags & XFS_XFLAG_REALTIME)) {
 526                         code = XFS_ERROR(EINVAL);       /* EFBIG? */
 527                         goto error_return;
 528                 }
 529                 /*
 530                  * Extent size must be a multiple of the appropriate block
 531                  * size, if set at all.
 532                  */
 533                 if ((mask & XFS_AT_EXTSIZE) && vap->va_extsize != 0) {
 534                         xfs_extlen_t    size;
 535
 536                         if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ||
 537                             ((mask & XFS_AT_XFLAGS) &&
 538                             (vap->va_xflags & XFS_XFLAG_REALTIME))) {
 539                                 size = mp->m_sb.sb_rextsize <<
 540                                        mp->m_sb.sb_blocklog;
 541                         } else {
 542                                 size = mp->m_sb.sb_blocksize;
 543                         }
 544                         if (vap->va_extsize % size) {
 545                                 code = XFS_ERROR(EINVAL);
 546                                 goto error_return;
 547                         }
 548                 }
 549                 /*
 550                  * If realtime flag is set then must have realtime data.
 551                  */
 552                 if ((mask & XFS_AT_XFLAGS) &&
 553                     (vap->va_xflags & XFS_XFLAG_REALTIME)) {
 554                         if ((mp->m_sb.sb_rblocks == 0) ||
 555                             (mp->m_sb.sb_rextsize == 0) ||
 556                             (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
 557                                 code = XFS_ERROR(EINVAL);
 558                                 goto error_return;
 559                         }
 560                 }
 561
 562                 /*
 563                  * Can't modify an immutable/append-only file unless
 564                  * we have appropriate permission.
 565                  */
 566                 if ((mask & XFS_AT_XFLAGS) &&
 567                     (ip->i_d.di_flags &
 568                                 (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
 569                      (vap->va_xflags &
 570                                 (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
 571                     !capable(CAP_LINUX_IMMUTABLE)) {
 572                         code = XFS_ERROR(EPERM);
 573                         goto error_return;
 574                 }
 575         }
 576
 577         /*
 578          * Now we can make the changes.  Before we join the inode
 579          * to the transaction, if XFS_AT_SIZE is set then take care of
 580          * the part of the truncation that must be done without the
 581          * inode lock.  This needs to be done before joining the inode
 582          * to the transaction, because the inode cannot be unlocked
 583          * once it is a part of the transaction.
 584          */
 585         if (mask & XFS_AT_SIZE) {
 586                 code = 0;
 587                 if ((vap->va_size > ip->i_size) &&
 588                     (flags & ATTR_NOSIZETOK) == 0) {
 589                         code = xfs_igrow_start(ip, vap->va_size, credp);
 590                 }
 591                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
 592
 593                 /*
 594                  * We are going to log the inode size change in this
 595                  * transaction so any previous writes that are beyond the on
 596                  * disk EOF and the new EOF that have not been written out need
 597                  * to be written here. If we do not write the data out, we
 598                  * expose ourselves to the null files problem.
 599                  *
 600                  * Only flush from the on disk size to the smaller of the in
 601                  * memory file size or the new size as that's the range we
 602                  * really care about here and prevents waiting for other data
 603                  * not within the range we care about here.
 604                  */
 605                 if (!code &&
 606                     (ip->i_size != ip->i_d.di_size) &&
 607                     (vap->va_size > ip->i_d.di_size)) {
 608                         code = bhv_vop_flush_pages(XFS_ITOV(ip),
 609                                         ip->i_d.di_size, vap->va_size,
 610                                         XFS_B_ASYNC, FI_NONE);
 611                 }
 612
 613                 /* wait for all I/O to complete */
 614                 vn_iowait(vp);
 615
 616                 if (!code)
 617                         code = xfs_itruncate_data(ip, vap->va_size);
 618                 if (code) {
 619                         ASSERT(tp == NULL);
 620                         lock_flags &= ~XFS_ILOCK_EXCL;
 621                         ASSERT(lock_flags == XFS_IOLOCK_EXCL);
 622                         goto error_return;
 623                 }
 624                 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
 625                 if ((code = xfs_trans_reserve(tp, 0,
 626                                              XFS_ITRUNCATE_LOG_RES(mp), 0,
 627                                              XFS_TRANS_PERM_LOG_RES,
 628                                              XFS_ITRUNCATE_LOG_COUNT))) {
 629                         xfs_trans_cancel(tp, 0);
 630                         if (need_iolock)
 631                                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 632                         return code;
 633                 }
 634                 commit_flags = XFS_TRANS_RELEASE_LOG_RES;
 635                 xfs_ilock(ip, XFS_ILOCK_EXCL);
 636         }
 637
 638         if (tp) {
 639                 xfs_trans_ijoin(tp, ip, lock_flags);
 640                 xfs_trans_ihold(tp, ip);
 641         }
 642
 643         /* determine whether mandatory locking mode changes */
 644         mandlock_before = MANDLOCK(vp, ip->i_d.di_mode);
 645
 646         /*
 647          * Truncate file.  Must have write permission and not be a directory.
 648          */
 649         if (mask & XFS_AT_SIZE) {
 650                 if (vap->va_size > ip->i_size) {
 651                         xfs_igrow_finish(tp, ip, vap->va_size,
 652                             !(flags & ATTR_DMI));
 653                 } else if ((vap->va_size <= ip->i_size) ||
 654                            ((vap->va_size == 0) && ip->i_d.di_nextents)) {
 655                         /*
 656                          * signal a sync transaction unless
 657                          * we're truncating an already unlinked
 658                          * file on a wsync filesystem
 659                          */
 660                         code = xfs_itruncate_finish(&tp, ip,
 661                                             (xfs_fsize_t)vap->va_size,
 662                                             XFS_DATA_FORK,
 663                                             ((ip->i_d.di_nlink != 0 ||
 664                                               !(mp->m_flags & XFS_MOUNT_WSYNC))
 665                                              ? 1 : 0));
 666                         if (code)
 667                                 goto abort_return;
 668                         /*
 669                          * Truncated "down", so we're removing references
 670                          * to old data here - if we now delay flushing for
 671                          * a long time, we expose ourselves unduly to the
 672                          * notorious NULL files problem.  So, we mark this
 673                          * vnode and flush it when the file is closed, and
 674                          * do not wait the usual (long) time for writeout.
 675                          */
 676                         VTRUNCATE(vp);
 677                 }
 678                 /*
 679                  * Have to do this even if the file's size doesn't change.
 680                  */
 681                 timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
 682         }
 683
 684         /*
 685          * Change file access modes.
 686          */
 687         if (mask & XFS_AT_MODE) {
 688                 ip->i_d.di_mode &= S_IFMT;
 689                 ip->i_d.di_mode |= vap->va_mode & ~S_IFMT;
 690
 691                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 692                 timeflags |= XFS_ICHGTIME_CHG;
 693         }
 694
 695         /*
 696          * Change file ownership.  Must be the owner or privileged.
 697          * If the system was configured with the "restricted_chown"
 698          * option, the owner is not permitted to give away the file,
 699          * and can change the group id only to a group of which he
 700          * or she is a member.
 701          */
 702         if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
 703                 /*
 704                  * CAP_FSETID overrides the following restrictions:
 705                  *
 706                  * The set-user-ID and set-group-ID bits of a file will be
 707                  * cleared upon successful return from chown()
 708                  */
 709                 if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
 710                     !capable(CAP_FSETID)) {
 711                         ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
 712                 }
 713
 714                 /*
 715                  * Change the ownerships and register quota modifications
 716                  * in the transaction.
 717                  */
 718                 if (iuid != uid) {
 719                         if (XFS_IS_UQUOTA_ON(mp)) {
 720                                 ASSERT(mask & XFS_AT_UID);
 721                                 ASSERT(udqp);
 722                                 olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 723                                                         &ip->i_udquot, udqp);
 724                         }
 725                         ip->i_d.di_uid = uid;
 726                 }
 727                 if (igid != gid) {
 728                         if (XFS_IS_GQUOTA_ON(mp)) {
 729                                 ASSERT(!XFS_IS_PQUOTA_ON(mp));
 730                                 ASSERT(mask & XFS_AT_GID);
 731                                 ASSERT(gdqp);
 732                                 olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 733                                                         &ip->i_gdquot, gdqp);
 734                         }
 735                         ip->i_d.di_gid = gid;
 736                 }
 737                 if (iprojid != projid) {
 738                         if (XFS_IS_PQUOTA_ON(mp)) {
 739                                 ASSERT(!XFS_IS_GQUOTA_ON(mp));
 740                                 ASSERT(mask & XFS_AT_PROJID);
 741                                 ASSERT(gdqp);
 742                                 olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 743                                                         &ip->i_gdquot, gdqp);
 744                         }
 745                         ip->i_d.di_projid = projid;
 746                         /*
 747                          * We may have to rev the inode as well as
 748                          * the superblock version number since projids didn't
 749                          * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
 750                          */
 751                         if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
 752                                 xfs_bump_ino_vers2(tp, ip);
 753                 }
 754
 755                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 756                 timeflags |= XFS_ICHGTIME_CHG;
 757         }
 758
 759
 760         /*
 761          * Change file access or modified times.
 762          */
 763         if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
 764                 if (mask & XFS_AT_ATIME) {
 765                         ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec;
 766                         ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec;
 767                         ip->i_update_core = 1;
 768                         timeflags &= ~XFS_ICHGTIME_ACC;
 769                 }
 770                 if (mask & XFS_AT_MTIME) {
 771                         ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec;
 772                         ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec;
 773                         timeflags &= ~XFS_ICHGTIME_MOD;
 774                         timeflags |= XFS_ICHGTIME_CHG;
 775                 }
 776                 if (tp && (flags & ATTR_UTIME))
 777                         xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 778         }
 779
 780         /*
 781          * Change XFS-added attributes.
 782          */
 783         if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
 784                 if (mask & XFS_AT_EXTSIZE) {
 785                         /*
 786                          * Converting bytes to fs blocks.
 787                          */
 788                         ip->i_d.di_extsize = vap->va_extsize >>
 789                                 mp->m_sb.sb_blocklog;
 790                 }
 791                 if (mask & XFS_AT_XFLAGS) {
 792                         uint    di_flags;
 793
 794                         /* can't set PREALLOC this way, just preserve it */
 795                         di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
 796                         if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
 797                                 di_flags |= XFS_DIFLAG_IMMUTABLE;
 798                         if (vap->va_xflags & XFS_XFLAG_APPEND)
 799                                 di_flags |= XFS_DIFLAG_APPEND;
 800                         if (vap->va_xflags & XFS_XFLAG_SYNC)
 801                                 di_flags |= XFS_DIFLAG_SYNC;
 802                         if (vap->va_xflags & XFS_XFLAG_NOATIME)
 803                                 di_flags |= XFS_DIFLAG_NOATIME;
 804                         if (vap->va_xflags & XFS_XFLAG_NODUMP)
 805                                 di_flags |= XFS_DIFLAG_NODUMP;
 806                         if (vap->va_xflags & XFS_XFLAG_PROJINHERIT)
 807                                 di_flags |= XFS_DIFLAG_PROJINHERIT;
 808                         if (vap->va_xflags & XFS_XFLAG_NODEFRAG)
 809                                 di_flags |= XFS_DIFLAG_NODEFRAG;
 810                         if (vap->va_xflags & XFS_XFLAG_FILESTREAM)
 811                                 di_flags |= XFS_DIFLAG_FILESTREAM;
 812                         if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
 813                                 if (vap->va_xflags & XFS_XFLAG_RTINHERIT)
 814                                         di_flags |= XFS_DIFLAG_RTINHERIT;
 815                                 if (vap->va_xflags & XFS_XFLAG_NOSYMLINKS)
 816                                         di_flags |= XFS_DIFLAG_NOSYMLINKS;
 817                                 if (vap->va_xflags & XFS_XFLAG_EXTSZINHERIT)
 818                                         di_flags |= XFS_DIFLAG_EXTSZINHERIT;
 819                         } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
 820                                 if (vap->va_xflags & XFS_XFLAG_REALTIME) {
 821                                         di_flags |= XFS_DIFLAG_REALTIME;
 822                                         ip->i_iocore.io_flags |= XFS_IOCORE_RT;
 823                                 } else {
 824                                         ip->i_iocore.io_flags &= ~XFS_IOCORE_RT;
 825                                 }
 826                                 if (vap->va_xflags & XFS_XFLAG_EXTSIZE)
 827                                         di_flags |= XFS_DIFLAG_EXTSIZE;
 828                         }
 829                         ip->i_d.di_flags = di_flags;
 830                 }
 831                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 832                 timeflags |= XFS_ICHGTIME_CHG;
 833         }
 834
 835         /*
 836          * Change file inode change time only if XFS_AT_CTIME set
 837          * AND we have been called by a DMI function.
 838          */
 839
 840         if ( (flags & ATTR_DMI) && (mask & XFS_AT_CTIME) ) {
 841                 ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec;
 842                 ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec;
 843                 ip->i_update_core = 1;
 844                 timeflags &= ~XFS_ICHGTIME_CHG;
 845         }
 846
 847         /*
 848          * Send out timestamp changes that need to be set to the
 849          * current time.  Not done when called by a DMI function.
 850          */
 851         if (timeflags && !(flags & ATTR_DMI))
 852                 xfs_ichgtime(ip, timeflags);
 853
 854         XFS_STATS_INC(xs_ig_attrchg);
 855
 856         /*
 857          * If this is a synchronous mount, make sure that the
 858          * transaction goes to disk before returning to the user.
 859          * This is slightly sub-optimal in that truncates require
 860          * two sync transactions instead of one for wsync filesystems.
 861          * One for the truncate and one for the timestamps since we
 862          * don't want to change the timestamps unless we're sure the
 863          * truncate worked.  Truncates are less than 1% of the laddis
 864          * mix so this probably isn't worth the trouble to optimize.
 865          */
 866         code = 0;
 867         if (tp) {
 868                 if (mp->m_flags & XFS_MOUNT_WSYNC)
 869                         xfs_trans_set_sync(tp);
 870
 871                 code = xfs_trans_commit(tp, commit_flags);
 872         }
 873
 874         /*
 875          * If the (regular) file's mandatory locking mode changed, then
 876          * notify the vnode.  We do this under the inode lock to prevent
 877          * racing calls to vop_vnode_change.
 878          */
 879         mandlock_after = MANDLOCK(vp, ip->i_d.di_mode);
 880         if (mandlock_before != mandlock_after) {
 881                 bhv_vop_vnode_change(vp, VCHANGE_FLAGS_ENF_LOCKING,
 882                                  mandlock_after);
 883         }
 884
 885         xfs_iunlock(ip, lock_flags);
 886
 887         /*
 888          * Release any dquot(s) the inode had kept before chown.
 889          */
 890         XFS_QM_DQRELE(mp, olddquot1);
 891         XFS_QM_DQRELE(mp, olddquot2);
 892         XFS_QM_DQRELE(mp, udqp);
 893         XFS_QM_DQRELE(mp, gdqp);
 894
 895         if (code) {
 896                 return code;
 897         }
 898
 899         if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) &&
 900             !(flags & ATTR_DMI)) {
 901                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, vp, DM_RIGHT_NULL,
 902                                         NULL, DM_RIGHT_NULL, NULL, NULL,
 903                                         0, 0, AT_DELAY_FLAG(flags));
 904         }
 905         return 0;
 906
 907  abort_return:
 908         commit_flags |= XFS_TRANS_ABORT;
 909         /* FALLTHROUGH */
 910  error_return:
 911         XFS_QM_DQRELE(mp, udqp);
 912         XFS_QM_DQRELE(mp, gdqp);
 913         if (tp) {
 914                 xfs_trans_cancel(tp, commit_flags);
 915         }
 916         if (lock_flags != 0) {
 917                 xfs_iunlock(ip, lock_flags);
 918         }
 919         return code;
 920 }
 921
 922
 923 /*
 924  * xfs_access
 925  * Null conversion from vnode mode bits to inode mode bits, as in efs.
 926  */
 927 STATIC int
 928 xfs_access(
 929         bhv_desc_t      *bdp,
 930         int             mode,
 931         cred_t          *credp)
 932 {
 933         xfs_inode_t     *ip;
 934         int             error;
 935
 936         vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__,
 937                                                (inst_t *)__return_address);
 938
 939         ip = XFS_BHVTOI(bdp);
 940         xfs_ilock(ip, XFS_ILOCK_SHARED);
 941         error = xfs_iaccess(ip, mode, credp);
 942         xfs_iunlock(ip, XFS_ILOCK_SHARED);
 943         return error;
 944 }
 945
 946
 947 /*
 948  * The maximum pathlen is 1024 bytes. Since the minimum file system
 949  * blocksize is 512 bytes, we can get a max of 2 extents back from
 950  * bmapi.
 951  */
 952 #define SYMLINK_MAPS 2
 953
 954 /*
 955  * xfs_readlink
 956  *
 957  */
 958 STATIC int
 959 xfs_readlink(
 960         bhv_desc_t      *bdp,
 961         uio_t           *uiop,
 962         int             ioflags,
 963         cred_t          *credp)
 964 {
 965         xfs_inode_t     *ip;
 966         int             count;
 967         xfs_off_t       offset;
 968         int             pathlen;
 969         bhv_vnode_t     *vp;
 970         int             error = 0;
 971         xfs_mount_t     *mp;
 972         int             nmaps;
 973         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
 974         xfs_daddr_t     d;
 975         int             byte_cnt;
 976         int             n;
 977         xfs_buf_t       *bp;
 978
 979         vp = BHV_TO_VNODE(bdp);
 980         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
 981
 982         ip = XFS_BHVTOI(bdp);
 983         mp = ip->i_mount;
 984
 985         if (XFS_FORCED_SHUTDOWN(mp))
 986                 return XFS_ERROR(EIO);
 987
 988         xfs_ilock(ip, XFS_ILOCK_SHARED);
 989
 990         ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK);
 991
 992         offset = uiop->uio_offset;
 993         count = uiop->uio_resid;
 994
 995         if (offset < 0) {
 996                 error = XFS_ERROR(EINVAL);
 997                 goto error_return;
 998         }
 999         if (count <= 0) {
1000                 error = 0;
1001                 goto error_return;
1002         }
1003
1004         /*
1005          * See if the symlink is stored inline.
1006          */
1007         pathlen = (int)ip->i_d.di_size;
1008
1009         if (ip->i_df.if_flags & XFS_IFINLINE) {
1010                 error = xfs_uio_read(ip->i_df.if_u1.if_data, pathlen, uiop);
1011         }
1012         else {
1013                 /*
1014                  * Symlink not inline.  Call bmap to get it in.
1015                  */
1016                 nmaps = SYMLINK_MAPS;
1017
1018                 error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen),
1019                                   0, NULL, 0, mval, &nmaps, NULL, NULL);
1020
1021                 if (error) {
1022                         goto error_return;
1023                 }
1024
1025                 for (n = 0; n < nmaps; n++) {
1026                         d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
1027                         byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
1028                         bp = xfs_buf_read(mp->m_ddev_targp, d,
1029                                       BTOBB(byte_cnt), 0);
1030                         error = XFS_BUF_GETERROR(bp);
1031                         if (error) {
1032                                 xfs_ioerror_alert("xfs_readlink",
1033                                           ip->i_mount, bp, XFS_BUF_ADDR(bp));
1034                                 xfs_buf_relse(bp);
1035                                 goto error_return;
1036                         }
1037                         if (pathlen < byte_cnt)
1038                                 byte_cnt = pathlen;
1039                         pathlen -= byte_cnt;
1040
1041                         error = xfs_uio_read(XFS_BUF_PTR(bp), byte_cnt, uiop);
1042                         xfs_buf_relse (bp);
1043                 }
1044
1045         }
1046
1047 error_return:
1048         xfs_iunlock(ip, XFS_ILOCK_SHARED);
1049         return error;
1050 }
1051
1052
1053 /*
1054  * xfs_fsync
1055  *
1056  * This is called to sync the inode and its data out to disk.
1057  * We need to hold the I/O lock while flushing the data, and
1058  * the inode lock while flushing the inode.  The inode lock CANNOT
1059  * be held while flushing the data, so acquire after we're done
1060  * with that.
1061  */
1062 STATIC int
1063 xfs_fsync(
1064         bhv_desc_t      *bdp,
1065         int             flag,
1066         cred_t          *credp,
1067         xfs_off_t       start,
1068         xfs_off_t       stop)
1069 {
1070         xfs_inode_t     *ip;
1071         xfs_trans_t     *tp;
1072         int             error;
1073         int             log_flushed = 0, changed = 1;
1074
1075         vn_trace_entry(BHV_TO_VNODE(bdp),
1076                         __FUNCTION__, (inst_t *)__return_address);
1077
1078         ip = XFS_BHVTOI(bdp);
1079
1080         ASSERT(start >= 0 && stop >= -1);
1081
1082         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
1083                 return XFS_ERROR(EIO);
1084
1085         if (flag & FSYNC_DATA)
1086                 filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
1087
1088         /*
1089          * We always need to make sure that the required inode state
1090          * is safe on disk.  The vnode might be clean but because
1091          * of committed transactions that haven't hit the disk yet.
1092          * Likewise, there could be unflushed non-transactional
1093          * changes to the inode core that have to go to disk.
1094          *
1095          * The following code depends on one assumption:  that
1096          * any transaction that changes an inode logs the core
1097          * because it has to change some field in the inode core
1098          * (typically nextents or nblocks).  That assumption
1099          * implies that any transactions against an inode will
1100          * catch any non-transactional updates.  If inode-altering
1101          * transactions exist that violate this assumption, the
1102          * code breaks.  Right now, it figures that if the involved
1103          * update_* field is clear and the inode is unpinned, the
1104          * inode is clean.  Either it's been flushed or it's been
1105          * committed and the commit has hit the disk unpinning the inode.
1106          * (Note that xfs_inode_item_format() called at commit clears
1107          * the update_* fields.)
1108          */
1109         xfs_ilock(ip, XFS_ILOCK_SHARED);
1110
1111         /* If we are flushing data then we care about update_size
1112          * being set, otherwise we care about update_core
1113          */
1114         if ((flag & FSYNC_DATA) ?
1115                         (ip->i_update_size == 0) :
1116                         (ip->i_update_core == 0)) {
1117                 /*
1118                  * Timestamps/size haven't changed since last inode
1119                  * flush or inode transaction commit.  That means
1120                  * either nothing got written or a transaction
1121                  * committed which caught the updates.  If the
1122                  * latter happened and the transaction hasn't
1123                  * hit the disk yet, the inode will be still
1124                  * be pinned.  If it is, force the log.
1125                  */
1126
1127                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1128
1129                 if (xfs_ipincount(ip)) {
1130                         _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
1131                                       XFS_LOG_FORCE |
1132                                       ((flag & FSYNC_WAIT)
1133                                        ? XFS_LOG_SYNC : 0),
1134                                       &log_flushed);
1135                 } else {
1136                         /*
1137                          * If the inode is not pinned and nothing
1138                          * has changed we don't need to flush the
1139                          * cache.
1140                          */
1141                         changed = 0;
1142                 }
1143                 error = 0;
1144         } else  {
1145                 /*
1146                  * Kick off a transaction to log the inode
1147                  * core to get the updates.  Make it
1148                  * sync if FSYNC_WAIT is passed in (which
1149                  * is done by everybody but specfs).  The
1150                  * sync transaction will also force the log.
1151                  */
1152                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1153                 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
1154                 if ((error = xfs_trans_reserve(tp, 0,
1155                                 XFS_FSYNC_TS_LOG_RES(ip->i_mount),
1156                                 0, 0, 0)))  {
1157                         xfs_trans_cancel(tp, 0);
1158                         return error;
1159                 }
1160                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1161
1162                 /*
1163                  * Note - it's possible that we might have pushed
1164                  * ourselves out of the way during trans_reserve
1165                  * which would flush the inode.  But there's no
1166                  * guarantee that the inode buffer has actually
1167                  * gone out yet (it's delwri).  Plus the buffer
1168                  * could be pinned anyway if it's part of an
1169                  * inode in another recent transaction.  So we
1170                  * play it safe and fire off the transaction anyway.
1171                  */
1172                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1173                 xfs_trans_ihold(tp, ip);
1174                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1175                 if (flag & FSYNC_WAIT)
1176                         xfs_trans_set_sync(tp);
1177                 error = _xfs_trans_commit(tp, 0, &log_flushed);
1178
1179                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1180         }
1181
1182         if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
1183                 /*
1184                  * If the log write didn't issue an ordered tag we need
1185                  * to flush the disk cache for the data device now.
1186                  */
1187                 if (!log_flushed)
1188                         xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
1189
1190                 /*
1191                  * If this inode is on the RT dev we need to flush that
1192                  * cache as well.
1193                  */
1194                 if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)
1195                         xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
1196         }
1197
1198         return error;
1199 }
1200
1201 /*
1202  * This is called by xfs_inactive to free any blocks beyond eof
1203  * when the link count isn't zero and by xfs_dm_punch_hole() when
1204  * punching a hole to EOF.
1205  */
1206 int
1207 xfs_free_eofblocks(
1208         xfs_mount_t     *mp,
1209         xfs_inode_t     *ip,
1210         int             flags)
1211 {
1212         xfs_trans_t     *tp;
1213         int             error;
1214         xfs_fileoff_t   end_fsb;
1215         xfs_fileoff_t   last_fsb;
1216         xfs_filblks_t   map_len;
1217         int             nimaps;
1218         xfs_bmbt_irec_t imap;
1219         int             use_iolock = (flags & XFS_FREE_EOF_LOCK);
1220
1221         /*
1222          * Figure out if there are any blocks beyond the end
1223          * of the file.  If not, then there is nothing to do.
1224          */
1225         end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
1226         last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1227         map_len = last_fsb - end_fsb;
1228         if (map_len <= 0)
1229                 return 0;
1230
1231         nimaps = 1;
1232         xfs_ilock(ip, XFS_ILOCK_SHARED);
1233         error = XFS_BMAPI(mp, NULL, &ip->i_iocore, end_fsb, map_len, 0,
1234                           NULL, 0, &imap, &nimaps, NULL, NULL);
1235         xfs_iunlock(ip, XFS_ILOCK_SHARED);
1236
1237         if (!error && (nimaps != 0) &&
1238             (imap.br_startblock != HOLESTARTBLOCK ||
1239              ip->i_delayed_blks)) {
1240                 /*
1241                  * Attach the dquots to the inode up front.
1242                  */
1243                 if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1244                         return error;
1245
1246                 /*
1247                  * There are blocks after the end of file.
1248                  * Free them up now by truncating the file to
1249                  * its current size.
1250                  */
1251                 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1252
1253                 /*
1254                  * Do the xfs_itruncate_start() call before
1255                  * reserving any log space because
1256                  * itruncate_start will call into the buffer
1257                  * cache and we can't
1258                  * do that within a transaction.
1259                  */
1260                 if (use_iolock)
1261                         xfs_ilock(ip, XFS_IOLOCK_EXCL);
1262                 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
1263                                     ip->i_size);
1264                 if (error) {
1265                         xfs_trans_cancel(tp, 0);
1266                         if (use_iolock)
1267                                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1268                         return error;
1269                 }
1270
1271                 error = xfs_trans_reserve(tp, 0,
1272                                           XFS_ITRUNCATE_LOG_RES(mp),
1273                                           0, XFS_TRANS_PERM_LOG_RES,
1274                                           XFS_ITRUNCATE_LOG_COUNT);
1275                 if (error) {
1276                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1277                         xfs_trans_cancel(tp, 0);
1278                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1279                         return error;
1280                 }
1281
1282                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1283                 xfs_trans_ijoin(tp, ip,
1284                                 XFS_IOLOCK_EXCL |
1285                                 XFS_ILOCK_EXCL);
1286                 xfs_trans_ihold(tp, ip);
1287
1288                 error = xfs_itruncate_finish(&tp, ip,
1289                                              ip->i_size,
1290                                              XFS_DATA_FORK,
1291                                              0);
1292                 /*
1293                  * If we get an error at this point we
1294                  * simply don't bother truncating the file.
1295                  */
1296                 if (error) {
1297                         xfs_trans_cancel(tp,
1298                                          (XFS_TRANS_RELEASE_LOG_RES |
1299                                           XFS_TRANS_ABORT));
1300                 } else {
1301                         error = xfs_trans_commit(tp,
1302                                                 XFS_TRANS_RELEASE_LOG_RES);
1303                 }
1304                 xfs_iunlock(ip, (use_iolock ? (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)
1305                                             : XFS_ILOCK_EXCL));
1306         }
1307         return error;
1308 }
1309
1310 /*
1311  * Free a symlink that has blocks associated with it.
1312  */
1313 STATIC int
1314 xfs_inactive_symlink_rmt(
1315         xfs_inode_t     *ip,
1316         xfs_trans_t     **tpp)
1317 {
1318         xfs_buf_t       *bp;
1319         int             committed;
1320         int             done;
1321         int             error;
1322         xfs_fsblock_t   first_block;
1323         xfs_bmap_free_t free_list;
1324         int             i;
1325         xfs_mount_t     *mp;
1326         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
1327         int             nmaps;
1328         xfs_trans_t     *ntp;
1329         int             size;
1330         xfs_trans_t     *tp;
1331
1332         tp = *tpp;
1333         mp = ip->i_mount;
1334         ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
1335         /*
1336          * We're freeing a symlink that has some
1337          * blocks allocated to it.  Free the
1338          * blocks here.  We know that we've got
1339          * either 1 or 2 extents and that we can
1340          * free them all in one bunmapi call.
1341          */
1342         ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
1343         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1344                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1345                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1346                 xfs_trans_cancel(tp, 0);
1347                 *tpp = NULL;
1348                 return error;
1349         }
1350         /*
1351          * Lock the inode, fix the size, and join it to the transaction.
1352          * Hold it so in the normal path, we still have it locked for
1353          * the second transaction.  In the error paths we need it
1354          * held so the cancel won't rele it, see below.
1355          */
1356         xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1357         size = (int)ip->i_d.di_size;
1358         ip->i_d.di_size = 0;
1359         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1360         xfs_trans_ihold(tp, ip);
1361         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1362         /*
1363          * Find the block(s) so we can inval and unmap them.
1364          */
1365         done = 0;
1366         XFS_BMAP_INIT(&free_list, &first_block);
1367         nmaps = ARRAY_SIZE(mval);
1368         if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
1369                         XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
1370                         &free_list, NULL)))
1371                 goto error0;
1372         /*
1373          * Invalidate the block(s).
1374          */
1375         for (i = 0; i < nmaps; i++) {
1376                 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
1377                         XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
1378                         XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
1379                 xfs_trans_binval(tp, bp);
1380         }
1381         /*
1382          * Unmap the dead block(s) to the free_list.
1383          */
1384         if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
1385                         &first_block, &free_list, NULL, &done)))
1386                 goto error1;
1387         ASSERT(done);
1388         /*
1389          * Commit the first transaction.  This logs the EFI and the inode.
1390          */
1391         if ((error = xfs_bmap_finish(&tp, &free_list, &committed)))
1392                 goto error1;
1393         /*
1394          * The transaction must have been committed, since there were
1395          * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
1396          * The new tp has the extent freeing and EFDs.
1397          */
1398         ASSERT(committed);
1399         /*
1400          * The first xact was committed, so add the inode to the new one.
1401          * Mark it dirty so it will be logged and moved forward in the log as
1402          * part of every commit.
1403          */
1404         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1405         xfs_trans_ihold(tp, ip);
1406         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1407         /*
1408          * Get a new, empty transaction to return to our caller.
1409          */
1410         ntp = xfs_trans_dup(tp);
1411         /*
1412          * Commit the transaction containing extent freeing and EFDs.
1413          * If we get an error on the commit here or on the reserve below,
1414          * we need to unlock the inode since the new transaction doesn't
1415          * have the inode attached.
1416          */
1417         error = xfs_trans_commit(tp, 0);
1418         tp = ntp;
1419         if (error) {
1420                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1421                 goto error0;
1422         }
1423         /*
1424          * Remove the memory for extent descriptions (just bookkeeping).
1425          */
1426         if (ip->i_df.if_bytes)
1427                 xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
1428         ASSERT(ip->i_df.if_bytes == 0);
1429         /*
1430          * Put an itruncate log reservation in the new transaction
1431          * for our caller.
1432          */
1433         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1434                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1435                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1436                 goto error0;
1437         }
1438         /*
1439          * Return with the inode locked but not joined to the transaction.
1440          */
1441         *tpp = tp;
1442         return 0;
1443
1444  error1:
1445         xfs_bmap_cancel(&free_list);
1446  error0:
1447         /*
1448          * Have to come here with the inode locked and either
1449          * (held and in the transaction) or (not in the transaction).
1450          * If the inode isn't held then cancel would iput it, but
1451          * that's wrong since this is inactive and the vnode ref
1452          * count is 0 already.
1453          * Cancel won't do anything to the inode if held, but it still
1454          * needs to be locked until the cancel is done, if it was
1455          * joined to the transaction.
1456          */
1457         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1458         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1459         *tpp = NULL;
1460         return error;
1461
1462 }
1463
1464 STATIC int
1465 xfs_inactive_symlink_local(
1466         xfs_inode_t     *ip,
1467         xfs_trans_t     **tpp)
1468 {
1469         int             error;
1470
1471         ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
1472         /*
1473          * We're freeing a symlink which fit into
1474          * the inode.  Just free the memory used
1475          * to hold the old symlink.
1476          */
1477         error = xfs_trans_reserve(*tpp, 0,
1478                                   XFS_ITRUNCATE_LOG_RES(ip->i_mount),
1479                                   0, XFS_TRANS_PERM_LOG_RES,
1480                                   XFS_ITRUNCATE_LOG_COUNT);
1481
1482         if (error) {
1483                 xfs_trans_cancel(*tpp, 0);
1484                 *tpp = NULL;
1485                 return error;
1486         }
1487         xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1488
1489         /*
1490          * Zero length symlinks _can_ exist.
1491          */
1492         if (ip->i_df.if_bytes > 0) {
1493                 xfs_idata_realloc(ip,
1494                                   -(ip->i_df.if_bytes),
1495                                   XFS_DATA_FORK);
1496                 ASSERT(ip->i_df.if_bytes == 0);
1497         }
1498         return 0;
1499 }
1500
1501 STATIC int
1502 xfs_inactive_attrs(
1503         xfs_inode_t     *ip,
1504         xfs_trans_t     **tpp)
1505 {
1506         xfs_trans_t     *tp;
1507         int             error;
1508         xfs_mount_t     *mp;
1509
1510         ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
1511         tp = *tpp;
1512         mp = ip->i_mount;
1513         ASSERT(ip->i_d.di_forkoff != 0);
1514         xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1515         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1516
1517         error = xfs_attr_inactive(ip);
1518         if (error) {
1519                 *tpp = NULL;
1520                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1521                 return error; /* goto out */
1522         }
1523
1524         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1525         error = xfs_trans_reserve(tp, 0,
1526                                   XFS_IFREE_LOG_RES(mp),
1527                                   0, XFS_TRANS_PERM_LOG_RES,
1528                                   XFS_INACTIVE_LOG_COUNT);
1529         if (error) {
1530                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1531                 xfs_trans_cancel(tp, 0);
1532                 *tpp = NULL;
1533                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1534                 return error;
1535         }
1536
1537         xfs_ilock(ip, XFS_ILOCK_EXCL);
1538         xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1539         xfs_trans_ihold(tp, ip);
1540         xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1541
1542         ASSERT(ip->i_d.di_anextents == 0);
1543
1544         *tpp = tp;
1545         return 0;
1546 }
1547
1548 STATIC int
1549 xfs_release(
1550         bhv_desc_t      *bdp)
1551 {
1552         xfs_inode_t     *ip;
1553         bhv_vnode_t     *vp;
1554         xfs_mount_t     *mp;
1555         int             error;
1556
1557         vp = BHV_TO_VNODE(bdp);
1558         ip = XFS_BHVTOI(bdp);
1559         mp = ip->i_mount;
1560
1561         if (!VN_ISREG(vp) || (ip->i_d.di_mode == 0))
1562                 return 0;
1563
1564         /* If this is a read-only mount, don't do this (would generate I/O) */
1565         if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
1566                 return 0;
1567
1568         if (!XFS_FORCED_SHUTDOWN(mp)) {
1569                 /*
1570                  * If we are using filestreams, and we have an unlinked
1571                  * file that we are processing the last close on, then nothing
1572                  * will be able to reopen and write to this file. Purge this
1573                  * inode from the filestreams cache so that it doesn't delay
1574                  * teardown of the inode.
1575                  */
1576                 if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
1577                         xfs_filestream_deassociate(ip);
1578
1579                 /*
1580                  * If we previously truncated this file and removed old data
1581                  * in the process, we want to initiate "early" writeout on
1582                  * the last close.  This is an attempt to combat the notorious
1583                  * NULL files problem which is particularly noticable from a
1584                  * truncate down, buffered (re-)write (delalloc), followed by
1585                  * a crash.  What we are effectively doing here is
1586                  * significantly reducing the time window where we'd otherwise
1587                  * be exposed to that problem.
1588                  */
1589                 if (VUNTRUNCATE(vp) && VN_DIRTY(vp) && ip->i_delayed_blks > 0)
1590                         bhv_vop_flush_pages(vp, 0, -1, XFS_B_ASYNC, FI_NONE);
1591         }
1592
1593 #ifdef HAVE_REFCACHE
1594         /* If we are in the NFS reference cache then don't do this now */
1595         if (ip->i_refcache)
1596                 return 0;
1597 #endif
1598
1599         if (ip->i_d.di_nlink != 0) {
1600                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1601                      ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1602                        ip->i_delayed_blks > 0)) &&
1603                      (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
1604                     (!(ip->i_d.di_flags &
1605                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
1606                         error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1607                         if (error)
1608                                 return error;
1609                         /* Update linux inode block count after free above */
1610                         vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1611                                 ip->i_d.di_nblocks + ip->i_delayed_blks);
1612                 }
1613         }
1614
1615         return 0;
1616 }
1617
1618 /*
1619  * xfs_inactive
1620  *
1621  * This is called when the vnode reference count for the vnode
1622  * goes to zero.  If the file has been unlinked, then it must
1623  * now be truncated.  Also, we clear all of the read-ahead state
1624  * kept for the inode here since the file is now closed.
1625  */
1626 STATIC int
1627 xfs_inactive(
1628         bhv_desc_t      *bdp,
1629         cred_t          *credp)
1630 {
1631         xfs_inode_t     *ip;
1632         bhv_vnode_t     *vp;
1633         xfs_bmap_free_t free_list;
1634         xfs_fsblock_t   first_block;
1635         int             committed;
1636         xfs_trans_t     *tp;
1637         xfs_mount_t     *mp;
1638         int             error;
1639         int             truncate;
1640
1641         vp = BHV_TO_VNODE(bdp);
1642         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
1643
1644         ip = XFS_BHVTOI(bdp);
1645
1646         /*
1647          * If the inode is already free, then there can be nothing
1648          * to clean up here.
1649          */
1650         if (ip->i_d.di_mode == 0 || VN_BAD(vp)) {
1651                 ASSERT(ip->i_df.if_real_bytes == 0);
1652                 ASSERT(ip->i_df.if_broot_bytes == 0);
1653                 return VN_INACTIVE_CACHE;
1654         }
1655
1656         /*
1657          * Only do a truncate if it's a regular file with
1658          * some actual space in it.  It's OK to look at the
1659          * inode's fields without the lock because we're the
1660          * only one with a reference to the inode.
1661          */
1662         truncate = ((ip->i_d.di_nlink == 0) &&
1663             ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
1664              (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
1665             ((ip->i_d.di_mode & S_IFMT) == S_IFREG));
1666
1667         mp = ip->i_mount;
1668
1669         if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY)) {
1670                 (void) XFS_SEND_DESTROY(mp, vp, DM_RIGHT_NULL);
1671         }
1672
1673         error = 0;
1674
1675         /* If this is a read-only mount, don't do this (would generate I/O) */
1676         if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
1677                 goto out;
1678
1679         if (ip->i_d.di_nlink != 0) {
1680                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1681                      ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1682                        ip->i_delayed_blks > 0)) &&
1683                       (ip->i_df.if_flags & XFS_IFEXTENTS) &&
1684                      (!(ip->i_d.di_flags &
1685                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
1686                       (ip->i_delayed_blks != 0)))) {
1687                         error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1688                         if (error)
1689                                 return VN_INACTIVE_CACHE;
1690                         /* Update linux inode block count after free above */
1691                         vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1692                                 ip->i_d.di_nblocks + ip->i_delayed_blks);
1693                 }
1694                 goto out;
1695         }
1696
1697         ASSERT(ip->i_d.di_nlink == 0);
1698
1699         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1700                 return VN_INACTIVE_CACHE;
1701
1702         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1703         if (truncate) {
1704                 /*
1705                  * Do the xfs_itruncate_start() call before
1706                  * reserving any log space because itruncate_start
1707                  * will call into the buffer cache and we can't
1708                  * do that within a transaction.
1709                  */
1710                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1711
1712                 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
1713                 if (error) {
1714                         xfs_trans_cancel(tp, 0);
1715                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1716                         return VN_INACTIVE_CACHE;
1717                 }
1718
1719                 error = xfs_trans_reserve(tp, 0,
1720                                           XFS_ITRUNCATE_LOG_RES(mp),
1721                                           0, XFS_TRANS_PERM_LOG_RES,
1722                                           XFS_ITRUNCATE_LOG_COUNT);
1723                 if (error) {
1724                         /* Don't call itruncate_cleanup */
1725                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1726                         xfs_trans_cancel(tp, 0);
1727                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1728                         return VN_INACTIVE_CACHE;
1729                 }
1730
1731                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1732                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1733                 xfs_trans_ihold(tp, ip);
1734
1735                 /*
1736                  * normally, we have to run xfs_itruncate_finish sync.
1737                  * But if filesystem is wsync and we're in the inactive
1738                  * path, then we know that nlink == 0, and that the
1739                  * xaction that made nlink == 0 is permanently committed
1740                  * since xfs_remove runs as a synchronous transaction.
1741                  */
1742                 error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK,
1743                                 (!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0));
1744
1745                 if (error) {
1746                         xfs_trans_cancel(tp,
1747                                 XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1748                         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1749                         return VN_INACTIVE_CACHE;
1750                 }
1751         } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) {
1752
1753                 /*
1754                  * If we get an error while cleaning up a
1755                  * symlink we bail out.
1756                  */
1757                 error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
1758                         xfs_inactive_symlink_rmt(ip, &tp) :
1759                         xfs_inactive_symlink_local(ip, &tp);
1760
1761                 if (error) {
1762                         ASSERT(tp == NULL);
1763                         return VN_INACTIVE_CACHE;
1764                 }
1765
1766                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1767                 xfs_trans_ihold(tp, ip);
1768         } else {
1769                 error = xfs_trans_reserve(tp, 0,
1770                                           XFS_IFREE_LOG_RES(mp),
1771                                           0, XFS_TRANS_PERM_LOG_RES,
1772                                           XFS_INACTIVE_LOG_COUNT);
1773                 if (error) {
1774                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1775                         xfs_trans_cancel(tp, 0);
1776                         return VN_INACTIVE_CACHE;
1777                 }
1778
1779                 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1780                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1781                 xfs_trans_ihold(tp, ip);
1782         }
1783
1784         /*
1785          * If there are attributes associated with the file
1786          * then blow them away now.  The code calls a routine
1787          * that recursively deconstructs the attribute fork.
1788          * We need to just commit the current transaction
1789          * because we can't use it for xfs_attr_inactive().
1790          */
1791         if (ip->i_d.di_anextents > 0) {
1792                 error = xfs_inactive_attrs(ip, &tp);
1793                 /*
1794                  * If we got an error, the transaction is already
1795                  * cancelled, and the inode is unlocked. Just get out.
1796                  */
1797                  if (error)
1798                          return VN_INACTIVE_CACHE;
1799         } else if (ip->i_afp) {
1800                 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1801         }
1802
1803         /*
1804          * Free the inode.
1805          */
1806         XFS_BMAP_INIT(&free_list, &first_block);
1807         error = xfs_ifree(tp, ip, &free_list);
1808         if (error) {
1809                 /*
1810                  * If we fail to free the inode, shut down.  The cancel
1811                  * might do that, we need to make sure.  Otherwise the
1812                  * inode might be lost for a long time or forever.
1813                  */
1814                 if (!XFS_FORCED_SHUTDOWN(mp)) {
1815                         cmn_err(CE_NOTE,
1816                 "xfs_inactive:  xfs_ifree() returned an error = %d on %s",
1817                                 error, mp->m_fsname);
1818                         xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1819                 }
1820                 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
1821         } else {
1822                 /*
1823                  * Credit the quota account(s). The inode is gone.
1824                  */
1825                 XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1826
1827                 /*
1828                  * Just ignore errors at this point.  There is
1829                  * nothing we can do except to try to keep going.
1830                  */
1831                 (void) xfs_bmap_finish(&tp,  &free_list, &committed);
1832                 (void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1833         }
1834         /*
1835          * Release the dquots held by inode, if any.
1836          */
1837         XFS_QM_DQDETACH(mp, ip);
1838
1839         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1840
1841  out:
1842         return VN_INACTIVE_CACHE;
1843 }
1844
1845
1846 /*
1847  * xfs_lookup
1848  */
1849 STATIC int
1850 xfs_lookup(
1851         bhv_desc_t              *dir_bdp,
1852         bhv_vname_t             *dentry,
1853         bhv_vnode_t             **vpp,
1854         int                     flags,
1855         bhv_vnode_t             *rdir,
1856         cred_t                  *credp)
1857 {
1858         xfs_inode_t             *dp, *ip;
1859         xfs_ino_t               e_inum;
1860         int                     error;
1861         uint                    lock_mode;
1862         bhv_vnode_t             *dir_vp;
1863
1864         dir_vp = BHV_TO_VNODE(dir_bdp);
1865         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
1866
1867         dp = XFS_BHVTOI(dir_bdp);
1868
1869         if (XFS_FORCED_SHUTDOWN(dp->i_mount))
1870                 return XFS_ERROR(EIO);
1871
1872         lock_mode = xfs_ilock_map_shared(dp);
1873         error = xfs_dir_lookup_int(dir_bdp, lock_mode, dentry, &e_inum, &ip);
1874         if (!error) {
1875                 *vpp = XFS_ITOV(ip);
1876                 ITRACE(ip);
1877         }
1878         xfs_iunlock_map_shared(dp, lock_mode);
1879         return error;
1880 }
1881
1882
1883 /*
1884  * xfs_create (create a new file).
1885  */
1886 STATIC int
1887 xfs_create(
1888         bhv_desc_t              *dir_bdp,
1889         bhv_vname_t             *dentry,
1890         bhv_vattr_t             *vap,
1891         bhv_vnode_t             **vpp,
1892         cred_t                  *credp)
1893 {
1894         char                    *name = VNAME(dentry);
1895         bhv_vnode_t             *dir_vp;
1896         xfs_inode_t             *dp, *ip;
1897         bhv_vnode_t             *vp = NULL;
1898         xfs_trans_t             *tp;
1899         xfs_mount_t             *mp;
1900         xfs_dev_t               rdev;
1901         int                     error;
1902         xfs_bmap_free_t         free_list;
1903         xfs_fsblock_t           first_block;
1904         boolean_t               dp_joined_to_trans;
1905         int                     dm_event_sent = 0;
1906         uint                    cancel_flags;
1907         int                     committed;
1908         xfs_prid_t              prid;
1909         struct xfs_dquot        *udqp, *gdqp;
1910         uint                    resblks;
1911         int                     dm_di_mode;
1912         int                     namelen;
1913
1914         ASSERT(!*vpp);
1915         dir_vp = BHV_TO_VNODE(dir_bdp);
1916         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
1917
1918         dp = XFS_BHVTOI(dir_bdp);
1919         mp = dp->i_mount;
1920
1921         dm_di_mode = vap->va_mode;
1922         namelen = VNAMELEN(dentry);
1923
1924         if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
1925                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
1926                                 dir_vp, DM_RIGHT_NULL, NULL,
1927                                 DM_RIGHT_NULL, name, NULL,
1928                                 dm_di_mode, 0, 0);
1929
1930                 if (error)
1931                         return error;
1932                 dm_event_sent = 1;
1933         }
1934
1935         if (XFS_FORCED_SHUTDOWN(mp))
1936                 return XFS_ERROR(EIO);
1937
1938         /* Return through std_return after this point. */
1939
1940         udqp = gdqp = NULL;
1941         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1942                 prid = dp->i_d.di_projid;
1943         else if (vap->va_mask & XFS_AT_PROJID)
1944                 prid = (xfs_prid_t)vap->va_projid;
1945         else
1946                 prid = (xfs_prid_t)dfltprid;
1947
1948         /*
1949          * Make sure that we have allocated dquot(s) on disk.
1950          */
1951         error = XFS_QM_DQVOPALLOC(mp, dp,
1952                         current_fsuid(credp), current_fsgid(credp), prid,
1953                         XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
1954         if (error)
1955                 goto std_return;
1956
1957         ip = NULL;
1958         dp_joined_to_trans = B_FALSE;
1959
1960         tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1961         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1962         resblks = XFS_CREATE_SPACE_RES(mp, namelen);
1963         /*
1964          * Initially assume that the file does not exist and
1965          * reserve the resources for that case.  If that is not
1966          * the case we'll drop the one we have and get a more
1967          * appropriate transaction later.
1968          */
1969         error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
1970                         XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1971         if (error == ENOSPC) {
1972                 resblks = 0;
1973                 error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0,
1974                                 XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1975         }
1976         if (error) {
1977                 cancel_flags = 0;
1978                 dp = NULL;
1979                 goto error_return;
1980         }
1981
1982         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1983
1984         XFS_BMAP_INIT(&free_list, &first_block);
1985
1986         ASSERT(ip == NULL);
1987
1988         /*
1989          * Reserve disk quota and the inode.
1990          */
1991         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
1992         if (error)
1993                 goto error_return;
1994
1995         if (resblks == 0 && (error = xfs_dir_canenter(tp, dp, name, namelen)))
1996                 goto error_return;
1997         rdev = (vap->va_mask & XFS_AT_RDEV) ? vap->va_rdev : 0;
1998         error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 1,
1999                         rdev, credp, prid, resblks > 0,
2000                         &ip, &committed);
2001         if (error) {
2002                 if (error == ENOSPC)
2003                         goto error_return;
2004                 goto abort_return;
2005         }
2006         ITRACE(ip);
2007
2008         /*
2009          * At this point, we've gotten a newly allocated inode.
2010          * It is locked (and joined to the transaction).
2011          */
2012
2013         ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE));
2014
2015         /*
2016          * Now we join the directory inode to the transaction.
2017          * We do not do it earlier because xfs_dir_ialloc
2018          * might commit the previous transaction (and release
2019          * all the locks).
2020          */
2021
2022         VN_HOLD(dir_vp);
2023         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2024         dp_joined_to_trans = B_TRUE;
2025
2026         error = xfs_dir_createname(tp, dp, name, namelen, ip->i_ino,
2027                                         &first_block, &free_list, resblks ?
2028                                         resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2029         if (error) {
2030                 ASSERT(error != ENOSPC);
2031                 goto abort_return;
2032         }
2033         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2034         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2035
2036         /*
2037          * If this is a synchronous mount, make sure that the
2038          * create transaction goes to disk before returning to
2039          * the user.
2040          */
2041         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2042                 xfs_trans_set_sync(tp);
2043         }
2044
2045         dp->i_gen++;
2046
2047         /*
2048          * Attach the dquot(s) to the inodes and modify them incore.
2049          * These ids of the inode couldn't have changed since the new
2050          * inode has been locked ever since it was created.
2051          */
2052         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
2053
2054         /*
2055          * xfs_trans_commit normally decrements the vnode ref count
2056          * when it unlocks the inode. Since we want to return the
2057          * vnode to the caller, we bump the vnode ref count now.
2058          */
2059         IHOLD(ip);
2060         vp = XFS_ITOV(ip);
2061
2062         error = xfs_bmap_finish(&tp, &free_list, &committed);
2063         if (error) {
2064                 xfs_bmap_cancel(&free_list);
2065                 goto abort_rele;
2066         }
2067
2068         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2069         if (error) {
2070                 IRELE(ip);
2071                 tp = NULL;
2072                 goto error_return;
2073         }
2074
2075         XFS_QM_DQRELE(mp, udqp);
2076         XFS_QM_DQRELE(mp, gdqp);
2077
2078         /*
2079          * Propagate the fact that the vnode changed after the
2080          * xfs_inode locks have been released.
2081          */
2082         bhv_vop_vnode_change(vp, VCHANGE_FLAGS_TRUNCATED, 3);
2083
2084         *vpp = vp;
2085
2086         /* Fallthrough to std_return with error = 0  */
2087
2088 std_return:
2089         if ((*vpp || (error != 0 && dm_event_sent != 0)) &&
2090             DM_EVENT_ENABLED(XFS_BHVTOI(dir_bdp), DM_EVENT_POSTCREATE)) {
2091                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2092                         dir_vp, DM_RIGHT_NULL,
2093                         *vpp ? vp:NULL,
2094                         DM_RIGHT_NULL, name, NULL,
2095                         dm_di_mode, error, 0);
2096         }
2097         return error;
2098
2099  abort_return:
2100         cancel_flags |= XFS_TRANS_ABORT;
2101         /* FALLTHROUGH */
2102
2103  error_return:
2104         if (tp != NULL)
2105                 xfs_trans_cancel(tp, cancel_flags);
2106
2107         if (!dp_joined_to_trans && (dp != NULL))
2108                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2109         XFS_QM_DQRELE(mp, udqp);
2110         XFS_QM_DQRELE(mp, gdqp);
2111
2112         goto std_return;
2113
2114  abort_rele:
2115         /*
2116          * Wait until after the current transaction is aborted to
2117          * release the inode.  This prevents recursive transactions
2118          * and deadlocks from xfs_inactive.
2119          */
2120         cancel_flags |= XFS_TRANS_ABORT;
2121         xfs_trans_cancel(tp, cancel_flags);
2122         IRELE(ip);
2123
2124         XFS_QM_DQRELE(mp, udqp);
2125         XFS_QM_DQRELE(mp, gdqp);
2126
2127         goto std_return;
2128 }
2129
2130 #ifdef DEBUG
2131 /*
2132  * Some counters to see if (and how often) we are hitting some deadlock
2133  * prevention code paths.
2134  */
2135
2136 int xfs_rm_locks;
2137 int xfs_rm_lock_delays;
2138 int xfs_rm_attempts;
2139 #endif
2140
2141 /*
2142  * The following routine will lock the inodes associated with the
2143  * directory and the named entry in the directory. The locks are
2144  * acquired in increasing inode number.
2145  *
2146  * If the entry is "..", then only the directory is locked. The
2147  * vnode ref count will still include that from the .. entry in
2148  * this case.
2149  *
2150  * There is a deadlock we need to worry about. If the locked directory is
2151  * in the AIL, it might be blocking up the log. The next inode we lock
2152  * could be already locked by another thread waiting for log space (e.g
2153  * a permanent log reservation with a long running transaction (see
2154  * xfs_itruncate_finish)). To solve this, we must check if the directory
2155  * is in the ail and use lock_nowait. If we can't lock, we need to
2156  * drop the inode lock on the directory and try again. xfs_iunlock will
2157  * potentially push the tail if we were holding up the log.
2158  */
2159 STATIC int
2160 xfs_lock_dir_and_entry(
2161         xfs_inode_t     *dp,
2162         xfs_inode_t     *ip)    /* inode of entry 'name' */
2163 {
2164         int             attempts;
2165         xfs_ino_t       e_inum;
2166         xfs_inode_t     *ips[2];
2167         xfs_log_item_t  *lp;
2168
2169 #ifdef DEBUG
2170         xfs_rm_locks++;
2171 #endif
2172         attempts = 0;
2173
2174 again:
2175         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2176
2177         e_inum = ip->i_ino;
2178
2179         ITRACE(ip);
2180
2181         /*
2182          * We want to lock in increasing inum. Since we've already
2183          * acquired the lock on the directory, we may need to release
2184          * if if the inum of the entry turns out to be less.
2185          */
2186         if (e_inum > dp->i_ino) {
2187                 /*
2188                  * We are already in the right order, so just
2189                  * lock on the inode of the entry.
2190                  * We need to use nowait if dp is in the AIL.
2191                  */
2192
2193                 lp = (xfs_log_item_t *)dp->i_itemp;
2194                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2195                         if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2196                                 attempts++;
2197 #ifdef DEBUG
2198                                 xfs_rm_attempts++;
2199 #endif
2200
2201                                 /*
2202                                  * Unlock dp and try again.
2203                                  * xfs_iunlock will try to push the tail
2204                                  * if the inode is in the AIL.
2205                                  */
2206
2207                                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2208
2209                                 if ((attempts % 5) == 0) {
2210                                         delay(1); /* Don't just spin the CPU */
2211 #ifdef DEBUG
2212                                         xfs_rm_lock_delays++;
2213 #endif
2214                                 }
2215                                 goto again;
2216                         }
2217                 } else {
2218                         xfs_ilock(ip, XFS_ILOCK_EXCL);
2219                 }
2220         } else if (e_inum < dp->i_ino) {
2221                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2222
2223                 ips[0] = ip;
2224                 ips[1] = dp;
2225                 xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2226         }
2227         /* else  e_inum == dp->i_ino */
2228         /*     This can happen if we're asked to lock /x/..
2229          *     the entry is "..", which is also the parent directory.
2230          */
2231
2232         return 0;
2233 }
2234
2235 #ifdef DEBUG
2236 int xfs_locked_n;
2237 int xfs_small_retries;
2238 int xfs_middle_retries;
2239 int xfs_lots_retries;
2240 int xfs_lock_delays;
2241 #endif
2242
2243 /*
2244  * Bump the subclass so xfs_lock_inodes() acquires each lock with
2245  * a different value
2246  */
2247 static inline int
2248 xfs_lock_inumorder(int lock_mode, int subclass)
2249 {
2250         if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
2251                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
2252         if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
2253                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
2254
2255         return lock_mode;
2256 }
2257
2258 /*
2259  * The following routine will lock n inodes in exclusive mode.
2260  * We assume the caller calls us with the inodes in i_ino order.
2261  *
2262  * We need to detect deadlock where an inode that we lock
2263  * is in the AIL and we start waiting for another inode that is locked
2264  * by a thread in a long running transaction (such as truncate). This can
2265  * result in deadlock since the long running trans might need to wait
2266  * for the inode we just locked in order to push the tail and free space
2267  * in the log.
2268  */
2269 void
2270 xfs_lock_inodes(
2271         xfs_inode_t     **ips,
2272         int             inodes,
2273         int             first_locked,
2274         uint            lock_mode)
2275 {
2276         int             attempts = 0, i, j, try_lock;
2277         xfs_log_item_t  *lp;
2278
2279         ASSERT(ips && (inodes >= 2)); /* we need at least two */
2280
2281         if (first_locked) {
2282                 try_lock = 1;
2283                 i = 1;
2284         } else {
2285                 try_lock = 0;
2286                 i = 0;
2287         }
2288
2289 again:
2290         for (; i < inodes; i++) {
2291                 ASSERT(ips[i]);
2292
2293                 if (i && (ips[i] == ips[i-1]))  /* Already locked */
2294                         continue;
2295
2296                 /*
2297                  * If try_lock is not set yet, make sure all locked inodes
2298                  * are not in the AIL.
2299                  * If any are, set try_lock to be used later.
2300                  */
2301
2302                 if (!try_lock) {
2303                         for (j = (i - 1); j >= 0 && !try_lock; j--) {
2304                                 lp = (xfs_log_item_t *)ips[j]->i_itemp;
2305                                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2306                                         try_lock++;
2307                                 }
2308                         }
2309                 }
2310
2311                 /*
2312                  * If any of the previous locks we have locked is in the AIL,
2313                  * we must TRY to get the second and subsequent locks. If
2314                  * we can't get any, we must release all we have
2315                  * and try again.
2316                  */
2317
2318                 if (try_lock) {
2319                         /* try_lock must be 0 if i is 0. */
2320                         /*
2321                          * try_lock means we have an inode locked
2322                          * that is in the AIL.
2323                          */
2324                         ASSERT(i != 0);
2325                         if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
2326                                 attempts++;
2327
2328                                 /*
2329                                  * Unlock all previous guys and try again.
2330                                  * xfs_iunlock will try to push the tail
2331                                  * if the inode is in the AIL.
2332                                  */
2333
2334                                 for(j = i - 1; j >= 0; j--) {
2335
2336                                         /*
2337                                          * Check to see if we've already
2338                                          * unlocked this one.
2339                                          * Not the first one going back,
2340                                          * and the inode ptr is the same.
2341                                          */
2342                                         if ((j != (i - 1)) && ips[j] ==
2343                                                                 ips[j+1])
2344                                                 continue;
2345
2346                                         xfs_iunlock(ips[j], lock_mode);
2347                                 }
2348
2349                                 if ((attempts % 5) == 0) {
2350                                         delay(1); /* Don't just spin the CPU */
2351 #ifdef DEBUG
2352                                         xfs_lock_delays++;
2353 #endif
2354                                 }
2355                                 i = 0;
2356                                 try_lock = 0;
2357                                 goto again;
2358                         }
2359                 } else {
2360                         xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
2361                 }
2362         }
2363
2364 #ifdef DEBUG
2365         if (attempts) {
2366                 if (attempts < 5) xfs_small_retries++;
2367                 else if (attempts < 100) xfs_middle_retries++;
2368                 else xfs_lots_retries++;
2369         } else {
2370                 xfs_locked_n++;
2371         }
2372 #endif
2373 }
2374
2375 #ifdef  DEBUG
2376 #define REMOVE_DEBUG_TRACE(x)   {remove_which_error_return = (x);}
2377 int remove_which_error_return = 0;
2378 #else /* ! DEBUG */
2379 #define REMOVE_DEBUG_TRACE(x)
2380 #endif  /* ! DEBUG */
2381
2382
2383 /*
2384  * xfs_remove
2385  *
2386  */
2387 STATIC int
2388 xfs_remove(
2389         bhv_desc_t              *dir_bdp,
2390         bhv_vname_t             *dentry,
2391         cred_t                  *credp)
2392 {
2393         bhv_vnode_t             *dir_vp;
2394         char                    *name = VNAME(dentry);
2395         xfs_inode_t             *dp, *ip;
2396         xfs_trans_t             *tp = NULL;
2397         xfs_mount_t             *mp;
2398         int                     error = 0;
2399         xfs_bmap_free_t         free_list;
2400         xfs_fsblock_t           first_block;
2401         int                     cancel_flags;
2402         int                     committed;
2403         int                     dm_di_mode = 0;
2404         int                     link_zero;
2405         uint                    resblks;
2406         int                     namelen;
2407
2408         dir_vp = BHV_TO_VNODE(dir_bdp);
2409         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
2410
2411         dp = XFS_BHVTOI(dir_bdp);
2412         mp = dp->i_mount;
2413
2414         if (XFS_FORCED_SHUTDOWN(mp))
2415                 return XFS_ERROR(EIO);
2416
2417         namelen = VNAMELEN(dentry);
2418
2419         if (!xfs_get_dir_entry(dentry, &ip)) {
2420                 dm_di_mode = ip->i_d.di_mode;
2421                 IRELE(ip);
2422         }
2423
2424         if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
2425                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp,
2426                                         DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
2427                                         name, NULL, dm_di_mode, 0, 0);
2428                 if (error)
2429                         return error;
2430         }
2431
2432         /* From this point on, return through std_return */
2433         ip = NULL;
2434
2435         /*
2436          * We need to get a reference to ip before we get our log
2437          * reservation. The reason for this is that we cannot call
2438          * xfs_iget for an inode for which we do not have a reference
2439          * once we've acquired a log reservation. This is because the
2440          * inode we are trying to get might be in xfs_inactive going
2441          * for a log reservation. Since we'll have to wait for the
2442          * inactive code to complete before returning from xfs_iget,
2443          * we need to make sure that we don't have log space reserved
2444          * when we call xfs_iget.  Instead we get an unlocked reference
2445          * to the inode before getting our log reservation.
2446          */
2447         error = xfs_get_dir_entry(dentry, &ip);
2448         if (error) {
2449                 REMOVE_DEBUG_TRACE(__LINE__);
2450                 goto std_return;
2451         }
2452
2453         dm_di_mode = ip->i_d.di_mode;
2454
2455         vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
2456
2457         ITRACE(ip);
2458
2459         error = XFS_QM_DQATTACH(mp, dp, 0);
2460         if (!error && dp != ip)
2461                 error = XFS_QM_DQATTACH(mp, ip, 0);
2462         if (error) {
2463                 REMOVE_DEBUG_TRACE(__LINE__);
2464                 IRELE(ip);
2465                 goto std_return;
2466         }
2467
2468         tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
2469         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2470         /*
2471          * We try to get the real space reservation first,
2472          * allowing for directory btree deletion(s) implying
2473          * possible bmap insert(s).  If we can't get the space
2474          * reservation then we use 0 instead, and avoid the bmap
2475          * btree insert(s) in the directory code by, if the bmap
2476          * insert tries to happen, instead trimming the LAST
2477          * block from the directory.
2478          */
2479         resblks = XFS_REMOVE_SPACE_RES(mp);
2480         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
2481                         XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2482         if (error == ENOSPC) {
2483                 resblks = 0;
2484                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
2485                                 XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2486         }
2487         if (error) {
2488                 ASSERT(error != ENOSPC);
2489                 REMOVE_DEBUG_TRACE(__LINE__);
2490                 xfs_trans_cancel(tp, 0);
2491                 IRELE(ip);
2492                 return error;
2493         }
2494
2495         error = xfs_lock_dir_and_entry(dp, ip);
2496         if (error) {
2497                 REMOVE_DEBUG_TRACE(__LINE__);
2498                 xfs_trans_cancel(tp, cancel_flags);
2499                 IRELE(ip);
2500                 goto std_return;
2501         }
2502
2503         /*
2504          * At this point, we've gotten both the directory and the entry
2505          * inodes locked.
2506          */
2507         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2508         if (dp != ip) {
2509                 /*
2510                  * Increment vnode ref count only in this case since
2511                  * there's an extra vnode reference in the case where
2512                  * dp == ip.
2513                  */
2514                 IHOLD(dp);
2515                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2516         }
2517
2518         /*
2519          * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
2520          */
2521         XFS_BMAP_INIT(&free_list, &first_block);
2522         error = xfs_dir_removename(tp, dp, name, namelen, ip->i_ino,
2523                                         &first_block, &free_list, 0);
2524         if (error) {
2525                 ASSERT(error != ENOENT);
2526                 REMOVE_DEBUG_TRACE(__LINE__);
2527                 goto error1;
2528         }
2529         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2530
2531         dp->i_gen++;
2532         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2533
2534         error = xfs_droplink(tp, ip);
2535         if (error) {
2536                 REMOVE_DEBUG_TRACE(__LINE__);
2537                 goto error1;
2538         }
2539
2540         /* Determine if this is the last link while
2541          * we are in the transaction.
2542          */
2543         link_zero = (ip)->i_d.di_nlink==0;
2544
2545         /*
2546          * Take an extra ref on the inode so that it doesn't
2547          * go to xfs_inactive() from within the commit.
2548          */
2549         IHOLD(ip);
2550
2551         /*
2552          * If this is a synchronous mount, make sure that the
2553          * remove transaction goes to disk before returning to
2554          * the user.
2555          */
2556         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2557                 xfs_trans_set_sync(tp);
2558         }
2559
2560         error = xfs_bmap_finish(&tp, &free_list, &committed);
2561         if (error) {
2562                 REMOVE_DEBUG_TRACE(__LINE__);
2563                 goto error_rele;
2564         }
2565
2566         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2567         if (error) {
2568                 IRELE(ip);
2569                 goto std_return;
2570         }
2571
2572         /*
2573          * Before we drop our extra reference to the inode, purge it
2574          * from the refcache if it is there.  By waiting until afterwards
2575          * to do the IRELE, we ensure that we won't go inactive in the
2576          * xfs_refcache_purge_ip routine (although that would be OK).
2577          */
2578         xfs_refcache_purge_ip(ip);
2579
2580         /*
2581          * If we are using filestreams, kill the stream association.
2582          * If the file is still open it may get a new one but that
2583          * will get killed on last close in xfs_close() so we don't
2584          * have to worry about that.
2585          */
2586         if (link_zero && xfs_inode_is_filestream(ip))
2587                 xfs_filestream_deassociate(ip);
2588
2589         vn_trace_exit(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
2590
2591         /*
2592          * Let interposed file systems know about removed links.
2593          */
2594         bhv_vop_link_removed(XFS_ITOV(ip), dir_vp, link_zero);
2595
2596         IRELE(ip);
2597
2598 /*      Fall through to std_return with error = 0 */
2599  std_return:
2600         if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
2601                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
2602                                 dir_vp, DM_RIGHT_NULL,
2603                                 NULL, DM_RIGHT_NULL,
2604                                 name, NULL, dm_di_mode, error, 0);
2605         }
2606         return error;
2607
2608  error1:
2609         xfs_bmap_cancel(&free_list);
2610         cancel_flags |= XFS_TRANS_ABORT;
2611         xfs_trans_cancel(tp, cancel_flags);
2612         goto std_return;
2613
2614  error_rele:
2615         /*
2616          * In this case make sure to not release the inode until after
2617          * the current transaction is aborted.  Releasing it beforehand
2618          * can cause us to go to xfs_inactive and start a recursive
2619          * transaction which can easily deadlock with the current one.
2620          */
2621         xfs_bmap_cancel(&free_list);
2622         cancel_flags |= XFS_TRANS_ABORT;
2623         xfs_trans_cancel(tp, cancel_flags);
2624
2625         /*
2626          * Before we drop our extra reference to the inode, purge it
2627          * from the refcache if it is there.  By waiting until afterwards
2628          * to do the IRELE, we ensure that we won't go inactive in the
2629          * xfs_refcache_purge_ip routine (although that would be OK).
2630          */
2631         xfs_refcache_purge_ip(ip);
2632
2633         IRELE(ip);
2634
2635         goto std_return;
2636 }
2637
2638
2639 /*
2640  * xfs_link
2641  *
2642  */
2643 STATIC int
2644 xfs_link(
2645         bhv_desc_t              *target_dir_bdp,
2646         bhv_vnode_t             *src_vp,
2647         bhv_vname_t             *dentry,
2648         cred_t                  *credp)
2649 {
2650         xfs_inode_t             *tdp, *sip;
2651         xfs_trans_t             *tp;
2652         xfs_mount_t             *mp;
2653         xfs_inode_t             *ips[2];
2654         int                     error;
2655         xfs_bmap_free_t         free_list;
2656         xfs_fsblock_t           first_block;
2657         int                     cancel_flags;
2658         int                     committed;
2659         bhv_vnode_t             *target_dir_vp;
2660         int                     resblks;
2661         char                    *target_name = VNAME(dentry);
2662         int                     target_namelen;
2663
2664         target_dir_vp = BHV_TO_VNODE(target_dir_bdp);
2665         vn_trace_entry(target_dir_vp, __FUNCTION__, (inst_t *)__return_address);
2666         vn_trace_entry(src_vp, __FUNCTION__, (inst_t *)__return_address);
2667
2668         target_namelen = VNAMELEN(dentry);
2669         ASSERT(!VN_ISDIR(src_vp));
2670
2671         sip = xfs_vtoi(src_vp);
2672         tdp = XFS_BHVTOI(target_dir_bdp);
2673         mp = tdp->i_mount;
2674         if (XFS_FORCED_SHUTDOWN(mp))
2675                 return XFS_ERROR(EIO);
2676
2677         if (DM_EVENT_ENABLED(tdp, DM_EVENT_LINK)) {
2678                 error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
2679                                         target_dir_vp, DM_RIGHT_NULL,
2680                                         src_vp, DM_RIGHT_NULL,
2681                                         target_name, NULL, 0, 0, 0);
2682                 if (error)
2683                         return error;
2684         }
2685
2686         /* Return through std_return after this point. */
2687
2688         error = XFS_QM_DQATTACH(mp, sip, 0);
2689         if (!error && sip != tdp)
2690                 error = XFS_QM_DQATTACH(mp, tdp, 0);
2691         if (error)
2692                 goto std_return;
2693
2694         tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
2695         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2696         resblks = XFS_LINK_SPACE_RES(mp, target_namelen);
2697         error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
2698                         XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2699         if (error == ENOSPC) {
2700                 resblks = 0;
2701                 error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
2702                                 XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2703         }
2704         if (error) {
2705                 cancel_flags = 0;
2706                 goto error_return;
2707         }
2708
2709         if (sip->i_ino < tdp->i_ino) {
2710                 ips[0] = sip;
2711                 ips[1] = tdp;
2712         } else {
2713                 ips[0] = tdp;
2714                 ips[1] = sip;
2715         }
2716
2717         xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2718
2719         /*
2720          * Increment vnode ref counts since xfs_trans_commit &
2721          * xfs_trans_cancel will both unlock the inodes and
2722          * decrement the associated ref counts.
2723          */
2724         VN_HOLD(src_vp);
2725         VN_HOLD(target_dir_vp);
2726         xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
2727         xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
2728
2729         /*
2730          * If the source has too many links, we can't make any more to it.
2731          */
2732         if (sip->i_d.di_nlink >= XFS_MAXLINK) {
2733                 error = XFS_ERROR(EMLINK);
2734                 goto error_return;
2735         }
2736
2737         /*
2738          * If we are using project inheritance, we only allow hard link
2739          * creation in our tree when the project IDs are the same; else
2740          * the tree quota mechanism could be circumvented.
2741          */
2742         if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
2743                      (tdp->i_d.di_projid != sip->i_d.di_projid))) {
2744                 error = XFS_ERROR(EXDEV);
2745                 goto error_return;
2746         }
2747
2748         if (resblks == 0 &&
2749             (error = xfs_dir_canenter(tp, tdp, target_name, target_namelen)))
2750                 goto error_return;
2751
2752         XFS_BMAP_INIT(&free_list, &first_block);
2753
2754         error = xfs_dir_createname(tp, tdp, target_name, target_namelen,
2755                                    sip->i_ino, &first_block, &free_list,
2756                                    resblks);
2757         if (error)
2758                 goto abort_return;
2759         xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2760         tdp->i_gen++;
2761         xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
2762
2763         error = xfs_bumplink(tp, sip);
2764         if (error)
2765                 goto abort_return;
2766
2767         /*
2768          * If this is a synchronous mount, make sure that the
2769          * link transaction goes to disk before returning to
2770          * the user.
2771          */
2772         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2773                 xfs_trans_set_sync(tp);
2774         }
2775
2776         error = xfs_bmap_finish (&tp, &free_list, &committed);
2777         if (error) {
2778                 xfs_bmap_cancel(&free_list);
2779                 goto abort_return;
2780         }
2781
2782         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2783         if (error)
2784                 goto std_return;
2785
2786         /* Fall through to std_return with error = 0. */
2787 std_return:
2788         if (DM_EVENT_ENABLED(sip, DM_EVENT_POSTLINK)) {
2789                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
2790                                 target_dir_vp, DM_RIGHT_NULL,
2791                                 src_vp, DM_RIGHT_NULL,
2792                                 target_name, NULL, 0, error, 0);
2793         }
2794         return error;
2795
2796  abort_return:
2797         cancel_flags |= XFS_TRANS_ABORT;
2798         /* FALLTHROUGH */
2799
2800  error_return:
2801         xfs_trans_cancel(tp, cancel_flags);
2802         goto std_return;
2803 }
2804
2805
2806 /*
2807  * xfs_mkdir
2808  *
2809  */
2810 STATIC int
2811 xfs_mkdir(
2812         bhv_desc_t              *dir_bdp,
2813         bhv_vname_t             *dentry,
2814         bhv_vattr_t             *vap,
2815         bhv_vnode_t             **vpp,
2816         cred_t                  *credp)
2817 {
2818         char                    *dir_name = VNAME(dentry);
2819         xfs_inode_t             *dp;
2820         xfs_inode_t             *cdp;   /* inode of created dir */
2821         bhv_vnode_t             *cvp;   /* vnode of created dir */
2822         xfs_trans_t             *tp;
2823         xfs_mount_t             *mp;
2824         int                     cancel_flags;
2825         int                     error;
2826         int                     committed;
2827         xfs_bmap_free_t         free_list;
2828         xfs_fsblock_t           first_block;
2829         bhv_vnode_t             *dir_vp;
2830         boolean_t               dp_joined_to_trans;
2831         boolean_t               created = B_FALSE;
2832         int                     dm_event_sent = 0;
2833         xfs_prid_t              prid;
2834         struct xfs_dquot        *udqp, *gdqp;
2835         uint                    resblks;
2836         int                     dm_di_mode;
2837         int                     dir_namelen;
2838
2839         dir_vp = BHV_TO_VNODE(dir_bdp);
2840         dp = XFS_BHVTOI(dir_bdp);
2841         mp = dp->i_mount;
2842
2843         if (XFS_FORCED_SHUTDOWN(mp))
2844                 return XFS_ERROR(EIO);
2845
2846         dir_namelen = VNAMELEN(dentry);
2847
2848         tp = NULL;
2849         dp_joined_to_trans = B_FALSE;
2850         dm_di_mode = vap->va_mode;
2851
2852         if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
2853                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
2854                                         dir_vp, DM_RIGHT_NULL, NULL,
2855                                         DM_RIGHT_NULL, dir_name, NULL,
2856                                         dm_di_mode, 0, 0);
2857                 if (error)
2858                         return error;
2859                 dm_event_sent = 1;
2860         }
2861
2862         /* Return through std_return after this point. */
2863
2864         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
2865
2866         mp = dp->i_mount;
2867         udqp = gdqp = NULL;
2868         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
2869                 prid = dp->i_d.di_projid;
2870         else if (vap->va_mask & XFS_AT_PROJID)
2871                 prid = (xfs_prid_t)vap->va_projid;
2872         else
2873                 prid = (xfs_prid_t)dfltprid;
2874
2875         /*
2876          * Make sure that we have allocated dquot(s) on disk.
2877          */
2878         error = XFS_QM_DQVOPALLOC(mp, dp,
2879                         current_fsuid(credp), current_fsgid(credp), prid,
2880                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2881         if (error)
2882                 goto std_return;
2883
2884         tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
2885         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2886         resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen);
2887         error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
2888                                   XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
2889         if (error == ENOSPC) {
2890                 resblks = 0;
2891                 error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
2892                                           XFS_TRANS_PERM_LOG_RES,
2893                                           XFS_MKDIR_LOG_COUNT);
2894         }
2895         if (error) {
2896                 cancel_flags = 0;
2897                 dp = NULL;
2898                 goto error_return;
2899         }
2900
2901         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2902
2903         /*
2904          * Check for directory link count overflow.
2905          */
2906         if (dp->i_d.di_nlink >= XFS_MAXLINK) {
2907                 error = XFS_ERROR(EMLINK);
2908                 goto error_return;
2909         }
2910
2911         /*
2912          * Reserve disk quota and the inode.
2913          */
2914         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
2915         if (error)
2916                 goto error_return;
2917
2918         if (resblks == 0 &&
2919             (error = xfs_dir_canenter(tp, dp, dir_name, dir_namelen)))
2920                 goto error_return;
2921         /*
2922          * create the directory inode.
2923          */
2924         error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 2,
2925                         0, credp, prid, resblks > 0,
2926                 &cdp, NULL);
2927         if (error) {
2928                 if (error == ENOSPC)
2929                         goto error_return;
2930                 goto abort_return;
2931         }
2932         ITRACE(cdp);
2933
2934         /*
2935          * Now we add the directory inode to the transaction.
2936          * We waited until now since xfs_dir_ialloc might start
2937          * a new transaction.  Had we joined the transaction
2938          * earlier, the locks might have gotten released.
2939          */
2940         VN_HOLD(dir_vp);
2941         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2942         dp_joined_to_trans = B_TRUE;
2943
2944         XFS_BMAP_INIT(&free_list, &first_block);
2945
2946         error = xfs_dir_createname(tp, dp, dir_name, dir_namelen, cdp->i_ino,
2947                                    &first_block, &free_list, resblks ?
2948                                    resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2949         if (error) {
2950                 ASSERT(error != ENOSPC);
2951                 goto error1;
2952         }
2953         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2954
2955         /*
2956          * Bump the in memory version number of the parent directory
2957          * so that other processes accessing it will recognize that
2958          * the directory has changed.
2959          */
2960         dp->i_gen++;
2961
2962         error = xfs_dir_init(tp, cdp, dp);
2963         if (error)
2964                 goto error2;
2965
2966         cdp->i_gen = 1;
2967         error = xfs_bumplink(tp, dp);
2968         if (error)
2969                 goto error2;
2970
2971         cvp = XFS_ITOV(cdp);
2972
2973         created = B_TRUE;
2974
2975         *vpp = cvp;
2976         IHOLD(cdp);
2977
2978         /*
2979          * Attach the dquots to the new inode and modify the icount incore.
2980          */
2981         XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp);
2982
2983         /*
2984          * If this is a synchronous mount, make sure that the
2985          * mkdir transaction goes to disk before returning to
2986          * the user.
2987          */
2988         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2989                 xfs_trans_set_sync(tp);
2990         }
2991
2992         error = xfs_bmap_finish(&tp, &free_list, &committed);
2993         if (error) {
2994                 IRELE(cdp);
2995                 goto error2;
2996         }
2997
2998         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2999         XFS_QM_DQRELE(mp, udqp);
3000         XFS_QM_DQRELE(mp, gdqp);
3001         if (error) {
3002                 IRELE(cdp);
3003         }
3004
3005         /* Fall through to std_return with error = 0 or errno from
3006          * xfs_trans_commit. */
3007
3008 std_return:
3009         if ((created || (error != 0 && dm_event_sent != 0)) &&
3010             DM_EVENT_ENABLED(XFS_BHVTOI(dir_bdp), DM_EVENT_POSTCREATE)) {
3011                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
3012                                         dir_vp, DM_RIGHT_NULL,
3013                                         created ? XFS_ITOV(cdp):NULL,
3014                                         DM_RIGHT_NULL,
3015                                         dir_name, NULL,
3016                                         dm_di_mode, error, 0);
3017         }
3018         return error;
3019
3020  error2:
3021  error1:
3022         xfs_bmap_cancel(&free_list);
3023  abort_return:
3024         cancel_flags |= XFS_TRANS_ABORT;
3025  error_return:
3026         xfs_trans_cancel(tp, cancel_flags);
3027         XFS_QM_DQRELE(mp, udqp);
3028         XFS_QM_DQRELE(mp, gdqp);
3029
3030         if (!dp_joined_to_trans && (dp != NULL)) {
3031                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
3032         }
3033
3034         goto std_return;
3035 }
3036
3037
3038 /*
3039  * xfs_rmdir
3040  *
3041  */
3042 STATIC int
3043 xfs_rmdir(
3044         bhv_desc_t              *dir_bdp,
3045         bhv_vname_t             *dentry,
3046         cred_t                  *credp)
3047 {
3048         char                    *name = VNAME(dentry);
3049         xfs_inode_t             *dp;
3050         xfs_inode_t             *cdp;   /* child directory */
3051         xfs_trans_t             *tp;
3052         xfs_mount_t             *mp;
3053         int                     error;
3054         xfs_bmap_free_t         free_list;
3055         xfs_fsblock_t           first_block;
3056         int                     cancel_flags;
3057         int                     committed;
3058         bhv_vnode_t             *dir_vp;
3059         int                     dm_di_mode = S_IFDIR;
3060         int                     last_cdp_link;
3061         int                     namelen;
3062         uint                    resblks;
3063
3064         dir_vp = BHV_TO_VNODE(dir_bdp);
3065         dp = XFS_BHVTOI(dir_bdp);
3066         mp = dp->i_mount;
3067
3068         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
3069
3070         if (XFS_FORCED_SHUTDOWN(XFS_BHVTOI(dir_bdp)->i_mount))
3071                 return XFS_ERROR(EIO);
3072         namelen = VNAMELEN(dentry);
3073
3074         if (!xfs_get_dir_entry(dentry, &cdp)) {
3075                 dm_di_mode = cdp->i_d.di_mode;
3076                 IRELE(cdp);
3077         }
3078
3079         if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
3080                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
3081                                         dir_vp, DM_RIGHT_NULL,
3082                                         NULL, DM_RIGHT_NULL,
3083                                         name, NULL, dm_di_mode, 0, 0);
3084                 if (error)
3085                         return XFS_ERROR(error);
3086         }
3087
3088         /* Return through std_return after this point. */
3089
3090         cdp = NULL;
3091
3092         /*
3093          * We need to get a reference to cdp before we get our log
3094          * reservation.  The reason for this is that we cannot call
3095          * xfs_iget for an inode for which we do not have a reference
3096          * once we've acquired a log reservation.  This is because the
3097          * inode we are trying to get might be in xfs_inactive going
3098          * for a log reservation.  Since we'll have to wait for the
3099          * inactive code to complete before returning from xfs_iget,
3100          * we need to make sure that we don't have log space reserved
3101          * when we call xfs_iget.  Instead we get an unlocked reference
3102          * to the inode before getting our log reservation.
3103          */
3104         error = xfs_get_dir_entry(dentry, &cdp);
3105         if (error) {
3106                 REMOVE_DEBUG_TRACE(__LINE__);
3107                 goto std_return;
3108         }
3109         mp = dp->i_mount;
3110         dm_di_mode = cdp->i_d.di_mode;
3111
3112         /*
3113          * Get the dquots for the inodes.
3114          */
3115         error = XFS_QM_DQATTACH(mp, dp, 0);
3116         if (!error && dp != cdp)
3117                 error = XFS_QM_DQATTACH(mp, cdp, 0);
3118         if (error) {
3119                 IRELE(cdp);
3120                 REMOVE_DEBUG_TRACE(__LINE__);
3121                 goto std_return;
3122         }
3123
3124         tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
3125         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3126         /*
3127          * We try to get the real space reservation first,
3128          * allowing for directory btree deletion(s) implying
3129          * possible bmap insert(s).  If we can't get the space
3130          * reservation then we use 0 instead, and avoid the bmap
3131          * btree insert(s) in the directory code by, if the bmap
3132          * insert tries to happen, instead trimming the LAST
3133          * block from the directory.
3134          */
3135         resblks = XFS_REMOVE_SPACE_RES(mp);
3136         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
3137                         XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3138         if (error == ENOSPC) {
3139                 resblks = 0;
3140                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
3141                                 XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3142         }
3143         if (error) {
3144                 ASSERT(error != ENOSPC);
3145                 cancel_flags = 0;
3146                 IRELE(cdp);
3147                 goto error_return;
3148         }
3149         XFS_BMAP_INIT(&free_list, &first_block);
3150
3151         /*
3152          * Now lock the child directory inode and the parent directory
3153          * inode in the proper order.  This will take care of validating
3154          * that the directory entry for the child directory inode has
3155          * not changed while we were obtaining a log reservation.
3156          */
3157         error = xfs_lock_dir_and_entry(dp, cdp);
3158         if (error) {
3159                 xfs_trans_cancel(tp, cancel_flags);
3160                 IRELE(cdp);
3161                 goto std_return;
3162         }
3163
3164         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3165         if (dp != cdp) {
3166                 /*
3167                  * Only increment the parent directory vnode count if
3168                  * we didn't bump it in looking up cdp.  The only time
3169                  * we don't bump it is when we're looking up ".".
3170                  */
3171                 VN_HOLD(dir_vp);
3172         }
3173
3174         ITRACE(cdp);
3175         xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL);
3176
3177         ASSERT(cdp->i_d.di_nlink >= 2);
3178         if (cdp->i_d.di_nlink != 2) {
3179                 error = XFS_ERROR(ENOTEMPTY);
3180                 goto error_return;
3181         }
3182         if (!xfs_dir_isempty(cdp)) {
3183                 error = XFS_ERROR(ENOTEMPTY);
3184                 goto error_return;
3185         }
3186
3187         error = xfs_dir_removename(tp, dp, name, namelen, cdp->i_ino,
3188                                         &first_block, &free_list, resblks);
3189         if (error)
3190                 goto error1;
3191
3192         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3193
3194         /*
3195          * Bump the in memory generation count on the parent
3196          * directory so that other can know that it has changed.
3197          */
3198         dp->i_gen++;
3199
3200         /*
3201          * Drop the link from cdp's "..".
3202          */
3203         error = xfs_droplink(tp, dp);
3204         if (error) {
3205                 goto error1;
3206         }
3207
3208         /*
3209          * Drop the link from dp to cdp.
3210          */
3211         error = xfs_droplink(tp, cdp);
3212         if (error) {
3213                 goto error1;
3214         }
3215
3216         /*
3217          * Drop the "." link from cdp to self.
3218          */
3219         error = xfs_droplink(tp, cdp);
3220         if (error) {
3221                 goto error1;
3222         }
3223
3224         /* Determine these before committing transaction */
3225         last_cdp_link = (cdp)->i_d.di_nlink==0;
3226
3227         /*
3228          * Take an extra ref on the child vnode so that it
3229          * does not go to xfs_inactive() from within the commit.
3230          */
3231         IHOLD(cdp);
3232
3233         /*
3234          * If this is a synchronous mount, make sure that the
3235          * rmdir transaction goes to disk before returning to
3236          * the user.
3237          */
3238         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3239                 xfs_trans_set_sync(tp);
3240         }
3241
3242         error = xfs_bmap_finish (&tp, &free_list, &committed);
3243         if (error) {
3244                 xfs_bmap_cancel(&free_list);
3245                 xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
3246                                  XFS_TRANS_ABORT));
3247                 IRELE(cdp);
3248                 goto std_return;
3249         }
3250
3251         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3252         if (error) {
3253                 IRELE(cdp);
3254                 goto std_return;
3255         }
3256
3257
3258         /*
3259          * Let interposed file systems know about removed links.
3260          */
3261         bhv_vop_link_removed(XFS_ITOV(cdp), dir_vp, last_cdp_link);
3262
3263         IRELE(cdp);
3264
3265         /* Fall through to std_return with error = 0 or the errno
3266          * from xfs_trans_commit. */
3267  std_return:
3268         if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
3269                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
3270                                         dir_vp, DM_RIGHT_NULL,
3271                                         NULL, DM_RIGHT_NULL,
3272                                         name, NULL, dm_di_mode,
3273                                         error, 0);
3274         }
3275         return error;
3276
3277  error1:
3278         xfs_bmap_cancel(&free_list);
3279         cancel_flags |= XFS_TRANS_ABORT;
3280         /* FALLTHROUGH */
3281
3282  error_return:
3283         xfs_trans_cancel(tp, cancel_flags);
3284         goto std_return;
3285 }
3286
3287 STATIC int
3288 xfs_symlink(
3289         bhv_desc_t              *dir_bdp,
3290         bhv_vname_t             *dentry,
3291         bhv_vattr_t             *vap,
3292         char                    *target_path,
3293         bhv_vnode_t             **vpp,
3294         cred_t                  *credp)
3295 {
3296         xfs_trans_t             *tp;
3297         xfs_mount_t             *mp;
3298         xfs_inode_t             *dp;
3299         xfs_inode_t             *ip;
3300         int                     error;
3301         int                     pathlen;
3302         xfs_bmap_free_t         free_list;
3303         xfs_fsblock_t           first_block;
3304         boolean_t               dp_joined_to_trans;
3305         bhv_vnode_t             *dir_vp;
3306         uint                    cancel_flags;
3307         int                     committed;
3308         xfs_fileoff_t           first_fsb;
3309         xfs_filblks_t           fs_blocks;
3310         int                     nmaps;
3311         xfs_bmbt_irec_t         mval[SYMLINK_MAPS];
3312         xfs_daddr_t             d;
3313         char                    *cur_chunk;
3314         int                     byte_cnt;
3315         int                     n;
3316         xfs_buf_t               *bp;
3317         xfs_prid_t              prid;
3318         struct xfs_dquot        *udqp, *gdqp;
3319         uint                    resblks;
3320         char                    *link_name = VNAME(dentry);
3321         int                     link_namelen;
3322
3323         *vpp = NULL;
3324         dir_vp = BHV_TO_VNODE(dir_bdp);
3325         dp = XFS_BHVTOI(dir_bdp);
3326         dp_joined_to_trans = B_FALSE;
3327         error = 0;
3328         ip = NULL;
3329         tp = NULL;
3330
3331         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
3332
3333         mp = dp->i_mount;
3334
3335         if (XFS_FORCED_SHUTDOWN(mp))
3336                 return XFS_ERROR(EIO);
3337
3338         link_namelen = VNAMELEN(dentry);
3339
3340         /*
3341          * Check component lengths of the target path name.
3342          */
3343         pathlen = strlen(target_path);
3344         if (pathlen >= MAXPATHLEN)      /* total string too long */
3345                 return XFS_ERROR(ENAMETOOLONG);
3346         if (pathlen >= MAXNAMELEN) {    /* is any component too long? */
3347                 int len, total;
3348                 char *path;
3349
3350                 for (total = 0, path = target_path; total < pathlen;) {
3351                         /*
3352                          * Skip any slashes.
3353                          */
3354                         while(*path == '/') {
3355                                 total++;
3356                                 path++;
3357                         }
3358
3359                         /*
3360                          * Count up to the next slash or end of path.
3361                          * Error out if the component is bigger than MAXNAMELEN.
3362                          */
3363                         for(len = 0; *path != '/' && total < pathlen;total++, path++) {
3364                                 if (++len >= MAXNAMELEN) {
3365                                         error = ENAMETOOLONG;
3366                                         return error;
3367                                 }
3368                         }
3369                 }
3370         }
3371
3372         if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
3373                 error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dir_vp,
3374                                         DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
3375                                         link_name, target_path, 0, 0, 0);
3376                 if (error)
3377                         return error;
3378         }
3379
3380         /* Return through std_return after this point. */
3381
3382         udqp = gdqp = NULL;
3383         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
3384                 prid = dp->i_d.di_projid;
3385         else if (vap->va_mask & XFS_AT_PROJID)
3386                 prid = (xfs_prid_t)vap->va_projid;
3387         else
3388                 prid = (xfs_prid_t)dfltprid;
3389
3390         /*
3391          * Make sure that we have allocated dquot(s) on disk.
3392          */
3393         error = XFS_QM_DQVOPALLOC(mp, dp,
3394                         current_fsuid(credp), current_fsgid(credp), prid,
3395                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
3396         if (error)
3397                 goto std_return;
3398
3399         tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
3400         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3401         /*
3402          * The symlink will fit into the inode data fork?
3403          * There can't be any attributes so we get the whole variable part.
3404          */
3405         if (pathlen <= XFS_LITINO(mp))
3406                 fs_blocks = 0;
3407         else
3408                 fs_blocks = XFS_B_TO_FSB(mp, pathlen);
3409         resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks);
3410         error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
3411                         XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3412         if (error == ENOSPC && fs_blocks == 0) {
3413                 resblks = 0;
3414                 error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
3415                                 XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3416         }
3417         if (error) {
3418                 cancel_flags = 0;
3419                 dp = NULL;
3420                 goto error_return;
3421         }
3422
3423         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
3424
3425         /*
3426          * Check whether the directory allows new symlinks or not.
3427          */
3428         if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
3429                 error = XFS_ERROR(EPERM);
3430                 goto error_return;
3431         }
3432
3433         /*
3434          * Reserve disk quota : blocks and inode.
3435          */
3436         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
3437         if (error)
3438                 goto error_return;
3439
3440         /*
3441          * Check for ability to enter directory entry, if no space reserved.
3442          */
3443         if (resblks == 0 &&
3444             (error = xfs_dir_canenter(tp, dp, link_name, link_namelen)))
3445                 goto error_return;
3446         /*
3447          * Initialize the bmap freelist prior to calling either
3448          * bmapi or the directory create code.
3449          */
3450         XFS_BMAP_INIT(&free_list, &first_block);
3451
3452         /*
3453          * Allocate an inode for the symlink.
3454          */
3455         error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (vap->va_mode&~S_IFMT),
3456                                1, 0, credp, prid, resblks > 0, &ip, NULL);
3457         if (error) {
3458                 if (error == ENOSPC)
3459                         goto error_return;
3460                 goto error1;
3461         }
3462         ITRACE(ip);
3463
3464         VN_HOLD(dir_vp);
3465         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3466         dp_joined_to_trans = B_TRUE;
3467
3468         /*
3469          * Also attach the dquot(s) to it, if applicable.
3470          */
3471         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
3472
3473         if (resblks)
3474                 resblks -= XFS_IALLOC_SPACE_RES(mp);
3475         /*
3476          * If the symlink will fit into the inode, write it inline.
3477          */
3478         if (pathlen <= XFS_IFORK_DSIZE(ip)) {
3479                 xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
3480                 memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
3481                 ip->i_d.di_size = pathlen;
3482
3483                 /*
3484                  * The inode was initially created in extent format.
3485                  */
3486                 ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
3487                 ip->i_df.if_flags |= XFS_IFINLINE;
3488
3489                 ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
3490                 xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
3491
3492         } else {
3493                 first_fsb = 0;
3494                 nmaps = SYMLINK_MAPS;
3495
3496                 error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
3497                                   XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
3498                                   &first_block, resblks, mval, &nmaps,
3499                                   &free_list, NULL);
3500                 if (error) {
3501                         goto error1;
3502                 }
3503
3504                 if (resblks)
3505                         resblks -= fs_blocks;
3506                 ip->i_d.di_size = pathlen;
3507                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3508
3509                 cur_chunk = target_path;
3510                 for (n = 0; n < nmaps; n++) {
3511                         d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
3512                         byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
3513                         bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
3514                                                BTOBB(byte_cnt), 0);
3515                         ASSERT(bp && !XFS_BUF_GETERROR(bp));
3516                         if (pathlen < byte_cnt) {
3517                                 byte_cnt = pathlen;
3518                         }
3519                         pathlen -= byte_cnt;
3520
3521                         memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt);
3522                         cur_chunk += byte_cnt;
3523
3524                         xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
3525                 }
3526         }
3527
3528         /*
3529          * Create the directory entry for the symlink.
3530          */
3531         error = xfs_dir_createname(tp, dp, link_name, link_namelen, ip->i_ino,
3532                                    &first_block, &free_list, resblks);
3533         if (error)
3534                 goto error1;
3535         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3536         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
3537
3538         /*
3539          * Bump the in memory version number of the parent directory
3540          * so that other processes accessing it will recognize that
3541          * the directory has changed.
3542          */
3543         dp->i_gen++;
3544
3545         /*
3546          * If this is a synchronous mount, make sure that the
3547          * symlink transaction goes to disk before returning to
3548          * the user.
3549          */
3550         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3551                 xfs_trans_set_sync(tp);
3552         }
3553
3554         /*
3555          * xfs_trans_commit normally decrements the vnode ref count
3556          * when it unlocks the inode. Since we want to return the
3557          * vnode to the caller, we bump the vnode ref count now.
3558          */
3559         IHOLD(ip);
3560
3561         error = xfs_bmap_finish(&tp, &free_list, &committed);
3562         if (error) {
3563                 goto error2;
3564         }
3565         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3566         XFS_QM_DQRELE(mp, udqp);
3567         XFS_QM_DQRELE(mp, gdqp);
3568
3569         /* Fall through to std_return with error = 0 or errno from
3570          * xfs_trans_commit     */
3571 std_return:
3572         if (DM_EVENT_ENABLED(XFS_BHVTOI(dir_bdp), DM_EVENT_POSTSYMLINK)) {
3573                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
3574                                         dir_vp, DM_RIGHT_NULL,
3575                                         error ? NULL : XFS_ITOV(ip),
3576                                         DM_RIGHT_NULL, link_name, target_path,
3577                                         0, error, 0);
3578         }
3579
3580         if (!error) {
3581                 bhv_vnode_t *vp;
3582
3583                 ASSERT(ip);
3584                 vp = XFS_ITOV(ip);
3585                 *vpp = vp;
3586         }
3587         return error;
3588
3589  error2:
3590         IRELE(ip);
3591  error1:
3592         xfs_bmap_cancel(&free_list);
3593         cancel_flags |= XFS_TRANS_ABORT;
3594  error_return:
3595         xfs_trans_cancel(tp, cancel_flags);
3596         XFS_QM_DQRELE(mp, udqp);
3597         XFS_QM_DQRELE(mp, gdqp);
3598
3599         if (!dp_joined_to_trans && (dp != NULL)) {
3600                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
3601         }
3602
3603         goto std_return;
3604 }
3605
3606
3607 /*
3608  * xfs_fid2
3609  *
3610  * A fid routine that takes a pointer to a previously allocated
3611  * fid structure (like xfs_fast_fid) but uses a 64 bit inode number.
3612  */
3613 STATIC int
3614 xfs_fid2(
3615         bhv_desc_t      *bdp,
3616         fid_t           *fidp)
3617 {
3618         xfs_inode_t     *ip;
3619         xfs_fid2_t      *xfid;
3620
3621         vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__,
3622                                        (inst_t *)__return_address);
3623         ASSERT(sizeof(fid_t) >= sizeof(xfs_fid2_t));
3624
3625         xfid = (xfs_fid2_t *)fidp;
3626         ip = XFS_BHVTOI(bdp);
3627         xfid->fid_len = sizeof(xfs_fid2_t) - sizeof(xfid->fid_len);
3628         xfid->fid_pad = 0;
3629         /*
3630          * use memcpy because the inode is a long long and there's no
3631          * assurance that xfid->fid_ino is properly aligned.
3632          */
3633         memcpy(&xfid->fid_ino, &ip->i_ino, sizeof(xfid->fid_ino));
3634         xfid->fid_gen = ip->i_d.di_gen;
3635
3636         return 0;
3637 }
3638
3639
3640 /*
3641  * xfs_rwlock
3642  */
3643 int
3644 xfs_rwlock(
3645         bhv_desc_t      *bdp,
3646         bhv_vrwlock_t   locktype)
3647 {
3648         xfs_inode_t     *ip;
3649         bhv_vnode_t     *vp;
3650
3651         vp = BHV_TO_VNODE(bdp);
3652         if (VN_ISDIR(vp))
3653                 return 1;
3654         ip = XFS_BHVTOI(bdp);
3655         if (locktype == VRWLOCK_WRITE) {
3656                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
3657         } else if (locktype == VRWLOCK_TRY_READ) {
3658                 return xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED);
3659         } else if (locktype == VRWLOCK_TRY_WRITE) {
3660                 return xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL);
3661         } else {
3662                 ASSERT((locktype == VRWLOCK_READ) ||
3663                        (locktype == VRWLOCK_WRITE_DIRECT));
3664                 xfs_ilock(ip, XFS_IOLOCK_SHARED);
3665         }
3666
3667         return 1;
3668 }
3669
3670
3671 /*
3672  * xfs_rwunlock
3673  */
3674 void
3675 xfs_rwunlock(
3676         bhv_desc_t      *bdp,
3677         bhv_vrwlock_t   locktype)
3678 {
3679         xfs_inode_t     *ip;
3680         bhv_vnode_t     *vp;
3681
3682         vp = BHV_TO_VNODE(bdp);
3683         if (VN_ISDIR(vp))
3684                 return;
3685         ip = XFS_BHVTOI(bdp);
3686         if (locktype == VRWLOCK_WRITE) {
3687                 /*
3688                  * In the write case, we may have added a new entry to
3689                  * the reference cache.  This might store a pointer to
3690                  * an inode to be released in this inode.  If it is there,
3691                  * clear the pointer and release the inode after unlocking
3692                  * this one.
3693                  */
3694                 xfs_refcache_iunlock(ip, XFS_IOLOCK_EXCL);
3695         } else {
3696                 ASSERT((locktype == VRWLOCK_READ) ||
3697                        (locktype == VRWLOCK_WRITE_DIRECT));
3698                 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
3699         }
3700         return;
3701 }
3702
3703 STATIC int
3704 xfs_inode_flush(
3705         bhv_desc_t      *bdp,
3706         int             flags)
3707 {
3708         xfs_inode_t     *ip;
3709         xfs_mount_t     *mp;
3710         xfs_inode_log_item_t *iip;
3711         int             error = 0;
3712
3713         ip = XFS_BHVTOI(bdp);
3714         mp = ip->i_mount;
3715         iip = ip->i_itemp;
3716
3717         if (XFS_FORCED_SHUTDOWN(mp))
3718                 return XFS_ERROR(EIO);
3719
3720         /*
3721          * Bypass inodes which have already been cleaned by
3722          * the inode flush clustering code inside xfs_iflush
3723          */
3724         if ((ip->i_update_core == 0) &&
3725             ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)))
3726                 return 0;
3727
3728         if (flags & FLUSH_LOG) {
3729                 if (iip && iip->ili_last_lsn) {
3730                         xlog_t          *log = mp->m_log;
3731                         xfs_lsn_t       sync_lsn;
3732                         int             s, log_flags = XFS_LOG_FORCE;
3733
3734                         s = GRANT_LOCK(log);
3735                         sync_lsn = log->l_last_sync_lsn;
3736                         GRANT_UNLOCK(log, s);
3737
3738                         if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) > 0)) {
3739                                 if (flags & FLUSH_SYNC)
3740                                         log_flags |= XFS_LOG_SYNC;
3741                                 error = xfs_log_force(mp, iip->ili_last_lsn, log_flags);
3742                                 if (error)
3743                                         return error;
3744                         }
3745
3746                         if (ip->i_update_core == 0)
3747                                 return 0;
3748                 }
3749         }
3750
3751         /*
3752          * We make this non-blocking if the inode is contended,
3753          * return EAGAIN to indicate to the caller that they
3754          * did not succeed. This prevents the flush path from
3755          * blocking on inodes inside another operation right
3756          * now, they get caught later by xfs_sync.
3757          */
3758         if (flags & FLUSH_INODE) {
3759                 int     flush_flags;
3760
3761                 if (flags & FLUSH_SYNC) {
3762                         xfs_ilock(ip, XFS_ILOCK_SHARED);
3763                         xfs_iflock(ip);
3764                 } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3765                         if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
3766                                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3767                                 return EAGAIN;
3768                         }
3769                 } else {
3770                         return EAGAIN;
3771                 }
3772
3773                 if (flags & FLUSH_SYNC)
3774                         flush_flags = XFS_IFLUSH_SYNC;
3775                 else
3776                         flush_flags = XFS_IFLUSH_ASYNC;
3777
3778                 error = xfs_iflush(ip, flush_flags);
3779                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3780         }
3781
3782         return error;
3783 }
3784
3785 int
3786 xfs_set_dmattrs (
3787         bhv_desc_t      *bdp,
3788         u_int           evmask,
3789         u_int16_t       state,
3790         cred_t          *credp)
3791 {
3792         xfs_inode_t     *ip;
3793         xfs_trans_t     *tp;
3794         xfs_mount_t     *mp;
3795         int             error;
3796
3797         if (!capable(CAP_SYS_ADMIN))
3798                 return XFS_ERROR(EPERM);
3799
3800         ip = XFS_BHVTOI(bdp);
3801         mp = ip->i_mount;
3802
3803         if (XFS_FORCED_SHUTDOWN(mp))
3804                 return XFS_ERROR(EIO);
3805
3806         tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
3807         error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
3808         if (error) {
3809                 xfs_trans_cancel(tp, 0);
3810                 return error;
3811         }
3812         xfs_ilock(ip, XFS_ILOCK_EXCL);
3813         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3814
3815         ip->i_iocore.io_dmevmask = ip->i_d.di_dmevmask = evmask;
3816         ip->i_iocore.io_dmstate  = ip->i_d.di_dmstate  = state;
3817
3818         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3819         IHOLD(ip);
3820         error = xfs_trans_commit(tp, 0);
3821
3822         return error;
3823 }
3824
3825 STATIC int
3826 xfs_reclaim(
3827         bhv_desc_t      *bdp)
3828 {
3829         xfs_inode_t     *ip;
3830         bhv_vnode_t     *vp;
3831
3832         vp = BHV_TO_VNODE(bdp);
3833         ip = XFS_BHVTOI(bdp);
3834
3835         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
3836
3837         ASSERT(!VN_MAPPED(vp));
3838
3839         /* bad inode, get out here ASAP */
3840         if (VN_BAD(vp)) {
3841                 xfs_ireclaim(ip);
3842                 return 0;
3843         }
3844
3845         vn_iowait(vp);
3846
3847         ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
3848
3849         /*
3850          * Make sure the atime in the XFS inode is correct before freeing the
3851          * Linux inode.
3852          */
3853         xfs_synchronize_atime(ip);
3854
3855         /*
3856          * If we have nothing to flush with this inode then complete the
3857          * teardown now, otherwise break the link between the xfs inode and the
3858          * linux inode and clean up the xfs inode later. This avoids flushing
3859          * the inode to disk during the delete operation itself.
3860          *
3861          * When breaking the link, we need to set the XFS_IRECLAIMABLE flag
3862          * first to ensure that xfs_iunpin() will never see an xfs inode
3863          * that has a linux inode being reclaimed. Synchronisation is provided
3864          * by the i_flags_lock.
3865          */
3866         if (!ip->i_update_core && (ip->i_itemp == NULL)) {
3867                 xfs_ilock(ip, XFS_ILOCK_EXCL);
3868                 xfs_iflock(ip);
3869                 return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
3870         } else {
3871                 xfs_mount_t     *mp = ip->i_mount;
3872
3873                 /* Protect sync and unpin from us */
3874                 XFS_MOUNT_ILOCK(mp);
3875                 spin_lock(&ip->i_flags_lock);
3876                 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
3877                 vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip));
3878                 spin_unlock(&ip->i_flags_lock);
3879                 list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
3880                 XFS_MOUNT_IUNLOCK(mp);
3881         }
3882         return 0;
3883 }
3884
3885 int
3886 xfs_finish_reclaim(
3887         xfs_inode_t     *ip,
3888         int             locked,
3889         int             sync_mode)
3890 {
3891         xfs_ihash_t     *ih = ip->i_hash;
3892         bhv_vnode_t     *vp = XFS_ITOV_NULL(ip);
3893         int             error;
3894
3895         if (vp && VN_BAD(vp))
3896                 goto reclaim;
3897
3898         /* The hash lock here protects a thread in xfs_iget_core from
3899          * racing with us on linking the inode back with a vnode.
3900          * Once we have the XFS_IRECLAIM flag set it will not touch
3901          * us.
3902          */
3903         write_lock(&ih->ih_lock);
3904         spin_lock(&ip->i_flags_lock);
3905         if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
3906             (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) && vp == NULL)) {
3907                 spin_unlock(&ip->i_flags_lock);
3908                 write_unlock(&ih->ih_lock);
3909                 if (locked) {
3910                         xfs_ifunlock(ip);
3911                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3912                 }
3913                 return 1;
3914         }
3915         __xfs_iflags_set(ip, XFS_IRECLAIM);
3916         spin_unlock(&ip->i_flags_lock);
3917         write_unlock(&ih->ih_lock);
3918
3919         /*
3920          * If the inode is still dirty, then flush it out.  If the inode
3921          * is not in the AIL, then it will be OK to flush it delwri as
3922          * long as xfs_iflush() does not keep any references to the inode.
3923          * We leave that decision up to xfs_iflush() since it has the
3924          * knowledge of whether it's OK to simply do a delwri flush of
3925          * the inode or whether we need to wait until the inode is
3926          * pulled from the AIL.
3927          * We get the flush lock regardless, though, just to make sure
3928          * we don't free it while it is being flushed.
3929          */
3930         if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
3931                 if (!locked) {
3932                         xfs_ilock(ip, XFS_ILOCK_EXCL);
3933                         xfs_iflock(ip);
3934                 }
3935
3936                 if (ip->i_update_core ||
3937                     ((ip->i_itemp != NULL) &&
3938                      (ip->i_itemp->ili_format.ilf_fields != 0))) {
3939                         error = xfs_iflush(ip, sync_mode);
3940                         /*
3941                          * If we hit an error, typically because of filesystem
3942                          * shutdown, we don't need to let vn_reclaim to know
3943                          * because we're gonna reclaim the inode anyway.
3944                          */
3945                         if (error) {
3946                                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3947                                 goto reclaim;
3948                         }
3949                         xfs_iflock(ip); /* synchronize with xfs_iflush_done */
3950                 }
3951
3952                 ASSERT(ip->i_update_core == 0);
3953                 ASSERT(ip->i_itemp == NULL ||
3954                        ip->i_itemp->ili_format.ilf_fields == 0);
3955                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3956         } else if (locked) {
3957                 /*
3958                  * We are not interested in doing an iflush if we're
3959                  * in the process of shutting down the filesystem forcibly.
3960                  * So, just reclaim the inode.
3961                  */
3962                 xfs_ifunlock(ip);
3963                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3964         }
3965
3966  reclaim:
3967         xfs_ireclaim(ip);
3968         return 0;
3969 }
3970
3971 int
3972 xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
3973 {
3974         int             purged;
3975         xfs_inode_t     *ip, *n;
3976         int             done = 0;
3977
3978         while (!done) {
3979                 purged = 0;
3980                 XFS_MOUNT_ILOCK(mp);
3981                 list_for_each_entry_safe(ip, n, &mp->m_del_inodes, i_reclaim) {
3982                         if (noblock) {
3983                                 if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0)
3984                                         continue;
3985                                 if (xfs_ipincount(ip) ||
3986                                     !xfs_iflock_nowait(ip)) {
3987                                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3988                                         continue;
3989                                 }
3990                         }
3991                         XFS_MOUNT_IUNLOCK(mp);
3992                         if (xfs_finish_reclaim(ip, noblock,
3993                                         XFS_IFLUSH_DELWRI_ELSE_ASYNC))
3994                                 delay(1);
3995                         purged = 1;
3996                         break;
3997                 }
3998
3999                 done = !purged;
4000         }
4001
4002         XFS_MOUNT_IUNLOCK(mp);
4003         return 0;
4004 }
4005
4006 /*
4007  * xfs_alloc_file_space()
4008  *      This routine allocates disk space for the given file.
4009  *
4010  *      If alloc_type == 0, this request is for an ALLOCSP type
4011  *      request which will change the file size.  In this case, no
4012  *      DMAPI event will be generated by the call.  A TRUNCATE event
4013  *      will be generated later by xfs_setattr.
4014  *
4015  *      If alloc_type != 0, this request is for a RESVSP type
4016  *      request, and a DMAPI DM_EVENT_WRITE will be generated if the
4017  *      lower block boundary byte address is less than the file's
4018  *      length.
4019  *
4020  * RETURNS:
4021  *       0 on success
4022  *      errno on error
4023  *
4024  */
4025 STATIC int
4026 xfs_alloc_file_space(
4027         xfs_inode_t             *ip,
4028         xfs_off_t               offset,
4029         xfs_off_t               len,
4030         int                     alloc_type,
4031         int                     attr_flags)
4032 {
4033         xfs_mount_t             *mp = ip->i_mount;
4034         xfs_off_t               count;
4035         xfs_filblks_t           allocated_fsb;
4036         xfs_filblks_t           allocatesize_fsb;
4037         xfs_extlen_t            extsz, temp;
4038         xfs_fileoff_t           startoffset_fsb;
4039         xfs_fsblock_t           firstfsb;
4040         int                     nimaps;
4041         int                     bmapi_flag;
4042         int                     quota_flag;
4043         int                     rt;
4044         xfs_trans_t             *tp;
4045         xfs_bmbt_irec_t         imaps[1], *imapp;
4046         xfs_bmap_free_t         free_list;
4047         uint                    qblocks, resblks, resrtextents;
4048         int                     committed;
4049         int                     error;
4050
4051         vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
4052
4053         if (XFS_FORCED_SHUTDOWN(mp))
4054                 return XFS_ERROR(EIO);
4055
4056         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4057                 return error;
4058
4059         if (len <= 0)
4060                 return XFS_ERROR(EINVAL);
4061
4062         rt = XFS_IS_REALTIME_INODE(ip);
4063         extsz = xfs_get_extsz_hint(ip);
4064
4065         count = len;
4066         imapp = &imaps[0];
4067         nimaps = 1;
4068         bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
4069         startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
4070         allocatesize_fsb = XFS_B_TO_FSB(mp, count);
4071
4072         /*      Generate a DMAPI event if needed.       */
4073         if (alloc_type != 0 && offset < ip->i_size &&
4074                         (attr_flags&ATTR_DMI) == 0  &&
4075                         DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
4076                 xfs_off_t           end_dmi_offset;
4077
4078                 end_dmi_offset = offset+len;
4079                 if (end_dmi_offset > ip->i_size)
4080                         end_dmi_offset = ip->i_size;
4081                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip),
4082                         offset, end_dmi_offset - offset,
4083                         0, NULL);
4084                 if (error)
4085                         return error;
4086         }
4087
4088         /*
4089          * Allocate file space until done or until there is an error
4090          */
4091 retry:
4092         while (allocatesize_fsb && !error) {
4093                 xfs_fileoff_t   s, e;
4094
4095                 /*
4096                  * Determine space reservations for data/realtime.
4097                  */
4098                 if (unlikely(extsz)) {
4099                         s = startoffset_fsb;
4100                         do_div(s, extsz);
4101                         s *= extsz;
4102                         e = startoffset_fsb + allocatesize_fsb;
4103                         if ((temp = do_mod(startoffset_fsb, extsz)))
4104                                 e += temp;
4105                         if ((temp = do_mod(e, extsz)))
4106                                 e += extsz - temp;
4107                 } else {
4108                         s = 0;
4109                         e = allocatesize_fsb;
4110                 }
4111
4112                 if (unlikely(rt)) {
4113                         resrtextents = qblocks = (uint)(e - s);
4114                         resrtextents /= mp->m_sb.sb_rextsize;
4115                         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
4116                         quota_flag = XFS_QMOPT_RES_RTBLKS;
4117                 } else {
4118                         resrtextents = 0;
4119                         resblks = qblocks = \
4120                                 XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
4121                         quota_flag = XFS_QMOPT_RES_REGBLKS;
4122                 }
4123
4124                 /*
4125                  * Allocate and setup the transaction.
4126                  */
4127                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4128                 error = xfs_trans_reserve(tp, resblks,
4129                                           XFS_WRITE_LOG_RES(mp), resrtextents,
4130                                           XFS_TRANS_PERM_LOG_RES,
4131                                           XFS_WRITE_LOG_COUNT);
4132                 /*
4133                  * Check for running out of space
4134                  */
4135                 if (error) {
4136                         /*
4137                          * Free the transaction structure.
4138                          */
4139                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4140                         xfs_trans_cancel(tp, 0);
4141                         break;
4142                 }
4143                 xfs_ilock(ip, XFS_ILOCK_EXCL);
4144                 error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip,
4145                                                       qblocks, 0, quota_flag);
4146                 if (error)
4147                         goto error1;
4148
4149                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4150                 xfs_trans_ihold(tp, ip);
4151
4152                 /*
4153                  * Issue the xfs_bmapi() call to allocate the blocks
4154                  */
4155                 XFS_BMAP_INIT(&free_list, &firstfsb);
4156                 error = XFS_BMAPI(mp, tp, &ip->i_iocore, startoffset_fsb,
4157                                   allocatesize_fsb, bmapi_flag,
4158                                   &firstfsb, 0, imapp, &nimaps,
4159                                   &free_list, NULL);
4160                 if (error) {
4161                         goto error0;
4162                 }
4163
4164                 /*
4165                  * Complete the transaction
4166                  */
4167                 error = xfs_bmap_finish(&tp, &free_list, &committed);
4168                 if (error) {
4169                         goto error0;
4170                 }
4171
4172                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
4173                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4174                 if (error) {
4175                         break;
4176                 }
4177
4178                 allocated_fsb = imapp->br_blockcount;
4179
4180                 if (nimaps == 0) {
4181                         error = XFS_ERROR(ENOSPC);
4182                         break;
4183                 }
4184
4185                 startoffset_fsb += allocated_fsb;
4186                 allocatesize_fsb -= allocated_fsb;
4187         }
4188 dmapi_enospc_check:
4189         if (error == ENOSPC && (attr_flags & ATTR_DMI) == 0 &&
4190             DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) {
4191                 error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
4192                                 XFS_ITOV(ip), DM_RIGHT_NULL,
4193                                 XFS_ITOV(ip), DM_RIGHT_NULL,
4194                                 NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
4195                 if (error == 0)
4196                         goto retry;     /* Maybe DMAPI app. has made space */
4197                 /* else fall through with error from XFS_SEND_DATA */
4198         }
4199
4200         return error;
4201
4202 error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
4203         xfs_bmap_cancel(&free_list);
4204         XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag);
4205
4206 error1: /* Just cancel transaction */
4207         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4208         xfs_iunlock(ip, XFS_ILOCK_EXCL);
4209         goto dmapi_enospc_check;
4210 }
4211
4212 /*
4213  * Zero file bytes between startoff and endoff inclusive.
4214  * The iolock is held exclusive and no blocks are buffered.
4215  */
4216 STATIC int
4217 xfs_zero_remaining_bytes(
4218         xfs_inode_t             *ip,
4219         xfs_off_t               startoff,
4220         xfs_off_t               endoff)
4221 {
4222         xfs_bmbt_irec_t         imap;
4223         xfs_fileoff_t           offset_fsb;
4224         xfs_off_t               lastoffset;
4225         xfs_off_t               offset;
4226         xfs_buf_t               *bp;
4227         xfs_mount_t             *mp = ip->i_mount;
4228         int                     nimap;
4229         int                     error = 0;
4230
4231         bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
4232                                 ip->i_d.di_flags & XFS_DIFLAG_REALTIME ?
4233                                 mp->m_rtdev_targp : mp->m_ddev_targp);
4234
4235         for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
4236                 offset_fsb = XFS_B_TO_FSBT(mp, offset);
4237                 nimap = 1;
4238                 error = XFS_BMAPI(mp, NULL, &ip->i_iocore, offset_fsb, 1, 0,
4239                         NULL, 0, &imap, &nimap, NULL, NULL);
4240                 if (error || nimap < 1)
4241                         break;
4242                 ASSERT(imap.br_blockcount >= 1);
4243                 ASSERT(imap.br_startoff == offset_fsb);
4244                 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
4245                 if (lastoffset > endoff)
4246                         lastoffset = endoff;
4247                 if (imap.br_startblock == HOLESTARTBLOCK)
4248                         continue;
4249                 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4250                 if (imap.br_state == XFS_EXT_UNWRITTEN)
4251                         continue;
4252                 XFS_BUF_UNDONE(bp);
4253                 XFS_BUF_UNWRITE(bp);
4254                 XFS_BUF_READ(bp);
4255                 XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
4256                 xfsbdstrat(mp, bp);
4257                 if ((error = xfs_iowait(bp))) {
4258                         xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
4259                                           mp, bp, XFS_BUF_ADDR(bp));
4260                         break;
4261                 }
4262                 memset(XFS_BUF_PTR(bp) +
4263                         (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
4264                       0, lastoffset - offset + 1);
4265                 XFS_BUF_UNDONE(bp);
4266                 XFS_BUF_UNREAD(bp);
4267                 XFS_BUF_WRITE(bp);
4268                 xfsbdstrat(mp, bp);
4269                 if ((error = xfs_iowait(bp))) {
4270                         xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
4271                                           mp, bp, XFS_BUF_ADDR(bp));
4272                         break;
4273                 }
4274         }
4275         xfs_buf_free(bp);
4276         return error;
4277 }
4278
4279 /*
4280  * xfs_free_file_space()
4281  *      This routine frees disk space for the given file.
4282  *
4283  *      This routine is only called by xfs_change_file_space
4284  *      for an UNRESVSP type call.
4285  *
4286  * RETURNS:
4287  *       0 on success
4288  *      errno on error
4289  *
4290  */
4291 STATIC int
4292 xfs_free_file_space(
4293         xfs_inode_t             *ip,
4294         xfs_off_t               offset,
4295         xfs_off_t               len,
4296         int                     attr_flags)
4297 {
4298         bhv_vnode_t             *vp;
4299         int                     committed;
4300         int                     done;
4301         xfs_off_t               end_dmi_offset;
4302         xfs_fileoff_t           endoffset_fsb;
4303         int                     error;
4304         xfs_fsblock_t           firstfsb;
4305         xfs_bmap_free_t         free_list;
4306         xfs_bmbt_irec_t         imap;
4307         xfs_off_t               ioffset;
4308         xfs_extlen_t            mod=0;
4309         xfs_mount_t             *mp;
4310         int                     nimap;
4311         uint                    resblks;
4312         uint                    rounding;
4313         int                     rt;
4314         xfs_fileoff_t           startoffset_fsb;
4315         xfs_trans_t             *tp;
4316         int                     need_iolock = 1;
4317
4318         vp = XFS_ITOV(ip);
4319         mp = ip->i_mount;
4320
4321         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
4322
4323         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4324                 return error;
4325
4326         error = 0;
4327         if (len <= 0)   /* if nothing being freed */
4328                 return error;
4329         rt = (ip->i_d.di_flags & XFS_DIFLAG_REALTIME);
4330         startoffset_fsb = XFS_B_TO_FSB(mp, offset);
4331         end_dmi_offset = offset + len;
4332         endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
4333
4334         if (offset < ip->i_size && (attr_flags & ATTR_DMI) == 0 &&
4335             DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
4336                 if (end_dmi_offset > ip->i_size)
4337                         end_dmi_offset = ip->i_size;
4338                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp,
4339                                 offset, end_dmi_offset - offset,
4340                                 AT_DELAY_FLAG(attr_flags), NULL);
4341                 if (error)
4342                         return error;
4343         }
4344
4345         if (attr_flags & ATTR_NOLOCK)
4346                 need_iolock = 0;
4347         if (need_iolock) {
4348                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
4349                 vn_iowait(vp);  /* wait for the completion of any pending DIOs */
4350         }
4351
4352         rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, NBPP);
4353         ioffset = offset & ~(rounding - 1);
4354
4355         if (VN_CACHED(vp) != 0) {
4356                 xfs_inval_cached_trace(&ip->i_iocore, ioffset, -1,
4357                                 ctooff(offtoct(ioffset)), -1);
4358                 error = bhv_vop_flushinval_pages(vp, ctooff(offtoct(ioffset)),
4359                                 -1, FI_REMAPF_LOCKED);
4360                 if (error)
4361                         goto out_unlock_iolock;
4362         }
4363
4364         /*
4365          * Need to zero the stuff we're not freeing, on disk.
4366          * If its a realtime file & can't use unwritten extents then we
4367          * actually need to zero the extent edges.  Otherwise xfs_bunmapi
4368          * will take care of it for us.
4369          */
4370         if (rt && !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
4371                 nimap = 1;
4372                 error = XFS_BMAPI(mp, NULL, &ip->i_iocore, startoffset_fsb,
4373                         1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
4374                 if (error)
4375                         goto out_unlock_iolock;
4376                 ASSERT(nimap == 0 || nimap == 1);
4377                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4378                         xfs_daddr_t     block;
4379
4380                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4381                         block = imap.br_startblock;
4382                         mod = do_div(block, mp->m_sb.sb_rextsize);
4383                         if (mod)
4384                                 startoffset_fsb += mp->m_sb.sb_rextsize - mod;
4385                 }
4386                 nimap = 1;
4387                 error = XFS_BMAPI(mp, NULL, &ip->i_iocore, endoffset_fsb - 1,
4388                         1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
4389                 if (error)
4390                         goto out_unlock_iolock;
4391                 ASSERT(nimap == 0 || nimap == 1);
4392                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4393                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4394                         mod++;
4395                         if (mod && (mod != mp->m_sb.sb_rextsize))
4396                                 endoffset_fsb -= mod;
4397                 }
4398         }
4399         if ((done = (endoffset_fsb <= startoffset_fsb)))
4400                 /*
4401                  * One contiguous piece to clear
4402                  */
4403                 error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
4404         else {
4405                 /*
4406                  * Some full blocks, possibly two pieces to clear
4407                  */
4408                 if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
4409                         error = xfs_zero_remaining_bytes(ip, offset,
4410                                 XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
4411                 if (!error &&
4412                     XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
4413                         error = xfs_zero_remaining_bytes(ip,
4414                                 XFS_FSB_TO_B(mp, endoffset_fsb),
4415                                 offset + len - 1);
4416         }
4417
4418         /*
4419          * free file space until done or until there is an error
4420          */
4421         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
4422         while (!error && !done) {
4423
4424                 /*
4425                  * allocate and setup the transaction. Allow this
4426                  * transaction to dip into the reserve blocks to ensure
4427                  * the freeing of the space succeeds at ENOSPC.
4428                  */
4429                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4430                 tp->t_flags |= XFS_TRANS_RESERVE;
4431                 error = xfs_trans_reserve(tp,
4432                                           resblks,
4433                                           XFS_WRITE_LOG_RES(mp),
4434                                           0,
4435                                           XFS_TRANS_PERM_LOG_RES,
4436                                           XFS_WRITE_LOG_COUNT);
4437
4438                 /*
4439                  * check for running out of space
4440                  */
4441                 if (error) {
4442                         /*
4443                          * Free the transaction structure.
4444                          */
4445                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4446                         xfs_trans_cancel(tp, 0);
4447                         break;
4448                 }
4449                 xfs_ilock(ip, XFS_ILOCK_EXCL);
4450                 error = XFS_TRANS_RESERVE_QUOTA(mp, tp,
4451                                 ip->i_udquot, ip->i_gdquot, resblks, 0,
4452                                 XFS_QMOPT_RES_REGBLKS);
4453                 if (error)
4454                         goto error1;
4455
4456                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4457                 xfs_trans_ihold(tp, ip);
4458
4459                 /*
4460                  * issue the bunmapi() call to free the blocks
4461                  */
4462                 XFS_BMAP_INIT(&free_list, &firstfsb);
4463                 error = XFS_BUNMAPI(mp, tp, &ip->i_iocore, startoffset_fsb,
4464                                   endoffset_fsb - startoffset_fsb,
4465                                   0, 2, &firstfsb, &free_list, NULL, &done);
4466                 if (error) {
4467                         goto error0;
4468                 }
4469
4470                 /*
4471                  * complete the transaction
4472                  */
4473                 error = xfs_bmap_finish(&tp, &free_list, &committed);
4474                 if (error) {
4475                         goto error0;
4476                 }
4477
4478                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
4479                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4480         }
4481
4482  out_unlock_iolock:
4483         if (need_iolock)
4484                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
4485         return error;
4486
4487  error0:
4488         xfs_bmap_cancel(&free_list);
4489  error1:
4490         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4491         xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
4492                     XFS_ILOCK_EXCL);
4493         return error;
4494 }
4495
4496 /*
4497  * xfs_change_file_space()
4498  *      This routine allocates or frees disk space for the given file.
4499  *      The user specified parameters are checked for alignment and size
4500  *      limitations.
4501  *
4502  * RETURNS:
4503  *       0 on success
4504  *      errno on error
4505  *
4506  */
4507 int
4508 xfs_change_file_space(
4509         bhv_desc_t      *bdp,
4510         int             cmd,
4511         xfs_flock64_t   *bf,
4512         xfs_off_t       offset,
4513         cred_t          *credp,
4514         int             attr_flags)
4515 {
4516         int             clrprealloc;
4517         int             error;
4518         xfs_fsize_t     fsize;
4519         xfs_inode_t     *ip;
4520         xfs_mount_t     *mp;
4521         int             setprealloc;
4522         xfs_off_t       startoffset;
4523         xfs_off_t       llen;
4524         xfs_trans_t     *tp;
4525         bhv_vattr_t     va;
4526         bhv_vnode_t     *vp;
4527
4528         vp = BHV_TO_VNODE(bdp);
4529         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
4530
4531         ip = XFS_BHVTOI(bdp);
4532         mp = ip->i_mount;
4533
4534         /*
4535          * must be a regular file and have write permission
4536          */
4537         if (!VN_ISREG(vp))
4538                 return XFS_ERROR(EINVAL);
4539
4540         xfs_ilock(ip, XFS_ILOCK_SHARED);
4541
4542         if ((error = xfs_iaccess(ip, S_IWUSR, credp))) {
4543                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
4544                 return error;
4545         }
4546
4547         xfs_iunlock(ip, XFS_ILOCK_SHARED);
4548
4549         switch (bf->l_whence) {
4550         case 0: /*SEEK_SET*/
4551                 break;
4552         case 1: /*SEEK_CUR*/
4553                 bf->l_start += offset;
4554                 break;
4555         case 2: /*SEEK_END*/
4556                 bf->l_start += ip->i_size;
4557                 break;
4558         default:
4559                 return XFS_ERROR(EINVAL);
4560         }
4561
4562         llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
4563
4564         if (   (bf->l_start < 0)
4565             || (bf->l_start > XFS_MAXIOFFSET(mp))
4566             || (bf->l_start + llen < 0)
4567             || (bf->l_start + llen > XFS_MAXIOFFSET(mp)))
4568                 return XFS_ERROR(EINVAL);
4569
4570         bf->l_whence = 0;
4571
4572         startoffset = bf->l_start;
4573         fsize = ip->i_size;
4574
4575         /*
4576          * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
4577          * file space.
4578          * These calls do NOT zero the data space allocated to the file,
4579          * nor do they change the file size.
4580          *
4581          * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
4582          * space.
4583          * These calls cause the new file data to be zeroed and the file
4584          * size to be changed.
4585          */
4586         setprealloc = clrprealloc = 0;
4587
4588         switch (cmd) {
4589         case XFS_IOC_RESVSP:
4590         case XFS_IOC_RESVSP64:
4591                 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
4592                                                                 1, attr_flags);
4593                 if (error)
4594                         return error;
4595                 setprealloc = 1;
4596                 break;
4597
4598         case XFS_IOC_UNRESVSP:
4599         case XFS_IOC_UNRESVSP64:
4600                 if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
4601                                                                 attr_flags)))
4602                         return error;
4603                 break;
4604
4605         case XFS_IOC_ALLOCSP:
4606         case XFS_IOC_ALLOCSP64:
4607         case XFS_IOC_FREESP:
4608         case XFS_IOC_FREESP64:
4609                 if (startoffset > fsize) {
4610                         error = xfs_alloc_file_space(ip, fsize,
4611                                         startoffset - fsize, 0, attr_flags);
4612                         if (error)
4613                                 break;
4614                 }
4615
4616                 va.va_mask = XFS_AT_SIZE;
4617                 va.va_size = startoffset;
4618
4619                 error = xfs_setattr(bdp, &va, attr_flags, credp);
4620
4621                 if (error)
4622                         return error;
4623
4624                 clrprealloc = 1;
4625                 break;
4626
4627         default:
4628                 ASSERT(0);
4629                 return XFS_ERROR(EINVAL);
4630         }
4631
4632         /*
4633          * update the inode timestamp, mode, and prealloc flag bits
4634          */
4635         tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
4636
4637         if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
4638                                       0, 0, 0))) {
4639                 /* ASSERT(0); */
4640                 xfs_trans_cancel(tp, 0);
4641                 return error;
4642         }
4643
4644         xfs_ilock(ip, XFS_ILOCK_EXCL);
4645
4646         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4647         xfs_trans_ihold(tp, ip);
4648
4649         if ((attr_flags & ATTR_DMI) == 0) {
4650                 ip->i_d.di_mode &= ~S_ISUID;
4651
4652                 /*
4653                  * Note that we don't have to worry about mandatory
4654                  * file locking being disabled here because we only
4655                  * clear the S_ISGID bit if the Group execute bit is
4656                  * on, but if it was on then mandatory locking wouldn't
4657                  * have been enabled.
4658                  */
4659                 if (ip->i_d.di_mode & S_IXGRP)
4660                         ip->i_d.di_mode &= ~S_ISGID;
4661
4662                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
4663         }
4664         if (setprealloc)
4665                 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
4666         else if (clrprealloc)
4667                 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
4668
4669         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
4670         xfs_trans_set_sync(tp);
4671
4672         error = xfs_trans_commit(tp, 0);
4673
4674         xfs_iunlock(ip, XFS_ILOCK_EXCL);
4675
4676         return error;
4677 }
4678
4679 bhv_vnodeops_t xfs_vnodeops = {
4680         BHV_IDENTITY_INIT(VN_BHV_XFS,VNODE_POSITION_XFS),
4681         .vop_open               = xfs_open,
4682         .vop_read               = xfs_read,
4683 #ifdef HAVE_SPLICE
4684         .vop_splice_read        = xfs_splice_read,
4685         .vop_splice_write       = xfs_splice_write,
4686 #endif
4687         .vop_write              = xfs_write,
4688         .vop_ioctl              = xfs_ioctl,
4689         .vop_getattr            = xfs_getattr,
4690         .vop_setattr            = xfs_setattr,
4691         .vop_access             = xfs_access,
4692         .vop_lookup             = xfs_lookup,
4693         .vop_create             = xfs_create,
4694         .vop_remove             = xfs_remove,
4695         .vop_link               = xfs_link,
4696         .vop_rename             = xfs_rename,
4697         .vop_mkdir              = xfs_mkdir,
4698         .vop_rmdir              = xfs_rmdir,
4699         .vop_readdir            = xfs_readdir,
4700         .vop_symlink            = xfs_symlink,
4701         .vop_readlink           = xfs_readlink,
4702         .vop_fsync              = xfs_fsync,
4703         .vop_inactive           = xfs_inactive,
4704         .vop_fid2               = xfs_fid2,
4705         .vop_rwlock             = xfs_rwlock,
4706         .vop_rwunlock           = xfs_rwunlock,
4707         .vop_bmap               = xfs_bmap,
4708         .vop_reclaim            = xfs_reclaim,
4709         .vop_attr_get           = xfs_attr_get,
4710         .vop_attr_set           = xfs_attr_set,
4711         .vop_attr_remove        = xfs_attr_remove,
4712         .vop_attr_list          = xfs_attr_list,
4713         .vop_link_removed       = (vop_link_removed_t)fs_noval,
4714         .vop_vnode_change       = (vop_vnode_change_t)fs_noval,
4715         .vop_tosspages          = fs_tosspages,
4716         .vop_flushinval_pages   = fs_flushinval_pages,
4717         .vop_flush_pages        = fs_flush_pages,
4718         .vop_release            = xfs_release,
4719         .vop_iflush             = xfs_inode_flush,
4720 };