fs/xfs/xfs_vnodeops.c

   1 /*
   2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18
  19 #include "xfs.h"
  20 #include "xfs_fs.h"
  21 #include "xfs_types.h"
  22 #include "xfs_bit.h"
  23 #include "xfs_log.h"
  24 #include "xfs_inum.h"
  25 #include "xfs_trans.h"
  26 #include "xfs_sb.h"
  27 #include "xfs_ag.h"
  28 #include "xfs_dir2.h"
  29 #include "xfs_dmapi.h"
  30 #include "xfs_mount.h"
  31 #include "xfs_da_btree.h"
  32 #include "xfs_bmap_btree.h"
  33 #include "xfs_alloc_btree.h"
  34 #include "xfs_ialloc_btree.h"
  35 #include "xfs_dir2_sf.h"
  36 #include "xfs_attr_sf.h"
  37 #include "xfs_dinode.h"
  38 #include "xfs_inode.h"
  39 #include "xfs_inode_item.h"
  40 #include "xfs_itable.h"
  41 #include "xfs_btree.h"
  42 #include "xfs_ialloc.h"
  43 #include "xfs_alloc.h"
  44 #include "xfs_bmap.h"
  45 #include "xfs_attr.h"
  46 #include "xfs_rw.h"
  47 #include "xfs_error.h"
  48 #include "xfs_quota.h"
  49 #include "xfs_utils.h"
  50 #include "xfs_rtalloc.h"
  51 #include "xfs_refcache.h"
  52 #include "xfs_trans_space.h"
  53 #include "xfs_log_priv.h"
  54 #include "xfs_filestream.h"
  55
  56 STATIC int
  57 xfs_open(
  58         bhv_desc_t      *bdp,
  59         cred_t          *credp)
  60 {
  61         int             mode;
  62         bhv_vnode_t     *vp = BHV_TO_VNODE(bdp);
  63         xfs_inode_t     *ip = XFS_BHVTOI(bdp);
  64
  65         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
  66                 return XFS_ERROR(EIO);
  67
  68         /*
  69          * If it's a directory with any blocks, read-ahead block 0
  70          * as we're almost certain to have the next operation be a read there.
  71          */
  72         if (VN_ISDIR(vp) && ip->i_d.di_nextents > 0) {
  73                 mode = xfs_ilock_map_shared(ip);
  74                 if (ip->i_d.di_nextents > 0)
  75                         (void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
  76                 xfs_iunlock(ip, mode);
  77         }
  78         return 0;
  79 }
  80
  81 /*
  82  * xfs_getattr
  83  */
  84 STATIC int
  85 xfs_getattr(
  86         bhv_desc_t      *bdp,
  87         bhv_vattr_t     *vap,
  88         int             flags,
  89         cred_t          *credp)
  90 {
  91         xfs_inode_t     *ip;
  92         xfs_mount_t     *mp;
  93         bhv_vnode_t     *vp;
  94
  95         vp  = BHV_TO_VNODE(bdp);
  96         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
  97
  98         ip = XFS_BHVTOI(bdp);
  99         mp = ip->i_mount;
 100
 101         if (XFS_FORCED_SHUTDOWN(mp))
 102                 return XFS_ERROR(EIO);
 103
 104         if (!(flags & ATTR_LAZY))
 105                 xfs_ilock(ip, XFS_ILOCK_SHARED);
 106
 107         vap->va_size = XFS_ISIZE(ip);
 108         if (vap->va_mask == XFS_AT_SIZE)
 109                 goto all_done;
 110
 111         vap->va_nblocks =
 112                 XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
 113         vap->va_nodeid = ip->i_ino;
 114 #if XFS_BIG_INUMS
 115         vap->va_nodeid += mp->m_inoadd;
 116 #endif
 117         vap->va_nlink = ip->i_d.di_nlink;
 118
 119         /*
 120          * Quick exit for non-stat callers
 121          */
 122         if ((vap->va_mask &
 123             ~(XFS_AT_SIZE|XFS_AT_FSID|XFS_AT_NODEID|
 124               XFS_AT_NLINK|XFS_AT_BLKSIZE)) == 0)
 125                 goto all_done;
 126
 127         /*
 128          * Copy from in-core inode.
 129          */
 130         vap->va_mode = ip->i_d.di_mode;
 131         vap->va_uid = ip->i_d.di_uid;
 132         vap->va_gid = ip->i_d.di_gid;
 133         vap->va_projid = ip->i_d.di_projid;
 134
 135         /*
 136          * Check vnode type block/char vs. everything else.
 137          */
 138         switch (ip->i_d.di_mode & S_IFMT) {
 139         case S_IFBLK:
 140         case S_IFCHR:
 141                 vap->va_rdev = ip->i_df.if_u2.if_rdev;
 142                 vap->va_blocksize = BLKDEV_IOSIZE;
 143                 break;
 144         default:
 145                 vap->va_rdev = 0;
 146
 147                 if (!(ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
 148                         vap->va_blocksize = xfs_preferred_iosize(mp);
 149                 } else {
 150
 151                         /*
 152                          * If the file blocks are being allocated from a
 153                          * realtime partition, then return the inode's
 154                          * realtime extent size or the realtime volume's
 155                          * extent size.
 156                          */
 157                         vap->va_blocksize =
 158                                 xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog;
 159                 }
 160                 break;
 161         }
 162
 163         vn_atime_to_timespec(vp, &vap->va_atime);
 164         vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
 165         vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
 166         vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
 167         vap->va_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
 168
 169         /*
 170          * Exit for stat callers.  See if any of the rest of the fields
 171          * to be filled in are needed.
 172          */
 173         if ((vap->va_mask &
 174              (XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
 175               XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
 176                 goto all_done;
 177
 178         /*
 179          * Convert di_flags to xflags.
 180          */
 181         vap->va_xflags = xfs_ip2xflags(ip);
 182
 183         /*
 184          * Exit for inode revalidate.  See if any of the rest of
 185          * the fields to be filled in are needed.
 186          */
 187         if ((vap->va_mask &
 188              (XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
 189               XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
 190                 goto all_done;
 191
 192         vap->va_extsize = ip->i_d.di_extsize << mp->m_sb.sb_blocklog;
 193         vap->va_nextents =
 194                 (ip->i_df.if_flags & XFS_IFEXTENTS) ?
 195                         ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) :
 196                         ip->i_d.di_nextents;
 197         if (ip->i_afp)
 198                 vap->va_anextents =
 199                         (ip->i_afp->if_flags & XFS_IFEXTENTS) ?
 200                                 ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) :
 201                                  ip->i_d.di_anextents;
 202         else
 203                 vap->va_anextents = 0;
 204         vap->va_gen = ip->i_d.di_gen;
 205
 206  all_done:
 207         if (!(flags & ATTR_LAZY))
 208                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
 209         return 0;
 210 }
 211
 212
 213 /*
 214  * xfs_setattr
 215  */
 216 int
 217 xfs_setattr(
 218         bhv_desc_t              *bdp,
 219         bhv_vattr_t             *vap,
 220         int                     flags,
 221         cred_t                  *credp)
 222 {
 223         xfs_inode_t             *ip;
 224         xfs_trans_t             *tp;
 225         xfs_mount_t             *mp;
 226         int                     mask;
 227         int                     code;
 228         uint                    lock_flags;
 229         uint                    commit_flags=0;
 230         uid_t                   uid=0, iuid=0;
 231         gid_t                   gid=0, igid=0;
 232         int                     timeflags = 0;
 233         bhv_vnode_t             *vp;
 234         xfs_prid_t              projid=0, iprojid=0;
 235         int                     mandlock_before, mandlock_after;
 236         struct xfs_dquot        *udqp, *gdqp, *olddquot1, *olddquot2;
 237         int                     file_owner;
 238         int                     need_iolock = 1;
 239
 240         vp = BHV_TO_VNODE(bdp);
 241         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
 242
 243         if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
 244                 return XFS_ERROR(EROFS);
 245
 246         /*
 247          * Cannot set certain attributes.
 248          */
 249         mask = vap->va_mask;
 250         if (mask & XFS_AT_NOSET) {
 251                 return XFS_ERROR(EINVAL);
 252         }
 253
 254         ip = XFS_BHVTOI(bdp);
 255         mp = ip->i_mount;
 256
 257         if (XFS_FORCED_SHUTDOWN(mp))
 258                 return XFS_ERROR(EIO);
 259
 260         /*
 261          * Timestamps do not need to be logged and hence do not
 262          * need to be done within a transaction.
 263          */
 264         if (mask & XFS_AT_UPDTIMES) {
 265                 ASSERT((mask & ~XFS_AT_UPDTIMES) == 0);
 266                 timeflags = ((mask & XFS_AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) |
 267                             ((mask & XFS_AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) |
 268                             ((mask & XFS_AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0);
 269                 xfs_ichgtime(ip, timeflags);
 270                 return 0;
 271         }
 272
 273         olddquot1 = olddquot2 = NULL;
 274         udqp = gdqp = NULL;
 275
 276         /*
 277          * If disk quotas is on, we make sure that the dquots do exist on disk,
 278          * before we start any other transactions. Trying to do this later
 279          * is messy. We don't care to take a readlock to look at the ids
 280          * in inode here, because we can't hold it across the trans_reserve.
 281          * If the IDs do change before we take the ilock, we're covered
 282          * because the i_*dquot fields will get updated anyway.
 283          */
 284         if (XFS_IS_QUOTA_ON(mp) &&
 285             (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID))) {
 286                 uint    qflags = 0;
 287
 288                 if ((mask & XFS_AT_UID) && XFS_IS_UQUOTA_ON(mp)) {
 289                         uid = vap->va_uid;
 290                         qflags |= XFS_QMOPT_UQUOTA;
 291                 } else {
 292                         uid = ip->i_d.di_uid;
 293                 }
 294                 if ((mask & XFS_AT_GID) && XFS_IS_GQUOTA_ON(mp)) {
 295                         gid = vap->va_gid;
 296                         qflags |= XFS_QMOPT_GQUOTA;
 297                 }  else {
 298                         gid = ip->i_d.di_gid;
 299                 }
 300                 if ((mask & XFS_AT_PROJID) && XFS_IS_PQUOTA_ON(mp)) {
 301                         projid = vap->va_projid;
 302                         qflags |= XFS_QMOPT_PQUOTA;
 303                 }  else {
 304                         projid = ip->i_d.di_projid;
 305                 }
 306                 /*
 307                  * We take a reference when we initialize udqp and gdqp,
 308                  * so it is important that we never blindly double trip on
 309                  * the same variable. See xfs_create() for an example.
 310                  */
 311                 ASSERT(udqp == NULL);
 312                 ASSERT(gdqp == NULL);
 313                 code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, projid, qflags,
 314                                          &udqp, &gdqp);
 315                 if (code)
 316                         return code;
 317         }
 318
 319         /*
 320          * For the other attributes, we acquire the inode lock and
 321          * first do an error checking pass.
 322          */
 323         tp = NULL;
 324         lock_flags = XFS_ILOCK_EXCL;
 325         if (flags & ATTR_NOLOCK)
 326                 need_iolock = 0;
 327         if (!(mask & XFS_AT_SIZE)) {
 328                 if ((mask != (XFS_AT_CTIME|XFS_AT_ATIME|XFS_AT_MTIME)) ||
 329                     (mp->m_flags & XFS_MOUNT_WSYNC)) {
 330                         tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
 331                         commit_flags = 0;
 332                         if ((code = xfs_trans_reserve(tp, 0,
 333                                                      XFS_ICHANGE_LOG_RES(mp), 0,
 334                                                      0, 0))) {
 335                                 lock_flags = 0;
 336                                 goto error_return;
 337                         }
 338                 }
 339         } else {
 340                 if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
 341                     !(flags & ATTR_DMI)) {
 342                         int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
 343                         code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, vp,
 344                                 vap->va_size, 0, dmflags, NULL);
 345                         if (code) {
 346                                 lock_flags = 0;
 347                                 goto error_return;
 348                         }
 349                 }
 350                 if (need_iolock)
 351                         lock_flags |= XFS_IOLOCK_EXCL;
 352         }
 353
 354         xfs_ilock(ip, lock_flags);
 355
 356         /* boolean: are we the file owner? */
 357         file_owner = (current_fsuid(credp) == ip->i_d.di_uid);
 358
 359         /*
 360          * Change various properties of a file.
 361          * Only the owner or users with CAP_FOWNER
 362          * capability may do these things.
 363          */
 364         if (mask &
 365             (XFS_AT_MODE|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_UID|
 366              XFS_AT_GID|XFS_AT_PROJID)) {
 367                 /*
 368                  * CAP_FOWNER overrides the following restrictions:
 369                  *
 370                  * The user ID of the calling process must be equal
 371                  * to the file owner ID, except in cases where the
 372                  * CAP_FSETID capability is applicable.
 373                  */
 374                 if (!file_owner && !capable(CAP_FOWNER)) {
 375                         code = XFS_ERROR(EPERM);
 376                         goto error_return;
 377                 }
 378
 379                 /*
 380                  * CAP_FSETID overrides the following restrictions:
 381                  *
 382                  * The effective user ID of the calling process shall match
 383                  * the file owner when setting the set-user-ID and
 384                  * set-group-ID bits on that file.
 385                  *
 386                  * The effective group ID or one of the supplementary group
 387                  * IDs of the calling process shall match the group owner of
 388                  * the file when setting the set-group-ID bit on that file
 389                  */
 390                 if (mask & XFS_AT_MODE) {
 391                         mode_t m = 0;
 392
 393                         if ((vap->va_mode & S_ISUID) && !file_owner)
 394                                 m |= S_ISUID;
 395                         if ((vap->va_mode & S_ISGID) &&
 396                             !in_group_p((gid_t)ip->i_d.di_gid))
 397                                 m |= S_ISGID;
 398 #if 0
 399                         /* Linux allows this, Irix doesn't. */
 400                         if ((vap->va_mode & S_ISVTX) && !VN_ISDIR(vp))
 401                                 m |= S_ISVTX;
 402 #endif
 403                         if (m && !capable(CAP_FSETID))
 404                                 vap->va_mode &= ~m;
 405                 }
 406         }
 407
 408         /*
 409          * Change file ownership.  Must be the owner or privileged.
 410          * If the system was configured with the "restricted_chown"
 411          * option, the owner is not permitted to give away the file,
 412          * and can change the group id only to a group of which he
 413          * or she is a member.
 414          */
 415         if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
 416                 /*
 417                  * These IDs could have changed since we last looked at them.
 418                  * But, we're assured that if the ownership did change
 419                  * while we didn't have the inode locked, inode's dquot(s)
 420                  * would have changed also.
 421                  */
 422                 iuid = ip->i_d.di_uid;
 423                 iprojid = ip->i_d.di_projid;
 424                 igid = ip->i_d.di_gid;
 425                 gid = (mask & XFS_AT_GID) ? vap->va_gid : igid;
 426                 uid = (mask & XFS_AT_UID) ? vap->va_uid : iuid;
 427                 projid = (mask & XFS_AT_PROJID) ? (xfs_prid_t)vap->va_projid :
 428                          iprojid;
 429
 430                 /*
 431                  * CAP_CHOWN overrides the following restrictions:
 432                  *
 433                  * If _POSIX_CHOWN_RESTRICTED is defined, this capability
 434                  * shall override the restriction that a process cannot
 435                  * change the user ID of a file it owns and the restriction
 436                  * that the group ID supplied to the chown() function
 437                  * shall be equal to either the group ID or one of the
 438                  * supplementary group IDs of the calling process.
 439                  */
 440                 if (restricted_chown &&
 441                     (iuid != uid || (igid != gid &&
 442                                      !in_group_p((gid_t)gid))) &&
 443                     !capable(CAP_CHOWN)) {
 444                         code = XFS_ERROR(EPERM);
 445                         goto error_return;
 446                 }
 447                 /*
 448                  * Do a quota reservation only if uid/projid/gid is actually
 449                  * going to change.
 450                  */
 451                 if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
 452                     (XFS_IS_PQUOTA_ON(mp) && iprojid != projid) ||
 453                     (XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
 454                         ASSERT(tp);
 455                         code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
 456                                                 capable(CAP_FOWNER) ?
 457                                                 XFS_QMOPT_FORCE_RES : 0);
 458                         if (code)       /* out of quota */
 459                                 goto error_return;
 460                 }
 461         }
 462
 463         /*
 464          * Truncate file.  Must have write permission and not be a directory.
 465          */
 466         if (mask & XFS_AT_SIZE) {
 467                 /* Short circuit the truncate case for zero length files */
 468                 if ((vap->va_size == 0) &&
 469                    (ip->i_size == 0) && (ip->i_d.di_nextents == 0)) {
 470                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
 471                         lock_flags &= ~XFS_ILOCK_EXCL;
 472                         if (mask & XFS_AT_CTIME)
 473                                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 474                         code = 0;
 475                         goto error_return;
 476                 }
 477
 478                 if (VN_ISDIR(vp)) {
 479                         code = XFS_ERROR(EISDIR);
 480                         goto error_return;
 481                 } else if (!VN_ISREG(vp)) {
 482                         code = XFS_ERROR(EINVAL);
 483                         goto error_return;
 484                 }
 485                 /*
 486                  * Make sure that the dquots are attached to the inode.
 487                  */
 488                 if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED)))
 489                         goto error_return;
 490         }
 491
 492         /*
 493          * Change file access or modified times.
 494          */
 495         if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
 496                 if (!file_owner) {
 497                         if ((flags & ATTR_UTIME) &&
 498                             !capable(CAP_FOWNER)) {
 499                                 code = XFS_ERROR(EPERM);
 500                                 goto error_return;
 501                         }
 502                 }
 503         }
 504
 505         /*
 506          * Change extent size or realtime flag.
 507          */
 508         if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
 509                 /*
 510                  * Can't change extent size if any extents are allocated.
 511                  */
 512                 if (ip->i_d.di_nextents && (mask & XFS_AT_EXTSIZE) &&
 513                     ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
 514                      vap->va_extsize) ) {
 515                         code = XFS_ERROR(EINVAL);       /* EFBIG? */
 516                         goto error_return;
 517                 }
 518
 519                 /*
 520                  * Can't change realtime flag if any extents are allocated.
 521                  */
 522                 if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
 523                     (mask & XFS_AT_XFLAGS) &&
 524                     (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) !=
 525                     (vap->va_xflags & XFS_XFLAG_REALTIME)) {
 526                         code = XFS_ERROR(EINVAL);       /* EFBIG? */
 527                         goto error_return;
 528                 }
 529                 /*
 530                  * Extent size must be a multiple of the appropriate block
 531                  * size, if set at all.
 532                  */
 533                 if ((mask & XFS_AT_EXTSIZE) && vap->va_extsize != 0) {
 534                         xfs_extlen_t    size;
 535
 536                         if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ||
 537                             ((mask & XFS_AT_XFLAGS) &&
 538                             (vap->va_xflags & XFS_XFLAG_REALTIME))) {
 539                                 size = mp->m_sb.sb_rextsize <<
 540                                        mp->m_sb.sb_blocklog;
 541                         } else {
 542                                 size = mp->m_sb.sb_blocksize;
 543                         }
 544                         if (vap->va_extsize % size) {
 545                                 code = XFS_ERROR(EINVAL);
 546                                 goto error_return;
 547                         }
 548                 }
 549                 /*
 550                  * If realtime flag is set then must have realtime data.
 551                  */
 552                 if ((mask & XFS_AT_XFLAGS) &&
 553                     (vap->va_xflags & XFS_XFLAG_REALTIME)) {
 554                         if ((mp->m_sb.sb_rblocks == 0) ||
 555                             (mp->m_sb.sb_rextsize == 0) ||
 556                             (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
 557                                 code = XFS_ERROR(EINVAL);
 558                                 goto error_return;
 559                         }
 560                 }
 561
 562                 /*
 563                  * Can't modify an immutable/append-only file unless
 564                  * we have appropriate permission.
 565                  */
 566                 if ((mask & XFS_AT_XFLAGS) &&
 567                     (ip->i_d.di_flags &
 568                                 (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
 569                      (vap->va_xflags &
 570                                 (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
 571                     !capable(CAP_LINUX_IMMUTABLE)) {
 572                         code = XFS_ERROR(EPERM);
 573                         goto error_return;
 574                 }
 575         }
 576
 577         /*
 578          * Now we can make the changes.  Before we join the inode
 579          * to the transaction, if XFS_AT_SIZE is set then take care of
 580          * the part of the truncation that must be done without the
 581          * inode lock.  This needs to be done before joining the inode
 582          * to the transaction, because the inode cannot be unlocked
 583          * once it is a part of the transaction.
 584          */
 585         if (mask & XFS_AT_SIZE) {
 586                 code = 0;
 587                 if ((vap->va_size > ip->i_size) &&
 588                     (flags & ATTR_NOSIZETOK) == 0) {
 589                         code = xfs_igrow_start(ip, vap->va_size, credp);
 590                 }
 591                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
 592
 593                 /*
 594                  * We are going to log the inode size change in this
 595                  * transaction so any previous writes that are beyond the on
 596                  * disk EOF and the new EOF that have not been written out need
 597                  * to be written here. If we do not write the data out, we
 598                  * expose ourselves to the null files problem.
 599                  *
 600                  * Only flush from the on disk size to the smaller of the in
 601                  * memory file size or the new size as that's the range we
 602                  * really care about here and prevents waiting for other data
 603                  * not within the range we care about here.
 604                  */
 605                 if (!code &&
 606                     (ip->i_size != ip->i_d.di_size) &&
 607                     (vap->va_size > ip->i_d.di_size)) {
 608                         code = bhv_vop_flush_pages(XFS_ITOV(ip),
 609                                         ip->i_d.di_size, vap->va_size,
 610                                         XFS_B_ASYNC, FI_NONE);
 611                 }
 612
 613                 /* wait for all I/O to complete */
 614                 vn_iowait(vp);
 615
 616                 if (!code)
 617                         code = xfs_itruncate_data(ip, vap->va_size);
 618                 if (code) {
 619                         ASSERT(tp == NULL);
 620                         lock_flags &= ~XFS_ILOCK_EXCL;
 621                         ASSERT(lock_flags == XFS_IOLOCK_EXCL);
 622                         goto error_return;
 623                 }
 624                 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
 625                 if ((code = xfs_trans_reserve(tp, 0,
 626                                              XFS_ITRUNCATE_LOG_RES(mp), 0,
 627                                              XFS_TRANS_PERM_LOG_RES,
 628                                              XFS_ITRUNCATE_LOG_COUNT))) {
 629                         xfs_trans_cancel(tp, 0);
 630                         if (need_iolock)
 631                                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 632                         return code;
 633                 }
 634                 commit_flags = XFS_TRANS_RELEASE_LOG_RES;
 635                 xfs_ilock(ip, XFS_ILOCK_EXCL);
 636         }
 637
 638         if (tp) {
 639                 xfs_trans_ijoin(tp, ip, lock_flags);
 640                 xfs_trans_ihold(tp, ip);
 641         }
 642
 643         /* determine whether mandatory locking mode changes */
 644         mandlock_before = MANDLOCK(vp, ip->i_d.di_mode);
 645
 646         /*
 647          * Truncate file.  Must have write permission and not be a directory.
 648          */
 649         if (mask & XFS_AT_SIZE) {
 650                 if (vap->va_size > ip->i_size) {
 651                         xfs_igrow_finish(tp, ip, vap->va_size,
 652                             !(flags & ATTR_DMI));
 653                 } else if ((vap->va_size <= ip->i_size) ||
 654                            ((vap->va_size == 0) && ip->i_d.di_nextents)) {
 655                         /*
 656                          * signal a sync transaction unless
 657                          * we're truncating an already unlinked
 658                          * file on a wsync filesystem
 659                          */
 660                         code = xfs_itruncate_finish(&tp, ip,
 661                                             (xfs_fsize_t)vap->va_size,
 662                                             XFS_DATA_FORK,
 663                                             ((ip->i_d.di_nlink != 0 ||
 664                                               !(mp->m_flags & XFS_MOUNT_WSYNC))
 665                                              ? 1 : 0));
 666                         if (code)
 667                                 goto abort_return;
 668                         /*
 669                          * Truncated "down", so we're removing references
 670                          * to old data here - if we now delay flushing for
 671                          * a long time, we expose ourselves unduly to the
 672                          * notorious NULL files problem.  So, we mark this
 673                          * vnode and flush it when the file is closed, and
 674                          * do not wait the usual (long) time for writeout.
 675                          */
 676                         VTRUNCATE(vp);
 677                 }
 678                 /*
 679                  * Have to do this even if the file's size doesn't change.
 680                  */
 681                 timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
 682         }
 683
 684         /*
 685          * Change file access modes.
 686          */
 687         if (mask & XFS_AT_MODE) {
 688                 ip->i_d.di_mode &= S_IFMT;
 689                 ip->i_d.di_mode |= vap->va_mode & ~S_IFMT;
 690
 691                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 692                 timeflags |= XFS_ICHGTIME_CHG;
 693         }
 694
 695         /*
 696          * Change file ownership.  Must be the owner or privileged.
 697          * If the system was configured with the "restricted_chown"
 698          * option, the owner is not permitted to give away the file,
 699          * and can change the group id only to a group of which he
 700          * or she is a member.
 701          */
 702         if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
 703                 /*
 704                  * CAP_FSETID overrides the following restrictions:
 705                  *
 706                  * The set-user-ID and set-group-ID bits of a file will be
 707                  * cleared upon successful return from chown()
 708                  */
 709                 if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
 710                     !capable(CAP_FSETID)) {
 711                         ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
 712                 }
 713
 714                 /*
 715                  * Change the ownerships and register quota modifications
 716                  * in the transaction.
 717                  */
 718                 if (iuid != uid) {
 719                         if (XFS_IS_UQUOTA_ON(mp)) {
 720                                 ASSERT(mask & XFS_AT_UID);
 721                                 ASSERT(udqp);
 722                                 olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 723                                                         &ip->i_udquot, udqp);
 724                         }
 725                         ip->i_d.di_uid = uid;
 726                 }
 727                 if (igid != gid) {
 728                         if (XFS_IS_GQUOTA_ON(mp)) {
 729                                 ASSERT(!XFS_IS_PQUOTA_ON(mp));
 730                                 ASSERT(mask & XFS_AT_GID);
 731                                 ASSERT(gdqp);
 732                                 olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 733                                                         &ip->i_gdquot, gdqp);
 734                         }
 735                         ip->i_d.di_gid = gid;
 736                 }
 737                 if (iprojid != projid) {
 738                         if (XFS_IS_PQUOTA_ON(mp)) {
 739                                 ASSERT(!XFS_IS_GQUOTA_ON(mp));
 740                                 ASSERT(mask & XFS_AT_PROJID);
 741                                 ASSERT(gdqp);
 742                                 olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 743                                                         &ip->i_gdquot, gdqp);
 744                         }
 745                         ip->i_d.di_projid = projid;
 746                         /*
 747                          * We may have to rev the inode as well as
 748                          * the superblock version number since projids didn't
 749                          * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
 750                          */
 751                         if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
 752                                 xfs_bump_ino_vers2(tp, ip);
 753                 }
 754
 755                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 756                 timeflags |= XFS_ICHGTIME_CHG;
 757         }
 758
 759
 760         /*
 761          * Change file access or modified times.
 762          */
 763         if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
 764                 if (mask & XFS_AT_ATIME) {
 765                         ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec;
 766                         ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec;
 767                         ip->i_update_core = 1;
 768                         timeflags &= ~XFS_ICHGTIME_ACC;
 769                 }
 770                 if (mask & XFS_AT_MTIME) {
 771                         ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec;
 772                         ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec;
 773                         timeflags &= ~XFS_ICHGTIME_MOD;
 774                         timeflags |= XFS_ICHGTIME_CHG;
 775                 }
 776                 if (tp && (flags & ATTR_UTIME))
 777                         xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 778         }
 779
 780         /*
 781          * Change XFS-added attributes.
 782          */
 783         if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
 784                 if (mask & XFS_AT_EXTSIZE) {
 785                         /*
 786                          * Converting bytes to fs blocks.
 787                          */
 788                         ip->i_d.di_extsize = vap->va_extsize >>
 789                                 mp->m_sb.sb_blocklog;
 790                 }
 791                 if (mask & XFS_AT_XFLAGS) {
 792                         uint    di_flags;
 793
 794                         /* can't set PREALLOC this way, just preserve it */
 795                         di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
 796                         if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
 797                                 di_flags |= XFS_DIFLAG_IMMUTABLE;
 798                         if (vap->va_xflags & XFS_XFLAG_APPEND)
 799                                 di_flags |= XFS_DIFLAG_APPEND;
 800                         if (vap->va_xflags & XFS_XFLAG_SYNC)
 801                                 di_flags |= XFS_DIFLAG_SYNC;
 802                         if (vap->va_xflags & XFS_XFLAG_NOATIME)
 803                                 di_flags |= XFS_DIFLAG_NOATIME;
 804                         if (vap->va_xflags & XFS_XFLAG_NODUMP)
 805                                 di_flags |= XFS_DIFLAG_NODUMP;
 806                         if (vap->va_xflags & XFS_XFLAG_PROJINHERIT)
 807                                 di_flags |= XFS_DIFLAG_PROJINHERIT;
 808                         if (vap->va_xflags & XFS_XFLAG_NODEFRAG)
 809                                 di_flags |= XFS_DIFLAG_NODEFRAG;
 810                         if (vap->va_xflags & XFS_XFLAG_FILESTREAM)
 811                                 di_flags |= XFS_DIFLAG_FILESTREAM;
 812                         if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
 813                                 if (vap->va_xflags & XFS_XFLAG_RTINHERIT)
 814                                         di_flags |= XFS_DIFLAG_RTINHERIT;
 815                                 if (vap->va_xflags & XFS_XFLAG_NOSYMLINKS)
 816                                         di_flags |= XFS_DIFLAG_NOSYMLINKS;
 817                                 if (vap->va_xflags & XFS_XFLAG_EXTSZINHERIT)
 818                                         di_flags |= XFS_DIFLAG_EXTSZINHERIT;
 819                         } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
 820                                 if (vap->va_xflags & XFS_XFLAG_REALTIME) {
 821                                         di_flags |= XFS_DIFLAG_REALTIME;
 822                                         ip->i_iocore.io_flags |= XFS_IOCORE_RT;
 823                                 } else {
 824                                         ip->i_iocore.io_flags &= ~XFS_IOCORE_RT;
 825                                 }
 826                                 if (vap->va_xflags & XFS_XFLAG_EXTSIZE)
 827                                         di_flags |= XFS_DIFLAG_EXTSIZE;
 828                         }
 829                         ip->i_d.di_flags = di_flags;
 830                 }
 831                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 832                 timeflags |= XFS_ICHGTIME_CHG;
 833         }
 834
 835         /*
 836          * Change file inode change time only if XFS_AT_CTIME set
 837          * AND we have been called by a DMI function.
 838          */
 839
 840         if ( (flags & ATTR_DMI) && (mask & XFS_AT_CTIME) ) {
 841                 ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec;
 842                 ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec;
 843                 ip->i_update_core = 1;
 844                 timeflags &= ~XFS_ICHGTIME_CHG;
 845         }
 846
 847         /*
 848          * Send out timestamp changes that need to be set to the
 849          * current time.  Not done when called by a DMI function.
 850          */
 851         if (timeflags && !(flags & ATTR_DMI))
 852                 xfs_ichgtime(ip, timeflags);
 853
 854         XFS_STATS_INC(xs_ig_attrchg);
 855
 856         /*
 857          * If this is a synchronous mount, make sure that the
 858          * transaction goes to disk before returning to the user.
 859          * This is slightly sub-optimal in that truncates require
 860          * two sync transactions instead of one for wsync filesystems.
 861          * One for the truncate and one for the timestamps since we
 862          * don't want to change the timestamps unless we're sure the
 863          * truncate worked.  Truncates are less than 1% of the laddis
 864          * mix so this probably isn't worth the trouble to optimize.
 865          */
 866         code = 0;
 867         if (tp) {
 868                 if (mp->m_flags & XFS_MOUNT_WSYNC)
 869                         xfs_trans_set_sync(tp);
 870
 871                 code = xfs_trans_commit(tp, commit_flags);
 872         }
 873
 874         /*
 875          * If the (regular) file's mandatory locking mode changed, then
 876          * notify the vnode.  We do this under the inode lock to prevent
 877          * racing calls to vop_vnode_change.
 878          */
 879         mandlock_after = MANDLOCK(vp, ip->i_d.di_mode);
 880         if (mandlock_before != mandlock_after) {
 881                 bhv_vop_vnode_change(vp, VCHANGE_FLAGS_ENF_LOCKING,
 882                                  mandlock_after);
 883         }
 884
 885         xfs_iunlock(ip, lock_flags);
 886
 887         /*
 888          * Release any dquot(s) the inode had kept before chown.
 889          */
 890         XFS_QM_DQRELE(mp, olddquot1);
 891         XFS_QM_DQRELE(mp, olddquot2);
 892         XFS_QM_DQRELE(mp, udqp);
 893         XFS_QM_DQRELE(mp, gdqp);
 894
 895         if (code) {
 896                 return code;
 897         }
 898
 899         if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) &&
 900             !(flags & ATTR_DMI)) {
 901                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, vp, DM_RIGHT_NULL,
 902                                         NULL, DM_RIGHT_NULL, NULL, NULL,
 903                                         0, 0, AT_DELAY_FLAG(flags));
 904         }
 905         return 0;
 906
 907  abort_return:
 908         commit_flags |= XFS_TRANS_ABORT;
 909         /* FALLTHROUGH */
 910  error_return:
 911         XFS_QM_DQRELE(mp, udqp);
 912         XFS_QM_DQRELE(mp, gdqp);
 913         if (tp) {
 914                 xfs_trans_cancel(tp, commit_flags);
 915         }
 916         if (lock_flags != 0) {
 917                 xfs_iunlock(ip, lock_flags);
 918         }
 919         return code;
 920 }
 921
 922
 923 /*
 924  * xfs_access
 925  * Null conversion from vnode mode bits to inode mode bits, as in efs.
 926  */
 927 STATIC int
 928 xfs_access(
 929         bhv_desc_t      *bdp,
 930         int             mode,
 931         cred_t          *credp)
 932 {
 933         xfs_inode_t     *ip;
 934         int             error;
 935
 936         vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__,
 937                                                (inst_t *)__return_address);
 938
 939         ip = XFS_BHVTOI(bdp);
 940         xfs_ilock(ip, XFS_ILOCK_SHARED);
 941         error = xfs_iaccess(ip, mode, credp);
 942         xfs_iunlock(ip, XFS_ILOCK_SHARED);
 943         return error;
 944 }
 945
 946
 947 /*
 948  * The maximum pathlen is 1024 bytes. Since the minimum file system
 949  * blocksize is 512 bytes, we can get a max of 2 extents back from
 950  * bmapi.
 951  */
 952 #define SYMLINK_MAPS 2
 953
 954 STATIC int
 955 xfs_readlink_bmap(
 956         xfs_inode_t     *ip,
 957         char            *link)
 958 {
 959         xfs_mount_t     *mp = ip->i_mount;
 960         int             pathlen = ip->i_d.di_size;
 961         int             nmaps = SYMLINK_MAPS;
 962         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
 963         xfs_daddr_t     d;
 964         int             byte_cnt;
 965         int             n;
 966         xfs_buf_t       *bp;
 967         int             error = 0;
 968
 969         error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0,
 970                         mval, &nmaps, NULL, NULL);
 971         if (error)
 972                 goto out;
 973
 974         for (n = 0; n < nmaps; n++) {
 975                 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
 976                 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
 977
 978                 bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0);
 979                 error = XFS_BUF_GETERROR(bp);
 980                 if (error) {
 981                         xfs_ioerror_alert("xfs_readlink",
 982                                   ip->i_mount, bp, XFS_BUF_ADDR(bp));
 983                         xfs_buf_relse(bp);
 984                         goto out;
 985                 }
 986                 if (pathlen < byte_cnt)
 987                         byte_cnt = pathlen;
 988                 pathlen -= byte_cnt;
 989
 990                 memcpy(link, XFS_BUF_PTR(bp), byte_cnt);
 991                 xfs_buf_relse(bp);
 992         }
 993
 994         link[ip->i_d.di_size] = '\0';
 995         error = 0;
 996
 997  out:
 998         return error;
 999 }
1000
1001 /*
1002  * xfs_readlink
1003  *
1004  */
1005 STATIC int
1006 xfs_readlink(
1007         bhv_desc_t      *bdp,
1008         char            *link)
1009 {
1010         xfs_inode_t     *ip = XFS_BHVTOI(bdp);
1011         xfs_mount_t     *mp = ip->i_mount;
1012         int             pathlen;
1013         int             error = 0;
1014
1015         vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
1016
1017         if (XFS_FORCED_SHUTDOWN(mp))
1018                 return XFS_ERROR(EIO);
1019
1020         xfs_ilock(ip, XFS_ILOCK_SHARED);
1021
1022         ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK);
1023         ASSERT(ip->i_d.di_size <= MAXPATHLEN);
1024
1025         pathlen = ip->i_d.di_size;
1026         if (!pathlen)
1027                 goto out;
1028
1029         if (ip->i_df.if_flags & XFS_IFINLINE) {
1030                 memcpy(link, ip->i_df.if_u1.if_data, pathlen);
1031                 link[pathlen] = '\0';
1032         } else {
1033                 error = xfs_readlink_bmap(ip, link);
1034         }
1035
1036  out:
1037         xfs_iunlock(ip, XFS_ILOCK_SHARED);
1038         return error;
1039 }
1040
1041 /*
1042  * xfs_fsync
1043  *
1044  * This is called to sync the inode and its data out to disk.
1045  * We need to hold the I/O lock while flushing the data, and
1046  * the inode lock while flushing the inode.  The inode lock CANNOT
1047  * be held while flushing the data, so acquire after we're done
1048  * with that.
1049  */
1050 STATIC int
1051 xfs_fsync(
1052         bhv_desc_t      *bdp,
1053         int             flag,
1054         cred_t          *credp,
1055         xfs_off_t       start,
1056         xfs_off_t       stop)
1057 {
1058         xfs_inode_t     *ip;
1059         xfs_trans_t     *tp;
1060         int             error;
1061         int             log_flushed = 0, changed = 1;
1062
1063         vn_trace_entry(BHV_TO_VNODE(bdp),
1064                         __FUNCTION__, (inst_t *)__return_address);
1065
1066         ip = XFS_BHVTOI(bdp);
1067
1068         ASSERT(start >= 0 && stop >= -1);
1069
1070         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
1071                 return XFS_ERROR(EIO);
1072
1073         if (flag & FSYNC_DATA)
1074                 filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
1075
1076         /*
1077          * We always need to make sure that the required inode state
1078          * is safe on disk.  The vnode might be clean but because
1079          * of committed transactions that haven't hit the disk yet.
1080          * Likewise, there could be unflushed non-transactional
1081          * changes to the inode core that have to go to disk.
1082          *
1083          * The following code depends on one assumption:  that
1084          * any transaction that changes an inode logs the core
1085          * because it has to change some field in the inode core
1086          * (typically nextents or nblocks).  That assumption
1087          * implies that any transactions against an inode will
1088          * catch any non-transactional updates.  If inode-altering
1089          * transactions exist that violate this assumption, the
1090          * code breaks.  Right now, it figures that if the involved
1091          * update_* field is clear and the inode is unpinned, the
1092          * inode is clean.  Either it's been flushed or it's been
1093          * committed and the commit has hit the disk unpinning the inode.
1094          * (Note that xfs_inode_item_format() called at commit clears
1095          * the update_* fields.)
1096          */
1097         xfs_ilock(ip, XFS_ILOCK_SHARED);
1098
1099         /* If we are flushing data then we care about update_size
1100          * being set, otherwise we care about update_core
1101          */
1102         if ((flag & FSYNC_DATA) ?
1103                         (ip->i_update_size == 0) :
1104                         (ip->i_update_core == 0)) {
1105                 /*
1106                  * Timestamps/size haven't changed since last inode
1107                  * flush or inode transaction commit.  That means
1108                  * either nothing got written or a transaction
1109                  * committed which caught the updates.  If the
1110                  * latter happened and the transaction hasn't
1111                  * hit the disk yet, the inode will be still
1112                  * be pinned.  If it is, force the log.
1113                  */
1114
1115                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1116
1117                 if (xfs_ipincount(ip)) {
1118                         _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
1119                                       XFS_LOG_FORCE |
1120                                       ((flag & FSYNC_WAIT)
1121                                        ? XFS_LOG_SYNC : 0),
1122                                       &log_flushed);
1123                 } else {
1124                         /*
1125                          * If the inode is not pinned and nothing
1126                          * has changed we don't need to flush the
1127                          * cache.
1128                          */
1129                         changed = 0;
1130                 }
1131                 error = 0;
1132         } else  {
1133                 /*
1134                  * Kick off a transaction to log the inode
1135                  * core to get the updates.  Make it
1136                  * sync if FSYNC_WAIT is passed in (which
1137                  * is done by everybody but specfs).  The
1138                  * sync transaction will also force the log.
1139                  */
1140                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1141                 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
1142                 if ((error = xfs_trans_reserve(tp, 0,
1143                                 XFS_FSYNC_TS_LOG_RES(ip->i_mount),
1144                                 0, 0, 0)))  {
1145                         xfs_trans_cancel(tp, 0);
1146                         return error;
1147                 }
1148                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1149
1150                 /*
1151                  * Note - it's possible that we might have pushed
1152                  * ourselves out of the way during trans_reserve
1153                  * which would flush the inode.  But there's no
1154                  * guarantee that the inode buffer has actually
1155                  * gone out yet (it's delwri).  Plus the buffer
1156                  * could be pinned anyway if it's part of an
1157                  * inode in another recent transaction.  So we
1158                  * play it safe and fire off the transaction anyway.
1159                  */
1160                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1161                 xfs_trans_ihold(tp, ip);
1162                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1163                 if (flag & FSYNC_WAIT)
1164                         xfs_trans_set_sync(tp);
1165                 error = _xfs_trans_commit(tp, 0, &log_flushed);
1166
1167                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1168         }
1169
1170         if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
1171                 /*
1172                  * If the log write didn't issue an ordered tag we need
1173                  * to flush the disk cache for the data device now.
1174                  */
1175                 if (!log_flushed)
1176                         xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
1177
1178                 /*
1179                  * If this inode is on the RT dev we need to flush that
1180                  * cache as well.
1181                  */
1182                 if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)
1183                         xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
1184         }
1185
1186         return error;
1187 }
1188
1189 /*
1190  * This is called by xfs_inactive to free any blocks beyond eof
1191  * when the link count isn't zero and by xfs_dm_punch_hole() when
1192  * punching a hole to EOF.
1193  */
1194 int
1195 xfs_free_eofblocks(
1196         xfs_mount_t     *mp,
1197         xfs_inode_t     *ip,
1198         int             flags)
1199 {
1200         xfs_trans_t     *tp;
1201         int             error;
1202         xfs_fileoff_t   end_fsb;
1203         xfs_fileoff_t   last_fsb;
1204         xfs_filblks_t   map_len;
1205         int             nimaps;
1206         xfs_bmbt_irec_t imap;
1207         int             use_iolock = (flags & XFS_FREE_EOF_LOCK);
1208
1209         /*
1210          * Figure out if there are any blocks beyond the end
1211          * of the file.  If not, then there is nothing to do.
1212          */
1213         end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
1214         last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1215         map_len = last_fsb - end_fsb;
1216         if (map_len <= 0)
1217                 return 0;
1218
1219         nimaps = 1;
1220         xfs_ilock(ip, XFS_ILOCK_SHARED);
1221         error = XFS_BMAPI(mp, NULL, &ip->i_iocore, end_fsb, map_len, 0,
1222                           NULL, 0, &imap, &nimaps, NULL, NULL);
1223         xfs_iunlock(ip, XFS_ILOCK_SHARED);
1224
1225         if (!error && (nimaps != 0) &&
1226             (imap.br_startblock != HOLESTARTBLOCK ||
1227              ip->i_delayed_blks)) {
1228                 /*
1229                  * Attach the dquots to the inode up front.
1230                  */
1231                 if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1232                         return error;
1233
1234                 /*
1235                  * There are blocks after the end of file.
1236                  * Free them up now by truncating the file to
1237                  * its current size.
1238                  */
1239                 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1240
1241                 /*
1242                  * Do the xfs_itruncate_start() call before
1243                  * reserving any log space because
1244                  * itruncate_start will call into the buffer
1245                  * cache and we can't
1246                  * do that within a transaction.
1247                  */
1248                 if (use_iolock)
1249                         xfs_ilock(ip, XFS_IOLOCK_EXCL);
1250                 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
1251                                     ip->i_size);
1252                 if (error) {
1253                         xfs_trans_cancel(tp, 0);
1254                         if (use_iolock)
1255                                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1256                         return error;
1257                 }
1258
1259                 error = xfs_trans_reserve(tp, 0,
1260                                           XFS_ITRUNCATE_LOG_RES(mp),
1261                                           0, XFS_TRANS_PERM_LOG_RES,
1262                                           XFS_ITRUNCATE_LOG_COUNT);
1263                 if (error) {
1264                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1265                         xfs_trans_cancel(tp, 0);
1266                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1267                         return error;
1268                 }
1269
1270                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1271                 xfs_trans_ijoin(tp, ip,
1272                                 XFS_IOLOCK_EXCL |
1273                                 XFS_ILOCK_EXCL);
1274                 xfs_trans_ihold(tp, ip);
1275
1276                 error = xfs_itruncate_finish(&tp, ip,
1277                                              ip->i_size,
1278                                              XFS_DATA_FORK,
1279                                              0);
1280                 /*
1281                  * If we get an error at this point we
1282                  * simply don't bother truncating the file.
1283                  */
1284                 if (error) {
1285                         xfs_trans_cancel(tp,
1286                                          (XFS_TRANS_RELEASE_LOG_RES |
1287                                           XFS_TRANS_ABORT));
1288                 } else {
1289                         error = xfs_trans_commit(tp,
1290                                                 XFS_TRANS_RELEASE_LOG_RES);
1291                 }
1292                 xfs_iunlock(ip, (use_iolock ? (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)
1293                                             : XFS_ILOCK_EXCL));
1294         }
1295         return error;
1296 }
1297
1298 /*
1299  * Free a symlink that has blocks associated with it.
1300  */
1301 STATIC int
1302 xfs_inactive_symlink_rmt(
1303         xfs_inode_t     *ip,
1304         xfs_trans_t     **tpp)
1305 {
1306         xfs_buf_t       *bp;
1307         int             committed;
1308         int             done;
1309         int             error;
1310         xfs_fsblock_t   first_block;
1311         xfs_bmap_free_t free_list;
1312         int             i;
1313         xfs_mount_t     *mp;
1314         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
1315         int             nmaps;
1316         xfs_trans_t     *ntp;
1317         int             size;
1318         xfs_trans_t     *tp;
1319
1320         tp = *tpp;
1321         mp = ip->i_mount;
1322         ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
1323         /*
1324          * We're freeing a symlink that has some
1325          * blocks allocated to it.  Free the
1326          * blocks here.  We know that we've got
1327          * either 1 or 2 extents and that we can
1328          * free them all in one bunmapi call.
1329          */
1330         ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
1331         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1332                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1333                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1334                 xfs_trans_cancel(tp, 0);
1335                 *tpp = NULL;
1336                 return error;
1337         }
1338         /*
1339          * Lock the inode, fix the size, and join it to the transaction.
1340          * Hold it so in the normal path, we still have it locked for
1341          * the second transaction.  In the error paths we need it
1342          * held so the cancel won't rele it, see below.
1343          */
1344         xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1345         size = (int)ip->i_d.di_size;
1346         ip->i_d.di_size = 0;
1347         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1348         xfs_trans_ihold(tp, ip);
1349         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1350         /*
1351          * Find the block(s) so we can inval and unmap them.
1352          */
1353         done = 0;
1354         XFS_BMAP_INIT(&free_list, &first_block);
1355         nmaps = ARRAY_SIZE(mval);
1356         if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
1357                         XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
1358                         &free_list, NULL)))
1359                 goto error0;
1360         /*
1361          * Invalidate the block(s).
1362          */
1363         for (i = 0; i < nmaps; i++) {
1364                 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
1365                         XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
1366                         XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
1367                 xfs_trans_binval(tp, bp);
1368         }
1369         /*
1370          * Unmap the dead block(s) to the free_list.
1371          */
1372         if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
1373                         &first_block, &free_list, NULL, &done)))
1374                 goto error1;
1375         ASSERT(done);
1376         /*
1377          * Commit the first transaction.  This logs the EFI and the inode.
1378          */
1379         if ((error = xfs_bmap_finish(&tp, &free_list, &committed)))
1380                 goto error1;
1381         /*
1382          * The transaction must have been committed, since there were
1383          * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
1384          * The new tp has the extent freeing and EFDs.
1385          */
1386         ASSERT(committed);
1387         /*
1388          * The first xact was committed, so add the inode to the new one.
1389          * Mark it dirty so it will be logged and moved forward in the log as
1390          * part of every commit.
1391          */
1392         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1393         xfs_trans_ihold(tp, ip);
1394         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1395         /*
1396          * Get a new, empty transaction to return to our caller.
1397          */
1398         ntp = xfs_trans_dup(tp);
1399         /*
1400          * Commit the transaction containing extent freeing and EFDs.
1401          * If we get an error on the commit here or on the reserve below,
1402          * we need to unlock the inode since the new transaction doesn't
1403          * have the inode attached.
1404          */
1405         error = xfs_trans_commit(tp, 0);
1406         tp = ntp;
1407         if (error) {
1408                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1409                 goto error0;
1410         }
1411         /*
1412          * Remove the memory for extent descriptions (just bookkeeping).
1413          */
1414         if (ip->i_df.if_bytes)
1415                 xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
1416         ASSERT(ip->i_df.if_bytes == 0);
1417         /*
1418          * Put an itruncate log reservation in the new transaction
1419          * for our caller.
1420          */
1421         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1422                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1423                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1424                 goto error0;
1425         }
1426         /*
1427          * Return with the inode locked but not joined to the transaction.
1428          */
1429         *tpp = tp;
1430         return 0;
1431
1432  error1:
1433         xfs_bmap_cancel(&free_list);
1434  error0:
1435         /*
1436          * Have to come here with the inode locked and either
1437          * (held and in the transaction) or (not in the transaction).
1438          * If the inode isn't held then cancel would iput it, but
1439          * that's wrong since this is inactive and the vnode ref
1440          * count is 0 already.
1441          * Cancel won't do anything to the inode if held, but it still
1442          * needs to be locked until the cancel is done, if it was
1443          * joined to the transaction.
1444          */
1445         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1446         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1447         *tpp = NULL;
1448         return error;
1449
1450 }
1451
1452 STATIC int
1453 xfs_inactive_symlink_local(
1454         xfs_inode_t     *ip,
1455         xfs_trans_t     **tpp)
1456 {
1457         int             error;
1458
1459         ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
1460         /*
1461          * We're freeing a symlink which fit into
1462          * the inode.  Just free the memory used
1463          * to hold the old symlink.
1464          */
1465         error = xfs_trans_reserve(*tpp, 0,
1466                                   XFS_ITRUNCATE_LOG_RES(ip->i_mount),
1467                                   0, XFS_TRANS_PERM_LOG_RES,
1468                                   XFS_ITRUNCATE_LOG_COUNT);
1469
1470         if (error) {
1471                 xfs_trans_cancel(*tpp, 0);
1472                 *tpp = NULL;
1473                 return error;
1474         }
1475         xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1476
1477         /*
1478          * Zero length symlinks _can_ exist.
1479          */
1480         if (ip->i_df.if_bytes > 0) {
1481                 xfs_idata_realloc(ip,
1482                                   -(ip->i_df.if_bytes),
1483                                   XFS_DATA_FORK);
1484                 ASSERT(ip->i_df.if_bytes == 0);
1485         }
1486         return 0;
1487 }
1488
1489 STATIC int
1490 xfs_inactive_attrs(
1491         xfs_inode_t     *ip,
1492         xfs_trans_t     **tpp)
1493 {
1494         xfs_trans_t     *tp;
1495         int             error;
1496         xfs_mount_t     *mp;
1497
1498         ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
1499         tp = *tpp;
1500         mp = ip->i_mount;
1501         ASSERT(ip->i_d.di_forkoff != 0);
1502         xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1503         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1504
1505         error = xfs_attr_inactive(ip);
1506         if (error) {
1507                 *tpp = NULL;
1508                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1509                 return error; /* goto out */
1510         }
1511
1512         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1513         error = xfs_trans_reserve(tp, 0,
1514                                   XFS_IFREE_LOG_RES(mp),
1515                                   0, XFS_TRANS_PERM_LOG_RES,
1516                                   XFS_INACTIVE_LOG_COUNT);
1517         if (error) {
1518                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1519                 xfs_trans_cancel(tp, 0);
1520                 *tpp = NULL;
1521                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1522                 return error;
1523         }
1524
1525         xfs_ilock(ip, XFS_ILOCK_EXCL);
1526         xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1527         xfs_trans_ihold(tp, ip);
1528         xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1529
1530         ASSERT(ip->i_d.di_anextents == 0);
1531
1532         *tpp = tp;
1533         return 0;
1534 }
1535
1536 STATIC int
1537 xfs_release(
1538         bhv_desc_t      *bdp)
1539 {
1540         xfs_inode_t     *ip;
1541         bhv_vnode_t     *vp;
1542         xfs_mount_t     *mp;
1543         int             error;
1544
1545         vp = BHV_TO_VNODE(bdp);
1546         ip = XFS_BHVTOI(bdp);
1547         mp = ip->i_mount;
1548
1549         if (!VN_ISREG(vp) || (ip->i_d.di_mode == 0))
1550                 return 0;
1551
1552         /* If this is a read-only mount, don't do this (would generate I/O) */
1553         if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
1554                 return 0;
1555
1556         if (!XFS_FORCED_SHUTDOWN(mp)) {
1557                 /*
1558                  * If we are using filestreams, and we have an unlinked
1559                  * file that we are processing the last close on, then nothing
1560                  * will be able to reopen and write to this file. Purge this
1561                  * inode from the filestreams cache so that it doesn't delay
1562                  * teardown of the inode.
1563                  */
1564                 if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
1565                         xfs_filestream_deassociate(ip);
1566
1567                 /*
1568                  * If we previously truncated this file and removed old data
1569                  * in the process, we want to initiate "early" writeout on
1570                  * the last close.  This is an attempt to combat the notorious
1571                  * NULL files problem which is particularly noticable from a
1572                  * truncate down, buffered (re-)write (delalloc), followed by
1573                  * a crash.  What we are effectively doing here is
1574                  * significantly reducing the time window where we'd otherwise
1575                  * be exposed to that problem.
1576                  */
1577                 if (VUNTRUNCATE(vp) && VN_DIRTY(vp) && ip->i_delayed_blks > 0)
1578                         bhv_vop_flush_pages(vp, 0, -1, XFS_B_ASYNC, FI_NONE);
1579         }
1580
1581 #ifdef HAVE_REFCACHE
1582         /* If we are in the NFS reference cache then don't do this now */
1583         if (ip->i_refcache)
1584                 return 0;
1585 #endif
1586
1587         if (ip->i_d.di_nlink != 0) {
1588                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1589                      ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1590                        ip->i_delayed_blks > 0)) &&
1591                      (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
1592                     (!(ip->i_d.di_flags &
1593                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
1594                         error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1595                         if (error)
1596                                 return error;
1597                         /* Update linux inode block count after free above */
1598                         vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1599                                 ip->i_d.di_nblocks + ip->i_delayed_blks);
1600                 }
1601         }
1602
1603         return 0;
1604 }
1605
1606 /*
1607  * xfs_inactive
1608  *
1609  * This is called when the vnode reference count for the vnode
1610  * goes to zero.  If the file has been unlinked, then it must
1611  * now be truncated.  Also, we clear all of the read-ahead state
1612  * kept for the inode here since the file is now closed.
1613  */
1614 STATIC int
1615 xfs_inactive(
1616         bhv_desc_t      *bdp,
1617         cred_t          *credp)
1618 {
1619         xfs_inode_t     *ip;
1620         bhv_vnode_t     *vp;
1621         xfs_bmap_free_t free_list;
1622         xfs_fsblock_t   first_block;
1623         int             committed;
1624         xfs_trans_t     *tp;
1625         xfs_mount_t     *mp;
1626         int             error;
1627         int             truncate;
1628
1629         vp = BHV_TO_VNODE(bdp);
1630         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
1631
1632         ip = XFS_BHVTOI(bdp);
1633
1634         /*
1635          * If the inode is already free, then there can be nothing
1636          * to clean up here.
1637          */
1638         if (ip->i_d.di_mode == 0 || VN_BAD(vp)) {
1639                 ASSERT(ip->i_df.if_real_bytes == 0);
1640                 ASSERT(ip->i_df.if_broot_bytes == 0);
1641                 return VN_INACTIVE_CACHE;
1642         }
1643
1644         /*
1645          * Only do a truncate if it's a regular file with
1646          * some actual space in it.  It's OK to look at the
1647          * inode's fields without the lock because we're the
1648          * only one with a reference to the inode.
1649          */
1650         truncate = ((ip->i_d.di_nlink == 0) &&
1651             ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
1652              (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
1653             ((ip->i_d.di_mode & S_IFMT) == S_IFREG));
1654
1655         mp = ip->i_mount;
1656
1657         if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY)) {
1658                 (void) XFS_SEND_DESTROY(mp, vp, DM_RIGHT_NULL);
1659         }
1660
1661         error = 0;
1662
1663         /* If this is a read-only mount, don't do this (would generate I/O) */
1664         if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
1665                 goto out;
1666
1667         if (ip->i_d.di_nlink != 0) {
1668                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1669                      ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1670                        ip->i_delayed_blks > 0)) &&
1671                       (ip->i_df.if_flags & XFS_IFEXTENTS) &&
1672                      (!(ip->i_d.di_flags &
1673                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
1674                       (ip->i_delayed_blks != 0)))) {
1675                         error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1676                         if (error)
1677                                 return VN_INACTIVE_CACHE;
1678                         /* Update linux inode block count after free above */
1679                         vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1680                                 ip->i_d.di_nblocks + ip->i_delayed_blks);
1681                 }
1682                 goto out;
1683         }
1684
1685         ASSERT(ip->i_d.di_nlink == 0);
1686
1687         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1688                 return VN_INACTIVE_CACHE;
1689
1690         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1691         if (truncate) {
1692                 /*
1693                  * Do the xfs_itruncate_start() call before
1694                  * reserving any log space because itruncate_start
1695                  * will call into the buffer cache and we can't
1696                  * do that within a transaction.
1697                  */
1698                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1699
1700                 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
1701                 if (error) {
1702                         xfs_trans_cancel(tp, 0);
1703                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1704                         return VN_INACTIVE_CACHE;
1705                 }
1706
1707                 error = xfs_trans_reserve(tp, 0,
1708                                           XFS_ITRUNCATE_LOG_RES(mp),
1709                                           0, XFS_TRANS_PERM_LOG_RES,
1710                                           XFS_ITRUNCATE_LOG_COUNT);
1711                 if (error) {
1712                         /* Don't call itruncate_cleanup */
1713                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1714                         xfs_trans_cancel(tp, 0);
1715                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1716                         return VN_INACTIVE_CACHE;
1717                 }
1718
1719                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1720                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1721                 xfs_trans_ihold(tp, ip);
1722
1723                 /*
1724                  * normally, we have to run xfs_itruncate_finish sync.
1725                  * But if filesystem is wsync and we're in the inactive
1726                  * path, then we know that nlink == 0, and that the
1727                  * xaction that made nlink == 0 is permanently committed
1728                  * since xfs_remove runs as a synchronous transaction.
1729                  */
1730                 error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK,
1731                                 (!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0));
1732
1733                 if (error) {
1734                         xfs_trans_cancel(tp,
1735                                 XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1736                         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1737                         return VN_INACTIVE_CACHE;
1738                 }
1739         } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) {
1740
1741                 /*
1742                  * If we get an error while cleaning up a
1743                  * symlink we bail out.
1744                  */
1745                 error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
1746                         xfs_inactive_symlink_rmt(ip, &tp) :
1747                         xfs_inactive_symlink_local(ip, &tp);
1748
1749                 if (error) {
1750                         ASSERT(tp == NULL);
1751                         return VN_INACTIVE_CACHE;
1752                 }
1753
1754                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1755                 xfs_trans_ihold(tp, ip);
1756         } else {
1757                 error = xfs_trans_reserve(tp, 0,
1758                                           XFS_IFREE_LOG_RES(mp),
1759                                           0, XFS_TRANS_PERM_LOG_RES,
1760                                           XFS_INACTIVE_LOG_COUNT);
1761                 if (error) {
1762                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1763                         xfs_trans_cancel(tp, 0);
1764                         return VN_INACTIVE_CACHE;
1765                 }
1766
1767                 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1768                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1769                 xfs_trans_ihold(tp, ip);
1770         }
1771
1772         /*
1773          * If there are attributes associated with the file
1774          * then blow them away now.  The code calls a routine
1775          * that recursively deconstructs the attribute fork.
1776          * We need to just commit the current transaction
1777          * because we can't use it for xfs_attr_inactive().
1778          */
1779         if (ip->i_d.di_anextents > 0) {
1780                 error = xfs_inactive_attrs(ip, &tp);
1781                 /*
1782                  * If we got an error, the transaction is already
1783                  * cancelled, and the inode is unlocked. Just get out.
1784                  */
1785                  if (error)
1786                          return VN_INACTIVE_CACHE;
1787         } else if (ip->i_afp) {
1788                 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1789         }
1790
1791         /*
1792          * Free the inode.
1793          */
1794         XFS_BMAP_INIT(&free_list, &first_block);
1795         error = xfs_ifree(tp, ip, &free_list);
1796         if (error) {
1797                 /*
1798                  * If we fail to free the inode, shut down.  The cancel
1799                  * might do that, we need to make sure.  Otherwise the
1800                  * inode might be lost for a long time or forever.
1801                  */
1802                 if (!XFS_FORCED_SHUTDOWN(mp)) {
1803                         cmn_err(CE_NOTE,
1804                 "xfs_inactive:  xfs_ifree() returned an error = %d on %s",
1805                                 error, mp->m_fsname);
1806                         xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1807                 }
1808                 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
1809         } else {
1810                 /*
1811                  * Credit the quota account(s). The inode is gone.
1812                  */
1813                 XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1814
1815                 /*
1816                  * Just ignore errors at this point.  There is
1817                  * nothing we can do except to try to keep going.
1818                  */
1819                 (void) xfs_bmap_finish(&tp,  &free_list, &committed);
1820                 (void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1821         }
1822         /*
1823          * Release the dquots held by inode, if any.
1824          */
1825         XFS_QM_DQDETACH(mp, ip);
1826
1827         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1828
1829  out:
1830         return VN_INACTIVE_CACHE;
1831 }
1832
1833
1834 /*
1835  * xfs_lookup
1836  */
1837 STATIC int
1838 xfs_lookup(
1839         bhv_desc_t              *dir_bdp,
1840         bhv_vname_t             *dentry,
1841         bhv_vnode_t             **vpp,
1842         int                     flags,
1843         bhv_vnode_t             *rdir,
1844         cred_t                  *credp)
1845 {
1846         xfs_inode_t             *dp, *ip;
1847         xfs_ino_t               e_inum;
1848         int                     error;
1849         uint                    lock_mode;
1850         bhv_vnode_t             *dir_vp;
1851
1852         dir_vp = BHV_TO_VNODE(dir_bdp);
1853         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
1854
1855         dp = XFS_BHVTOI(dir_bdp);
1856
1857         if (XFS_FORCED_SHUTDOWN(dp->i_mount))
1858                 return XFS_ERROR(EIO);
1859
1860         lock_mode = xfs_ilock_map_shared(dp);
1861         error = xfs_dir_lookup_int(dir_bdp, lock_mode, dentry, &e_inum, &ip);
1862         if (!error) {
1863                 *vpp = XFS_ITOV(ip);
1864                 ITRACE(ip);
1865         }
1866         xfs_iunlock_map_shared(dp, lock_mode);
1867         return error;
1868 }
1869
1870
1871 /*
1872  * xfs_create (create a new file).
1873  */
1874 STATIC int
1875 xfs_create(
1876         bhv_desc_t              *dir_bdp,
1877         bhv_vname_t             *dentry,
1878         bhv_vattr_t             *vap,
1879         bhv_vnode_t             **vpp,
1880         cred_t                  *credp)
1881 {
1882         char                    *name = VNAME(dentry);
1883         bhv_vnode_t             *dir_vp;
1884         xfs_inode_t             *dp, *ip;
1885         bhv_vnode_t             *vp = NULL;
1886         xfs_trans_t             *tp;
1887         xfs_mount_t             *mp;
1888         xfs_dev_t               rdev;
1889         int                     error;
1890         xfs_bmap_free_t         free_list;
1891         xfs_fsblock_t           first_block;
1892         boolean_t               dp_joined_to_trans;
1893         int                     dm_event_sent = 0;
1894         uint                    cancel_flags;
1895         int                     committed;
1896         xfs_prid_t              prid;
1897         struct xfs_dquot        *udqp, *gdqp;
1898         uint                    resblks;
1899         int                     dm_di_mode;
1900         int                     namelen;
1901
1902         ASSERT(!*vpp);
1903         dir_vp = BHV_TO_VNODE(dir_bdp);
1904         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
1905
1906         dp = XFS_BHVTOI(dir_bdp);
1907         mp = dp->i_mount;
1908
1909         dm_di_mode = vap->va_mode;
1910         namelen = VNAMELEN(dentry);
1911
1912         if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
1913                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
1914                                 dir_vp, DM_RIGHT_NULL, NULL,
1915                                 DM_RIGHT_NULL, name, NULL,
1916                                 dm_di_mode, 0, 0);
1917
1918                 if (error)
1919                         return error;
1920                 dm_event_sent = 1;
1921         }
1922
1923         if (XFS_FORCED_SHUTDOWN(mp))
1924                 return XFS_ERROR(EIO);
1925
1926         /* Return through std_return after this point. */
1927
1928         udqp = gdqp = NULL;
1929         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1930                 prid = dp->i_d.di_projid;
1931         else if (vap->va_mask & XFS_AT_PROJID)
1932                 prid = (xfs_prid_t)vap->va_projid;
1933         else
1934                 prid = (xfs_prid_t)dfltprid;
1935
1936         /*
1937          * Make sure that we have allocated dquot(s) on disk.
1938          */
1939         error = XFS_QM_DQVOPALLOC(mp, dp,
1940                         current_fsuid(credp), current_fsgid(credp), prid,
1941                         XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
1942         if (error)
1943                 goto std_return;
1944
1945         ip = NULL;
1946         dp_joined_to_trans = B_FALSE;
1947
1948         tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1949         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1950         resblks = XFS_CREATE_SPACE_RES(mp, namelen);
1951         /*
1952          * Initially assume that the file does not exist and
1953          * reserve the resources for that case.  If that is not
1954          * the case we'll drop the one we have and get a more
1955          * appropriate transaction later.
1956          */
1957         error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
1958                         XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1959         if (error == ENOSPC) {
1960                 resblks = 0;
1961                 error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0,
1962                                 XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1963         }
1964         if (error) {
1965                 cancel_flags = 0;
1966                 dp = NULL;
1967                 goto error_return;
1968         }
1969
1970         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1971
1972         XFS_BMAP_INIT(&free_list, &first_block);
1973
1974         ASSERT(ip == NULL);
1975
1976         /*
1977          * Reserve disk quota and the inode.
1978          */
1979         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
1980         if (error)
1981                 goto error_return;
1982
1983         if (resblks == 0 && (error = xfs_dir_canenter(tp, dp, name, namelen)))
1984                 goto error_return;
1985         rdev = (vap->va_mask & XFS_AT_RDEV) ? vap->va_rdev : 0;
1986         error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 1,
1987                         rdev, credp, prid, resblks > 0,
1988                         &ip, &committed);
1989         if (error) {
1990                 if (error == ENOSPC)
1991                         goto error_return;
1992                 goto abort_return;
1993         }
1994         ITRACE(ip);
1995
1996         /*
1997          * At this point, we've gotten a newly allocated inode.
1998          * It is locked (and joined to the transaction).
1999          */
2000
2001         ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE));
2002
2003         /*
2004          * Now we join the directory inode to the transaction.
2005          * We do not do it earlier because xfs_dir_ialloc
2006          * might commit the previous transaction (and release
2007          * all the locks).
2008          */
2009
2010         VN_HOLD(dir_vp);
2011         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2012         dp_joined_to_trans = B_TRUE;
2013
2014         error = xfs_dir_createname(tp, dp, name, namelen, ip->i_ino,
2015                                         &first_block, &free_list, resblks ?
2016                                         resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2017         if (error) {
2018                 ASSERT(error != ENOSPC);
2019                 goto abort_return;
2020         }
2021         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2022         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2023
2024         /*
2025          * If this is a synchronous mount, make sure that the
2026          * create transaction goes to disk before returning to
2027          * the user.
2028          */
2029         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2030                 xfs_trans_set_sync(tp);
2031         }
2032
2033         dp->i_gen++;
2034
2035         /*
2036          * Attach the dquot(s) to the inodes and modify them incore.
2037          * These ids of the inode couldn't have changed since the new
2038          * inode has been locked ever since it was created.
2039          */
2040         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
2041
2042         /*
2043          * xfs_trans_commit normally decrements the vnode ref count
2044          * when it unlocks the inode. Since we want to return the
2045          * vnode to the caller, we bump the vnode ref count now.
2046          */
2047         IHOLD(ip);
2048         vp = XFS_ITOV(ip);
2049
2050         error = xfs_bmap_finish(&tp, &free_list, &committed);
2051         if (error) {
2052                 xfs_bmap_cancel(&free_list);
2053                 goto abort_rele;
2054         }
2055
2056         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2057         if (error) {
2058                 IRELE(ip);
2059                 tp = NULL;
2060                 goto error_return;
2061         }
2062
2063         XFS_QM_DQRELE(mp, udqp);
2064         XFS_QM_DQRELE(mp, gdqp);
2065
2066         /*
2067          * Propagate the fact that the vnode changed after the
2068          * xfs_inode locks have been released.
2069          */
2070         bhv_vop_vnode_change(vp, VCHANGE_FLAGS_TRUNCATED, 3);
2071
2072         *vpp = vp;
2073
2074         /* Fallthrough to std_return with error = 0  */
2075
2076 std_return:
2077         if ((*vpp || (error != 0 && dm_event_sent != 0)) &&
2078             DM_EVENT_ENABLED(XFS_BHVTOI(dir_bdp), DM_EVENT_POSTCREATE)) {
2079                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2080                         dir_vp, DM_RIGHT_NULL,
2081                         *vpp ? vp:NULL,
2082                         DM_RIGHT_NULL, name, NULL,
2083                         dm_di_mode, error, 0);
2084         }
2085         return error;
2086
2087  abort_return:
2088         cancel_flags |= XFS_TRANS_ABORT;
2089         /* FALLTHROUGH */
2090
2091  error_return:
2092         if (tp != NULL)
2093                 xfs_trans_cancel(tp, cancel_flags);
2094
2095         if (!dp_joined_to_trans && (dp != NULL))
2096                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2097         XFS_QM_DQRELE(mp, udqp);
2098         XFS_QM_DQRELE(mp, gdqp);
2099
2100         goto std_return;
2101
2102  abort_rele:
2103         /*
2104          * Wait until after the current transaction is aborted to
2105          * release the inode.  This prevents recursive transactions
2106          * and deadlocks from xfs_inactive.
2107          */
2108         cancel_flags |= XFS_TRANS_ABORT;
2109         xfs_trans_cancel(tp, cancel_flags);
2110         IRELE(ip);
2111
2112         XFS_QM_DQRELE(mp, udqp);
2113         XFS_QM_DQRELE(mp, gdqp);
2114
2115         goto std_return;
2116 }
2117
2118 #ifdef DEBUG
2119 /*
2120  * Some counters to see if (and how often) we are hitting some deadlock
2121  * prevention code paths.
2122  */
2123
2124 int xfs_rm_locks;
2125 int xfs_rm_lock_delays;
2126 int xfs_rm_attempts;
2127 #endif
2128
2129 /*
2130  * The following routine will lock the inodes associated with the
2131  * directory and the named entry in the directory. The locks are
2132  * acquired in increasing inode number.
2133  *
2134  * If the entry is "..", then only the directory is locked. The
2135  * vnode ref count will still include that from the .. entry in
2136  * this case.
2137  *
2138  * There is a deadlock we need to worry about. If the locked directory is
2139  * in the AIL, it might be blocking up the log. The next inode we lock
2140  * could be already locked by another thread waiting for log space (e.g
2141  * a permanent log reservation with a long running transaction (see
2142  * xfs_itruncate_finish)). To solve this, we must check if the directory
2143  * is in the ail and use lock_nowait. If we can't lock, we need to
2144  * drop the inode lock on the directory and try again. xfs_iunlock will
2145  * potentially push the tail if we were holding up the log.
2146  */
2147 STATIC int
2148 xfs_lock_dir_and_entry(
2149         xfs_inode_t     *dp,
2150         xfs_inode_t     *ip)    /* inode of entry 'name' */
2151 {
2152         int             attempts;
2153         xfs_ino_t       e_inum;
2154         xfs_inode_t     *ips[2];
2155         xfs_log_item_t  *lp;
2156
2157 #ifdef DEBUG
2158         xfs_rm_locks++;
2159 #endif
2160         attempts = 0;
2161
2162 again:
2163         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2164
2165         e_inum = ip->i_ino;
2166
2167         ITRACE(ip);
2168
2169         /*
2170          * We want to lock in increasing inum. Since we've already
2171          * acquired the lock on the directory, we may need to release
2172          * if if the inum of the entry turns out to be less.
2173          */
2174         if (e_inum > dp->i_ino) {
2175                 /*
2176                  * We are already in the right order, so just
2177                  * lock on the inode of the entry.
2178                  * We need to use nowait if dp is in the AIL.
2179                  */
2180
2181                 lp = (xfs_log_item_t *)dp->i_itemp;
2182                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2183                         if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2184                                 attempts++;
2185 #ifdef DEBUG
2186                                 xfs_rm_attempts++;
2187 #endif
2188
2189                                 /*
2190                                  * Unlock dp and try again.
2191                                  * xfs_iunlock will try to push the tail
2192                                  * if the inode is in the AIL.
2193                                  */
2194
2195                                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2196
2197                                 if ((attempts % 5) == 0) {
2198                                         delay(1); /* Don't just spin the CPU */
2199 #ifdef DEBUG
2200                                         xfs_rm_lock_delays++;
2201 #endif
2202                                 }
2203                                 goto again;
2204                         }
2205                 } else {
2206                         xfs_ilock(ip, XFS_ILOCK_EXCL);
2207                 }
2208         } else if (e_inum < dp->i_ino) {
2209                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2210
2211                 ips[0] = ip;
2212                 ips[1] = dp;
2213                 xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2214         }
2215         /* else  e_inum == dp->i_ino */
2216         /*     This can happen if we're asked to lock /x/..
2217          *     the entry is "..", which is also the parent directory.
2218          */
2219
2220         return 0;
2221 }
2222
2223 #ifdef DEBUG
2224 int xfs_locked_n;
2225 int xfs_small_retries;
2226 int xfs_middle_retries;
2227 int xfs_lots_retries;
2228 int xfs_lock_delays;
2229 #endif
2230
2231 /*
2232  * Bump the subclass so xfs_lock_inodes() acquires each lock with
2233  * a different value
2234  */
2235 static inline int
2236 xfs_lock_inumorder(int lock_mode, int subclass)
2237 {
2238         if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
2239                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
2240         if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
2241                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
2242
2243         return lock_mode;
2244 }
2245
2246 /*
2247  * The following routine will lock n inodes in exclusive mode.
2248  * We assume the caller calls us with the inodes in i_ino order.
2249  *
2250  * We need to detect deadlock where an inode that we lock
2251  * is in the AIL and we start waiting for another inode that is locked
2252  * by a thread in a long running transaction (such as truncate). This can
2253  * result in deadlock since the long running trans might need to wait
2254  * for the inode we just locked in order to push the tail and free space
2255  * in the log.
2256  */
2257 void
2258 xfs_lock_inodes(
2259         xfs_inode_t     **ips,
2260         int             inodes,
2261         int             first_locked,
2262         uint            lock_mode)
2263 {
2264         int             attempts = 0, i, j, try_lock;
2265         xfs_log_item_t  *lp;
2266
2267         ASSERT(ips && (inodes >= 2)); /* we need at least two */
2268
2269         if (first_locked) {
2270                 try_lock = 1;
2271                 i = 1;
2272         } else {
2273                 try_lock = 0;
2274                 i = 0;
2275         }
2276
2277 again:
2278         for (; i < inodes; i++) {
2279                 ASSERT(ips[i]);
2280
2281                 if (i && (ips[i] == ips[i-1]))  /* Already locked */
2282                         continue;
2283
2284                 /*
2285                  * If try_lock is not set yet, make sure all locked inodes
2286                  * are not in the AIL.
2287                  * If any are, set try_lock to be used later.
2288                  */
2289
2290                 if (!try_lock) {
2291                         for (j = (i - 1); j >= 0 && !try_lock; j--) {
2292                                 lp = (xfs_log_item_t *)ips[j]->i_itemp;
2293                                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2294                                         try_lock++;
2295                                 }
2296                         }
2297                 }
2298
2299                 /*
2300                  * If any of the previous locks we have locked is in the AIL,
2301                  * we must TRY to get the second and subsequent locks. If
2302                  * we can't get any, we must release all we have
2303                  * and try again.
2304                  */
2305
2306                 if (try_lock) {
2307                         /* try_lock must be 0 if i is 0. */
2308                         /*
2309                          * try_lock means we have an inode locked
2310                          * that is in the AIL.
2311                          */
2312                         ASSERT(i != 0);
2313                         if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
2314                                 attempts++;
2315
2316                                 /*
2317                                  * Unlock all previous guys and try again.
2318                                  * xfs_iunlock will try to push the tail
2319                                  * if the inode is in the AIL.
2320                                  */
2321
2322                                 for(j = i - 1; j >= 0; j--) {
2323
2324                                         /*
2325                                          * Check to see if we've already
2326                                          * unlocked this one.
2327                                          * Not the first one going back,
2328                                          * and the inode ptr is the same.
2329                                          */
2330                                         if ((j != (i - 1)) && ips[j] ==
2331                                                                 ips[j+1])
2332                                                 continue;
2333
2334                                         xfs_iunlock(ips[j], lock_mode);
2335                                 }
2336
2337                                 if ((attempts % 5) == 0) {
2338                                         delay(1); /* Don't just spin the CPU */
2339 #ifdef DEBUG
2340                                         xfs_lock_delays++;
2341 #endif
2342                                 }
2343                                 i = 0;
2344                                 try_lock = 0;
2345                                 goto again;
2346                         }
2347                 } else {
2348                         xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
2349                 }
2350         }
2351
2352 #ifdef DEBUG
2353         if (attempts) {
2354                 if (attempts < 5) xfs_small_retries++;
2355                 else if (attempts < 100) xfs_middle_retries++;
2356                 else xfs_lots_retries++;
2357         } else {
2358                 xfs_locked_n++;
2359         }
2360 #endif
2361 }
2362
2363 #ifdef  DEBUG
2364 #define REMOVE_DEBUG_TRACE(x)   {remove_which_error_return = (x);}
2365 int remove_which_error_return = 0;
2366 #else /* ! DEBUG */
2367 #define REMOVE_DEBUG_TRACE(x)
2368 #endif  /* ! DEBUG */
2369
2370
2371 /*
2372  * xfs_remove
2373  *
2374  */
2375 STATIC int
2376 xfs_remove(
2377         bhv_desc_t              *dir_bdp,
2378         bhv_vname_t             *dentry,
2379         cred_t                  *credp)
2380 {
2381         bhv_vnode_t             *dir_vp;
2382         char                    *name = VNAME(dentry);
2383         xfs_inode_t             *dp, *ip;
2384         xfs_trans_t             *tp = NULL;
2385         xfs_mount_t             *mp;
2386         int                     error = 0;
2387         xfs_bmap_free_t         free_list;
2388         xfs_fsblock_t           first_block;
2389         int                     cancel_flags;
2390         int                     committed;
2391         int                     dm_di_mode = 0;
2392         int                     link_zero;
2393         uint                    resblks;
2394         int                     namelen;
2395
2396         dir_vp = BHV_TO_VNODE(dir_bdp);
2397         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
2398
2399         dp = XFS_BHVTOI(dir_bdp);
2400         mp = dp->i_mount;
2401
2402         if (XFS_FORCED_SHUTDOWN(mp))
2403                 return XFS_ERROR(EIO);
2404
2405         namelen = VNAMELEN(dentry);
2406
2407         if (!xfs_get_dir_entry(dentry, &ip)) {
2408                 dm_di_mode = ip->i_d.di_mode;
2409                 IRELE(ip);
2410         }
2411
2412         if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
2413                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp,
2414                                         DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
2415                                         name, NULL, dm_di_mode, 0, 0);
2416                 if (error)
2417                         return error;
2418         }
2419
2420         /* From this point on, return through std_return */
2421         ip = NULL;
2422
2423         /*
2424          * We need to get a reference to ip before we get our log
2425          * reservation. The reason for this is that we cannot call
2426          * xfs_iget for an inode for which we do not have a reference
2427          * once we've acquired a log reservation. This is because the
2428          * inode we are trying to get might be in xfs_inactive going
2429          * for a log reservation. Since we'll have to wait for the
2430          * inactive code to complete before returning from xfs_iget,
2431          * we need to make sure that we don't have log space reserved
2432          * when we call xfs_iget.  Instead we get an unlocked reference
2433          * to the inode before getting our log reservation.
2434          */
2435         error = xfs_get_dir_entry(dentry, &ip);
2436         if (error) {
2437                 REMOVE_DEBUG_TRACE(__LINE__);
2438                 goto std_return;
2439         }
2440
2441         dm_di_mode = ip->i_d.di_mode;
2442
2443         vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
2444
2445         ITRACE(ip);
2446
2447         error = XFS_QM_DQATTACH(mp, dp, 0);
2448         if (!error && dp != ip)
2449                 error = XFS_QM_DQATTACH(mp, ip, 0);
2450         if (error) {
2451                 REMOVE_DEBUG_TRACE(__LINE__);
2452                 IRELE(ip);
2453                 goto std_return;
2454         }
2455
2456         tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
2457         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2458         /*
2459          * We try to get the real space reservation first,
2460          * allowing for directory btree deletion(s) implying
2461          * possible bmap insert(s).  If we can't get the space
2462          * reservation then we use 0 instead, and avoid the bmap
2463          * btree insert(s) in the directory code by, if the bmap
2464          * insert tries to happen, instead trimming the LAST
2465          * block from the directory.
2466          */
2467         resblks = XFS_REMOVE_SPACE_RES(mp);
2468         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
2469                         XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2470         if (error == ENOSPC) {
2471                 resblks = 0;
2472                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
2473                                 XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2474         }
2475         if (error) {
2476                 ASSERT(error != ENOSPC);
2477                 REMOVE_DEBUG_TRACE(__LINE__);
2478                 xfs_trans_cancel(tp, 0);
2479                 IRELE(ip);
2480                 return error;
2481         }
2482
2483         error = xfs_lock_dir_and_entry(dp, ip);
2484         if (error) {
2485                 REMOVE_DEBUG_TRACE(__LINE__);
2486                 xfs_trans_cancel(tp, cancel_flags);
2487                 IRELE(ip);
2488                 goto std_return;
2489         }
2490
2491         /*
2492          * At this point, we've gotten both the directory and the entry
2493          * inodes locked.
2494          */
2495         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2496         if (dp != ip) {
2497                 /*
2498                  * Increment vnode ref count only in this case since
2499                  * there's an extra vnode reference in the case where
2500                  * dp == ip.
2501                  */
2502                 IHOLD(dp);
2503                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2504         }
2505
2506         /*
2507          * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
2508          */
2509         XFS_BMAP_INIT(&free_list, &first_block);
2510         error = xfs_dir_removename(tp, dp, name, namelen, ip->i_ino,
2511                                         &first_block, &free_list, 0);
2512         if (error) {
2513                 ASSERT(error != ENOENT);
2514                 REMOVE_DEBUG_TRACE(__LINE__);
2515                 goto error1;
2516         }
2517         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2518
2519         dp->i_gen++;
2520         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2521
2522         error = xfs_droplink(tp, ip);
2523         if (error) {
2524                 REMOVE_DEBUG_TRACE(__LINE__);
2525                 goto error1;
2526         }
2527
2528         /* Determine if this is the last link while
2529          * we are in the transaction.
2530          */
2531         link_zero = (ip)->i_d.di_nlink==0;
2532
2533         /*
2534          * Take an extra ref on the inode so that it doesn't
2535          * go to xfs_inactive() from within the commit.
2536          */
2537         IHOLD(ip);
2538
2539         /*
2540          * If this is a synchronous mount, make sure that the
2541          * remove transaction goes to disk before returning to
2542          * the user.
2543          */
2544         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2545                 xfs_trans_set_sync(tp);
2546         }
2547
2548         error = xfs_bmap_finish(&tp, &free_list, &committed);
2549         if (error) {
2550                 REMOVE_DEBUG_TRACE(__LINE__);
2551                 goto error_rele;
2552         }
2553
2554         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2555         if (error) {
2556                 IRELE(ip);
2557                 goto std_return;
2558         }
2559
2560         /*
2561          * Before we drop our extra reference to the inode, purge it
2562          * from the refcache if it is there.  By waiting until afterwards
2563          * to do the IRELE, we ensure that we won't go inactive in the
2564          * xfs_refcache_purge_ip routine (although that would be OK).
2565          */
2566         xfs_refcache_purge_ip(ip);
2567
2568         /*
2569          * If we are using filestreams, kill the stream association.
2570          * If the file is still open it may get a new one but that
2571          * will get killed on last close in xfs_close() so we don't
2572          * have to worry about that.
2573          */
2574         if (link_zero && xfs_inode_is_filestream(ip))
2575                 xfs_filestream_deassociate(ip);
2576
2577         vn_trace_exit(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
2578
2579         /*
2580          * Let interposed file systems know about removed links.
2581          */
2582         bhv_vop_link_removed(XFS_ITOV(ip), dir_vp, link_zero);
2583
2584         IRELE(ip);
2585
2586 /*      Fall through to std_return with error = 0 */
2587  std_return:
2588         if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
2589                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
2590                                 dir_vp, DM_RIGHT_NULL,
2591                                 NULL, DM_RIGHT_NULL,
2592                                 name, NULL, dm_di_mode, error, 0);
2593         }
2594         return error;
2595
2596  error1:
2597         xfs_bmap_cancel(&free_list);
2598         cancel_flags |= XFS_TRANS_ABORT;
2599         xfs_trans_cancel(tp, cancel_flags);
2600         goto std_return;
2601
2602  error_rele:
2603         /*
2604          * In this case make sure to not release the inode until after
2605          * the current transaction is aborted.  Releasing it beforehand
2606          * can cause us to go to xfs_inactive and start a recursive
2607          * transaction which can easily deadlock with the current one.
2608          */
2609         xfs_bmap_cancel(&free_list);
2610         cancel_flags |= XFS_TRANS_ABORT;
2611         xfs_trans_cancel(tp, cancel_flags);
2612
2613         /*
2614          * Before we drop our extra reference to the inode, purge it
2615          * from the refcache if it is there.  By waiting until afterwards
2616          * to do the IRELE, we ensure that we won't go inactive in the
2617          * xfs_refcache_purge_ip routine (although that would be OK).
2618          */
2619         xfs_refcache_purge_ip(ip);
2620
2621         IRELE(ip);
2622
2623         goto std_return;
2624 }
2625
2626
2627 /*
2628  * xfs_link
2629  *
2630  */
2631 STATIC int
2632 xfs_link(
2633         bhv_desc_t              *target_dir_bdp,
2634         bhv_vnode_t             *src_vp,
2635         bhv_vname_t             *dentry,
2636         cred_t                  *credp)
2637 {
2638         xfs_inode_t             *tdp, *sip;
2639         xfs_trans_t             *tp;
2640         xfs_mount_t             *mp;
2641         xfs_inode_t             *ips[2];
2642         int                     error;
2643         xfs_bmap_free_t         free_list;
2644         xfs_fsblock_t           first_block;
2645         int                     cancel_flags;
2646         int                     committed;
2647         bhv_vnode_t             *target_dir_vp;
2648         int                     resblks;
2649         char                    *target_name = VNAME(dentry);
2650         int                     target_namelen;
2651
2652         target_dir_vp = BHV_TO_VNODE(target_dir_bdp);
2653         vn_trace_entry(target_dir_vp, __FUNCTION__, (inst_t *)__return_address);
2654         vn_trace_entry(src_vp, __FUNCTION__, (inst_t *)__return_address);
2655
2656         target_namelen = VNAMELEN(dentry);
2657         ASSERT(!VN_ISDIR(src_vp));
2658
2659         sip = xfs_vtoi(src_vp);
2660         tdp = XFS_BHVTOI(target_dir_bdp);
2661         mp = tdp->i_mount;
2662         if (XFS_FORCED_SHUTDOWN(mp))
2663                 return XFS_ERROR(EIO);
2664
2665         if (DM_EVENT_ENABLED(tdp, DM_EVENT_LINK)) {
2666                 error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
2667                                         target_dir_vp, DM_RIGHT_NULL,
2668                                         src_vp, DM_RIGHT_NULL,
2669                                         target_name, NULL, 0, 0, 0);
2670                 if (error)
2671                         return error;
2672         }
2673
2674         /* Return through std_return after this point. */
2675
2676         error = XFS_QM_DQATTACH(mp, sip, 0);
2677         if (!error && sip != tdp)
2678                 error = XFS_QM_DQATTACH(mp, tdp, 0);
2679         if (error)
2680                 goto std_return;
2681
2682         tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
2683         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2684         resblks = XFS_LINK_SPACE_RES(mp, target_namelen);
2685         error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
2686                         XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2687         if (error == ENOSPC) {
2688                 resblks = 0;
2689                 error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
2690                                 XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2691         }
2692         if (error) {
2693                 cancel_flags = 0;
2694                 goto error_return;
2695         }
2696
2697         if (sip->i_ino < tdp->i_ino) {
2698                 ips[0] = sip;
2699                 ips[1] = tdp;
2700         } else {
2701                 ips[0] = tdp;
2702                 ips[1] = sip;
2703         }
2704
2705         xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2706
2707         /*
2708          * Increment vnode ref counts since xfs_trans_commit &
2709          * xfs_trans_cancel will both unlock the inodes and
2710          * decrement the associated ref counts.
2711          */
2712         VN_HOLD(src_vp);
2713         VN_HOLD(target_dir_vp);
2714         xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
2715         xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
2716
2717         /*
2718          * If the source has too many links, we can't make any more to it.
2719          */
2720         if (sip->i_d.di_nlink >= XFS_MAXLINK) {
2721                 error = XFS_ERROR(EMLINK);
2722                 goto error_return;
2723         }
2724
2725         /*
2726          * If we are using project inheritance, we only allow hard link
2727          * creation in our tree when the project IDs are the same; else
2728          * the tree quota mechanism could be circumvented.
2729          */
2730         if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
2731                      (tdp->i_d.di_projid != sip->i_d.di_projid))) {
2732                 error = XFS_ERROR(EXDEV);
2733                 goto error_return;
2734         }
2735
2736         if (resblks == 0 &&
2737             (error = xfs_dir_canenter(tp, tdp, target_name, target_namelen)))
2738                 goto error_return;
2739
2740         XFS_BMAP_INIT(&free_list, &first_block);
2741
2742         error = xfs_dir_createname(tp, tdp, target_name, target_namelen,
2743                                    sip->i_ino, &first_block, &free_list,
2744                                    resblks);
2745         if (error)
2746                 goto abort_return;
2747         xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2748         tdp->i_gen++;
2749         xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
2750
2751         error = xfs_bumplink(tp, sip);
2752         if (error)
2753                 goto abort_return;
2754
2755         /*
2756          * If this is a synchronous mount, make sure that the
2757          * link transaction goes to disk before returning to
2758          * the user.
2759          */
2760         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2761                 xfs_trans_set_sync(tp);
2762         }
2763
2764         error = xfs_bmap_finish (&tp, &free_list, &committed);
2765         if (error) {
2766                 xfs_bmap_cancel(&free_list);
2767                 goto abort_return;
2768         }
2769
2770         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2771         if (error)
2772                 goto std_return;
2773
2774         /* Fall through to std_return with error = 0. */
2775 std_return:
2776         if (DM_EVENT_ENABLED(sip, DM_EVENT_POSTLINK)) {
2777                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
2778                                 target_dir_vp, DM_RIGHT_NULL,
2779                                 src_vp, DM_RIGHT_NULL,
2780                                 target_name, NULL, 0, error, 0);
2781         }
2782         return error;
2783
2784  abort_return:
2785         cancel_flags |= XFS_TRANS_ABORT;
2786         /* FALLTHROUGH */
2787
2788  error_return:
2789         xfs_trans_cancel(tp, cancel_flags);
2790         goto std_return;
2791 }
2792
2793
2794 /*
2795  * xfs_mkdir
2796  *
2797  */
2798 STATIC int
2799 xfs_mkdir(
2800         bhv_desc_t              *dir_bdp,
2801         bhv_vname_t             *dentry,
2802         bhv_vattr_t             *vap,
2803         bhv_vnode_t             **vpp,
2804         cred_t                  *credp)
2805 {
2806         char                    *dir_name = VNAME(dentry);
2807         xfs_inode_t             *dp;
2808         xfs_inode_t             *cdp;   /* inode of created dir */
2809         bhv_vnode_t             *cvp;   /* vnode of created dir */
2810         xfs_trans_t             *tp;
2811         xfs_mount_t             *mp;
2812         int                     cancel_flags;
2813         int                     error;
2814         int                     committed;
2815         xfs_bmap_free_t         free_list;
2816         xfs_fsblock_t           first_block;
2817         bhv_vnode_t             *dir_vp;
2818         boolean_t               dp_joined_to_trans;
2819         boolean_t               created = B_FALSE;
2820         int                     dm_event_sent = 0;
2821         xfs_prid_t              prid;
2822         struct xfs_dquot        *udqp, *gdqp;
2823         uint                    resblks;
2824         int                     dm_di_mode;
2825         int                     dir_namelen;
2826
2827         dir_vp = BHV_TO_VNODE(dir_bdp);
2828         dp = XFS_BHVTOI(dir_bdp);
2829         mp = dp->i_mount;
2830
2831         if (XFS_FORCED_SHUTDOWN(mp))
2832                 return XFS_ERROR(EIO);
2833
2834         dir_namelen = VNAMELEN(dentry);
2835
2836         tp = NULL;
2837         dp_joined_to_trans = B_FALSE;
2838         dm_di_mode = vap->va_mode;
2839
2840         if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
2841                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
2842                                         dir_vp, DM_RIGHT_NULL, NULL,
2843                                         DM_RIGHT_NULL, dir_name, NULL,
2844                                         dm_di_mode, 0, 0);
2845                 if (error)
2846                         return error;
2847                 dm_event_sent = 1;
2848         }
2849
2850         /* Return through std_return after this point. */
2851
2852         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
2853
2854         mp = dp->i_mount;
2855         udqp = gdqp = NULL;
2856         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
2857                 prid = dp->i_d.di_projid;
2858         else if (vap->va_mask & XFS_AT_PROJID)
2859                 prid = (xfs_prid_t)vap->va_projid;
2860         else
2861                 prid = (xfs_prid_t)dfltprid;
2862
2863         /*
2864          * Make sure that we have allocated dquot(s) on disk.
2865          */
2866         error = XFS_QM_DQVOPALLOC(mp, dp,
2867                         current_fsuid(credp), current_fsgid(credp), prid,
2868                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2869         if (error)
2870                 goto std_return;
2871
2872         tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
2873         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2874         resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen);
2875         error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
2876                                   XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
2877         if (error == ENOSPC) {
2878                 resblks = 0;
2879                 error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
2880                                           XFS_TRANS_PERM_LOG_RES,
2881                                           XFS_MKDIR_LOG_COUNT);
2882         }
2883         if (error) {
2884                 cancel_flags = 0;
2885                 dp = NULL;
2886                 goto error_return;
2887         }
2888
2889         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2890
2891         /*
2892          * Check for directory link count overflow.
2893          */
2894         if (dp->i_d.di_nlink >= XFS_MAXLINK) {
2895                 error = XFS_ERROR(EMLINK);
2896                 goto error_return;
2897         }
2898
2899         /*
2900          * Reserve disk quota and the inode.
2901          */
2902         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
2903         if (error)
2904                 goto error_return;
2905
2906         if (resblks == 0 &&
2907             (error = xfs_dir_canenter(tp, dp, dir_name, dir_namelen)))
2908                 goto error_return;
2909         /*
2910          * create the directory inode.
2911          */
2912         error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 2,
2913                         0, credp, prid, resblks > 0,
2914                 &cdp, NULL);
2915         if (error) {
2916                 if (error == ENOSPC)
2917                         goto error_return;
2918                 goto abort_return;
2919         }
2920         ITRACE(cdp);
2921
2922         /*
2923          * Now we add the directory inode to the transaction.
2924          * We waited until now since xfs_dir_ialloc might start
2925          * a new transaction.  Had we joined the transaction
2926          * earlier, the locks might have gotten released.
2927          */
2928         VN_HOLD(dir_vp);
2929         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2930         dp_joined_to_trans = B_TRUE;
2931
2932         XFS_BMAP_INIT(&free_list, &first_block);
2933
2934         error = xfs_dir_createname(tp, dp, dir_name, dir_namelen, cdp->i_ino,
2935                                    &first_block, &free_list, resblks ?
2936                                    resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2937         if (error) {
2938                 ASSERT(error != ENOSPC);
2939                 goto error1;
2940         }
2941         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2942
2943         /*
2944          * Bump the in memory version number of the parent directory
2945          * so that other processes accessing it will recognize that
2946          * the directory has changed.
2947          */
2948         dp->i_gen++;
2949
2950         error = xfs_dir_init(tp, cdp, dp);
2951         if (error)
2952                 goto error2;
2953
2954         cdp->i_gen = 1;
2955         error = xfs_bumplink(tp, dp);
2956         if (error)
2957                 goto error2;
2958
2959         cvp = XFS_ITOV(cdp);
2960
2961         created = B_TRUE;
2962
2963         *vpp = cvp;
2964         IHOLD(cdp);
2965
2966         /*
2967          * Attach the dquots to the new inode and modify the icount incore.
2968          */
2969         XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp);
2970
2971         /*
2972          * If this is a synchronous mount, make sure that the
2973          * mkdir transaction goes to disk before returning to
2974          * the user.
2975          */
2976         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2977                 xfs_trans_set_sync(tp);
2978         }
2979
2980         error = xfs_bmap_finish(&tp, &free_list, &committed);
2981         if (error) {
2982                 IRELE(cdp);
2983                 goto error2;
2984         }
2985
2986         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2987         XFS_QM_DQRELE(mp, udqp);
2988         XFS_QM_DQRELE(mp, gdqp);
2989         if (error) {
2990                 IRELE(cdp);
2991         }
2992
2993         /* Fall through to std_return with error = 0 or errno from
2994          * xfs_trans_commit. */
2995
2996 std_return:
2997         if ((created || (error != 0 && dm_event_sent != 0)) &&
2998             DM_EVENT_ENABLED(XFS_BHVTOI(dir_bdp), DM_EVENT_POSTCREATE)) {
2999                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
3000                                         dir_vp, DM_RIGHT_NULL,
3001                                         created ? XFS_ITOV(cdp):NULL,
3002                                         DM_RIGHT_NULL,
3003                                         dir_name, NULL,
3004                                         dm_di_mode, error, 0);
3005         }
3006         return error;
3007
3008  error2:
3009  error1:
3010         xfs_bmap_cancel(&free_list);
3011  abort_return:
3012         cancel_flags |= XFS_TRANS_ABORT;
3013  error_return:
3014         xfs_trans_cancel(tp, cancel_flags);
3015         XFS_QM_DQRELE(mp, udqp);
3016         XFS_QM_DQRELE(mp, gdqp);
3017
3018         if (!dp_joined_to_trans && (dp != NULL)) {
3019                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
3020         }
3021
3022         goto std_return;
3023 }
3024
3025
3026 /*
3027  * xfs_rmdir
3028  *
3029  */
3030 STATIC int
3031 xfs_rmdir(
3032         bhv_desc_t              *dir_bdp,
3033         bhv_vname_t             *dentry,
3034         cred_t                  *credp)
3035 {
3036         char                    *name = VNAME(dentry);
3037         xfs_inode_t             *dp;
3038         xfs_inode_t             *cdp;   /* child directory */
3039         xfs_trans_t             *tp;
3040         xfs_mount_t             *mp;
3041         int                     error;
3042         xfs_bmap_free_t         free_list;
3043         xfs_fsblock_t           first_block;
3044         int                     cancel_flags;
3045         int                     committed;
3046         bhv_vnode_t             *dir_vp;
3047         int                     dm_di_mode = S_IFDIR;
3048         int                     last_cdp_link;
3049         int                     namelen;
3050         uint                    resblks;
3051
3052         dir_vp = BHV_TO_VNODE(dir_bdp);
3053         dp = XFS_BHVTOI(dir_bdp);
3054         mp = dp->i_mount;
3055
3056         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
3057
3058         if (XFS_FORCED_SHUTDOWN(XFS_BHVTOI(dir_bdp)->i_mount))
3059                 return XFS_ERROR(EIO);
3060         namelen = VNAMELEN(dentry);
3061
3062         if (!xfs_get_dir_entry(dentry, &cdp)) {
3063                 dm_di_mode = cdp->i_d.di_mode;
3064                 IRELE(cdp);
3065         }
3066
3067         if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
3068                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
3069                                         dir_vp, DM_RIGHT_NULL,
3070                                         NULL, DM_RIGHT_NULL,
3071                                         name, NULL, dm_di_mode, 0, 0);
3072                 if (error)
3073                         return XFS_ERROR(error);
3074         }
3075
3076         /* Return through std_return after this point. */
3077
3078         cdp = NULL;
3079
3080         /*
3081          * We need to get a reference to cdp before we get our log
3082          * reservation.  The reason for this is that we cannot call
3083          * xfs_iget for an inode for which we do not have a reference
3084          * once we've acquired a log reservation.  This is because the
3085          * inode we are trying to get might be in xfs_inactive going
3086          * for a log reservation.  Since we'll have to wait for the
3087          * inactive code to complete before returning from xfs_iget,
3088          * we need to make sure that we don't have log space reserved
3089          * when we call xfs_iget.  Instead we get an unlocked reference
3090          * to the inode before getting our log reservation.
3091          */
3092         error = xfs_get_dir_entry(dentry, &cdp);
3093         if (error) {
3094                 REMOVE_DEBUG_TRACE(__LINE__);
3095                 goto std_return;
3096         }
3097         mp = dp->i_mount;
3098         dm_di_mode = cdp->i_d.di_mode;
3099
3100         /*
3101          * Get the dquots for the inodes.
3102          */
3103         error = XFS_QM_DQATTACH(mp, dp, 0);
3104         if (!error && dp != cdp)
3105                 error = XFS_QM_DQATTACH(mp, cdp, 0);
3106         if (error) {
3107                 IRELE(cdp);
3108                 REMOVE_DEBUG_TRACE(__LINE__);
3109                 goto std_return;
3110         }
3111
3112         tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
3113         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3114         /*
3115          * We try to get the real space reservation first,
3116          * allowing for directory btree deletion(s) implying
3117          * possible bmap insert(s).  If we can't get the space
3118          * reservation then we use 0 instead, and avoid the bmap
3119          * btree insert(s) in the directory code by, if the bmap
3120          * insert tries to happen, instead trimming the LAST
3121          * block from the directory.
3122          */
3123         resblks = XFS_REMOVE_SPACE_RES(mp);
3124         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
3125                         XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3126         if (error == ENOSPC) {
3127                 resblks = 0;
3128                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
3129                                 XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3130         }
3131         if (error) {
3132                 ASSERT(error != ENOSPC);
3133                 cancel_flags = 0;
3134                 IRELE(cdp);
3135                 goto error_return;
3136         }
3137         XFS_BMAP_INIT(&free_list, &first_block);
3138
3139         /*
3140          * Now lock the child directory inode and the parent directory
3141          * inode in the proper order.  This will take care of validating
3142          * that the directory entry for the child directory inode has
3143          * not changed while we were obtaining a log reservation.
3144          */
3145         error = xfs_lock_dir_and_entry(dp, cdp);
3146         if (error) {
3147                 xfs_trans_cancel(tp, cancel_flags);
3148                 IRELE(cdp);
3149                 goto std_return;
3150         }
3151
3152         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3153         if (dp != cdp) {
3154                 /*
3155                  * Only increment the parent directory vnode count if
3156                  * we didn't bump it in looking up cdp.  The only time
3157                  * we don't bump it is when we're looking up ".".
3158                  */
3159                 VN_HOLD(dir_vp);
3160         }
3161
3162         ITRACE(cdp);
3163         xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL);
3164
3165         ASSERT(cdp->i_d.di_nlink >= 2);
3166         if (cdp->i_d.di_nlink != 2) {
3167                 error = XFS_ERROR(ENOTEMPTY);
3168                 goto error_return;
3169         }
3170         if (!xfs_dir_isempty(cdp)) {
3171                 error = XFS_ERROR(ENOTEMPTY);
3172                 goto error_return;
3173         }
3174
3175         error = xfs_dir_removename(tp, dp, name, namelen, cdp->i_ino,
3176                                         &first_block, &free_list, resblks);
3177         if (error)
3178                 goto error1;
3179
3180         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3181
3182         /*
3183          * Bump the in memory generation count on the parent
3184          * directory so that other can know that it has changed.
3185          */
3186         dp->i_gen++;
3187
3188         /*
3189          * Drop the link from cdp's "..".
3190          */
3191         error = xfs_droplink(tp, dp);
3192         if (error) {
3193                 goto error1;
3194         }
3195
3196         /*
3197          * Drop the link from dp to cdp.
3198          */
3199         error = xfs_droplink(tp, cdp);
3200         if (error) {
3201                 goto error1;
3202         }
3203
3204         /*
3205          * Drop the "." link from cdp to self.
3206          */
3207         error = xfs_droplink(tp, cdp);
3208         if (error) {
3209                 goto error1;
3210         }
3211
3212         /* Determine these before committing transaction */
3213         last_cdp_link = (cdp)->i_d.di_nlink==0;
3214
3215         /*
3216          * Take an extra ref on the child vnode so that it
3217          * does not go to xfs_inactive() from within the commit.
3218          */
3219         IHOLD(cdp);
3220
3221         /*
3222          * If this is a synchronous mount, make sure that the
3223          * rmdir transaction goes to disk before returning to
3224          * the user.
3225          */
3226         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3227                 xfs_trans_set_sync(tp);
3228         }
3229
3230         error = xfs_bmap_finish (&tp, &free_list, &committed);
3231         if (error) {
3232                 xfs_bmap_cancel(&free_list);
3233                 xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
3234                                  XFS_TRANS_ABORT));
3235                 IRELE(cdp);
3236                 goto std_return;
3237         }
3238
3239         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3240         if (error) {
3241                 IRELE(cdp);
3242                 goto std_return;
3243         }
3244
3245
3246         /*
3247          * Let interposed file systems know about removed links.
3248          */
3249         bhv_vop_link_removed(XFS_ITOV(cdp), dir_vp, last_cdp_link);
3250
3251         IRELE(cdp);
3252
3253         /* Fall through to std_return with error = 0 or the errno
3254          * from xfs_trans_commit. */
3255  std_return:
3256         if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
3257                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
3258                                         dir_vp, DM_RIGHT_NULL,
3259                                         NULL, DM_RIGHT_NULL,
3260                                         name, NULL, dm_di_mode,
3261                                         error, 0);
3262         }
3263         return error;
3264
3265  error1:
3266         xfs_bmap_cancel(&free_list);
3267         cancel_flags |= XFS_TRANS_ABORT;
3268         /* FALLTHROUGH */
3269
3270  error_return:
3271         xfs_trans_cancel(tp, cancel_flags);
3272         goto std_return;
3273 }
3274
3275 STATIC int
3276 xfs_symlink(
3277         bhv_desc_t              *dir_bdp,
3278         bhv_vname_t             *dentry,
3279         bhv_vattr_t             *vap,
3280         char                    *target_path,
3281         bhv_vnode_t             **vpp,
3282         cred_t                  *credp)
3283 {
3284         xfs_trans_t             *tp;
3285         xfs_mount_t             *mp;
3286         xfs_inode_t             *dp;
3287         xfs_inode_t             *ip;
3288         int                     error;
3289         int                     pathlen;
3290         xfs_bmap_free_t         free_list;
3291         xfs_fsblock_t           first_block;
3292         boolean_t               dp_joined_to_trans;
3293         bhv_vnode_t             *dir_vp;
3294         uint                    cancel_flags;
3295         int                     committed;
3296         xfs_fileoff_t           first_fsb;
3297         xfs_filblks_t           fs_blocks;
3298         int                     nmaps;
3299         xfs_bmbt_irec_t         mval[SYMLINK_MAPS];
3300         xfs_daddr_t             d;
3301         char                    *cur_chunk;
3302         int                     byte_cnt;
3303         int                     n;
3304         xfs_buf_t               *bp;
3305         xfs_prid_t              prid;
3306         struct xfs_dquot        *udqp, *gdqp;
3307         uint                    resblks;
3308         char                    *link_name = VNAME(dentry);
3309         int                     link_namelen;
3310
3311         *vpp = NULL;
3312         dir_vp = BHV_TO_VNODE(dir_bdp);
3313         dp = XFS_BHVTOI(dir_bdp);
3314         dp_joined_to_trans = B_FALSE;
3315         error = 0;
3316         ip = NULL;
3317         tp = NULL;
3318
3319         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
3320
3321         mp = dp->i_mount;
3322
3323         if (XFS_FORCED_SHUTDOWN(mp))
3324                 return XFS_ERROR(EIO);
3325
3326         link_namelen = VNAMELEN(dentry);
3327
3328         /*
3329          * Check component lengths of the target path name.
3330          */
3331         pathlen = strlen(target_path);
3332         if (pathlen >= MAXPATHLEN)      /* total string too long */
3333                 return XFS_ERROR(ENAMETOOLONG);
3334         if (pathlen >= MAXNAMELEN) {    /* is any component too long? */
3335                 int len, total;
3336                 char *path;
3337
3338                 for (total = 0, path = target_path; total < pathlen;) {
3339                         /*
3340                          * Skip any slashes.
3341                          */
3342                         while(*path == '/') {
3343                                 total++;
3344                                 path++;
3345                         }
3346
3347                         /*
3348                          * Count up to the next slash or end of path.
3349                          * Error out if the component is bigger than MAXNAMELEN.
3350                          */
3351                         for(len = 0; *path != '/' && total < pathlen;total++, path++) {
3352                                 if (++len >= MAXNAMELEN) {
3353                                         error = ENAMETOOLONG;
3354                                         return error;
3355                                 }
3356                         }
3357                 }
3358         }
3359
3360         if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
3361                 error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dir_vp,
3362                                         DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
3363                                         link_name, target_path, 0, 0, 0);
3364                 if (error)
3365                         return error;
3366         }
3367
3368         /* Return through std_return after this point. */
3369
3370         udqp = gdqp = NULL;
3371         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
3372                 prid = dp->i_d.di_projid;
3373         else if (vap->va_mask & XFS_AT_PROJID)
3374                 prid = (xfs_prid_t)vap->va_projid;
3375         else
3376                 prid = (xfs_prid_t)dfltprid;
3377
3378         /*
3379          * Make sure that we have allocated dquot(s) on disk.
3380          */
3381         error = XFS_QM_DQVOPALLOC(mp, dp,
3382                         current_fsuid(credp), current_fsgid(credp), prid,
3383                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
3384         if (error)
3385                 goto std_return;
3386
3387         tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
3388         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3389         /*
3390          * The symlink will fit into the inode data fork?
3391          * There can't be any attributes so we get the whole variable part.
3392          */
3393         if (pathlen <= XFS_LITINO(mp))
3394                 fs_blocks = 0;
3395         else
3396                 fs_blocks = XFS_B_TO_FSB(mp, pathlen);
3397         resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks);
3398         error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
3399                         XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3400         if (error == ENOSPC && fs_blocks == 0) {
3401                 resblks = 0;
3402                 error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
3403                                 XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3404         }
3405         if (error) {
3406                 cancel_flags = 0;
3407                 dp = NULL;
3408                 goto error_return;
3409         }
3410
3411         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
3412
3413         /*
3414          * Check whether the directory allows new symlinks or not.
3415          */
3416         if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
3417                 error = XFS_ERROR(EPERM);
3418                 goto error_return;
3419         }
3420
3421         /*
3422          * Reserve disk quota : blocks and inode.
3423          */
3424         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
3425         if (error)
3426                 goto error_return;
3427
3428         /*
3429          * Check for ability to enter directory entry, if no space reserved.
3430          */
3431         if (resblks == 0 &&
3432             (error = xfs_dir_canenter(tp, dp, link_name, link_namelen)))
3433                 goto error_return;
3434         /*
3435          * Initialize the bmap freelist prior to calling either
3436          * bmapi or the directory create code.
3437          */
3438         XFS_BMAP_INIT(&free_list, &first_block);
3439
3440         /*
3441          * Allocate an inode for the symlink.
3442          */
3443         error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (vap->va_mode&~S_IFMT),
3444                                1, 0, credp, prid, resblks > 0, &ip, NULL);
3445         if (error) {
3446                 if (error == ENOSPC)
3447                         goto error_return;
3448                 goto error1;
3449         }
3450         ITRACE(ip);
3451
3452         VN_HOLD(dir_vp);
3453         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3454         dp_joined_to_trans = B_TRUE;
3455
3456         /*
3457          * Also attach the dquot(s) to it, if applicable.
3458          */
3459         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
3460
3461         if (resblks)
3462                 resblks -= XFS_IALLOC_SPACE_RES(mp);
3463         /*
3464          * If the symlink will fit into the inode, write it inline.
3465          */
3466         if (pathlen <= XFS_IFORK_DSIZE(ip)) {
3467                 xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
3468                 memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
3469                 ip->i_d.di_size = pathlen;
3470
3471                 /*
3472                  * The inode was initially created in extent format.
3473                  */
3474                 ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
3475                 ip->i_df.if_flags |= XFS_IFINLINE;
3476
3477                 ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
3478                 xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
3479
3480         } else {
3481                 first_fsb = 0;
3482                 nmaps = SYMLINK_MAPS;
3483
3484                 error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
3485                                   XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
3486                                   &first_block, resblks, mval, &nmaps,
3487                                   &free_list, NULL);
3488                 if (error) {
3489                         goto error1;
3490                 }
3491
3492                 if (resblks)
3493                         resblks -= fs_blocks;
3494                 ip->i_d.di_size = pathlen;
3495                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3496
3497                 cur_chunk = target_path;
3498                 for (n = 0; n < nmaps; n++) {
3499                         d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
3500                         byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
3501                         bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
3502                                                BTOBB(byte_cnt), 0);
3503                         ASSERT(bp && !XFS_BUF_GETERROR(bp));
3504                         if (pathlen < byte_cnt) {
3505                                 byte_cnt = pathlen;
3506                         }
3507                         pathlen -= byte_cnt;
3508
3509                         memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt);
3510                         cur_chunk += byte_cnt;
3511
3512                         xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
3513                 }
3514         }
3515
3516         /*
3517          * Create the directory entry for the symlink.
3518          */
3519         error = xfs_dir_createname(tp, dp, link_name, link_namelen, ip->i_ino,
3520                                    &first_block, &free_list, resblks);
3521         if (error)
3522                 goto error1;
3523         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3524         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
3525
3526         /*
3527          * Bump the in memory version number of the parent directory
3528          * so that other processes accessing it will recognize that
3529          * the directory has changed.
3530          */
3531         dp->i_gen++;
3532
3533         /*
3534          * If this is a synchronous mount, make sure that the
3535          * symlink transaction goes to disk before returning to
3536          * the user.
3537          */
3538         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3539                 xfs_trans_set_sync(tp);
3540         }
3541
3542         /*
3543          * xfs_trans_commit normally decrements the vnode ref count
3544          * when it unlocks the inode. Since we want to return the
3545          * vnode to the caller, we bump the vnode ref count now.
3546          */
3547         IHOLD(ip);
3548
3549         error = xfs_bmap_finish(&tp, &free_list, &committed);
3550         if (error) {
3551                 goto error2;
3552         }
3553         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3554         XFS_QM_DQRELE(mp, udqp);
3555         XFS_QM_DQRELE(mp, gdqp);
3556
3557         /* Fall through to std_return with error = 0 or errno from
3558          * xfs_trans_commit     */
3559 std_return:
3560         if (DM_EVENT_ENABLED(XFS_BHVTOI(dir_bdp), DM_EVENT_POSTSYMLINK)) {
3561                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
3562                                         dir_vp, DM_RIGHT_NULL,
3563                                         error ? NULL : XFS_ITOV(ip),
3564                                         DM_RIGHT_NULL, link_name, target_path,
3565                                         0, error, 0);
3566         }
3567
3568         if (!error) {
3569                 bhv_vnode_t *vp;
3570
3571                 ASSERT(ip);
3572                 vp = XFS_ITOV(ip);
3573                 *vpp = vp;
3574         }
3575         return error;
3576
3577  error2:
3578         IRELE(ip);
3579  error1:
3580         xfs_bmap_cancel(&free_list);
3581         cancel_flags |= XFS_TRANS_ABORT;
3582  error_return:
3583         xfs_trans_cancel(tp, cancel_flags);
3584         XFS_QM_DQRELE(mp, udqp);
3585         XFS_QM_DQRELE(mp, gdqp);
3586
3587         if (!dp_joined_to_trans && (dp != NULL)) {
3588                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
3589         }
3590
3591         goto std_return;
3592 }
3593
3594
3595 /*
3596  * xfs_fid2
3597  *
3598  * A fid routine that takes a pointer to a previously allocated
3599  * fid structure (like xfs_fast_fid) but uses a 64 bit inode number.
3600  */
3601 STATIC int
3602 xfs_fid2(
3603         bhv_desc_t      *bdp,
3604         fid_t           *fidp)
3605 {
3606         xfs_inode_t     *ip;
3607         xfs_fid2_t      *xfid;
3608
3609         vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__,
3610                                        (inst_t *)__return_address);
3611         ASSERT(sizeof(fid_t) >= sizeof(xfs_fid2_t));
3612
3613         xfid = (xfs_fid2_t *)fidp;
3614         ip = XFS_BHVTOI(bdp);
3615         xfid->fid_len = sizeof(xfs_fid2_t) - sizeof(xfid->fid_len);
3616         xfid->fid_pad = 0;
3617         /*
3618          * use memcpy because the inode is a long long and there's no
3619          * assurance that xfid->fid_ino is properly aligned.
3620          */
3621         memcpy(&xfid->fid_ino, &ip->i_ino, sizeof(xfid->fid_ino));
3622         xfid->fid_gen = ip->i_d.di_gen;
3623
3624         return 0;
3625 }
3626
3627
3628 /*
3629  * xfs_rwlock
3630  */
3631 int
3632 xfs_rwlock(
3633         bhv_desc_t      *bdp,
3634         bhv_vrwlock_t   locktype)
3635 {
3636         xfs_inode_t     *ip;
3637         bhv_vnode_t     *vp;
3638
3639         vp = BHV_TO_VNODE(bdp);
3640         if (VN_ISDIR(vp))
3641                 return 1;
3642         ip = XFS_BHVTOI(bdp);
3643         if (locktype == VRWLOCK_WRITE) {
3644                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
3645         } else if (locktype == VRWLOCK_TRY_READ) {
3646                 return xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED);
3647         } else if (locktype == VRWLOCK_TRY_WRITE) {
3648                 return xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL);
3649         } else {
3650                 ASSERT((locktype == VRWLOCK_READ) ||
3651                        (locktype == VRWLOCK_WRITE_DIRECT));
3652                 xfs_ilock(ip, XFS_IOLOCK_SHARED);
3653         }
3654
3655         return 1;
3656 }
3657
3658
3659 /*
3660  * xfs_rwunlock
3661  */
3662 void
3663 xfs_rwunlock(
3664         bhv_desc_t      *bdp,
3665         bhv_vrwlock_t   locktype)
3666 {
3667         xfs_inode_t     *ip;
3668         bhv_vnode_t     *vp;
3669
3670         vp = BHV_TO_VNODE(bdp);
3671         if (VN_ISDIR(vp))
3672                 return;
3673         ip = XFS_BHVTOI(bdp);
3674         if (locktype == VRWLOCK_WRITE) {
3675                 /*
3676                  * In the write case, we may have added a new entry to
3677                  * the reference cache.  This might store a pointer to
3678                  * an inode to be released in this inode.  If it is there,
3679                  * clear the pointer and release the inode after unlocking
3680                  * this one.
3681                  */
3682                 xfs_refcache_iunlock(ip, XFS_IOLOCK_EXCL);
3683         } else {
3684                 ASSERT((locktype == VRWLOCK_READ) ||
3685                        (locktype == VRWLOCK_WRITE_DIRECT));
3686                 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
3687         }
3688         return;
3689 }
3690
3691 STATIC int
3692 xfs_inode_flush(
3693         bhv_desc_t      *bdp,
3694         int             flags)
3695 {
3696         xfs_inode_t     *ip;
3697         xfs_mount_t     *mp;
3698         xfs_inode_log_item_t *iip;
3699         int             error = 0;
3700
3701         ip = XFS_BHVTOI(bdp);
3702         mp = ip->i_mount;
3703         iip = ip->i_itemp;
3704
3705         if (XFS_FORCED_SHUTDOWN(mp))
3706                 return XFS_ERROR(EIO);
3707
3708         /*
3709          * Bypass inodes which have already been cleaned by
3710          * the inode flush clustering code inside xfs_iflush
3711          */
3712         if ((ip->i_update_core == 0) &&
3713             ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)))
3714                 return 0;
3715
3716         if (flags & FLUSH_LOG) {
3717                 if (iip && iip->ili_last_lsn) {
3718                         xlog_t          *log = mp->m_log;
3719                         xfs_lsn_t       sync_lsn;
3720                         int             s, log_flags = XFS_LOG_FORCE;
3721
3722                         s = GRANT_LOCK(log);
3723                         sync_lsn = log->l_last_sync_lsn;
3724                         GRANT_UNLOCK(log, s);
3725
3726                         if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) > 0)) {
3727                                 if (flags & FLUSH_SYNC)
3728                                         log_flags |= XFS_LOG_SYNC;
3729                                 error = xfs_log_force(mp, iip->ili_last_lsn, log_flags);
3730                                 if (error)
3731                                         return error;
3732                         }
3733
3734                         if (ip->i_update_core == 0)
3735                                 return 0;
3736                 }
3737         }
3738
3739         /*
3740          * We make this non-blocking if the inode is contended,
3741          * return EAGAIN to indicate to the caller that they
3742          * did not succeed. This prevents the flush path from
3743          * blocking on inodes inside another operation right
3744          * now, they get caught later by xfs_sync.
3745          */
3746         if (flags & FLUSH_INODE) {
3747                 int     flush_flags;
3748
3749                 if (flags & FLUSH_SYNC) {
3750                         xfs_ilock(ip, XFS_ILOCK_SHARED);
3751                         xfs_iflock(ip);
3752                 } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3753                         if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
3754                                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3755                                 return EAGAIN;
3756                         }
3757                 } else {
3758                         return EAGAIN;
3759                 }
3760
3761                 if (flags & FLUSH_SYNC)
3762                         flush_flags = XFS_IFLUSH_SYNC;
3763                 else
3764                         flush_flags = XFS_IFLUSH_ASYNC;
3765
3766                 error = xfs_iflush(ip, flush_flags);
3767                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3768         }
3769
3770         return error;
3771 }
3772
3773 int
3774 xfs_set_dmattrs (
3775         bhv_desc_t      *bdp,
3776         u_int           evmask,
3777         u_int16_t       state,
3778         cred_t          *credp)
3779 {
3780         xfs_inode_t     *ip;
3781         xfs_trans_t     *tp;
3782         xfs_mount_t     *mp;
3783         int             error;
3784
3785         if (!capable(CAP_SYS_ADMIN))
3786                 return XFS_ERROR(EPERM);
3787
3788         ip = XFS_BHVTOI(bdp);
3789         mp = ip->i_mount;
3790
3791         if (XFS_FORCED_SHUTDOWN(mp))
3792                 return XFS_ERROR(EIO);
3793
3794         tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
3795         error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
3796         if (error) {
3797                 xfs_trans_cancel(tp, 0);
3798                 return error;
3799         }
3800         xfs_ilock(ip, XFS_ILOCK_EXCL);
3801         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3802
3803         ip->i_iocore.io_dmevmask = ip->i_d.di_dmevmask = evmask;
3804         ip->i_iocore.io_dmstate  = ip->i_d.di_dmstate  = state;
3805
3806         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3807         IHOLD(ip);
3808         error = xfs_trans_commit(tp, 0);
3809
3810         return error;
3811 }
3812
3813 STATIC int
3814 xfs_reclaim(
3815         bhv_desc_t      *bdp)
3816 {
3817         xfs_inode_t     *ip;
3818         bhv_vnode_t     *vp;
3819
3820         vp = BHV_TO_VNODE(bdp);
3821         ip = XFS_BHVTOI(bdp);
3822
3823         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
3824
3825         ASSERT(!VN_MAPPED(vp));
3826
3827         /* bad inode, get out here ASAP */
3828         if (VN_BAD(vp)) {
3829                 xfs_ireclaim(ip);
3830                 return 0;
3831         }
3832
3833         vn_iowait(vp);
3834
3835         ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
3836
3837         /*
3838          * Make sure the atime in the XFS inode is correct before freeing the
3839          * Linux inode.
3840          */
3841         xfs_synchronize_atime(ip);
3842
3843         /*
3844          * If we have nothing to flush with this inode then complete the
3845          * teardown now, otherwise break the link between the xfs inode and the
3846          * linux inode and clean up the xfs inode later. This avoids flushing
3847          * the inode to disk during the delete operation itself.
3848          *
3849          * When breaking the link, we need to set the XFS_IRECLAIMABLE flag
3850          * first to ensure that xfs_iunpin() will never see an xfs inode
3851          * that has a linux inode being reclaimed. Synchronisation is provided
3852          * by the i_flags_lock.
3853          */
3854         if (!ip->i_update_core && (ip->i_itemp == NULL)) {
3855                 xfs_ilock(ip, XFS_ILOCK_EXCL);
3856                 xfs_iflock(ip);
3857                 return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
3858         } else {
3859                 xfs_mount_t     *mp = ip->i_mount;
3860
3861                 /* Protect sync and unpin from us */
3862                 XFS_MOUNT_ILOCK(mp);
3863                 spin_lock(&ip->i_flags_lock);
3864                 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
3865                 vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip));
3866                 spin_unlock(&ip->i_flags_lock);
3867                 list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
3868                 XFS_MOUNT_IUNLOCK(mp);
3869         }
3870         return 0;
3871 }
3872
3873 int
3874 xfs_finish_reclaim(
3875         xfs_inode_t     *ip,
3876         int             locked,
3877         int             sync_mode)
3878 {
3879         xfs_ihash_t     *ih = ip->i_hash;
3880         bhv_vnode_t     *vp = XFS_ITOV_NULL(ip);
3881         int             error;
3882
3883         if (vp && VN_BAD(vp))
3884                 goto reclaim;
3885
3886         /* The hash lock here protects a thread in xfs_iget_core from
3887          * racing with us on linking the inode back with a vnode.
3888          * Once we have the XFS_IRECLAIM flag set it will not touch
3889          * us.
3890          */
3891         write_lock(&ih->ih_lock);
3892         spin_lock(&ip->i_flags_lock);
3893         if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
3894             (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) && vp == NULL)) {
3895                 spin_unlock(&ip->i_flags_lock);
3896                 write_unlock(&ih->ih_lock);
3897                 if (locked) {
3898                         xfs_ifunlock(ip);
3899                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3900                 }
3901                 return 1;
3902         }
3903         __xfs_iflags_set(ip, XFS_IRECLAIM);
3904         spin_unlock(&ip->i_flags_lock);
3905         write_unlock(&ih->ih_lock);
3906
3907         /*
3908          * If the inode is still dirty, then flush it out.  If the inode
3909          * is not in the AIL, then it will be OK to flush it delwri as
3910          * long as xfs_iflush() does not keep any references to the inode.
3911          * We leave that decision up to xfs_iflush() since it has the
3912          * knowledge of whether it's OK to simply do a delwri flush of
3913          * the inode or whether we need to wait until the inode is
3914          * pulled from the AIL.
3915          * We get the flush lock regardless, though, just to make sure
3916          * we don't free it while it is being flushed.
3917          */
3918         if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
3919                 if (!locked) {
3920                         xfs_ilock(ip, XFS_ILOCK_EXCL);
3921                         xfs_iflock(ip);
3922                 }
3923
3924                 if (ip->i_update_core ||
3925                     ((ip->i_itemp != NULL) &&
3926                      (ip->i_itemp->ili_format.ilf_fields != 0))) {
3927                         error = xfs_iflush(ip, sync_mode);
3928                         /*
3929                          * If we hit an error, typically because of filesystem
3930                          * shutdown, we don't need to let vn_reclaim to know
3931                          * because we're gonna reclaim the inode anyway.
3932                          */
3933                         if (error) {
3934                                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3935                                 goto reclaim;
3936                         }
3937                         xfs_iflock(ip); /* synchronize with xfs_iflush_done */
3938                 }
3939
3940                 ASSERT(ip->i_update_core == 0);
3941                 ASSERT(ip->i_itemp == NULL ||
3942                        ip->i_itemp->ili_format.ilf_fields == 0);
3943                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3944         } else if (locked) {
3945                 /*
3946                  * We are not interested in doing an iflush if we're
3947                  * in the process of shutting down the filesystem forcibly.
3948                  * So, just reclaim the inode.
3949                  */
3950                 xfs_ifunlock(ip);
3951                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3952         }
3953
3954  reclaim:
3955         xfs_ireclaim(ip);
3956         return 0;
3957 }
3958
3959 int
3960 xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
3961 {
3962         int             purged;
3963         xfs_inode_t     *ip, *n;
3964         int             done = 0;
3965
3966         while (!done) {
3967                 purged = 0;
3968                 XFS_MOUNT_ILOCK(mp);
3969                 list_for_each_entry_safe(ip, n, &mp->m_del_inodes, i_reclaim) {
3970                         if (noblock) {
3971                                 if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0)
3972                                         continue;
3973                                 if (xfs_ipincount(ip) ||
3974                                     !xfs_iflock_nowait(ip)) {
3975                                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3976                                         continue;
3977                                 }
3978                         }
3979                         XFS_MOUNT_IUNLOCK(mp);
3980                         if (xfs_finish_reclaim(ip, noblock,
3981                                         XFS_IFLUSH_DELWRI_ELSE_ASYNC))
3982                                 delay(1);
3983                         purged = 1;
3984                         break;
3985                 }
3986
3987                 done = !purged;
3988         }
3989
3990         XFS_MOUNT_IUNLOCK(mp);
3991         return 0;
3992 }
3993
3994 /*
3995  * xfs_alloc_file_space()
3996  *      This routine allocates disk space for the given file.
3997  *
3998  *      If alloc_type == 0, this request is for an ALLOCSP type
3999  *      request which will change the file size.  In this case, no
4000  *      DMAPI event will be generated by the call.  A TRUNCATE event
4001  *      will be generated later by xfs_setattr.
4002  *
4003  *      If alloc_type != 0, this request is for a RESVSP type
4004  *      request, and a DMAPI DM_EVENT_WRITE will be generated if the
4005  *      lower block boundary byte address is less than the file's
4006  *      length.
4007  *
4008  * RETURNS:
4009  *       0 on success
4010  *      errno on error
4011  *
4012  */
4013 STATIC int
4014 xfs_alloc_file_space(
4015         xfs_inode_t             *ip,
4016         xfs_off_t               offset,
4017         xfs_off_t               len,
4018         int                     alloc_type,
4019         int                     attr_flags)
4020 {
4021         xfs_mount_t             *mp = ip->i_mount;
4022         xfs_off_t               count;
4023         xfs_filblks_t           allocated_fsb;
4024         xfs_filblks_t           allocatesize_fsb;
4025         xfs_extlen_t            extsz, temp;
4026         xfs_fileoff_t           startoffset_fsb;
4027         xfs_fsblock_t           firstfsb;
4028         int                     nimaps;
4029         int                     bmapi_flag;
4030         int                     quota_flag;
4031         int                     rt;
4032         xfs_trans_t             *tp;
4033         xfs_bmbt_irec_t         imaps[1], *imapp;
4034         xfs_bmap_free_t         free_list;
4035         uint                    qblocks, resblks, resrtextents;
4036         int                     committed;
4037         int                     error;
4038
4039         vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
4040
4041         if (XFS_FORCED_SHUTDOWN(mp))
4042                 return XFS_ERROR(EIO);
4043
4044         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4045                 return error;
4046
4047         if (len <= 0)
4048                 return XFS_ERROR(EINVAL);
4049
4050         rt = XFS_IS_REALTIME_INODE(ip);
4051         extsz = xfs_get_extsz_hint(ip);
4052
4053         count = len;
4054         imapp = &imaps[0];
4055         nimaps = 1;
4056         bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
4057         startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
4058         allocatesize_fsb = XFS_B_TO_FSB(mp, count);
4059
4060         /*      Generate a DMAPI event if needed.       */
4061         if (alloc_type != 0 && offset < ip->i_size &&
4062                         (attr_flags&ATTR_DMI) == 0  &&
4063                         DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
4064                 xfs_off_t           end_dmi_offset;
4065
4066                 end_dmi_offset = offset+len;
4067                 if (end_dmi_offset > ip->i_size)
4068                         end_dmi_offset = ip->i_size;
4069                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip),
4070                         offset, end_dmi_offset - offset,
4071                         0, NULL);
4072                 if (error)
4073                         return error;
4074         }
4075
4076         /*
4077          * Allocate file space until done or until there is an error
4078          */
4079 retry:
4080         while (allocatesize_fsb && !error) {
4081                 xfs_fileoff_t   s, e;
4082
4083                 /*
4084                  * Determine space reservations for data/realtime.
4085                  */
4086                 if (unlikely(extsz)) {
4087                         s = startoffset_fsb;
4088                         do_div(s, extsz);
4089                         s *= extsz;
4090                         e = startoffset_fsb + allocatesize_fsb;
4091                         if ((temp = do_mod(startoffset_fsb, extsz)))
4092                                 e += temp;
4093                         if ((temp = do_mod(e, extsz)))
4094                                 e += extsz - temp;
4095                 } else {
4096                         s = 0;
4097                         e = allocatesize_fsb;
4098                 }
4099
4100                 if (unlikely(rt)) {
4101                         resrtextents = qblocks = (uint)(e - s);
4102                         resrtextents /= mp->m_sb.sb_rextsize;
4103                         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
4104                         quota_flag = XFS_QMOPT_RES_RTBLKS;
4105                 } else {
4106                         resrtextents = 0;
4107                         resblks = qblocks = \
4108                                 XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
4109                         quota_flag = XFS_QMOPT_RES_REGBLKS;
4110                 }
4111
4112                 /*
4113                  * Allocate and setup the transaction.
4114                  */
4115                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4116                 error = xfs_trans_reserve(tp, resblks,
4117                                           XFS_WRITE_LOG_RES(mp), resrtextents,
4118                                           XFS_TRANS_PERM_LOG_RES,
4119                                           XFS_WRITE_LOG_COUNT);
4120                 /*
4121                  * Check for running out of space
4122                  */
4123                 if (error) {
4124                         /*
4125                          * Free the transaction structure.
4126                          */
4127                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4128                         xfs_trans_cancel(tp, 0);
4129                         break;
4130                 }
4131                 xfs_ilock(ip, XFS_ILOCK_EXCL);
4132                 error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip,
4133                                                       qblocks, 0, quota_flag);
4134                 if (error)
4135                         goto error1;
4136
4137                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4138                 xfs_trans_ihold(tp, ip);
4139
4140                 /*
4141                  * Issue the xfs_bmapi() call to allocate the blocks
4142                  */
4143                 XFS_BMAP_INIT(&free_list, &firstfsb);
4144                 error = XFS_BMAPI(mp, tp, &ip->i_iocore, startoffset_fsb,
4145                                   allocatesize_fsb, bmapi_flag,
4146                                   &firstfsb, 0, imapp, &nimaps,
4147                                   &free_list, NULL);
4148                 if (error) {
4149                         goto error0;
4150                 }
4151
4152                 /*
4153                  * Complete the transaction
4154                  */
4155                 error = xfs_bmap_finish(&tp, &free_list, &committed);
4156                 if (error) {
4157                         goto error0;
4158                 }
4159
4160                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
4161                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4162                 if (error) {
4163                         break;
4164                 }
4165
4166                 allocated_fsb = imapp->br_blockcount;
4167
4168                 if (nimaps == 0) {
4169                         error = XFS_ERROR(ENOSPC);
4170                         break;
4171                 }
4172
4173                 startoffset_fsb += allocated_fsb;
4174                 allocatesize_fsb -= allocated_fsb;
4175         }
4176 dmapi_enospc_check:
4177         if (error == ENOSPC && (attr_flags & ATTR_DMI) == 0 &&
4178             DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) {
4179                 error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
4180                                 XFS_ITOV(ip), DM_RIGHT_NULL,
4181                                 XFS_ITOV(ip), DM_RIGHT_NULL,
4182                                 NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
4183                 if (error == 0)
4184                         goto retry;     /* Maybe DMAPI app. has made space */
4185                 /* else fall through with error from XFS_SEND_DATA */
4186         }
4187
4188         return error;
4189
4190 error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
4191         xfs_bmap_cancel(&free_list);
4192         XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag);
4193
4194 error1: /* Just cancel transaction */
4195         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4196         xfs_iunlock(ip, XFS_ILOCK_EXCL);
4197         goto dmapi_enospc_check;
4198 }
4199
4200 /*
4201  * Zero file bytes between startoff and endoff inclusive.
4202  * The iolock is held exclusive and no blocks are buffered.
4203  */
4204 STATIC int
4205 xfs_zero_remaining_bytes(
4206         xfs_inode_t             *ip,
4207         xfs_off_t               startoff,
4208         xfs_off_t               endoff)
4209 {
4210         xfs_bmbt_irec_t         imap;
4211         xfs_fileoff_t           offset_fsb;
4212         xfs_off_t               lastoffset;
4213         xfs_off_t               offset;
4214         xfs_buf_t               *bp;
4215         xfs_mount_t             *mp = ip->i_mount;
4216         int                     nimap;
4217         int                     error = 0;
4218
4219         bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
4220                                 ip->i_d.di_flags & XFS_DIFLAG_REALTIME ?
4221                                 mp->m_rtdev_targp : mp->m_ddev_targp);
4222
4223         for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
4224                 offset_fsb = XFS_B_TO_FSBT(mp, offset);
4225                 nimap = 1;
4226                 error = XFS_BMAPI(mp, NULL, &ip->i_iocore, offset_fsb, 1, 0,
4227                         NULL, 0, &imap, &nimap, NULL, NULL);
4228                 if (error || nimap < 1)
4229                         break;
4230                 ASSERT(imap.br_blockcount >= 1);
4231                 ASSERT(imap.br_startoff == offset_fsb);
4232                 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
4233                 if (lastoffset > endoff)
4234                         lastoffset = endoff;
4235                 if (imap.br_startblock == HOLESTARTBLOCK)
4236                         continue;
4237                 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4238                 if (imap.br_state == XFS_EXT_UNWRITTEN)
4239                         continue;
4240                 XFS_BUF_UNDONE(bp);
4241                 XFS_BUF_UNWRITE(bp);
4242                 XFS_BUF_READ(bp);
4243                 XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
4244                 xfsbdstrat(mp, bp);
4245                 if ((error = xfs_iowait(bp))) {
4246                         xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
4247                                           mp, bp, XFS_BUF_ADDR(bp));
4248                         break;
4249                 }
4250                 memset(XFS_BUF_PTR(bp) +
4251                         (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
4252                       0, lastoffset - offset + 1);
4253                 XFS_BUF_UNDONE(bp);
4254                 XFS_BUF_UNREAD(bp);
4255                 XFS_BUF_WRITE(bp);
4256                 xfsbdstrat(mp, bp);
4257                 if ((error = xfs_iowait(bp))) {
4258                         xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
4259                                           mp, bp, XFS_BUF_ADDR(bp));
4260                         break;
4261                 }
4262         }
4263         xfs_buf_free(bp);
4264         return error;
4265 }
4266
4267 /*
4268  * xfs_free_file_space()
4269  *      This routine frees disk space for the given file.
4270  *
4271  *      This routine is only called by xfs_change_file_space
4272  *      for an UNRESVSP type call.
4273  *
4274  * RETURNS:
4275  *       0 on success
4276  *      errno on error
4277  *
4278  */
4279 STATIC int
4280 xfs_free_file_space(
4281         xfs_inode_t             *ip,
4282         xfs_off_t               offset,
4283         xfs_off_t               len,
4284         int                     attr_flags)
4285 {
4286         bhv_vnode_t             *vp;
4287         int                     committed;
4288         int                     done;
4289         xfs_off_t               end_dmi_offset;
4290         xfs_fileoff_t           endoffset_fsb;
4291         int                     error;
4292         xfs_fsblock_t           firstfsb;
4293         xfs_bmap_free_t         free_list;
4294         xfs_bmbt_irec_t         imap;
4295         xfs_off_t               ioffset;
4296         xfs_extlen_t            mod=0;
4297         xfs_mount_t             *mp;
4298         int                     nimap;
4299         uint                    resblks;
4300         uint                    rounding;
4301         int                     rt;
4302         xfs_fileoff_t           startoffset_fsb;
4303         xfs_trans_t             *tp;
4304         int                     need_iolock = 1;
4305
4306         vp = XFS_ITOV(ip);
4307         mp = ip->i_mount;
4308
4309         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
4310
4311         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4312                 return error;
4313
4314         error = 0;
4315         if (len <= 0)   /* if nothing being freed */
4316                 return error;
4317         rt = (ip->i_d.di_flags & XFS_DIFLAG_REALTIME);
4318         startoffset_fsb = XFS_B_TO_FSB(mp, offset);
4319         end_dmi_offset = offset + len;
4320         endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
4321
4322         if (offset < ip->i_size && (attr_flags & ATTR_DMI) == 0 &&
4323             DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
4324                 if (end_dmi_offset > ip->i_size)
4325                         end_dmi_offset = ip->i_size;
4326                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp,
4327                                 offset, end_dmi_offset - offset,
4328                                 AT_DELAY_FLAG(attr_flags), NULL);
4329                 if (error)
4330                         return error;
4331         }
4332
4333         if (attr_flags & ATTR_NOLOCK)
4334                 need_iolock = 0;
4335         if (need_iolock) {
4336                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
4337                 vn_iowait(vp);  /* wait for the completion of any pending DIOs */
4338         }
4339
4340         rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, NBPP);
4341         ioffset = offset & ~(rounding - 1);
4342
4343         if (VN_CACHED(vp) != 0) {
4344                 xfs_inval_cached_trace(&ip->i_iocore, ioffset, -1,
4345                                 ctooff(offtoct(ioffset)), -1);
4346                 error = bhv_vop_flushinval_pages(vp, ctooff(offtoct(ioffset)),
4347                                 -1, FI_REMAPF_LOCKED);
4348                 if (error)
4349                         goto out_unlock_iolock;
4350         }
4351
4352         /*
4353          * Need to zero the stuff we're not freeing, on disk.
4354          * If its a realtime file & can't use unwritten extents then we
4355          * actually need to zero the extent edges.  Otherwise xfs_bunmapi
4356          * will take care of it for us.
4357          */
4358         if (rt && !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
4359                 nimap = 1;
4360                 error = XFS_BMAPI(mp, NULL, &ip->i_iocore, startoffset_fsb,
4361                         1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
4362                 if (error)
4363                         goto out_unlock_iolock;
4364                 ASSERT(nimap == 0 || nimap == 1);
4365                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4366                         xfs_daddr_t     block;
4367
4368                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4369                         block = imap.br_startblock;
4370                         mod = do_div(block, mp->m_sb.sb_rextsize);
4371                         if (mod)
4372                                 startoffset_fsb += mp->m_sb.sb_rextsize - mod;
4373                 }
4374                 nimap = 1;
4375                 error = XFS_BMAPI(mp, NULL, &ip->i_iocore, endoffset_fsb - 1,
4376                         1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
4377                 if (error)
4378                         goto out_unlock_iolock;
4379                 ASSERT(nimap == 0 || nimap == 1);
4380                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4381                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4382                         mod++;
4383                         if (mod && (mod != mp->m_sb.sb_rextsize))
4384                                 endoffset_fsb -= mod;
4385                 }
4386         }
4387         if ((done = (endoffset_fsb <= startoffset_fsb)))
4388                 /*
4389                  * One contiguous piece to clear
4390                  */
4391                 error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
4392         else {
4393                 /*
4394                  * Some full blocks, possibly two pieces to clear
4395                  */
4396                 if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
4397                         error = xfs_zero_remaining_bytes(ip, offset,
4398                                 XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
4399                 if (!error &&
4400                     XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
4401                         error = xfs_zero_remaining_bytes(ip,
4402                                 XFS_FSB_TO_B(mp, endoffset_fsb),
4403                                 offset + len - 1);
4404         }
4405
4406         /*
4407          * free file space until done or until there is an error
4408          */
4409         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
4410         while (!error && !done) {
4411
4412                 /*
4413                  * allocate and setup the transaction. Allow this
4414                  * transaction to dip into the reserve blocks to ensure
4415                  * the freeing of the space succeeds at ENOSPC.
4416                  */
4417                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4418                 tp->t_flags |= XFS_TRANS_RESERVE;
4419                 error = xfs_trans_reserve(tp,
4420                                           resblks,
4421                                           XFS_WRITE_LOG_RES(mp),
4422                                           0,
4423                                           XFS_TRANS_PERM_LOG_RES,
4424                                           XFS_WRITE_LOG_COUNT);
4425
4426                 /*
4427                  * check for running out of space
4428                  */
4429                 if (error) {
4430                         /*
4431                          * Free the transaction structure.
4432                          */
4433                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4434                         xfs_trans_cancel(tp, 0);
4435                         break;
4436                 }
4437                 xfs_ilock(ip, XFS_ILOCK_EXCL);
4438                 error = XFS_TRANS_RESERVE_QUOTA(mp, tp,
4439                                 ip->i_udquot, ip->i_gdquot, resblks, 0,
4440                                 XFS_QMOPT_RES_REGBLKS);
4441                 if (error)
4442                         goto error1;
4443
4444                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4445                 xfs_trans_ihold(tp, ip);
4446
4447                 /*
4448                  * issue the bunmapi() call to free the blocks
4449                  */
4450                 XFS_BMAP_INIT(&free_list, &firstfsb);
4451                 error = XFS_BUNMAPI(mp, tp, &ip->i_iocore, startoffset_fsb,
4452                                   endoffset_fsb - startoffset_fsb,
4453                                   0, 2, &firstfsb, &free_list, NULL, &done);
4454                 if (error) {
4455                         goto error0;
4456                 }
4457
4458                 /*
4459                  * complete the transaction
4460                  */
4461                 error = xfs_bmap_finish(&tp, &free_list, &committed);
4462                 if (error) {
4463                         goto error0;
4464                 }
4465
4466                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
4467                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4468         }
4469
4470  out_unlock_iolock:
4471         if (need_iolock)
4472                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
4473         return error;
4474
4475  error0:
4476         xfs_bmap_cancel(&free_list);
4477  error1:
4478         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4479         xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
4480                     XFS_ILOCK_EXCL);
4481         return error;
4482 }
4483
4484 /*
4485  * xfs_change_file_space()
4486  *      This routine allocates or frees disk space for the given file.
4487  *      The user specified parameters are checked for alignment and size
4488  *      limitations.
4489  *
4490  * RETURNS:
4491  *       0 on success
4492  *      errno on error
4493  *
4494  */
4495 int
4496 xfs_change_file_space(
4497         bhv_desc_t      *bdp,
4498         int             cmd,
4499         xfs_flock64_t   *bf,
4500         xfs_off_t       offset,
4501         cred_t          *credp,
4502         int             attr_flags)
4503 {
4504         int             clrprealloc;
4505         int             error;
4506         xfs_fsize_t     fsize;
4507         xfs_inode_t     *ip;
4508         xfs_mount_t     *mp;
4509         int             setprealloc;
4510         xfs_off_t       startoffset;
4511         xfs_off_t       llen;
4512         xfs_trans_t     *tp;
4513         bhv_vattr_t     va;
4514         bhv_vnode_t     *vp;
4515
4516         vp = BHV_TO_VNODE(bdp);
4517         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
4518
4519         ip = XFS_BHVTOI(bdp);
4520         mp = ip->i_mount;
4521
4522         /*
4523          * must be a regular file and have write permission
4524          */
4525         if (!VN_ISREG(vp))
4526                 return XFS_ERROR(EINVAL);
4527
4528         xfs_ilock(ip, XFS_ILOCK_SHARED);
4529
4530         if ((error = xfs_iaccess(ip, S_IWUSR, credp))) {
4531                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
4532                 return error;
4533         }
4534
4535         xfs_iunlock(ip, XFS_ILOCK_SHARED);
4536
4537         switch (bf->l_whence) {
4538         case 0: /*SEEK_SET*/
4539                 break;
4540         case 1: /*SEEK_CUR*/
4541                 bf->l_start += offset;
4542                 break;
4543         case 2: /*SEEK_END*/
4544                 bf->l_start += ip->i_size;
4545                 break;
4546         default:
4547                 return XFS_ERROR(EINVAL);
4548         }
4549
4550         llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
4551
4552         if (   (bf->l_start < 0)
4553             || (bf->l_start > XFS_MAXIOFFSET(mp))
4554             || (bf->l_start + llen < 0)
4555             || (bf->l_start + llen > XFS_MAXIOFFSET(mp)))
4556                 return XFS_ERROR(EINVAL);
4557
4558         bf->l_whence = 0;
4559
4560         startoffset = bf->l_start;
4561         fsize = ip->i_size;
4562
4563         /*
4564          * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
4565          * file space.
4566          * These calls do NOT zero the data space allocated to the file,
4567          * nor do they change the file size.
4568          *
4569          * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
4570          * space.
4571          * These calls cause the new file data to be zeroed and the file
4572          * size to be changed.
4573          */
4574         setprealloc = clrprealloc = 0;
4575
4576         switch (cmd) {
4577         case XFS_IOC_RESVSP:
4578         case XFS_IOC_RESVSP64:
4579                 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
4580                                                                 1, attr_flags);
4581                 if (error)
4582                         return error;
4583                 setprealloc = 1;
4584                 break;
4585
4586         case XFS_IOC_UNRESVSP:
4587         case XFS_IOC_UNRESVSP64:
4588                 if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
4589                                                                 attr_flags)))
4590                         return error;
4591                 break;
4592
4593         case XFS_IOC_ALLOCSP:
4594         case XFS_IOC_ALLOCSP64:
4595         case XFS_IOC_FREESP:
4596         case XFS_IOC_FREESP64:
4597                 if (startoffset > fsize) {
4598                         error = xfs_alloc_file_space(ip, fsize,
4599                                         startoffset - fsize, 0, attr_flags);
4600                         if (error)
4601                                 break;
4602                 }
4603
4604                 va.va_mask = XFS_AT_SIZE;
4605                 va.va_size = startoffset;
4606
4607                 error = xfs_setattr(bdp, &va, attr_flags, credp);
4608
4609                 if (error)
4610                         return error;
4611
4612                 clrprealloc = 1;
4613                 break;
4614
4615         default:
4616                 ASSERT(0);
4617                 return XFS_ERROR(EINVAL);
4618         }
4619
4620         /*
4621          * update the inode timestamp, mode, and prealloc flag bits
4622          */
4623         tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
4624
4625         if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
4626                                       0, 0, 0))) {
4627                 /* ASSERT(0); */
4628                 xfs_trans_cancel(tp, 0);
4629                 return error;
4630         }
4631
4632         xfs_ilock(ip, XFS_ILOCK_EXCL);
4633
4634         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4635         xfs_trans_ihold(tp, ip);
4636
4637         if ((attr_flags & ATTR_DMI) == 0) {
4638                 ip->i_d.di_mode &= ~S_ISUID;
4639
4640                 /*
4641                  * Note that we don't have to worry about mandatory
4642                  * file locking being disabled here because we only
4643                  * clear the S_ISGID bit if the Group execute bit is
4644                  * on, but if it was on then mandatory locking wouldn't
4645                  * have been enabled.
4646                  */
4647                 if (ip->i_d.di_mode & S_IXGRP)
4648                         ip->i_d.di_mode &= ~S_ISGID;
4649
4650                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
4651         }
4652         if (setprealloc)
4653                 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
4654         else if (clrprealloc)
4655                 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
4656
4657         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
4658         xfs_trans_set_sync(tp);
4659
4660         error = xfs_trans_commit(tp, 0);
4661
4662         xfs_iunlock(ip, XFS_ILOCK_EXCL);
4663
4664         return error;
4665 }
4666
4667 bhv_vnodeops_t xfs_vnodeops = {
4668         BHV_IDENTITY_INIT(VN_BHV_XFS,VNODE_POSITION_XFS),
4669         .vop_open               = xfs_open,
4670         .vop_read               = xfs_read,
4671 #ifdef HAVE_SPLICE
4672         .vop_splice_read        = xfs_splice_read,
4673         .vop_splice_write       = xfs_splice_write,
4674 #endif
4675         .vop_write              = xfs_write,
4676         .vop_ioctl              = xfs_ioctl,
4677         .vop_getattr            = xfs_getattr,
4678         .vop_setattr            = xfs_setattr,
4679         .vop_access             = xfs_access,
4680         .vop_lookup             = xfs_lookup,
4681         .vop_create             = xfs_create,
4682         .vop_remove             = xfs_remove,
4683         .vop_link               = xfs_link,
4684         .vop_rename             = xfs_rename,
4685         .vop_mkdir              = xfs_mkdir,
4686         .vop_rmdir              = xfs_rmdir,
4687         .vop_readdir            = xfs_readdir,
4688         .vop_symlink            = xfs_symlink,
4689         .vop_readlink           = xfs_readlink,
4690         .vop_fsync              = xfs_fsync,
4691         .vop_inactive           = xfs_inactive,
4692         .vop_fid2               = xfs_fid2,
4693         .vop_rwlock             = xfs_rwlock,
4694         .vop_rwunlock           = xfs_rwunlock,
4695         .vop_bmap               = xfs_bmap,
4696         .vop_reclaim            = xfs_reclaim,
4697         .vop_attr_get           = xfs_attr_get,
4698         .vop_attr_set           = xfs_attr_set,
4699         .vop_attr_remove        = xfs_attr_remove,
4700         .vop_attr_list          = xfs_attr_list,
4701         .vop_link_removed       = (vop_link_removed_t)fs_noval,
4702         .vop_vnode_change       = (vop_vnode_change_t)fs_noval,
4703         .vop_tosspages          = fs_tosspages,
4704         .vop_flushinval_pages   = fs_flushinval_pages,
4705         .vop_flush_pages        = fs_flush_pages,
4706         .vop_release            = xfs_release,
4707         .vop_iflush             = xfs_inode_flush,
4708 };