2 * aops.c - NTFS kernel address space operations and page cache handling.
3 * Part of the Linux-NTFS project.
5 * Copyright (c) 2001-2005 Anton Altaparmakov
6 * Copyright (c) 2002 Richard Russon
8 * This program/include file is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as published
10 * by the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program/include file is distributed in the hope that it will be
14 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
15 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program (in the main directory of the Linux-NTFS
20 * distribution in the file COPYING); if not, write to the Free Software
21 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 #include <linux/errno.h>
26 #include <linux/pagemap.h>
27 #include <linux/swap.h>
28 #include <linux/buffer_head.h>
29 #include <linux/writeback.h>
41 * ntfs_end_buffer_async_read - async io completion for reading attributes
42 * @bh: buffer head on which io is completed
43 * @uptodate: whether @bh is now uptodate or not
45 * Asynchronous I/O completion handler for reading pages belonging to the
46 * attribute address space of an inode. The inodes can either be files or
47 * directories or they can be fake inodes describing some attribute.
49 * If NInoMstProtected(), perform the post read mst fixups when all IO on the
50 * page has been completed and mark the page uptodate or set the error bit on
51 * the page. To determine the size of the records that need fixing up, we
52 * cheat a little bit by setting the index_block_size in ntfs_inode to the ntfs
53 * record size, and index_block_size_bits, to the log(base 2) of the ntfs
56 static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
58 static DEFINE_SPINLOCK(page_uptodate_lock);
60 struct buffer_head *tmp;
63 int page_uptodate = 1;
66 ni = NTFS_I(page->mapping->host);
68 if (likely(uptodate)) {
69 s64 file_ofs, initialized_size;
71 set_buffer_uptodate(bh);
73 file_ofs = ((s64)page->index << PAGE_CACHE_SHIFT) +
75 read_lock_irqsave(&ni->size_lock, flags);
76 initialized_size = ni->initialized_size;
77 read_unlock_irqrestore(&ni->size_lock, flags);
78 /* Check for the current buffer head overflowing. */
79 if (file_ofs + bh->b_size > initialized_size) {
83 if (file_ofs < initialized_size)
84 ofs = initialized_size - file_ofs;
85 addr = kmap_atomic(page, KM_BIO_SRC_IRQ);
86 memset(addr + bh_offset(bh) + ofs, 0, bh->b_size - ofs);
87 flush_dcache_page(page);
88 kunmap_atomic(addr, KM_BIO_SRC_IRQ);
91 clear_buffer_uptodate(bh);
92 ntfs_error(ni->vol->sb, "Buffer I/O error, logical block %llu.",
93 (unsigned long long)bh->b_blocknr);
96 spin_lock_irqsave(&page_uptodate_lock, flags);
97 clear_buffer_async_read(bh);
101 if (!buffer_uptodate(tmp))
103 if (buffer_async_read(tmp)) {
104 if (likely(buffer_locked(tmp)))
106 /* Async buffers must be locked. */
109 tmp = tmp->b_this_page;
111 spin_unlock_irqrestore(&page_uptodate_lock, flags);
113 * If none of the buffers had errors then we can set the page uptodate,
114 * but we first have to perform the post read mst fixups, if the
115 * attribute is mst protected, i.e. if NInoMstProteced(ni) is true.
116 * Note we ignore fixup errors as those are detected when
117 * map_mft_record() is called which gives us per record granularity
118 * rather than per page granularity.
120 if (!NInoMstProtected(ni)) {
121 if (likely(page_uptodate && !PageError(page)))
122 SetPageUptodate(page);
125 unsigned int i, recs;
128 rec_size = ni->itype.index.block_size;
129 recs = PAGE_CACHE_SIZE / rec_size;
130 /* Should have been verified before we got here... */
132 addr = kmap_atomic(page, KM_BIO_SRC_IRQ);
133 for (i = 0; i < recs; i++)
134 post_read_mst_fixup((NTFS_RECORD*)(addr +
135 i * rec_size), rec_size);
136 flush_dcache_page(page);
137 kunmap_atomic(addr, KM_BIO_SRC_IRQ);
138 if (likely(page_uptodate && !PageError(page)))
139 SetPageUptodate(page);
144 spin_unlock_irqrestore(&page_uptodate_lock, flags);
149 * ntfs_read_block - fill a @page of an address space with data
150 * @page: page cache page to fill with data
152 * Fill the page @page of the address space belonging to the @page->host inode.
153 * We read each buffer asynchronously and when all buffers are read in, our io
154 * completion handler ntfs_end_buffer_read_async(), if required, automatically
155 * applies the mst fixups to the page before finally marking it uptodate and
158 * We only enforce allocated_size limit because i_size is checked for in
159 * generic_file_read().
161 * Return 0 on success and -errno on error.
163 * Contains an adapted version of fs/buffer.c::block_read_full_page().
165 static int ntfs_read_block(struct page *page)
172 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
173 sector_t iblock, lblock, zblock;
175 unsigned int blocksize, vcn_ofs;
177 unsigned char blocksize_bits;
179 ni = NTFS_I(page->mapping->host);
182 /* $MFT/$DATA must have its complete runlist in memory at all times. */
183 BUG_ON(!ni->runlist.rl && !ni->mft_no && !NInoAttr(ni));
185 blocksize_bits = VFS_I(ni)->i_blkbits;
186 blocksize = 1 << blocksize_bits;
188 if (!page_has_buffers(page))
189 create_empty_buffers(page, blocksize, 0);
190 bh = head = page_buffers(page);
196 iblock = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
197 read_lock_irqsave(&ni->size_lock, flags);
198 lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits;
199 zblock = (ni->initialized_size + blocksize - 1) >> blocksize_bits;
200 read_unlock_irqrestore(&ni->size_lock, flags);
202 /* Loop through all the buffers in the page. */
209 if (unlikely(buffer_uptodate(bh)))
211 if (unlikely(buffer_mapped(bh))) {
216 bh->b_bdev = vol->sb->s_bdev;
217 /* Is the block within the allowed limits? */
218 if (iblock < lblock) {
219 BOOL is_retry = FALSE;
221 /* Convert iblock into corresponding vcn and offset. */
222 vcn = (VCN)iblock << blocksize_bits >>
223 vol->cluster_size_bits;
224 vcn_ofs = ((VCN)iblock << blocksize_bits) &
225 vol->cluster_size_mask;
228 down_read(&ni->runlist.lock);
231 if (likely(rl != NULL)) {
232 /* Seek to element containing target vcn. */
233 while (rl->length && rl[1].vcn <= vcn)
235 lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
237 lcn = LCN_RL_NOT_MAPPED;
238 /* Successful remap. */
240 /* Setup buffer head to correct block. */
241 bh->b_blocknr = ((lcn << vol->cluster_size_bits)
242 + vcn_ofs) >> blocksize_bits;
243 set_buffer_mapped(bh);
244 /* Only read initialized data blocks. */
245 if (iblock < zblock) {
249 /* Fully non-initialized data block, zero it. */
252 /* It is a hole, need to zero it. */
255 /* If first try and runlist unmapped, map and retry. */
256 if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
259 * Attempt to map runlist, dropping lock for
262 up_read(&ni->runlist.lock);
263 err = ntfs_map_runlist(ni, vcn);
265 goto lock_retry_remap;
268 up_read(&ni->runlist.lock);
270 * If buffer is outside the runlist, treat it as a
271 * hole. This can happen due to concurrent truncate
274 if (err == -ENOENT || lcn == LCN_ENOENT) {
278 /* Hard error, zero out region. */
283 ntfs_error(vol->sb, "Failed to read from inode 0x%lx, "
284 "attribute type 0x%x, vcn 0x%llx, "
285 "offset 0x%x because its location on "
286 "disk could not be determined%s "
287 "(error code %i).", ni->mft_no,
288 ni->type, (unsigned long long)vcn,
289 vcn_ofs, is_retry ? " even after "
290 "retrying" : "", err);
293 * Either iblock was outside lblock limits or
294 * ntfs_rl_vcn_to_lcn() returned error. Just zero that portion
295 * of the page and set the buffer uptodate.
298 bh->b_blocknr = -1UL;
299 clear_buffer_mapped(bh);
301 kaddr = kmap_atomic(page, KM_USER0);
302 memset(kaddr + i * blocksize, 0, blocksize);
303 kunmap_atomic(kaddr, KM_USER0);
304 flush_dcache_page(page);
306 set_buffer_uptodate(bh);
307 } while (i++, iblock++, (bh = bh->b_this_page) != head);
309 /* Release the lock if we took it. */
311 up_read(&ni->runlist.lock);
313 /* Check we have at least one buffer ready for i/o. */
315 struct buffer_head *tbh;
317 /* Lock the buffers. */
318 for (i = 0; i < nr; i++) {
321 tbh->b_end_io = ntfs_end_buffer_async_read;
322 set_buffer_async_read(tbh);
324 /* Finally, start i/o on the buffers. */
325 for (i = 0; i < nr; i++) {
327 if (likely(!buffer_uptodate(tbh)))
328 submit_bh(READ, tbh);
330 ntfs_end_buffer_async_read(tbh, 1);
334 /* No i/o was scheduled on any of the buffers. */
335 if (likely(!PageError(page)))
336 SetPageUptodate(page);
337 else /* Signal synchronous i/o error. */
344 * ntfs_readpage - fill a @page of a @file with data from the device
345 * @file: open file to which the page @page belongs or NULL
346 * @page: page cache page to fill with data
348 * For non-resident attributes, ntfs_readpage() fills the @page of the open
349 * file @file by calling the ntfs version of the generic block_read_full_page()
350 * function, ntfs_read_block(), which in turn creates and reads in the buffers
351 * associated with the page asynchronously.
353 * For resident attributes, OTOH, ntfs_readpage() fills @page by copying the
354 * data from the mft record (which at this stage is most likely in memory) and
355 * fills the remainder with zeroes. Thus, in this case, I/O is synchronous, as
356 * even if the mft record is not cached at this point in time, we need to wait
357 * for it to be read in before we can do the copy.
359 * Return 0 on success and -errno on error.
361 static int ntfs_readpage(struct file *file, struct page *page)
363 ntfs_inode *ni, *base_ni;
365 ntfs_attr_search_ctx *ctx;
372 BUG_ON(!PageLocked(page));
374 * This can potentially happen because we clear PageUptodate() during
375 * ntfs_writepage() of MstProtected() attributes.
377 if (PageUptodate(page)) {
381 ni = NTFS_I(page->mapping->host);
383 * Only $DATA attributes can be encrypted and only unnamed $DATA
384 * attributes can be compressed. Index root can have the flags set but
385 * this means to create compressed/encrypted files, not that the
386 * attribute is compressed/encrypted.
388 if (ni->type != AT_INDEX_ROOT) {
389 /* If attribute is encrypted, deny access, just like NT4. */
390 if (NInoEncrypted(ni)) {
391 BUG_ON(ni->type != AT_DATA);
395 /* Compressed data streams are handled in compress.c. */
396 if (NInoNonResident(ni) && NInoCompressed(ni)) {
397 BUG_ON(ni->type != AT_DATA);
398 BUG_ON(ni->name_len);
399 return ntfs_read_compressed_block(page);
402 /* NInoNonResident() == NInoIndexAllocPresent() */
403 if (NInoNonResident(ni)) {
404 /* Normal, non-resident data stream. */
405 return ntfs_read_block(page);
408 * Attribute is resident, implying it is not compressed or encrypted.
409 * This also means the attribute is smaller than an mft record and
410 * hence smaller than a page, so can simply zero out any pages with
411 * index above 0. Note the attribute can actually be marked compressed
412 * but if it is resident the actual data is not compressed so we are
413 * ok to ignore the compressed flag here.
415 if (unlikely(page->index > 0)) {
416 kaddr = kmap_atomic(page, KM_USER0);
417 memset(kaddr, 0, PAGE_CACHE_SIZE);
418 flush_dcache_page(page);
419 kunmap_atomic(kaddr, KM_USER0);
425 base_ni = ni->ext.base_ntfs_ino;
426 /* Map, pin, and lock the mft record. */
427 mrec = map_mft_record(base_ni);
433 * If a parallel write made the attribute non-resident, drop the mft
434 * record and retry the readpage.
436 if (unlikely(NInoNonResident(ni))) {
437 unmap_mft_record(base_ni);
440 ctx = ntfs_attr_get_search_ctx(base_ni, mrec);
441 if (unlikely(!ctx)) {
445 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
446 CASE_SENSITIVE, 0, NULL, 0, ctx);
448 goto put_unm_err_out;
449 attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
450 read_lock_irqsave(&ni->size_lock, flags);
451 if (unlikely(attr_len > ni->initialized_size))
452 attr_len = ni->initialized_size;
453 read_unlock_irqrestore(&ni->size_lock, flags);
454 kaddr = kmap_atomic(page, KM_USER0);
455 /* Copy the data to the page. */
456 memcpy(kaddr, (u8*)ctx->attr +
457 le16_to_cpu(ctx->attr->data.resident.value_offset),
459 /* Zero the remainder of the page. */
460 memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
461 flush_dcache_page(page);
462 kunmap_atomic(kaddr, KM_USER0);
464 ntfs_attr_put_search_ctx(ctx);
466 unmap_mft_record(base_ni);
468 SetPageUptodate(page);
477 * ntfs_write_block - write a @page to the backing store
478 * @page: page cache page to write out
479 * @wbc: writeback control structure
481 * This function is for writing pages belonging to non-resident, non-mst
482 * protected attributes to their backing store.
484 * For a page with buffers, map and write the dirty buffers asynchronously
485 * under page writeback. For a page without buffers, create buffers for the
486 * page, then proceed as above.
488 * If a page doesn't have buffers the page dirty state is definitive. If a page
489 * does have buffers, the page dirty state is just a hint, and the buffer dirty
490 * state is definitive. (A hint which has rules: dirty buffers against a clean
491 * page is illegal. Other combinations are legal and need to be handled. In
492 * particular a dirty page containing clean buffers for example.)
494 * Return 0 on success and -errno on error.
496 * Based on ntfs_read_block() and __block_write_full_page().
498 static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
502 s64 initialized_size;
504 sector_t block, dblock, iblock;
509 struct buffer_head *bh, *head;
511 unsigned int blocksize, vcn_ofs;
513 BOOL need_end_writeback;
514 unsigned char blocksize_bits;
516 vi = page->mapping->host;
520 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
521 "0x%lx.", ni->mft_no, ni->type, page->index);
523 BUG_ON(!NInoNonResident(ni));
524 BUG_ON(NInoMstProtected(ni));
526 blocksize_bits = vi->i_blkbits;
527 blocksize = 1 << blocksize_bits;
529 if (!page_has_buffers(page)) {
530 BUG_ON(!PageUptodate(page));
531 create_empty_buffers(page, blocksize,
532 (1 << BH_Uptodate) | (1 << BH_Dirty));
534 bh = head = page_buffers(page);
536 ntfs_warning(vol->sb, "Error allocating page buffers. "
537 "Redirtying page so we try again later.");
539 * Put the page back on mapping->dirty_pages, but leave its
540 * buffer's dirty state as-is.
542 redirty_page_for_writepage(wbc, page);
547 /* NOTE: Different naming scheme to ntfs_read_block()! */
549 /* The first block in the page. */
550 block = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
552 read_lock_irqsave(&ni->size_lock, flags);
553 i_size = i_size_read(vi);
554 initialized_size = ni->initialized_size;
555 read_unlock_irqrestore(&ni->size_lock, flags);
557 /* The first out of bounds block for the data size. */
558 dblock = (i_size + blocksize - 1) >> blocksize_bits;
560 /* The last (fully or partially) initialized block. */
561 iblock = initialized_size >> blocksize_bits;
564 * Be very careful. We have no exclusion from __set_page_dirty_buffers
565 * here, and the (potentially unmapped) buffers may become dirty at
566 * any time. If a buffer becomes dirty here after we've inspected it
567 * then we just miss that fact, and the page stays dirty.
569 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
570 * handle that here by just cleaning them.
574 * Loop through all the buffers in the page, mapping all the dirty
575 * buffers to disk addresses and handling any aliases from the
576 * underlying block device's mapping.
581 BOOL is_retry = FALSE;
583 if (unlikely(block >= dblock)) {
585 * Mapped buffers outside i_size will occur, because
586 * this page can be outside i_size when there is a
587 * truncate in progress. The contents of such buffers
588 * were zeroed by ntfs_writepage().
590 * FIXME: What about the small race window where
591 * ntfs_writepage() has not done any clearing because
592 * the page was within i_size but before we get here,
593 * vmtruncate() modifies i_size?
595 clear_buffer_dirty(bh);
596 set_buffer_uptodate(bh);
600 /* Clean buffers are not written out, so no need to map them. */
601 if (!buffer_dirty(bh))
604 /* Make sure we have enough initialized size. */
605 if (unlikely((block >= iblock) &&
606 (initialized_size < i_size))) {
608 * If this page is fully outside initialized size, zero
609 * out all pages between the current initialized size
610 * and the current page. Just use ntfs_readpage() to do
611 * the zeroing transparently.
613 if (block > iblock) {
616 // - read_cache_page()
617 // Again for each page do:
618 // - wait_on_page_locked()
619 // - Check (PageUptodate(page) &&
621 // Update initialized size in the attribute and
623 // Again, for each page do:
624 // __set_page_dirty_buffers();
625 // page_cache_release()
626 // We don't need to wait on the writes.
630 * The current page straddles initialized size. Zero
631 * all non-uptodate buffers and set them uptodate (and
632 * dirty?). Note, there aren't any non-uptodate buffers
633 * if the page is uptodate.
634 * FIXME: For an uptodate page, the buffers may need to
635 * be written out because they were not initialized on
638 if (!PageUptodate(page)) {
640 // Zero any non-uptodate buffers up to i_size.
641 // Set them uptodate and dirty.
644 // Update initialized size in the attribute and in the
645 // inode (up to i_size).
647 // FIXME: This is inefficient. Try to batch the two
648 // size changes to happen in one go.
649 ntfs_error(vol->sb, "Writing beyond initialized size "
650 "is not supported yet. Sorry.");
653 // Do NOT set_buffer_new() BUT DO clear buffer range
654 // outside write request range.
655 // set_buffer_uptodate() on complete buffers as well as
656 // set_buffer_dirty().
659 /* No need to map buffers that are already mapped. */
660 if (buffer_mapped(bh))
663 /* Unmapped, dirty buffer. Need to map it. */
664 bh->b_bdev = vol->sb->s_bdev;
666 /* Convert block into corresponding vcn and offset. */
667 vcn = (VCN)block << blocksize_bits;
668 vcn_ofs = vcn & vol->cluster_size_mask;
669 vcn >>= vol->cluster_size_bits;
672 down_read(&ni->runlist.lock);
675 if (likely(rl != NULL)) {
676 /* Seek to element containing target vcn. */
677 while (rl->length && rl[1].vcn <= vcn)
679 lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
681 lcn = LCN_RL_NOT_MAPPED;
682 /* Successful remap. */
684 /* Setup buffer head to point to correct block. */
685 bh->b_blocknr = ((lcn << vol->cluster_size_bits) +
686 vcn_ofs) >> blocksize_bits;
687 set_buffer_mapped(bh);
690 /* It is a hole, need to instantiate it. */
691 if (lcn == LCN_HOLE) {
693 unsigned long *bpos, *bend;
695 /* Check if the buffer is zero. */
696 kaddr = kmap_atomic(page, KM_USER0);
697 bpos = (unsigned long *)(kaddr + bh_offset(bh));
698 bend = (unsigned long *)((u8*)bpos + blocksize);
702 } while (likely(++bpos < bend));
703 kunmap_atomic(kaddr, KM_USER0);
706 * Buffer is zero and sparse, no need to write
710 clear_buffer_dirty(bh);
713 // TODO: Instantiate the hole.
714 // clear_buffer_new(bh);
715 // unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
716 ntfs_error(vol->sb, "Writing into sparse regions is "
717 "not supported yet. Sorry.");
721 /* If first try and runlist unmapped, map and retry. */
722 if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
725 * Attempt to map runlist, dropping lock for
728 up_read(&ni->runlist.lock);
729 err = ntfs_map_runlist(ni, vcn);
731 goto lock_retry_remap;
734 up_read(&ni->runlist.lock);
736 * If buffer is outside the runlist, truncate has cut it out
737 * of the runlist. Just clean and clear the buffer and set it
738 * uptodate so it can get discarded by the VM.
740 if (err == -ENOENT || lcn == LCN_ENOENT) {
744 clear_buffer_dirty(bh);
745 kaddr = kmap_atomic(page, KM_USER0);
746 memset(kaddr + bh_offset(bh), 0, blocksize);
747 kunmap_atomic(kaddr, KM_USER0);
748 flush_dcache_page(page);
749 set_buffer_uptodate(bh);
753 /* Failed to map the buffer, even after retrying. */
757 ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
758 "attribute type 0x%x, vcn 0x%llx, offset 0x%x "
759 "because its location on disk could not be "
760 "determined%s (error code %i).", ni->mft_no,
761 ni->type, (unsigned long long)vcn,
762 vcn_ofs, is_retry ? " even after "
763 "retrying" : "", err);
765 } while (block++, (bh = bh->b_this_page) != head);
767 /* Release the lock if we took it. */
769 up_read(&ni->runlist.lock);
771 /* For the error case, need to reset bh to the beginning. */
774 /* Just an optimization, so ->readpage() is not called later. */
775 if (unlikely(!PageUptodate(page))) {
778 if (!buffer_uptodate(bh)) {
783 } while ((bh = bh->b_this_page) != head);
785 SetPageUptodate(page);
788 /* Setup all mapped, dirty buffers for async write i/o. */
790 if (buffer_mapped(bh) && buffer_dirty(bh)) {
792 if (test_clear_buffer_dirty(bh)) {
793 BUG_ON(!buffer_uptodate(bh));
794 mark_buffer_async_write(bh);
797 } else if (unlikely(err)) {
799 * For the error case. The buffer may have been set
800 * dirty during attachment to a dirty page.
803 clear_buffer_dirty(bh);
805 } while ((bh = bh->b_this_page) != head);
808 // TODO: Remove the -EOPNOTSUPP check later on...
809 if (unlikely(err == -EOPNOTSUPP))
811 else if (err == -ENOMEM) {
812 ntfs_warning(vol->sb, "Error allocating memory. "
813 "Redirtying page so we try again "
816 * Put the page back on mapping->dirty_pages, but
817 * leave its buffer's dirty state as-is.
819 redirty_page_for_writepage(wbc, page);
825 BUG_ON(PageWriteback(page));
826 set_page_writeback(page); /* Keeps try_to_free_buffers() away. */
828 /* Submit the prepared buffers for i/o. */
829 need_end_writeback = TRUE;
831 struct buffer_head *next = bh->b_this_page;
832 if (buffer_async_write(bh)) {
833 submit_bh(WRITE, bh);
834 need_end_writeback = FALSE;
837 } while (bh != head);
840 /* If no i/o was started, need to end_page_writeback(). */
841 if (unlikely(need_end_writeback))
842 end_page_writeback(page);
849 * ntfs_write_mst_block - write a @page to the backing store
850 * @page: page cache page to write out
851 * @wbc: writeback control structure
853 * This function is for writing pages belonging to non-resident, mst protected
854 * attributes to their backing store. The only supported attributes are index
855 * allocation and $MFT/$DATA. Both directory inodes and index inodes are
856 * supported for the index allocation case.
858 * The page must remain locked for the duration of the write because we apply
859 * the mst fixups, write, and then undo the fixups, so if we were to unlock the
860 * page before undoing the fixups, any other user of the page will see the
861 * page contents as corrupt.
863 * We clear the page uptodate flag for the duration of the function to ensure
864 * exclusion for the $MFT/$DATA case against someone mapping an mft record we
865 * are about to apply the mst fixups to.
867 * Return 0 on success and -errno on error.
869 * Based on ntfs_write_block(), ntfs_mft_writepage(), and
870 * write_mft_record_nolock().
872 static int ntfs_write_mst_block(struct page *page,
873 struct writeback_control *wbc)
875 sector_t block, dblock, rec_block;
876 struct inode *vi = page->mapping->host;
877 ntfs_inode *ni = NTFS_I(vi);
878 ntfs_volume *vol = ni->vol;
880 unsigned int rec_size = ni->itype.index.block_size;
881 ntfs_inode *locked_nis[PAGE_CACHE_SIZE / rec_size];
882 struct buffer_head *bh, *head, *tbh, *rec_start_bh;
883 struct buffer_head *bhs[MAX_BUF_PER_PAGE];
885 int i, nr_locked_nis, nr_recs, nr_bhs, max_bhs, bhs_per_rec, err, err2;
886 unsigned bh_size, rec_size_bits;
887 BOOL sync, is_mft, page_is_dirty, rec_is_dirty;
888 unsigned char bh_size_bits;
890 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
891 "0x%lx.", vi->i_ino, ni->type, page->index);
892 BUG_ON(!NInoNonResident(ni));
893 BUG_ON(!NInoMstProtected(ni));
894 is_mft = (S_ISREG(vi->i_mode) && !vi->i_ino);
896 * NOTE: ntfs_write_mst_block() would be called for $MFTMirr if a page
897 * in its page cache were to be marked dirty. However this should
898 * never happen with the current driver and considering we do not
899 * handle this case here we do want to BUG(), at least for now.
901 BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) ||
902 (NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION)));
903 bh_size_bits = vi->i_blkbits;
904 bh_size = 1 << bh_size_bits;
905 max_bhs = PAGE_CACHE_SIZE / bh_size;
907 BUG_ON(max_bhs > MAX_BUF_PER_PAGE);
909 /* Were we called for sync purposes? */
910 sync = (wbc->sync_mode == WB_SYNC_ALL);
912 /* Make sure we have mapped buffers. */
913 BUG_ON(!page_has_buffers(page));
914 bh = head = page_buffers(page);
917 rec_size_bits = ni->itype.index.block_size_bits;
918 BUG_ON(!(PAGE_CACHE_SIZE >> rec_size_bits));
919 bhs_per_rec = rec_size >> bh_size_bits;
920 BUG_ON(!bhs_per_rec);
922 /* The first block in the page. */
923 rec_block = block = (sector_t)page->index <<
924 (PAGE_CACHE_SHIFT - bh_size_bits);
926 /* The first out of bounds block for the data size. */
927 dblock = (i_size_read(vi) + bh_size - 1) >> bh_size_bits;
930 err = err2 = nr_bhs = nr_recs = nr_locked_nis = 0;
931 page_is_dirty = rec_is_dirty = FALSE;
934 BOOL is_retry = FALSE;
936 if (likely(block < rec_block)) {
937 if (unlikely(block >= dblock)) {
938 clear_buffer_dirty(bh);
939 set_buffer_uptodate(bh);
943 * This block is not the first one in the record. We
944 * ignore the buffer's dirty state because we could
945 * have raced with a parallel mark_ntfs_record_dirty().
949 if (unlikely(err2)) {
951 clear_buffer_dirty(bh);
954 } else /* if (block == rec_block) */ {
955 BUG_ON(block > rec_block);
956 /* This block is the first one in the record. */
957 rec_block += bhs_per_rec;
959 if (unlikely(block >= dblock)) {
960 clear_buffer_dirty(bh);
963 if (!buffer_dirty(bh)) {
964 /* Clean records are not written out. */
965 rec_is_dirty = FALSE;
971 /* Need to map the buffer if it is not mapped already. */
972 if (unlikely(!buffer_mapped(bh))) {
975 unsigned int vcn_ofs;
977 bh->b_bdev = vol->sb->s_bdev;
978 /* Obtain the vcn and offset of the current block. */
979 vcn = (VCN)block << bh_size_bits;
980 vcn_ofs = vcn & vol->cluster_size_mask;
981 vcn >>= vol->cluster_size_bits;
984 down_read(&ni->runlist.lock);
987 if (likely(rl != NULL)) {
988 /* Seek to element containing target vcn. */
989 while (rl->length && rl[1].vcn <= vcn)
991 lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
993 lcn = LCN_RL_NOT_MAPPED;
994 /* Successful remap. */
995 if (likely(lcn >= 0)) {
996 /* Setup buffer head to correct block. */
997 bh->b_blocknr = ((lcn <<
998 vol->cluster_size_bits) +
999 vcn_ofs) >> bh_size_bits;
1000 set_buffer_mapped(bh);
1003 * Remap failed. Retry to map the runlist once
1004 * unless we are working on $MFT which always
1005 * has the whole of its runlist in memory.
1007 if (!is_mft && !is_retry &&
1008 lcn == LCN_RL_NOT_MAPPED) {
1011 * Attempt to map runlist, dropping
1012 * lock for the duration.
1014 up_read(&ni->runlist.lock);
1015 err2 = ntfs_map_runlist(ni, vcn);
1017 goto lock_retry_remap;
1018 if (err2 == -ENOMEM)
1019 page_is_dirty = TRUE;
1024 up_read(&ni->runlist.lock);
1026 /* Hard error. Abort writing this record. */
1027 if (!err || err == -ENOMEM)
1030 ntfs_error(vol->sb, "Cannot write ntfs record "
1031 "0x%llx (inode 0x%lx, "
1032 "attribute type 0x%x) because "
1033 "its location on disk could "
1034 "not be determined (error "
1038 vol->mft_record_size_bits,
1039 ni->mft_no, ni->type,
1042 * If this is not the first buffer, remove the
1043 * buffers in this record from the list of
1044 * buffers to write and clear their dirty bit
1045 * if not error -ENOMEM.
1047 if (rec_start_bh != bh) {
1048 while (bhs[--nr_bhs] != rec_start_bh)
1050 if (err2 != -ENOMEM) {
1054 } while ((rec_start_bh =
1063 BUG_ON(!buffer_uptodate(bh));
1064 BUG_ON(nr_bhs >= max_bhs);
1066 } while (block++, (bh = bh->b_this_page) != head);
1068 up_read(&ni->runlist.lock);
1069 /* If there were no dirty buffers, we are done. */
1072 /* Map the page so we can access its contents. */
1074 /* Clear the page uptodate flag whilst the mst fixups are applied. */
1075 BUG_ON(!PageUptodate(page));
1076 ClearPageUptodate(page);
1077 for (i = 0; i < nr_bhs; i++) {
1080 /* Skip buffers which are not at the beginning of records. */
1081 if (i % bhs_per_rec)
1084 ofs = bh_offset(tbh);
1087 unsigned long mft_no;
1089 /* Get the mft record number. */
1090 mft_no = (((s64)page->index << PAGE_CACHE_SHIFT) + ofs)
1092 /* Check whether to write this mft record. */
1094 if (!ntfs_may_write_mft_record(vol, mft_no,
1095 (MFT_RECORD*)(kaddr + ofs), &tni)) {
1097 * The record should not be written. This
1098 * means we need to redirty the page before
1101 page_is_dirty = TRUE;
1103 * Remove the buffers in this mft record from
1104 * the list of buffers to write.
1108 } while (++i % bhs_per_rec);
1112 * The record should be written. If a locked ntfs
1113 * inode was returned, add it to the array of locked
1117 locked_nis[nr_locked_nis++] = tni;
1119 /* Apply the mst protection fixups. */
1120 err2 = pre_write_mst_fixup((NTFS_RECORD*)(kaddr + ofs),
1122 if (unlikely(err2)) {
1123 if (!err || err == -ENOMEM)
1125 ntfs_error(vol->sb, "Failed to apply mst fixups "
1126 "(inode 0x%lx, attribute type 0x%x, "
1127 "page index 0x%lx, page offset 0x%x)!"
1128 " Unmount and run chkdsk.", vi->i_ino,
1129 ni->type, page->index, ofs);
1131 * Mark all the buffers in this record clean as we do
1132 * not want to write corrupt data to disk.
1135 clear_buffer_dirty(bhs[i]);
1137 } while (++i % bhs_per_rec);
1142 /* If no records are to be written out, we are done. */
1145 flush_dcache_page(page);
1146 /* Lock buffers and start synchronous write i/o on them. */
1147 for (i = 0; i < nr_bhs; i++) {
1151 if (unlikely(test_set_buffer_locked(tbh)))
1153 /* The buffer dirty state is now irrelevant, just clean it. */
1154 clear_buffer_dirty(tbh);
1155 BUG_ON(!buffer_uptodate(tbh));
1156 BUG_ON(!buffer_mapped(tbh));
1158 tbh->b_end_io = end_buffer_write_sync;
1159 submit_bh(WRITE, tbh);
1161 /* Synchronize the mft mirror now if not @sync. */
1162 if (is_mft && !sync)
1165 /* Wait on i/o completion of buffers. */
1166 for (i = 0; i < nr_bhs; i++) {
1170 wait_on_buffer(tbh);
1171 if (unlikely(!buffer_uptodate(tbh))) {
1172 ntfs_error(vol->sb, "I/O error while writing ntfs "
1173 "record buffer (inode 0x%lx, "
1174 "attribute type 0x%x, page index "
1175 "0x%lx, page offset 0x%lx)! Unmount "
1176 "and run chkdsk.", vi->i_ino, ni->type,
1177 page->index, bh_offset(tbh));
1178 if (!err || err == -ENOMEM)
1181 * Set the buffer uptodate so the page and buffer
1182 * states do not become out of sync.
1184 set_buffer_uptodate(tbh);
1187 /* If @sync, now synchronize the mft mirror. */
1188 if (is_mft && sync) {
1190 for (i = 0; i < nr_bhs; i++) {
1191 unsigned long mft_no;
1195 * Skip buffers which are not at the beginning of
1198 if (i % bhs_per_rec)
1201 /* Skip removed buffers (and hence records). */
1204 ofs = bh_offset(tbh);
1205 /* Get the mft record number. */
1206 mft_no = (((s64)page->index << PAGE_CACHE_SHIFT) + ofs)
1208 if (mft_no < vol->mftmirr_size)
1209 ntfs_sync_mft_mirror(vol, mft_no,
1210 (MFT_RECORD*)(kaddr + ofs),
1216 /* Remove the mst protection fixups again. */
1217 for (i = 0; i < nr_bhs; i++) {
1218 if (!(i % bhs_per_rec)) {
1222 post_write_mst_fixup((NTFS_RECORD*)(kaddr +
1226 flush_dcache_page(page);
1228 /* Unlock any locked inodes. */
1229 while (nr_locked_nis-- > 0) {
1230 ntfs_inode *tni, *base_tni;
1232 tni = locked_nis[nr_locked_nis];
1233 /* Get the base inode. */
1234 down(&tni->extent_lock);
1235 if (tni->nr_extents >= 0)
1238 base_tni = tni->ext.base_ntfs_ino;
1241 up(&tni->extent_lock);
1242 ntfs_debug("Unlocking %s inode 0x%lx.",
1243 tni == base_tni ? "base" : "extent",
1245 up(&tni->mrec_lock);
1246 atomic_dec(&tni->count);
1247 iput(VFS_I(base_tni));
1249 SetPageUptodate(page);
1252 if (unlikely(err && err != -ENOMEM)) {
1254 * Set page error if there is only one ntfs record in the page.
1255 * Otherwise we would loose per-record granularity.
1257 if (ni->itype.index.block_size == PAGE_CACHE_SIZE)
1261 if (page_is_dirty) {
1262 ntfs_debug("Page still contains one or more dirty ntfs "
1263 "records. Redirtying the page starting at "
1264 "record 0x%lx.", page->index <<
1265 (PAGE_CACHE_SHIFT - rec_size_bits));
1266 redirty_page_for_writepage(wbc, page);
1270 * Keep the VM happy. This must be done otherwise the
1271 * radix-tree tag PAGECACHE_TAG_DIRTY remains set even though
1272 * the page is clean.
1274 BUG_ON(PageWriteback(page));
1275 set_page_writeback(page);
1277 end_page_writeback(page);
1280 ntfs_debug("Done.");
1285 * ntfs_writepage - write a @page to the backing store
1286 * @page: page cache page to write out
1287 * @wbc: writeback control structure
1289 * This is called from the VM when it wants to have a dirty ntfs page cache
1290 * page cleaned. The VM has already locked the page and marked it clean.
1292 * For non-resident attributes, ntfs_writepage() writes the @page by calling
1293 * the ntfs version of the generic block_write_full_page() function,
1294 * ntfs_write_block(), which in turn if necessary creates and writes the
1295 * buffers associated with the page asynchronously.
1297 * For resident attributes, OTOH, ntfs_writepage() writes the @page by copying
1298 * the data to the mft record (which at this stage is most likely in memory).
1299 * The mft record is then marked dirty and written out asynchronously via the
1300 * vfs inode dirty code path for the inode the mft record belongs to or via the
1301 * vm page dirty code path for the page the mft record is in.
1303 * Based on ntfs_readpage() and fs/buffer.c::block_write_full_page().
1305 * Return 0 on success and -errno on error.
1307 static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
1310 struct inode *vi = page->mapping->host;
1311 ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi);
1313 ntfs_attr_search_ctx *ctx = NULL;
1314 MFT_RECORD *m = NULL;
1319 BUG_ON(!PageLocked(page));
1320 i_size = i_size_read(vi);
1321 /* Is the page fully outside i_size? (truncate in progress) */
1322 if (unlikely(page->index >= (i_size + PAGE_CACHE_SIZE - 1) >>
1323 PAGE_CACHE_SHIFT)) {
1325 * The page may have dirty, unmapped buffers. Make them
1326 * freeable here, so the page does not leak.
1328 block_invalidatepage(page, 0);
1330 ntfs_debug("Write outside i_size - truncated?");
1334 * Only $DATA attributes can be encrypted and only unnamed $DATA
1335 * attributes can be compressed. Index root can have the flags set but
1336 * this means to create compressed/encrypted files, not that the
1337 * attribute is compressed/encrypted.
1339 if (ni->type != AT_INDEX_ROOT) {
1340 /* If file is encrypted, deny access, just like NT4. */
1341 if (NInoEncrypted(ni)) {
1343 BUG_ON(ni->type != AT_DATA);
1344 ntfs_debug("Denying write access to encrypted "
1348 /* Compressed data streams are handled in compress.c. */
1349 if (NInoNonResident(ni) && NInoCompressed(ni)) {
1350 BUG_ON(ni->type != AT_DATA);
1351 BUG_ON(ni->name_len);
1352 // TODO: Implement and replace this with
1353 // return ntfs_write_compressed_block(page);
1355 ntfs_error(vi->i_sb, "Writing to compressed files is "
1356 "not supported yet. Sorry.");
1359 // TODO: Implement and remove this check.
1360 if (NInoNonResident(ni) && NInoSparse(ni)) {
1362 ntfs_error(vi->i_sb, "Writing to sparse files is not "
1363 "supported yet. Sorry.");
1367 /* NInoNonResident() == NInoIndexAllocPresent() */
1368 if (NInoNonResident(ni)) {
1369 /* We have to zero every time due to mmap-at-end-of-file. */
1370 if (page->index >= (i_size >> PAGE_CACHE_SHIFT)) {
1371 /* The page straddles i_size. */
1372 unsigned int ofs = i_size & ~PAGE_CACHE_MASK;
1373 kaddr = kmap_atomic(page, KM_USER0);
1374 memset(kaddr + ofs, 0, PAGE_CACHE_SIZE - ofs);
1375 flush_dcache_page(page);
1376 kunmap_atomic(kaddr, KM_USER0);
1378 /* Handle mst protected attributes. */
1379 if (NInoMstProtected(ni))
1380 return ntfs_write_mst_block(page, wbc);
1381 /* Normal, non-resident data stream. */
1382 return ntfs_write_block(page, wbc);
1385 * Attribute is resident, implying it is not compressed, encrypted, or
1386 * mst protected. This also means the attribute is smaller than an mft
1387 * record and hence smaller than a page, so can simply return error on
1388 * any pages with index above 0. Note the attribute can actually be
1389 * marked compressed but if it is resident the actual data is not
1390 * compressed so we are ok to ignore the compressed flag here.
1392 BUG_ON(page_has_buffers(page));
1393 BUG_ON(!PageUptodate(page));
1394 if (unlikely(page->index > 0)) {
1395 ntfs_error(vi->i_sb, "BUG()! page->index (0x%lx) > 0. "
1396 "Aborting write.", page->index);
1397 BUG_ON(PageWriteback(page));
1398 set_page_writeback(page);
1400 end_page_writeback(page);
1406 base_ni = ni->ext.base_ntfs_ino;
1407 /* Map, pin, and lock the mft record. */
1408 m = map_mft_record(base_ni);
1416 * If a parallel write made the attribute non-resident, drop the mft
1417 * record and retry the writepage.
1419 if (unlikely(NInoNonResident(ni))) {
1420 unmap_mft_record(base_ni);
1421 goto retry_writepage;
1423 ctx = ntfs_attr_get_search_ctx(base_ni, m);
1424 if (unlikely(!ctx)) {
1428 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1429 CASE_SENSITIVE, 0, NULL, 0, ctx);
1433 * Keep the VM happy. This must be done otherwise the radix-tree tag
1434 * PAGECACHE_TAG_DIRTY remains set even though the page is clean.
1436 BUG_ON(PageWriteback(page));
1437 set_page_writeback(page);
1440 * Here, we do not need to zero the out of bounds area everytime
1441 * because the below memcpy() already takes care of the
1442 * mmap-at-end-of-file requirements. If the file is converted to a
1443 * non-resident one, then the code path use is switched to the
1444 * non-resident one where the zeroing happens on each ntfs_writepage()
1447 attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
1448 i_size = i_size_read(vi);
1449 if (unlikely(attr_len > i_size)) {
1451 ctx->attr->data.resident.value_length = cpu_to_le32(attr_len);
1453 kaddr = kmap_atomic(page, KM_USER0);
1454 /* Copy the data from the page to the mft record. */
1455 memcpy((u8*)ctx->attr +
1456 le16_to_cpu(ctx->attr->data.resident.value_offset),
1458 flush_dcache_mft_record_page(ctx->ntfs_ino);
1459 /* Zero out of bounds area in the page cache page. */
1460 memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
1461 flush_dcache_page(page);
1462 kunmap_atomic(kaddr, KM_USER0);
1464 end_page_writeback(page);
1466 /* Mark the mft record dirty, so it gets written back. */
1467 mark_mft_record_dirty(ctx->ntfs_ino);
1468 ntfs_attr_put_search_ctx(ctx);
1469 unmap_mft_record(base_ni);
1472 if (err == -ENOMEM) {
1473 ntfs_warning(vi->i_sb, "Error allocating memory. Redirtying "
1474 "page so we try again later.");
1476 * Put the page back on mapping->dirty_pages, but leave its
1477 * buffers' dirty state as-is.
1479 redirty_page_for_writepage(wbc, page);
1482 ntfs_error(vi->i_sb, "Resident attribute write failed with "
1485 NVolSetErrors(ni->vol);
1490 ntfs_attr_put_search_ctx(ctx);
1492 unmap_mft_record(base_ni);
1497 * ntfs_prepare_nonresident_write -
1500 static int ntfs_prepare_nonresident_write(struct page *page,
1501 unsigned from, unsigned to)
1505 s64 initialized_size;
1507 sector_t block, ablock, iblock;
1511 runlist_element *rl;
1512 struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
1513 unsigned long flags;
1514 unsigned int vcn_ofs, block_start, block_end, blocksize;
1517 unsigned char blocksize_bits;
1519 vi = page->mapping->host;
1523 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
1524 "0x%lx, from = %u, to = %u.", ni->mft_no, ni->type,
1525 page->index, from, to);
1527 BUG_ON(!NInoNonResident(ni));
1529 blocksize_bits = vi->i_blkbits;
1530 blocksize = 1 << blocksize_bits;
1533 * create_empty_buffers() will create uptodate/dirty buffers if the
1534 * page is uptodate/dirty.
1536 if (!page_has_buffers(page))
1537 create_empty_buffers(page, blocksize, 0);
1538 bh = head = page_buffers(page);
1542 /* The first block in the page. */
1543 block = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
1545 read_lock_irqsave(&ni->size_lock, flags);
1547 * The first out of bounds block for the allocated size. No need to
1548 * round up as allocated_size is in multiples of cluster size and the
1549 * minimum cluster size is 512 bytes, which is equal to the smallest
1552 ablock = ni->allocated_size >> blocksize_bits;
1553 i_size = i_size_read(vi);
1554 initialized_size = ni->initialized_size;
1555 read_unlock_irqrestore(&ni->size_lock, flags);
1557 /* The last (fully or partially) initialized block. */
1558 iblock = initialized_size >> blocksize_bits;
1560 /* Loop through all the buffers in the page. */
1565 block_end = block_start + blocksize;
1567 * If buffer @bh is outside the write, just mark it uptodate
1568 * if the page is uptodate and continue with the next buffer.
1570 if (block_end <= from || block_start >= to) {
1571 if (PageUptodate(page)) {
1572 if (!buffer_uptodate(bh))
1573 set_buffer_uptodate(bh);
1578 * @bh is at least partially being written to.
1579 * Make sure it is not marked as new.
1581 //if (buffer_new(bh))
1582 // clear_buffer_new(bh);
1584 if (block >= ablock) {
1585 // TODO: block is above allocated_size, need to
1586 // allocate it. Best done in one go to accommodate not
1587 // only block but all above blocks up to and including:
1588 // ((page->index << PAGE_CACHE_SHIFT) + to + blocksize
1589 // - 1) >> blobksize_bits. Obviously will need to round
1590 // up to next cluster boundary, too. This should be
1591 // done with a helper function, so it can be reused.
1592 ntfs_error(vol->sb, "Writing beyond allocated size "
1593 "is not supported yet. Sorry.");
1596 // Need to update ablock.
1597 // Need to set_buffer_new() on all block bhs that are
1601 * Now we have enough allocated size to fulfill the whole
1602 * request, i.e. block < ablock is true.
1604 if (unlikely((block >= iblock) &&
1605 (initialized_size < i_size))) {
1607 * If this page is fully outside initialized size, zero
1608 * out all pages between the current initialized size
1609 * and the current page. Just use ntfs_readpage() to do
1610 * the zeroing transparently.
1612 if (block > iblock) {
1614 // For each page do:
1615 // - read_cache_page()
1616 // Again for each page do:
1617 // - wait_on_page_locked()
1618 // - Check (PageUptodate(page) &&
1619 // !PageError(page))
1620 // Update initialized size in the attribute and
1622 // Again, for each page do:
1623 // __set_page_dirty_buffers();
1624 // page_cache_release()
1625 // We don't need to wait on the writes.
1629 * The current page straddles initialized size. Zero
1630 * all non-uptodate buffers and set them uptodate (and
1631 * dirty?). Note, there aren't any non-uptodate buffers
1632 * if the page is uptodate.
1633 * FIXME: For an uptodate page, the buffers may need to
1634 * be written out because they were not initialized on
1637 if (!PageUptodate(page)) {
1639 // Zero any non-uptodate buffers up to i_size.
1640 // Set them uptodate and dirty.
1643 // Update initialized size in the attribute and in the
1644 // inode (up to i_size).
1646 // FIXME: This is inefficient. Try to batch the two
1647 // size changes to happen in one go.
1648 ntfs_error(vol->sb, "Writing beyond initialized size "
1649 "is not supported yet. Sorry.");
1652 // Do NOT set_buffer_new() BUT DO clear buffer range
1653 // outside write request range.
1654 // set_buffer_uptodate() on complete buffers as well as
1655 // set_buffer_dirty().
1658 /* Need to map unmapped buffers. */
1659 if (!buffer_mapped(bh)) {
1660 /* Unmapped buffer. Need to map it. */
1661 bh->b_bdev = vol->sb->s_bdev;
1663 /* Convert block into corresponding vcn and offset. */
1664 vcn = (VCN)block << blocksize_bits >>
1665 vol->cluster_size_bits;
1666 vcn_ofs = ((VCN)block << blocksize_bits) &
1667 vol->cluster_size_mask;
1672 down_read(&ni->runlist.lock);
1673 rl = ni->runlist.rl;
1675 if (likely(rl != NULL)) {
1676 /* Seek to element containing target vcn. */
1677 while (rl->length && rl[1].vcn <= vcn)
1679 lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
1681 lcn = LCN_RL_NOT_MAPPED;
1682 if (unlikely(lcn < 0)) {
1684 * We extended the attribute allocation above.
1685 * If we hit an ENOENT here it means that the
1686 * allocation was insufficient which is a bug.
1688 BUG_ON(lcn == LCN_ENOENT);
1690 /* It is a hole, need to instantiate it. */
1691 if (lcn == LCN_HOLE) {
1692 // TODO: Instantiate the hole.
1693 // clear_buffer_new(bh);
1694 // unmap_underlying_metadata(bh->b_bdev,
1696 // For non-uptodate buffers, need to
1697 // zero out the region outside the
1698 // request in this bh or all bhs,
1699 // depending on what we implemented
1701 // Need to flush_dcache_page().
1702 // Or could use set_buffer_new()
1704 ntfs_error(vol->sb, "Writing into "
1705 "sparse regions is "
1706 "not supported yet. "
1710 up_read(&ni->runlist.lock);
1712 } else if (!is_retry &&
1713 lcn == LCN_RL_NOT_MAPPED) {
1716 * Attempt to map runlist, dropping
1717 * lock for the duration.
1719 up_read(&ni->runlist.lock);
1720 err = ntfs_map_runlist(ni, vcn);
1722 goto lock_retry_remap;
1726 up_read(&ni->runlist.lock);
1728 * Failed to map the buffer, even after
1732 ntfs_error(vol->sb, "Failed to write to inode "
1733 "0x%lx, attribute type 0x%x, "
1734 "vcn 0x%llx, offset 0x%x "
1735 "because its location on disk "
1736 "could not be determined%s "
1737 "(error code %lli).",
1738 ni->mft_no, ni->type,
1739 (unsigned long long)vcn,
1740 vcn_ofs, is_retry ? " even "
1741 "after retrying" : "",
1747 /* We now have a successful remap, i.e. lcn >= 0. */
1749 /* Setup buffer head to correct block. */
1750 bh->b_blocknr = ((lcn << vol->cluster_size_bits)
1751 + vcn_ofs) >> blocksize_bits;
1752 set_buffer_mapped(bh);
1754 // FIXME: Something analogous to this is needed for
1755 // each newly allocated block, i.e. BH_New.
1756 // FIXME: Might need to take this out of the
1757 // if (!buffer_mapped(bh)) {}, depending on how we
1758 // implement things during the allocated_size and
1759 // initialized_size extension code above.
1760 if (buffer_new(bh)) {
1761 clear_buffer_new(bh);
1762 unmap_underlying_metadata(bh->b_bdev,
1764 if (PageUptodate(page)) {
1765 set_buffer_uptodate(bh);
1769 * Page is _not_ uptodate, zero surrounding
1770 * region. NOTE: This is how we decide if to
1773 if (block_end > to || block_start < from) {
1776 kaddr = kmap_atomic(page, KM_USER0);
1778 memset(kaddr + to, 0,
1780 if (block_start < from)
1781 memset(kaddr + block_start, 0,
1784 flush_dcache_page(page);
1785 kunmap_atomic(kaddr, KM_USER0);
1790 /* @bh is mapped, set it uptodate if the page is uptodate. */
1791 if (PageUptodate(page)) {
1792 if (!buffer_uptodate(bh))
1793 set_buffer_uptodate(bh);
1797 * The page is not uptodate. The buffer is mapped. If it is not
1798 * uptodate, and it is only partially being written to, we need
1799 * to read the buffer in before the write, i.e. right now.
1801 if (!buffer_uptodate(bh) &&
1802 (block_start < from || block_end > to)) {
1803 ll_rw_block(READ, 1, &bh);
1806 } while (block++, block_start = block_end,
1807 (bh = bh->b_this_page) != head);
1809 /* Release the lock if we took it. */
1811 up_read(&ni->runlist.lock);
1815 /* If we issued read requests, let them complete. */
1816 while (wait_bh > wait) {
1817 wait_on_buffer(*--wait_bh);
1818 if (!buffer_uptodate(*wait_bh))
1822 ntfs_debug("Done.");
1826 * Zero out any newly allocated blocks to avoid exposing stale data.
1827 * If BH_New is set, we know that the block was newly allocated in the
1829 * FIXME: What about initialized_size increments? Have we done all the
1830 * required zeroing above? If not this error handling is broken, and
1831 * in particular the if (block_end <= from) check is completely bogus.
1837 block_end = block_start + blocksize;
1838 if (block_end <= from)
1840 if (block_start >= to)
1842 if (buffer_new(bh)) {
1845 clear_buffer_new(bh);
1846 kaddr = kmap_atomic(page, KM_USER0);
1847 memset(kaddr + block_start, 0, bh->b_size);
1848 kunmap_atomic(kaddr, KM_USER0);
1849 set_buffer_uptodate(bh);
1850 mark_buffer_dirty(bh);
1853 } while (block_start = block_end, (bh = bh->b_this_page) != head);
1855 flush_dcache_page(page);
1857 up_read(&ni->runlist.lock);
1862 * ntfs_prepare_write - prepare a page for receiving data
1864 * This is called from generic_file_write() with i_sem held on the inode
1865 * (@page->mapping->host). The @page is locked but not kmap()ped. The source
1866 * data has not yet been copied into the @page.
1868 * Need to extend the attribute/fill in holes if necessary, create blocks and
1869 * make partially overwritten blocks uptodate,
1871 * i_size is not to be modified yet.
1873 * Return 0 on success or -errno on error.
1875 * Should be using block_prepare_write() [support for sparse files] or
1876 * cont_prepare_write() [no support for sparse files]. Cannot do that due to
1877 * ntfs specifics but can look at them for implementation guidance.
1879 * Note: In the range, @from is inclusive and @to is exclusive, i.e. @from is
1880 * the first byte in the page that will be written to and @to is the first byte
1881 * after the last byte that will be written to.
1883 static int ntfs_prepare_write(struct file *file, struct page *page,
1884 unsigned from, unsigned to)
1888 struct inode *vi = page->mapping->host;
1889 ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi);
1890 ntfs_volume *vol = ni->vol;
1891 ntfs_attr_search_ctx *ctx = NULL;
1892 MFT_RECORD *m = NULL;
1898 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
1899 "0x%lx, from = %u, to = %u.", vi->i_ino, ni->type,
1900 page->index, from, to);
1901 BUG_ON(!PageLocked(page));
1902 BUG_ON(from > PAGE_CACHE_SIZE);
1903 BUG_ON(to > PAGE_CACHE_SIZE);
1905 BUG_ON(NInoMstProtected(ni));
1907 * If a previous ntfs_truncate() failed, repeat it and abort if it
1910 if (unlikely(NInoTruncateFailed(ni))) {
1911 down_write(&vi->i_alloc_sem);
1912 err = ntfs_truncate(vi);
1913 up_write(&vi->i_alloc_sem);
1914 if (err || NInoTruncateFailed(ni)) {
1920 /* If the attribute is not resident, deal with it elsewhere. */
1921 if (NInoNonResident(ni)) {
1923 * Only unnamed $DATA attributes can be compressed, encrypted,
1926 if (ni->type == AT_DATA && !ni->name_len) {
1927 /* If file is encrypted, deny access, just like NT4. */
1928 if (NInoEncrypted(ni)) {
1929 ntfs_debug("Denying write access to encrypted "
1933 /* Compressed data streams are handled in compress.c. */
1934 if (NInoCompressed(ni)) {
1935 // TODO: Implement and replace this check with
1936 // return ntfs_write_compressed_block(page);
1937 ntfs_error(vi->i_sb, "Writing to compressed "
1938 "files is not supported yet. "
1942 // TODO: Implement and remove this check.
1943 if (NInoSparse(ni)) {
1944 ntfs_error(vi->i_sb, "Writing to sparse files "
1945 "is not supported yet. Sorry.");
1949 /* Normal data stream. */
1950 return ntfs_prepare_nonresident_write(page, from, to);
1953 * Attribute is resident, implying it is not compressed, encrypted, or
1956 BUG_ON(page_has_buffers(page));
1957 new_size = ((s64)page->index << PAGE_CACHE_SHIFT) + to;
1958 /* If we do not need to resize the attribute allocation we are done. */
1959 if (new_size <= i_size_read(vi))
1961 /* Map, pin, and lock the (base) mft record. */
1965 base_ni = ni->ext.base_ntfs_ino;
1966 m = map_mft_record(base_ni);
1973 ctx = ntfs_attr_get_search_ctx(base_ni, m);
1974 if (unlikely(!ctx)) {
1978 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1979 CASE_SENSITIVE, 0, NULL, 0, ctx);
1980 if (unlikely(err)) {
1987 /* The total length of the attribute value. */
1988 attr_len = le32_to_cpu(a->data.resident.value_length);
1989 /* Fix an eventual previous failure of ntfs_commit_write(). */
1990 i_size = i_size_read(vi);
1991 if (unlikely(attr_len > i_size)) {
1993 a->data.resident.value_length = cpu_to_le32(attr_len);
1995 /* If we do not need to resize the attribute allocation we are done. */
1996 if (new_size <= attr_len)
1998 /* Check if new size is allowed in $AttrDef. */
1999 err = ntfs_attr_size_bounds_check(vol, ni->type, new_size);
2000 if (unlikely(err)) {
2001 if (err == -ERANGE) {
2002 ntfs_error(vol->sb, "Write would cause the inode "
2003 "0x%lx to exceed the maximum size for "
2004 "its attribute type (0x%x). Aborting "
2005 "write.", vi->i_ino,
2006 le32_to_cpu(ni->type));
2008 ntfs_error(vol->sb, "Inode 0x%lx has unknown "
2009 "attribute type 0x%x. Aborting "
2010 "write.", vi->i_ino,
2011 le32_to_cpu(ni->type));
2017 * Extend the attribute record to be able to store the new attribute
2020 if (new_size >= vol->mft_record_size || ntfs_attr_record_resize(m, a,
2021 le16_to_cpu(a->data.resident.value_offset) +
2023 /* Not enough space in the mft record. */
2024 ntfs_error(vol->sb, "Not enough space in the mft record for "
2025 "the resized attribute value. This is not "
2026 "supported yet. Aborting write.");
2031 * We have enough space in the mft record to fit the write. This
2032 * implies the attribute is smaller than the mft record and hence the
2033 * attribute must be in a single page and hence page->index must be 0.
2035 BUG_ON(page->index);
2037 * If the beginning of the write is past the old size, enlarge the
2038 * attribute value up to the beginning of the write and fill it with
2041 if (from > attr_len) {
2042 memset((u8*)a + le16_to_cpu(a->data.resident.value_offset) +
2043 attr_len, 0, from - attr_len);
2044 a->data.resident.value_length = cpu_to_le32(from);
2045 /* Zero the corresponding area in the page as well. */
2046 if (PageUptodate(page)) {
2047 kaddr = kmap_atomic(page, KM_USER0);
2048 memset(kaddr + attr_len, 0, from - attr_len);
2049 kunmap_atomic(kaddr, KM_USER0);
2050 flush_dcache_page(page);
2053 flush_dcache_mft_record_page(ctx->ntfs_ino);
2054 mark_mft_record_dirty(ctx->ntfs_ino);
2056 ntfs_attr_put_search_ctx(ctx);
2057 unmap_mft_record(base_ni);
2059 * Because resident attributes are handled by memcpy() to/from the
2060 * corresponding MFT record, and because this form of i/o is byte
2061 * aligned rather than block aligned, there is no need to bring the
2062 * page uptodate here as in the non-resident case where we need to
2063 * bring the buffers straddled by the write uptodate before
2064 * generic_file_write() does the copying from userspace.
2066 * We thus defer the uptodate bringing of the page region outside the
2067 * region written to to ntfs_commit_write(), which makes the code
2068 * simpler and saves one atomic kmap which is good.
2071 ntfs_debug("Done.");
2075 ntfs_warning(vi->i_sb, "Error allocating memory required to "
2076 "prepare the write.");
2078 ntfs_error(vi->i_sb, "Resident attribute prepare write failed "
2079 "with error %i.", err);
2085 ntfs_attr_put_search_ctx(ctx);
2087 unmap_mft_record(base_ni);
2092 * ntfs_commit_nonresident_write -
2095 static int ntfs_commit_nonresident_write(struct page *page,
2096 unsigned from, unsigned to)
2098 s64 pos = ((s64)page->index << PAGE_CACHE_SHIFT) + to;
2099 struct inode *vi = page->mapping->host;
2100 struct buffer_head *bh, *head;
2101 unsigned int block_start, block_end, blocksize;
2104 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
2105 "0x%lx, from = %u, to = %u.", vi->i_ino,
2106 NTFS_I(vi)->type, page->index, from, to);
2107 blocksize = 1 << vi->i_blkbits;
2109 // FIXME: We need a whole slew of special cases in here for compressed
2110 // files for example...
2111 // For now, we know ntfs_prepare_write() would have failed so we can't
2112 // get here in any of the cases which we have to special case, so we
2113 // are just a ripped off, unrolled generic_commit_write().
2115 bh = head = page_buffers(page);
2119 block_end = block_start + blocksize;
2120 if (block_end <= from || block_start >= to) {
2121 if (!buffer_uptodate(bh))
2124 set_buffer_uptodate(bh);
2125 mark_buffer_dirty(bh);
2127 } while (block_start = block_end, (bh = bh->b_this_page) != head);
2129 * If this is a partial write which happened to make all buffers
2130 * uptodate then we can optimize away a bogus ->readpage() for the next
2131 * read(). Here we 'discover' whether the page went uptodate as a
2132 * result of this (potentially partial) write.
2135 SetPageUptodate(page);
2137 * Not convinced about this at all. See disparity comment above. For
2138 * now we know ntfs_prepare_write() would have failed in the write
2139 * exceeds i_size case, so this will never trigger which is fine.
2141 if (pos > i_size_read(vi)) {
2142 ntfs_error(vi->i_sb, "Writing beyond the existing file size is "
2143 "not supported yet. Sorry.");
2145 // vi->i_size = pos;
2146 // mark_inode_dirty(vi);
2148 ntfs_debug("Done.");
2153 * ntfs_commit_write - commit the received data
2155 * This is called from generic_file_write() with i_sem held on the inode
2156 * (@page->mapping->host). The @page is locked but not kmap()ped. The source
2157 * data has already been copied into the @page. ntfs_prepare_write() has been
2158 * called before the data copied and it returned success so we can take the
2159 * results of various BUG checks and some error handling for granted.
2161 * Need to mark modified blocks dirty so they get written out later when
2162 * ntfs_writepage() is invoked by the VM.
2164 * Return 0 on success or -errno on error.
2166 * Should be using generic_commit_write(). This marks buffers uptodate and
2167 * dirty, sets the page uptodate if all buffers in the page are uptodate, and
2168 * updates i_size if the end of io is beyond i_size. In that case, it also
2169 * marks the inode dirty.
2171 * Cannot use generic_commit_write() due to ntfs specialities but can look at
2172 * it for implementation guidance.
2174 * If things have gone as outlined in ntfs_prepare_write(), then we do not
2175 * need to do any page content modifications here at all, except in the write
2176 * to resident attribute case, where we need to do the uptodate bringing here
2177 * which we combine with the copying into the mft record which means we save
2180 static int ntfs_commit_write(struct file *file, struct page *page,
2181 unsigned from, unsigned to)
2183 struct inode *vi = page->mapping->host;
2184 ntfs_inode *base_ni, *ni = NTFS_I(vi);
2185 char *kaddr, *kattr;
2186 ntfs_attr_search_ctx *ctx;
2192 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
2193 "0x%lx, from = %u, to = %u.", vi->i_ino, ni->type,
2194 page->index, from, to);
2195 /* If the attribute is not resident, deal with it elsewhere. */
2196 if (NInoNonResident(ni)) {
2197 /* Only unnamed $DATA attributes can be compressed/encrypted. */
2198 if (ni->type == AT_DATA && !ni->name_len) {
2199 /* Encrypted files need separate handling. */
2200 if (NInoEncrypted(ni)) {
2201 // We never get here at present!
2204 /* Compressed data streams are handled in compress.c. */
2205 if (NInoCompressed(ni)) {
2206 // TODO: Implement this!
2207 // return ntfs_write_compressed_block(page);
2208 // We never get here at present!
2212 /* Normal data stream. */
2213 return ntfs_commit_nonresident_write(page, from, to);
2216 * Attribute is resident, implying it is not compressed, encrypted, or
2222 base_ni = ni->ext.base_ntfs_ino;
2223 /* Map, pin, and lock the mft record. */
2224 m = map_mft_record(base_ni);
2231 ctx = ntfs_attr_get_search_ctx(base_ni, m);
2232 if (unlikely(!ctx)) {
2236 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
2237 CASE_SENSITIVE, 0, NULL, 0, ctx);
2238 if (unlikely(err)) {
2244 /* The total length of the attribute value. */
2245 attr_len = le32_to_cpu(a->data.resident.value_length);
2246 BUG_ON(from > attr_len);
2247 kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
2248 kaddr = kmap_atomic(page, KM_USER0);
2249 /* Copy the received data from the page to the mft record. */
2250 memcpy(kattr + from, kaddr + from, to - from);
2251 /* Update the attribute length if necessary. */
2252 if (to > attr_len) {
2254 a->data.resident.value_length = cpu_to_le32(attr_len);
2257 * If the page is not uptodate, bring the out of bounds area(s)
2258 * uptodate by copying data from the mft record to the page.
2260 if (!PageUptodate(page)) {
2262 memcpy(kaddr, kattr, from);
2264 memcpy(kaddr + to, kattr + to, attr_len - to);
2265 /* Zero the region outside the end of the attribute value. */
2266 if (attr_len < PAGE_CACHE_SIZE)
2267 memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
2269 * The probability of not having done any of the above is
2270 * extremely small, so we just flush unconditionally.
2272 flush_dcache_page(page);
2273 SetPageUptodate(page);
2275 kunmap_atomic(kaddr, KM_USER0);
2276 /* Update i_size if necessary. */
2277 if (i_size_read(vi) < attr_len) {
2278 unsigned long flags;
2280 write_lock_irqsave(&ni->size_lock, flags);
2281 ni->allocated_size = ni->initialized_size = attr_len;
2282 i_size_write(vi, attr_len);
2283 write_unlock_irqrestore(&ni->size_lock, flags);
2285 /* Mark the mft record dirty, so it gets written back. */
2286 flush_dcache_mft_record_page(ctx->ntfs_ino);
2287 mark_mft_record_dirty(ctx->ntfs_ino);
2288 ntfs_attr_put_search_ctx(ctx);
2289 unmap_mft_record(base_ni);
2290 ntfs_debug("Done.");
2293 if (err == -ENOMEM) {
2294 ntfs_warning(vi->i_sb, "Error allocating memory required to "
2295 "commit the write.");
2296 if (PageUptodate(page)) {
2297 ntfs_warning(vi->i_sb, "Page is uptodate, setting "
2298 "dirty so the write will be retried "
2299 "later on by the VM.");
2301 * Put the page on mapping->dirty_pages, but leave its
2302 * buffers' dirty state as-is.
2304 __set_page_dirty_nobuffers(page);
2307 ntfs_error(vi->i_sb, "Page is not uptodate. Written "
2308 "data has been lost.");
2310 ntfs_error(vi->i_sb, "Resident attribute commit write failed "
2311 "with error %i.", err);
2312 NVolSetErrors(ni->vol);
2316 ntfs_attr_put_search_ctx(ctx);
2318 unmap_mft_record(base_ni);
2322 #endif /* NTFS_RW */
2325 * ntfs_aops - general address space operations for inodes and attributes
2327 struct address_space_operations ntfs_aops = {
2328 .readpage = ntfs_readpage, /* Fill page with data. */
2329 .sync_page = block_sync_page, /* Currently, just unplugs the
2330 disk request queue. */
2332 .writepage = ntfs_writepage, /* Write dirty page to disk. */
2333 .prepare_write = ntfs_prepare_write, /* Prepare page and buffers
2334 ready to receive data. */
2335 .commit_write = ntfs_commit_write, /* Commit received data. */
2336 #endif /* NTFS_RW */
2340 * ntfs_mst_aops - general address space operations for mst protecteed inodes
2343 struct address_space_operations ntfs_mst_aops = {
2344 .readpage = ntfs_readpage, /* Fill page with data. */
2345 .sync_page = block_sync_page, /* Currently, just unplugs the
2346 disk request queue. */
2348 .writepage = ntfs_writepage, /* Write dirty page to disk. */
2349 .set_page_dirty = __set_page_dirty_nobuffers, /* Set the page dirty
2350 without touching the buffers
2351 belonging to the page. */
2352 #endif /* NTFS_RW */
2358 * mark_ntfs_record_dirty - mark an ntfs record dirty
2359 * @page: page containing the ntfs record to mark dirty
2360 * @ofs: byte offset within @page at which the ntfs record begins
2362 * Set the buffers and the page in which the ntfs record is located dirty.
2364 * The latter also marks the vfs inode the ntfs record belongs to dirty
2365 * (I_DIRTY_PAGES only).
2367 * If the page does not have buffers, we create them and set them uptodate.
2368 * The page may not be locked which is why we need to handle the buffers under
2369 * the mapping->private_lock. Once the buffers are marked dirty we no longer
2370 * need the lock since try_to_free_buffers() does not free dirty buffers.
2372 void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
2373 struct address_space *mapping = page->mapping;
2374 ntfs_inode *ni = NTFS_I(mapping->host);
2375 struct buffer_head *bh, *head, *buffers_to_free = NULL;
2376 unsigned int end, bh_size, bh_ofs;
2378 BUG_ON(!PageUptodate(page));
2379 end = ofs + ni->itype.index.block_size;
2380 bh_size = 1 << VFS_I(ni)->i_blkbits;
2381 spin_lock(&mapping->private_lock);
2382 if (unlikely(!page_has_buffers(page))) {
2383 spin_unlock(&mapping->private_lock);
2384 bh = head = alloc_page_buffers(page, bh_size, 1);
2385 spin_lock(&mapping->private_lock);
2386 if (likely(!page_has_buffers(page))) {
2387 struct buffer_head *tail;
2390 set_buffer_uptodate(bh);
2392 bh = bh->b_this_page;
2394 tail->b_this_page = head;
2395 attach_page_buffers(page, head);
2397 buffers_to_free = bh;
2399 bh = head = page_buffers(page);
2401 bh_ofs = bh_offset(bh);
2402 if (bh_ofs + bh_size <= ofs)
2404 if (unlikely(bh_ofs >= end))
2406 set_buffer_dirty(bh);
2407 } while ((bh = bh->b_this_page) != head);
2408 spin_unlock(&mapping->private_lock);
2409 __set_page_dirty_nobuffers(page);
2410 if (unlikely(buffers_to_free)) {
2412 bh = buffers_to_free->b_this_page;
2413 free_buffer_head(buffers_to_free);
2414 buffers_to_free = bh;
2415 } while (buffers_to_free);
2419 #endif /* NTFS_RW */