X-Git-Url: http://pilppa.org/gitweb/gitweb.cgi?a=blobdiff_plain;f=mm%2Ffilemap.c;h=5dfc093ceb3d680322158f508bed3b0b47a8fbb7;hb=d56e03cd275486eb8141116a7af2df7457cb0115;hp=ec469235985d6e2cd89d39bf8bd40405f133e142;hpb=63c422afe3739b68bec0b5c42807d1450c951caf;p=linux-2.6-omap-h63xx.git diff --git a/mm/filemap.c b/mm/filemap.c index ec469235985..5dfc093ceb3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -75,8 +75,8 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * ->mmap_sem * ->lock_page (access_process_vm) * - * ->mmap_sem - * ->i_mutex (msync) + * ->i_mutex (generic_file_buffered_write) + * ->mmap_sem (fault_in_pages_readable->do_page_fault) * * ->i_mutex * ->i_alloc_sem (various) @@ -327,7 +327,7 @@ EXPORT_SYMBOL(sync_page_range); * @pos: beginning offset in pages to write * @count: number of bytes to write * - * Note: Holding i_mutex across sync_page_range_nolock is not a good idea + * Note: Holding i_mutex across sync_page_range_nolock() is not a good idea * as it forces O_SYNC writers to different parts of the same file * to be serialised right until io completion. */ @@ -467,25 +467,15 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, } #ifdef CONFIG_NUMA -struct page *page_cache_alloc(struct address_space *x) -{ - if (cpuset_do_page_mem_spread()) { - int n = cpuset_mem_spread_node(); - return alloc_pages_node(n, mapping_gfp_mask(x), 0); - } - return alloc_pages(mapping_gfp_mask(x), 0); -} -EXPORT_SYMBOL(page_cache_alloc); - -struct page *page_cache_alloc_cold(struct address_space *x) +struct page *__page_cache_alloc(gfp_t gfp) { if (cpuset_do_page_mem_spread()) { int n = cpuset_mem_spread_node(); - return alloc_pages_node(n, mapping_gfp_mask(x)|__GFP_COLD, 0); + return alloc_pages_node(n, gfp, 0); } - return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0); + return alloc_pages(gfp, 0); } -EXPORT_SYMBOL(page_cache_alloc_cold); +EXPORT_SYMBOL(__page_cache_alloc); #endif static int __sleep_on_page_lock(void *word) @@ -615,26 +605,6 @@ struct page * find_get_page(struct address_space *mapping, unsigned long offset) } EXPORT_SYMBOL(find_get_page); -/** - * find_trylock_page - find and lock a page - * @mapping: the address_space to search - * @offset: the page index - * - * Same as find_get_page(), but trylock it instead of incrementing the count. - */ -struct page *find_trylock_page(struct address_space *mapping, unsigned long offset) -{ - struct page *page; - - read_lock_irq(&mapping->tree_lock); - page = radix_tree_lookup(&mapping->page_tree, offset); - if (page && TestSetPageLocked(page)) - page = NULL; - read_unlock_irq(&mapping->tree_lock); - return page; -} -EXPORT_SYMBOL(find_trylock_page); - /** * find_lock_page - locate, pin and lock a pagecache page * @mapping: the address_space to search @@ -814,7 +784,7 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, * @mapping: target address_space * @index: the page index * - * Same as grab_cache_page, but do not wait if the page is unavailable. + * Same as grab_cache_page(), but do not wait if the page is unavailable. * This is intended for speculative data generators, where the data can * be regenerated if the page couldn't be grabbed. This routine should * be safe to call while holding the lock for another page. @@ -826,7 +796,6 @@ struct page * grab_cache_page_nowait(struct address_space *mapping, unsigned long index) { struct page *page = find_get_page(mapping, index); - gfp_t gfp_mask; if (page) { if (!TestSetPageLocked(page)) @@ -834,9 +803,8 @@ grab_cache_page_nowait(struct address_space *mapping, unsigned long index) page_cache_release(page); return NULL; } - gfp_mask = mapping_gfp_mask(mapping) & ~__GFP_FS; - page = alloc_pages(gfp_mask, 0); - if (page && add_to_page_cache_lru(page, mapping, index, gfp_mask)) { + page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); + if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) { page_cache_release(page); page = NULL; } @@ -1139,11 +1107,11 @@ success: } /** - * __generic_file_aio_read - generic filesystem read routine + * generic_file_aio_read - generic filesystem read routine * @iocb: kernel I/O control block * @iov: io vector request * @nr_segs: number of segments in the iovec - * @ppos: current file position + * @pos: current file position * * This is the "read()" routine for all filesystems * that can use the page cache directly. @@ -1193,13 +1161,13 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, if (pos < size) { retval = generic_file_direct_IO(READ, iocb, iov, pos, nr_segs); - if (retval > 0 && !is_sync_kiocb(iocb)) - retval = -EIOCBQUEUED; if (retval > 0) *ppos = pos + retval; } - file_accessed(filp); - goto out; + if (likely(retval != 0)) { + file_accessed(filp); + goto out; + } } retval = 0; @@ -1455,7 +1423,6 @@ no_cached_page: * effect. */ error = page_cache_read(file, pgoff); - grab_swap_token(); /* * The page we want has now been added to the page cache. @@ -1882,11 +1849,10 @@ repeat: * if suid or (sgid and xgrp) * remove privs */ -int remove_suid(struct dentry *dentry) +int should_remove_suid(struct dentry *dentry) { mode_t mode = dentry->d_inode->i_mode; int kill = 0; - int result = 0; /* suid always must be killed */ if (unlikely(mode & S_ISUID)) @@ -1899,13 +1865,29 @@ int remove_suid(struct dentry *dentry) if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) kill |= ATTR_KILL_SGID; - if (unlikely(kill && !capable(CAP_FSETID))) { - struct iattr newattrs; + if (unlikely(kill && !capable(CAP_FSETID))) + return kill; - newattrs.ia_valid = ATTR_FORCE | kill; - result = notify_change(dentry, &newattrs); - } - return result; + return 0; +} +EXPORT_SYMBOL(should_remove_suid); + +int __remove_suid(struct dentry *dentry, int kill) +{ + struct iattr newattrs; + + newattrs.ia_valid = ATTR_FORCE | kill; + return notify_change(dentry, &newattrs); +} + +int remove_suid(struct dentry *dentry) +{ + int kill = should_remove_suid(dentry); + + if (unlikely(kill)) + return __remove_suid(dentry, kill); + + return 0; } EXPORT_SYMBOL(remove_suid); @@ -2043,15 +2025,14 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, * Sync the fs metadata but not the minor inode changes and * of course not the data as we did direct DMA for the IO. * i_mutex is held, which protects generic_osync_inode() from - * livelocking. + * livelocking. AIO O_DIRECT ops attempt to sync metadata here. */ - if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + if ((written >= 0 || written == -EIOCBQUEUED) && + ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); if (err < 0) written = err; } - if (written == count && !is_sync_kiocb(iocb)) - written = -EIOCBQUEUED; return written; } EXPORT_SYMBOL(generic_file_direct_write); @@ -2098,21 +2079,27 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, /* Limit the size of the copy to the caller's write size */ bytes = min(bytes, count); - /* - * Limit the size of the copy to that of the current segment, - * because fault_in_pages_readable() doesn't know how to walk - * segments. - */ - bytes = min(bytes, cur_iov->iov_len - iov_base); - - /* - * Bring in the user page that we will copy from _first_. - * Otherwise there's a nasty deadlock on copying from the - * same page as we're writing to, without it being marked - * up-to-date. + /* We only need to worry about prefaulting when writes are from + * user-space. NFSd uses vfs_writev with several non-aligned + * segments in the vector, and limiting to one segment a time is + * a noticeable performance for re-write */ - fault_in_pages_readable(buf, bytes); + if (!segment_eq(get_fs(), KERNEL_DS)) { + /* + * Limit the size of the copy to that of the current + * segment, because fault_in_pages_readable() doesn't + * know how to walk segments. + */ + bytes = min(bytes, cur_iov->iov_len - iov_base); + /* + * Bring in the user page that we will copy from + * _first_. Otherwise there's a nasty deadlock on + * copying from the same page as we're writing to, + * without it being marked up-to-date. + */ + fault_in_pages_readable(buf, bytes); + } page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); if (!page) { status = -ENOMEM; @@ -2220,7 +2207,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t *ppos) { struct file *file = iocb->ki_filp; - const struct address_space * mapping = file->f_mapping; + struct address_space * mapping = file->f_mapping; size_t ocount; /* original count */ size_t count; /* after file limit checks */ struct inode *inode = mapping->host; @@ -2265,7 +2252,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, if (count == 0) goto out; - err = remove_suid(file->f_dentry); + err = remove_suid(file->f_path.dentry); if (err) goto out; @@ -2273,8 +2260,11 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (unlikely(file->f_flags & O_DIRECT)) { - written = generic_file_direct_write(iocb, iov, - &nr_segs, pos, ppos, count, ocount); + loff_t endbyte; + ssize_t written_buffered; + + written = generic_file_direct_write(iocb, iov, &nr_segs, pos, + ppos, count, ocount); if (written < 0 || written == count) goto out; /* @@ -2283,10 +2273,46 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, */ pos += written; count -= written; - } + written_buffered = generic_file_buffered_write(iocb, iov, + nr_segs, pos, ppos, count, + written); + /* + * If generic_file_buffered_write() retuned a synchronous error + * then we want to return the number of bytes which were + * direct-written, or the error code if that was zero. Note + * that this differs from normal direct-io semantics, which + * will return -EFOO even if some bytes were written. + */ + if (written_buffered < 0) { + err = written_buffered; + goto out; + } - written = generic_file_buffered_write(iocb, iov, nr_segs, - pos, ppos, count, written); + /* + * We need to ensure that the page cache pages are written to + * disk and invalidated to preserve the expected O_DIRECT + * semantics. + */ + endbyte = pos + written_buffered - written - 1; + err = do_sync_file_range(file, pos, endbyte, + SYNC_FILE_RANGE_WAIT_BEFORE| + SYNC_FILE_RANGE_WRITE| + SYNC_FILE_RANGE_WAIT_AFTER); + if (err == 0) { + written = written_buffered; + invalidate_mapping_pages(mapping, + pos >> PAGE_CACHE_SHIFT, + endbyte >> PAGE_CACHE_SHIFT); + } else { + /* + * We don't know how much we wrote, so just return + * the number of bytes which were direct-written + */ + } + } else { + written = generic_file_buffered_write(iocb, iov, nr_segs, + pos, ppos, count, written); + } out: current->backing_dev_info = NULL; return written ? written : err; @@ -2353,7 +2379,8 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; ssize_t retval; - size_t write_len = 0; + size_t write_len; + pgoff_t end = 0; /* silence gcc */ /* * If it's a write, unmap all mmappings of the file up-front. This @@ -2362,23 +2389,46 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, */ if (rw == WRITE) { write_len = iov_length(iov, nr_segs); + end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT; if (mapping_mapped(mapping)) unmap_mapping_range(mapping, offset, write_len, 0); } retval = filemap_write_and_wait(mapping); - if (retval == 0) { - retval = mapping->a_ops->direct_IO(rw, iocb, iov, - offset, nr_segs); - if (rw == WRITE && mapping->nrpages) { - pgoff_t end = (offset + write_len - 1) - >> PAGE_CACHE_SHIFT; - int err = invalidate_inode_pages2_range(mapping, + if (retval) + goto out; + + /* + * After a write we want buffered reads to be sure to go to disk to get + * the new data. We invalidate clean cached page from the region we're + * about to write. We do this *before* the write so that we can return + * -EIO without clobbering -EIOCBQUEUED from ->direct_IO(). + */ + if (rw == WRITE && mapping->nrpages) { + retval = invalidate_inode_pages2_range(mapping, offset >> PAGE_CACHE_SHIFT, end); - if (err) - retval = err; - } + if (retval) + goto out; } + + retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs); + if (retval) + goto out; + + /* + * Finally, try again to invalidate clean pages which might have been + * faulted in by get_user_pages() if the source of the write was an + * mmap()ed region of the file we're writing. That's a pretty crazy + * thing to do, so we don't support it 100%. If this invalidation + * fails and we have -EIOCBQUEUED we ignore the failure. + */ + if (rw == WRITE && mapping->nrpages) { + int err = invalidate_inode_pages2_range(mapping, + offset >> PAGE_CACHE_SHIFT, end); + if (err && retval >= 0) + retval = err; + } +out: return retval; }