2 FUSE: Filesystem in Userspace
3 Copyright (C) 2001-2005 Miklos Szeredi <miklos@szeredi.hu>
5 This program can be distributed under the terms of the GNU GPL.
11 #include <linux/init.h>
12 #include <linux/module.h>
13 #include <linux/poll.h>
14 #include <linux/uio.h>
15 #include <linux/miscdevice.h>
16 #include <linux/pagemap.h>
17 #include <linux/file.h>
18 #include <linux/slab.h>
20 MODULE_ALIAS_MISCDEV(FUSE_MINOR);
22 static kmem_cache_t *fuse_req_cachep;
24 static inline struct fuse_conn *fuse_get_conn(struct file *file)
27 spin_lock(&fuse_lock);
28 fc = file->private_data;
29 if (fc && !fc->mounted)
31 spin_unlock(&fuse_lock);
35 static inline void fuse_request_init(struct fuse_req *req)
37 memset(req, 0, sizeof(*req));
38 INIT_LIST_HEAD(&req->list);
39 init_waitqueue_head(&req->waitq);
40 atomic_set(&req->count, 1);
43 struct fuse_req *fuse_request_alloc(void)
45 struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, SLAB_KERNEL);
47 fuse_request_init(req);
51 void fuse_request_free(struct fuse_req *req)
53 kmem_cache_free(fuse_req_cachep, req);
56 static inline void block_sigs(sigset_t *oldset)
60 siginitsetinv(&mask, sigmask(SIGKILL));
61 sigprocmask(SIG_BLOCK, &mask, oldset);
64 static inline void restore_sigs(sigset_t *oldset)
66 sigprocmask(SIG_SETMASK, oldset, NULL);
69 void fuse_reset_request(struct fuse_req *req)
71 int preallocated = req->preallocated;
72 BUG_ON(atomic_read(&req->count) != 1);
73 fuse_request_init(req);
74 req->preallocated = preallocated;
77 static void __fuse_get_request(struct fuse_req *req)
79 atomic_inc(&req->count);
82 /* Must be called with > 1 refcount */
83 static void __fuse_put_request(struct fuse_req *req)
85 BUG_ON(atomic_read(&req->count) < 2);
86 atomic_dec(&req->count);
89 static struct fuse_req *do_get_request(struct fuse_conn *fc)
93 spin_lock(&fuse_lock);
94 BUG_ON(list_empty(&fc->unused_list));
95 req = list_entry(fc->unused_list.next, struct fuse_req, list);
96 list_del_init(&req->list);
97 spin_unlock(&fuse_lock);
98 fuse_request_init(req);
99 req->preallocated = 1;
100 req->in.h.uid = current->fsuid;
101 req->in.h.gid = current->fsgid;
102 req->in.h.pid = current->pid;
106 /* This can return NULL, but only in case it's interrupted by a SIGKILL */
107 struct fuse_req *fuse_get_request(struct fuse_conn *fc)
113 intr = down_interruptible(&fc->outstanding_sem);
114 restore_sigs(&oldset);
115 return intr ? NULL : do_get_request(fc);
118 static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req)
120 spin_lock(&fuse_lock);
121 if (req->preallocated)
122 list_add(&req->list, &fc->unused_list);
124 fuse_request_free(req);
126 /* If we are in debt decrease that first */
127 if (fc->outstanding_debt)
128 fc->outstanding_debt--;
130 up(&fc->outstanding_sem);
131 spin_unlock(&fuse_lock);
134 void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
136 if (atomic_dec_and_test(&req->count))
137 fuse_putback_request(fc, req);
140 void fuse_release_background(struct fuse_req *req)
146 spin_lock(&fuse_lock);
147 list_del(&req->bg_entry);
148 spin_unlock(&fuse_lock);
152 * This function is called when a request is finished. Either a reply
153 * has arrived or it was interrupted (and not yet sent) or some error
154 * occurred during communication with userspace, or the device file was
155 * closed. It decreases the reference count for the request. In case
156 * of a background request the reference to the stored objects are
157 * released. The requester thread is woken up (if still waiting), and
158 * finally the request is either freed or put on the unused_list
160 * Called with fuse_lock, unlocks it
162 static void request_end(struct fuse_conn *fc, struct fuse_req *req)
166 putback = atomic_dec_and_test(&req->count);
167 spin_unlock(&fuse_lock);
168 if (req->background) {
169 down_read(&fc->sbput_sem);
171 fuse_release_background(req);
172 up_read(&fc->sbput_sem);
174 wake_up(&req->waitq);
175 if (req->in.h.opcode == FUSE_INIT) {
178 if (req->misc.init_in_out.major != FUSE_KERNEL_VERSION)
181 fc->minor = req->misc.init_in_out.minor;
183 /* After INIT reply is received other requests can go
184 out. So do (FUSE_MAX_OUTSTANDING - 1) number of
185 up()s on outstanding_sem. The last up() is done in
186 fuse_putback_request() */
187 for (i = 1; i < FUSE_MAX_OUTSTANDING; i++)
188 up(&fc->outstanding_sem);
189 } else if (req->in.h.opcode == FUSE_RELEASE && req->inode == NULL) {
190 /* Special case for failed iget in CREATE */
191 u64 nodeid = req->in.h.nodeid;
192 __fuse_get_request(req);
193 fuse_reset_request(req);
194 fuse_send_forget(fc, req, nodeid, 1);
198 fuse_putback_request(fc, req);
202 * Unfortunately request interruption not just solves the deadlock
203 * problem, it causes problems too. These stem from the fact, that an
204 * interrupted request is continued to be processed in userspace,
205 * while all the locks and object references (inode and file) held
206 * during the operation are released.
208 * To release the locks is exactly why there's a need to interrupt the
209 * request, so there's not a lot that can be done about this, except
210 * introduce additional locking in userspace.
212 * More important is to keep inode and file references until userspace
213 * has replied, otherwise FORGET and RELEASE could be sent while the
214 * inode/file is still used by the filesystem.
216 * For this reason the concept of "background" request is introduced.
217 * An interrupted request is backgrounded if it has been already sent
218 * to userspace. Backgrounding involves getting an extra reference to
219 * inode(s) or file used in the request, and adding the request to
220 * fc->background list. When a reply is received for a background
221 * request, the object references are released, and the request is
222 * removed from the list. If the filesystem is unmounted while there
223 * are still background requests, the list is walked and references
224 * are released as if a reply was received.
226 * There's one more use for a background request. The RELEASE message is
227 * always sent as background, since it doesn't return an error or
230 static void background_request(struct fuse_conn *fc, struct fuse_req *req)
233 list_add(&req->bg_entry, &fc->background);
235 req->inode = igrab(req->inode);
237 req->inode2 = igrab(req->inode2);
242 /* Called with fuse_lock held. Releases, and then reacquires it. */
243 static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
247 spin_unlock(&fuse_lock);
249 wait_event_interruptible(req->waitq, req->finished);
250 restore_sigs(&oldset);
251 spin_lock(&fuse_lock);
255 req->out.h.error = -EINTR;
256 req->interrupted = 1;
258 /* This is uninterruptible sleep, because data is
259 being copied to/from the buffers of req. During
260 locked state, there mustn't be any filesystem
261 operation (e.g. page fault), since that could lead
263 spin_unlock(&fuse_lock);
264 wait_event(req->waitq, !req->locked);
265 spin_lock(&fuse_lock);
267 if (!req->sent && !list_empty(&req->list)) {
268 list_del(&req->list);
269 __fuse_put_request(req);
270 } else if (!req->finished && req->sent)
271 background_request(fc, req);
274 static unsigned len_args(unsigned numargs, struct fuse_arg *args)
279 for (i = 0; i < numargs; i++)
280 nbytes += args[i].size;
285 static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
288 /* zero is special */
291 req->in.h.unique = fc->reqctr;
292 req->in.h.len = sizeof(struct fuse_in_header) +
293 len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
294 if (!req->preallocated) {
295 /* If request is not preallocated (either FORGET or
296 RELEASE), then still decrease outstanding_sem, so
297 user can't open infinite number of files while not
298 processing the RELEASE requests. However for
299 efficiency do it without blocking, so if down()
300 would block, just increase the debt instead */
301 if (down_trylock(&fc->outstanding_sem))
302 fc->outstanding_debt++;
304 list_add_tail(&req->list, &fc->pending);
309 * This can only be interrupted by a SIGKILL
311 void request_send(struct fuse_conn *fc, struct fuse_req *req)
314 spin_lock(&fuse_lock);
316 req->out.h.error = -ENOTCONN;
317 else if (fc->conn_error)
318 req->out.h.error = -ECONNREFUSED;
320 queue_request(fc, req);
321 /* acquire extra reference, since request is still needed
322 after request_end() */
323 __fuse_get_request(req);
325 request_wait_answer(fc, req);
327 spin_unlock(&fuse_lock);
330 static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
332 spin_lock(&fuse_lock);
334 queue_request(fc, req);
335 spin_unlock(&fuse_lock);
337 req->out.h.error = -ENOTCONN;
338 request_end(fc, req);
342 void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
345 request_send_nowait(fc, req);
348 void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
351 spin_lock(&fuse_lock);
352 background_request(fc, req);
353 spin_unlock(&fuse_lock);
354 request_send_nowait(fc, req);
357 void fuse_send_init(struct fuse_conn *fc)
359 /* This is called from fuse_read_super() so there's guaranteed
360 to be a request available */
361 struct fuse_req *req = do_get_request(fc);
362 struct fuse_init_in_out *arg = &req->misc.init_in_out;
363 arg->major = FUSE_KERNEL_VERSION;
364 arg->minor = FUSE_KERNEL_MINOR_VERSION;
365 req->in.h.opcode = FUSE_INIT;
367 req->in.args[0].size = sizeof(*arg);
368 req->in.args[0].value = arg;
369 req->out.numargs = 1;
370 req->out.args[0].size = sizeof(*arg);
371 req->out.args[0].value = arg;
372 request_send_background(fc, req);
376 * Lock the request. Up to the next unlock_request() there mustn't be
377 * anything that could cause a page-fault. If the request was already
378 * interrupted bail out.
380 static inline int lock_request(struct fuse_req *req)
384 spin_lock(&fuse_lock);
385 if (req->interrupted)
389 spin_unlock(&fuse_lock);
395 * Unlock request. If it was interrupted during being locked, the
396 * requester thread is currently waiting for it to be unlocked, so
399 static inline void unlock_request(struct fuse_req *req)
402 spin_lock(&fuse_lock);
404 if (req->interrupted)
405 wake_up(&req->waitq);
406 spin_unlock(&fuse_lock);
410 struct fuse_copy_state {
412 struct fuse_req *req;
413 const struct iovec *iov;
414 unsigned long nr_segs;
415 unsigned long seglen;
423 static void fuse_copy_init(struct fuse_copy_state *cs, int write,
424 struct fuse_req *req, const struct iovec *iov,
425 unsigned long nr_segs)
427 memset(cs, 0, sizeof(*cs));
431 cs->nr_segs = nr_segs;
434 /* Unmap and put previous page of userspace buffer */
435 static inline void fuse_copy_finish(struct fuse_copy_state *cs)
438 kunmap_atomic(cs->mapaddr, KM_USER0);
440 flush_dcache_page(cs->pg);
441 set_page_dirty_lock(cs->pg);
449 * Get another pagefull of userspace buffer, and map it to kernel
450 * address space, and lock request
452 static int fuse_copy_fill(struct fuse_copy_state *cs)
454 unsigned long offset;
457 unlock_request(cs->req);
458 fuse_copy_finish(cs);
460 BUG_ON(!cs->nr_segs);
461 cs->seglen = cs->iov[0].iov_len;
462 cs->addr = (unsigned long) cs->iov[0].iov_base;
466 down_read(¤t->mm->mmap_sem);
467 err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0,
469 up_read(¤t->mm->mmap_sem);
473 offset = cs->addr % PAGE_SIZE;
474 cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
475 cs->buf = cs->mapaddr + offset;
476 cs->len = min(PAGE_SIZE - offset, cs->seglen);
477 cs->seglen -= cs->len;
480 return lock_request(cs->req);
483 /* Do as much copy to/from userspace buffer as we can */
484 static inline int fuse_copy_do(struct fuse_copy_state *cs, void **val,
487 unsigned ncpy = min(*size, cs->len);
490 memcpy(cs->buf, *val, ncpy);
492 memcpy(*val, cs->buf, ncpy);
502 * Copy a page in the request to/from the userspace buffer. Must be
505 static inline int fuse_copy_page(struct fuse_copy_state *cs, struct page *page,
506 unsigned offset, unsigned count, int zeroing)
508 if (page && zeroing && count < PAGE_SIZE) {
509 void *mapaddr = kmap_atomic(page, KM_USER1);
510 memset(mapaddr, 0, PAGE_SIZE);
511 kunmap_atomic(mapaddr, KM_USER1);
515 if (!cs->len && (err = fuse_copy_fill(cs)))
518 void *mapaddr = kmap_atomic(page, KM_USER1);
519 void *buf = mapaddr + offset;
520 offset += fuse_copy_do(cs, &buf, &count);
521 kunmap_atomic(mapaddr, KM_USER1);
523 offset += fuse_copy_do(cs, NULL, &count);
525 if (page && !cs->write)
526 flush_dcache_page(page);
530 /* Copy pages in the request to/from userspace buffer */
531 static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
535 struct fuse_req *req = cs->req;
536 unsigned offset = req->page_offset;
537 unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset);
539 for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
540 struct page *page = req->pages[i];
541 int err = fuse_copy_page(cs, page, offset, count, zeroing);
546 count = min(nbytes, (unsigned) PAGE_SIZE);
552 /* Copy a single argument in the request to/from userspace buffer */
553 static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
557 if (!cs->len && (err = fuse_copy_fill(cs)))
559 fuse_copy_do(cs, &val, &size);
564 /* Copy request arguments to/from userspace buffer */
565 static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
566 unsigned argpages, struct fuse_arg *args,
572 for (i = 0; !err && i < numargs; i++) {
573 struct fuse_arg *arg = &args[i];
574 if (i == numargs - 1 && argpages)
575 err = fuse_copy_pages(cs, arg->size, zeroing);
577 err = fuse_copy_one(cs, arg->value, arg->size);
582 /* Wait until a request is available on the pending list */
583 static void request_wait(struct fuse_conn *fc)
585 DECLARE_WAITQUEUE(wait, current);
587 add_wait_queue_exclusive(&fc->waitq, &wait);
588 while (fc->mounted && list_empty(&fc->pending)) {
589 set_current_state(TASK_INTERRUPTIBLE);
590 if (signal_pending(current))
593 spin_unlock(&fuse_lock);
595 spin_lock(&fuse_lock);
597 set_current_state(TASK_RUNNING);
598 remove_wait_queue(&fc->waitq, &wait);
602 * Read a single request into the userspace filesystem's buffer. This
603 * function waits until a request is available, then removes it from
604 * the pending list and copies request data to userspace buffer. If
605 * no reply is needed (FORGET) or request has been interrupted or
606 * there was an error during the copying then it's finished by calling
607 * request_end(). Otherwise add it to the processing list, and set
610 static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
611 unsigned long nr_segs, loff_t *off)
614 struct fuse_conn *fc;
615 struct fuse_req *req;
617 struct fuse_copy_state cs;
621 spin_lock(&fuse_lock);
622 fc = file->private_data;
631 if (list_empty(&fc->pending))
634 req = list_entry(fc->pending.next, struct fuse_req, list);
635 list_del_init(&req->list);
639 /* If request is too large, reply with an error and restart the read */
640 if (iov_length(iov, nr_segs) < reqsize) {
641 req->out.h.error = -EIO;
642 /* SETXATTR is special, since it may contain too large data */
643 if (in->h.opcode == FUSE_SETXATTR)
644 req->out.h.error = -E2BIG;
645 request_end(fc, req);
648 spin_unlock(&fuse_lock);
649 fuse_copy_init(&cs, 1, req, iov, nr_segs);
650 err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
652 err = fuse_copy_args(&cs, in->numargs, in->argpages,
653 (struct fuse_arg *) in->args, 0);
654 fuse_copy_finish(&cs);
655 spin_lock(&fuse_lock);
657 if (!err && req->interrupted)
660 if (!req->interrupted)
661 req->out.h.error = -EIO;
662 request_end(fc, req);
666 request_end(fc, req);
669 list_add_tail(&req->list, &fc->processing);
670 spin_unlock(&fuse_lock);
675 spin_unlock(&fuse_lock);
679 static ssize_t fuse_dev_read(struct file *file, char __user *buf,
680 size_t nbytes, loff_t *off)
683 iov.iov_len = nbytes;
685 return fuse_dev_readv(file, &iov, 1, off);
688 /* Look up request on processing list by unique ID */
689 static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
691 struct list_head *entry;
693 list_for_each(entry, &fc->processing) {
694 struct fuse_req *req;
695 req = list_entry(entry, struct fuse_req, list);
696 if (req->in.h.unique == unique)
702 static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
705 unsigned reqsize = sizeof(struct fuse_out_header);
708 return nbytes != reqsize ? -EINVAL : 0;
710 reqsize += len_args(out->numargs, out->args);
712 if (reqsize < nbytes || (reqsize > nbytes && !out->argvar))
714 else if (reqsize > nbytes) {
715 struct fuse_arg *lastarg = &out->args[out->numargs-1];
716 unsigned diffsize = reqsize - nbytes;
717 if (diffsize > lastarg->size)
719 lastarg->size -= diffsize;
721 return fuse_copy_args(cs, out->numargs, out->argpages, out->args,
726 * Write a single reply to a request. First the header is copied from
727 * the write buffer. The request is then searched on the processing
728 * list by the unique ID found in the header. If found, then remove
729 * it from the list and copy the rest of the buffer to the request.
730 * The request is finished by calling request_end()
732 static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
733 unsigned long nr_segs, loff_t *off)
736 unsigned nbytes = iov_length(iov, nr_segs);
737 struct fuse_req *req;
738 struct fuse_out_header oh;
739 struct fuse_copy_state cs;
740 struct fuse_conn *fc = fuse_get_conn(file);
744 fuse_copy_init(&cs, 0, NULL, iov, nr_segs);
745 if (nbytes < sizeof(struct fuse_out_header))
748 err = fuse_copy_one(&cs, &oh, sizeof(oh));
752 if (!oh.unique || oh.error <= -1000 || oh.error > 0 ||
756 spin_lock(&fuse_lock);
757 req = request_find(fc, oh.unique);
762 list_del_init(&req->list);
763 if (req->interrupted) {
764 request_end(fc, req);
765 fuse_copy_finish(&cs);
771 spin_unlock(&fuse_lock);
773 err = copy_out_args(&cs, &req->out, nbytes);
774 fuse_copy_finish(&cs);
776 spin_lock(&fuse_lock);
779 if (req->interrupted)
781 } else if (!req->interrupted)
782 req->out.h.error = -EIO;
783 request_end(fc, req);
785 return err ? err : nbytes;
788 spin_unlock(&fuse_lock);
790 fuse_copy_finish(&cs);
794 static ssize_t fuse_dev_write(struct file *file, const char __user *buf,
795 size_t nbytes, loff_t *off)
798 iov.iov_len = nbytes;
799 iov.iov_base = (char __user *) buf;
800 return fuse_dev_writev(file, &iov, 1, off);
803 static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
805 struct fuse_conn *fc = fuse_get_conn(file);
806 unsigned mask = POLLOUT | POLLWRNORM;
811 poll_wait(file, &fc->waitq, wait);
813 spin_lock(&fuse_lock);
814 if (!list_empty(&fc->pending))
815 mask |= POLLIN | POLLRDNORM;
816 spin_unlock(&fuse_lock);
821 /* Abort all requests on the given list (pending or processing) */
822 static void end_requests(struct fuse_conn *fc, struct list_head *head)
824 while (!list_empty(head)) {
825 struct fuse_req *req;
826 req = list_entry(head->next, struct fuse_req, list);
827 list_del_init(&req->list);
828 req->out.h.error = -ECONNABORTED;
829 request_end(fc, req);
830 spin_lock(&fuse_lock);
834 static int fuse_dev_release(struct inode *inode, struct file *file)
836 struct fuse_conn *fc;
838 spin_lock(&fuse_lock);
839 fc = file->private_data;
842 end_requests(fc, &fc->pending);
843 end_requests(fc, &fc->processing);
844 fuse_release_conn(fc);
846 spin_unlock(&fuse_lock);
850 struct file_operations fuse_dev_operations = {
851 .owner = THIS_MODULE,
853 .read = fuse_dev_read,
854 .readv = fuse_dev_readv,
855 .write = fuse_dev_write,
856 .writev = fuse_dev_writev,
857 .poll = fuse_dev_poll,
858 .release = fuse_dev_release,
861 static struct miscdevice fuse_miscdevice = {
864 .fops = &fuse_dev_operations,
867 int __init fuse_dev_init(void)
870 fuse_req_cachep = kmem_cache_create("fuse_request",
871 sizeof(struct fuse_req),
873 if (!fuse_req_cachep)
876 err = misc_register(&fuse_miscdevice);
878 goto out_cache_clean;
883 kmem_cache_destroy(fuse_req_cachep);
888 void fuse_dev_cleanup(void)
890 misc_deregister(&fuse_miscdevice);
891 kmem_cache_destroy(fuse_req_cachep);