From: Miklos Szeredi This adds lots of documentation for the userspace - kernel interface. Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton --- 25-akpm/Documentation/filesystems/fuse.txt | 130 +++++++++++++++++++++++++++++ 25-akpm/fs/fuse/dev.c | 83 +++++++++++++++--- 2 files changed, 201 insertions(+), 12 deletions(-) diff -puN /dev/null Documentation/filesystems/fuse.txt --- /dev/null Thu Apr 11 07:25:15 2002 +++ 25-akpm/Documentation/filesystems/fuse.txt Thu Mar 31 15:08:15 2005 @@ -0,0 +1,130 @@ +The following diagram shows how a filesystem operation (in this +example unlink) is performed in FUSE. + +NOTE: everything in this description is greatly simplified + + | "rm /mnt/fuse/file" | FUSE filesystem daemon + | | + | | >sys_read() + | | >fuse_dev_read() + | | >request_wait() + | | [sleep on fc->waitq] + | | + | >sys_unlink() | + | >fuse_unlink() | + | [get request from | + | fc->unused_list] | + | >request_send() | + | [queue req on fc->pending] | + | [wake up fc->waitq] | [woken up] + | >request_wait_answer() | + | [sleep on req->waitq] | + | | pending] + | | [copy req to read buffer] + | | [add req to fc->processing] + | | sys_write() + | | >fuse_dev_write() + | | [look up req in fc->processing] + | | [remove from fc->processing] + | | [copy write buffer to req] + | [woken up] | [wake up req->waitq] + | | unused_list] | + | sys_unlink("/mnt/fuse/file") | + | [acquire inode semaphore | + | for "file"] | + | >fuse_unlink() | + | [sleep on req->waitq] | + | | sys_unlink("/mnt/fuse/file") + | | [acquire inode semaphore + | | for "file"] + | | *DEADLOCK* + +The solution for this is to allow requests to be interrupted while +they are in userspace: + + | [interrupted by signal] | + | fuse_unlink() + | | [queue req on fc->pending] + | | [wake up fc->waitq] + | | [sleep on req->waitq] + +If the filesystem daemon was single threaded, this will stop here, +since there's no other thread to dequeue and execute the request. +In this case the solution is to kill the FUSE daemon as well. If +there are multiple serving threads, you just have to kill them as +long as any remain. + +Moral: a filesystem which deadlocks, can soon find itself dead. + +Scenario 2 - Tricky deadlock +---------------------------- + +This one needs a carefully crafted filesystem. It's a variation on +the above, only the call back to the filesystem is not explicit, +but is caused by a pagefault. + + | Kamikaze filesystem thread 1 | Kamikaze filesystem thread 2 + | | + | [fd = open("/mnt/fuse/file")] | [request served normally] + | [mmap fd to 'addr'] | + | [close fd] | [FLUSH triggers 'magic' flag] + | [read a byte from addr] | + | >do_page_fault() | + | [find or create page] | + | [lock page] | + | >fuse_readpage() | + | [queue READ request] | + | [sleep on req->waitq] | + | | [read request to buffer] + | | [create reply header before addr] + | | >sys_write(addr - headerlength) + | | >fuse_dev_write() + | | [look up req in fc->processing] + | | [remove from fc->processing] + | | [copy write buffer to req] + | | >do_page_fault() + | | [find or create page] + | | [lock page] + | | * DEADLOCK * + +Solution is again to let the the request be interrupted (not +elaborated further). + +An additional problem is that while the write buffer is being +copied to the request, the request must not be interrupted. This +is because the destination address of the copy may not be valid +after the request is interrupted. + +This is solved with doing the copy atomically, and allowing +interruption while the page(s) belonging to the write buffer are +faulted with get_user_pages(). The 'req->locked' flag indicates +when the copy is taking place, and interruption is delayed until +this flag is unset. + diff -puN fs/fuse/dev.c~fuse-device-functions-comments-and-documentation fs/fuse/dev.c --- 25/fs/fuse/dev.c~fuse-device-functions-comments-and-documentation Thu Mar 31 15:08:15 2005 +++ 25-akpm/fs/fuse/dev.c Thu Mar 31 15:08:15 2005 @@ -108,6 +108,11 @@ struct fuse_req *fuse_get_request(struct return do_get_request(fc); } +/* + * Non-interruptible version of the above function is for operations + * which can't legally return -ERESTART{SYS,NOINTR}. This can still + * return NULL, but only in case the signal is SIGKILL. + */ struct fuse_req *fuse_get_request_nonint(struct fuse_conn *fc) { int intr; @@ -127,6 +132,7 @@ static void fuse_putback_request(struct else fuse_request_free(req); + /* If we are in debt decrease that first */ if (fc->outstanding_debt) fc->outstanding_debt--; else @@ -140,7 +146,17 @@ void fuse_put_request(struct fuse_conn * fuse_putback_request(fc, req); } -/* Called with fuse_lock, unlocks it */ +/* + * This function is called when a request is finished. Either a reply + * has arrived or it was interrupted (and not yet sent) or some error + * occured during communication with userspace, or the device file was + * closed. It decreases the referece count for the request. In case + * of a background request the referece to the stored objects are + * released. The requester thread is woken up (if still waiting), and + * finally the request is either freed or put on the unused_list + * + * Called with fuse_lock, unlocks it + */ static void request_end(struct fuse_conn *fc, struct fuse_req *req) { int putback; @@ -226,7 +242,7 @@ static void request_wait_answer(struct f if (req->locked) { /* This is uninterruptible sleep, because data is being copied to/from the buffers of req. During - locked state, there musn't be any filesystem + locked state, there mustn't be any filesystem operation (e.g. page fault), since that could lead to deadlock */ spin_unlock(&fuse_lock); @@ -261,7 +277,12 @@ static void queue_request(struct fuse_co req->in.h.len = sizeof(struct fuse_in_header) + len_args(req->in.numargs, (struct fuse_arg *) req->in.args); if (!req->preallocated) { - /* decrease outstanding_sem, but without blocking... */ + /* If request is not preallocated (either FORGET or + RELEASE), then still decrease outstanding_sem, so + user can't open infinite number of files while not + processing the RELEASE requests. However for + efficiency do it without blocking, so if down() + would block, just increase the debt instead */ if (down_trylock(&fc->outstanding_sem)) fc->outstanding_debt++; } @@ -294,6 +315,11 @@ void request_send(struct fuse_conn *fc, request_send_wait(fc, req, 1); } +/* + * Non-interruptible version of the above function is for operations + * which can't legally return -ERESTART{SYS,NOINTR}. This can still + * be interrupted but only with SIGKILL. + */ void request_send_nonint(struct fuse_conn *fc, struct fuse_req *req) { request_send_wait(fc, req, 0); @@ -342,6 +368,11 @@ void fuse_send_init(struct fuse_conn *fc request_send_background(fc, req); } +/* + * Lock the request. Up to the next unlock_request() there mustn't be + * anything that could cause a page-fault. If the request was already + * interrupted bail out. + */ static inline int lock_request(struct fuse_req *req) { int err = 0; @@ -356,6 +387,11 @@ static inline int lock_request(struct fu return err; } +/* + * Unlock request. If it was interrupted during being locked, the + * requester thread is currently waiting for it to be unlocked, so + * wake it up. + */ static inline void unlock_request(struct fuse_req *req) { if (req) { @@ -367,15 +403,6 @@ static inline void unlock_request(struct } } -/* Why all this complex one-page-at-a-time copying needed instead of - just copy_to/from_user()? The reason is that blocking on a page - fault must be avoided while the request is locked. This is because - if servicing that pagefault happens to be done by this filesystem, - an unbreakable deadlock can occur. So the code is careful to allow - request interruption during get_user_pages(), and only lock the - request while doing kmapped copying, which cannot block. - */ - struct fuse_copy_state { int write; struct fuse_req *req; @@ -409,6 +436,7 @@ static unsigned fuse_copy_init(struct fu return nbytes; } +/* Unmap and put previous page of userspace buffer */ static inline void fuse_copy_finish(struct fuse_copy_state *cs) { if (cs->mapaddr) { @@ -422,6 +450,10 @@ static inline void fuse_copy_finish(stru } } +/* + * Get another pagefull of userspace buffer, and map it to kernel + * address space, and lock request + */ static int fuse_copy_fill(struct fuse_copy_state *cs) { unsigned long offset; @@ -453,6 +485,7 @@ static int fuse_copy_fill(struct fuse_co return lock_request(cs->req); } +/* Do as much copy to/from userspace buffer as we can */ static inline int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) { @@ -470,6 +503,10 @@ static inline int fuse_copy_do(struct fu return ncpy; } +/* + * Copy a page in the request to/from the userspace buffer. Must be + * done atomically + */ static inline int fuse_copy_page(struct fuse_copy_state *cs, struct page *page, unsigned offset, unsigned count, int zeroing) { @@ -495,6 +532,7 @@ static inline int fuse_copy_page(struct return 0; } +/* Copy pages in the request to/from userspace buffer */ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, int zeroing) { @@ -516,6 +554,7 @@ static int fuse_copy_pages(struct fuse_c return 0; } +/* Copy a single argument in the request to/from userspace buffer */ static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size) { while (size) { @@ -527,6 +566,7 @@ static int fuse_copy_one(struct fuse_cop return 0; } +/* Copy request arguments to/from userspace buffer */ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, unsigned argpages, struct fuse_arg *args, int zeroing) @@ -544,6 +584,7 @@ static int fuse_copy_args(struct fuse_co return err; } +/* Wait until a request is available on the pending list */ static void request_wait(struct fuse_conn *fc) { DECLARE_WAITQUEUE(wait, current); @@ -562,6 +603,15 @@ static void request_wait(struct fuse_con remove_wait_queue(&fc->waitq, &wait); } +/* + * Read a single request into the userspace filesystem's buffer. This + * function waits until a request is available, then removes it from + * the pending list and copies request data to userspace buffer. If + * no reply is needed (FORGET) or request has been interrupted or + * there was an error during the copying then it's finished by calling + * request_end(). Otherwise add it to the processing list, and set + * the 'sent' flag. + */ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov, unsigned long nr_segs, loff_t *off) { @@ -635,6 +685,7 @@ static ssize_t fuse_dev_read(struct file return fuse_dev_readv(file, &iov, 1, off); } +/* Look up request on processing list by unique ID */ static struct fuse_req *request_find(struct fuse_conn *fc, unsigned unique) { struct list_head *entry; @@ -671,6 +722,13 @@ static int copy_out_args(struct fuse_cop out->page_zeroing); } +/* + * Write a single reply to a request. First the header is copied from + * the write buffer. The request is then searched on the processing + * list by the unique ID found in the header. If found, then remove + * it from the list and copy the rest of the buffer to the request. + * The request is finished by calling request_end() + */ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, unsigned long nr_segs, loff_t *off) { @@ -760,6 +818,7 @@ static unsigned fuse_dev_poll(struct fil return mask; } +/* Abort all requests on the given list (pending or processing) */ static void end_requests(struct fuse_conn *fc, struct list_head *head) { while (!list_empty(head)) { _