X-Git-Url: http://pilppa.org/gitweb/gitweb.cgi?a=blobdiff_plain;f=net%2Fsunrpc%2Fsvcsock.c;h=d95a0c894d4fcc8eb3c35e7df28610aa5e77c31f;hb=f6150c3cab6e788afacb07470be3c6b4a722f1ed;hp=e6bb1b0563ec50af63c61facd1eaba156afcb796;hpb=d7c9f1ed972b4a468dd24a2457721704dfe9ca70;p=linux-2.6-omap-h63xx.git diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index e6bb1b0563e..d95a0c894d4 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -5,7 +5,7 @@ * * The server scheduling algorithm does not always distribute the load * evenly when servicing a single client. May need to modify the - * svc_sock_enqueue procedure... + * svc_xprt_enqueue procedure... * * TCP support is largely untested and may be a little slow. The problem * is that we currently do two separate recvfrom's, one for the 4-byte @@ -56,22 +56,23 @@ * BKL protects svc_serv->sv_nrthread. * svc_sock->sk_lock protects the svc_sock->sk_deferred list * and the ->sk_info_authunix cache. - * svc_sock->sk_flags.SK_BUSY prevents a svc_sock being enqueued multiply. + * svc_sock->sk_xprt.xpt_flags.XPT_BUSY prevents a svc_sock being + * enqueued multiply. * * Some flags can be set to certain values at any time * providing that certain rules are followed: * - * SK_CONN, SK_DATA, can be set or cleared at any time. - * after a set, svc_sock_enqueue must be called. + * XPT_CONN, XPT_DATA, can be set or cleared at any time. + * after a set, svc_xprt_enqueue must be called. * after a clear, the socket must be read/accepted * if this succeeds, it must be set again. - * SK_CLOSE can set at any time. It is never cleared. - * sk_inuse contains a bias of '1' until SK_DEAD is set. - * so when sk_inuse hits zero, we know the socket is dead + * XPT_CLOSE can set at any time. It is never cleared. + * xpt_ref contains a bias of '1' until XPT_DEAD is set. + * so when xprt_ref hits zero, we know the transport is dead * and no-one is using it. - * SK_DEAD can only be set while SK_BUSY is held which ensures + * XPT_DEAD can only be set while XPT_BUSY is held which ensures * no other thread will be using the socket or will try to - * set SK_DEAD. + * set XPT_DEAD. * */ @@ -80,11 +81,11 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, int *errp, int flags); -static void svc_delete_socket(struct svc_sock *svsk); +static void svc_delete_xprt(struct svc_xprt *xprt); static void svc_udp_data_ready(struct sock *, int); static int svc_udp_recvfrom(struct svc_rqst *); static int svc_udp_sendto(struct svc_rqst *); -static void svc_close_socket(struct svc_sock *svsk); +static void svc_close_xprt(struct svc_xprt *xprt); static void svc_sock_detach(struct svc_xprt *); static void svc_sock_free(struct svc_xprt *); @@ -211,22 +212,21 @@ static void svc_release_skb(struct svc_rqst *rqstp) * processes, wake 'em up. * */ -static void -svc_sock_enqueue(struct svc_sock *svsk) +void svc_xprt_enqueue(struct svc_xprt *xprt) { - struct svc_serv *serv = svsk->sk_server; + struct svc_serv *serv = xprt->xpt_server; struct svc_pool *pool; struct svc_rqst *rqstp; int cpu; - if (!(svsk->sk_flags & - ( (1<xpt_flags & + ((1<sk_flags)) + if (test_bit(XPT_DEAD, &xprt->xpt_flags)) return; cpu = get_cpu(); - pool = svc_pool_for_cpu(svsk->sk_server, cpu); + pool = svc_pool_for_cpu(xprt->xpt_server, cpu); put_cpu(); spin_lock_bh(&pool->sp_lock); @@ -234,41 +234,43 @@ svc_sock_enqueue(struct svc_sock *svsk) if (!list_empty(&pool->sp_threads) && !list_empty(&pool->sp_sockets)) printk(KERN_ERR - "svc_sock_enqueue: threads and sockets both waiting??\n"); + "svc_xprt_enqueue: " + "threads and transports both waiting??\n"); - if (test_bit(SK_DEAD, &svsk->sk_flags)) { + if (test_bit(XPT_DEAD, &xprt->xpt_flags)) { /* Don't enqueue dead sockets */ - dprintk("svc: socket %p is dead, not enqueued\n", svsk->sk_sk); + dprintk("svc: transport %p is dead, not enqueued\n", xprt); goto out_unlock; } /* Mark socket as busy. It will remain in this state until the * server has processed all pending data and put the socket back - * on the idle list. We update SK_BUSY atomically because + * on the idle list. We update XPT_BUSY atomically because * it also guards against trying to enqueue the svc_sock twice. */ - if (test_and_set_bit(SK_BUSY, &svsk->sk_flags)) { + if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) { /* Don't enqueue socket while already enqueued */ - dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk); + dprintk("svc: transport %p busy, not enqueued\n", xprt); goto out_unlock; } - BUG_ON(svsk->sk_pool != NULL); - svsk->sk_pool = pool; + BUG_ON(xprt->xpt_pool != NULL); + xprt->xpt_pool = pool; /* Handle pending connection */ - if (test_bit(SK_CONN, &svsk->sk_flags)) + if (test_bit(XPT_CONN, &xprt->xpt_flags)) goto process; /* Handle close in-progress */ - if (test_bit(SK_CLOSE, &svsk->sk_flags)) + if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) goto process; /* Check if we have space to reply to a request */ - if (!svsk->sk_xprt.xpt_ops->xpo_has_wspace(&svsk->sk_xprt)) { + if (!xprt->xpt_ops->xpo_has_wspace(xprt)) { /* Don't enqueue while not enough space for reply */ - dprintk("svc: no write space, socket %p not enqueued\n", svsk); - svsk->sk_pool = NULL; - clear_bit(SK_BUSY, &svsk->sk_flags); + dprintk("svc: no write space, transport %p not enqueued\n", + xprt); + xprt->xpt_pool = NULL; + clear_bit(XPT_BUSY, &xprt->xpt_flags); goto out_unlock; } @@ -277,28 +279,29 @@ svc_sock_enqueue(struct svc_sock *svsk) rqstp = list_entry(pool->sp_threads.next, struct svc_rqst, rq_list); - dprintk("svc: socket %p served by daemon %p\n", - svsk->sk_sk, rqstp); + dprintk("svc: transport %p served by daemon %p\n", + xprt, rqstp); svc_thread_dequeue(pool, rqstp); - if (rqstp->rq_sock) + if (rqstp->rq_xprt) printk(KERN_ERR - "svc_sock_enqueue: server %p, rq_sock=%p!\n", - rqstp, rqstp->rq_sock); - rqstp->rq_sock = svsk; - atomic_inc(&svsk->sk_inuse); + "svc_xprt_enqueue: server %p, rq_xprt=%p!\n", + rqstp, rqstp->rq_xprt); + rqstp->rq_xprt = xprt; + svc_xprt_get(xprt); rqstp->rq_reserved = serv->sv_max_mesg; - atomic_add(rqstp->rq_reserved, &svsk->sk_reserved); - BUG_ON(svsk->sk_pool != pool); + atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); + BUG_ON(xprt->xpt_pool != pool); wake_up(&rqstp->rq_wait); } else { - dprintk("svc: socket %p put into queue\n", svsk->sk_sk); - list_add_tail(&svsk->sk_ready, &pool->sp_sockets); - BUG_ON(svsk->sk_pool != pool); + dprintk("svc: transport %p put into queue\n", xprt); + list_add_tail(&xprt->xpt_ready, &pool->sp_sockets); + BUG_ON(xprt->xpt_pool != pool); } out_unlock: spin_unlock_bh(&pool->sp_lock); } +EXPORT_SYMBOL_GPL(svc_xprt_enqueue); /* * Dequeue the first socket. Must be called with the pool->sp_lock held. @@ -312,11 +315,11 @@ svc_sock_dequeue(struct svc_pool *pool) return NULL; svsk = list_entry(pool->sp_sockets.next, - struct svc_sock, sk_ready); - list_del_init(&svsk->sk_ready); + struct svc_sock, sk_xprt.xpt_ready); + list_del_init(&svsk->sk_xprt.xpt_ready); dprintk("svc: socket %p dequeued, inuse=%d\n", - svsk->sk_sk, atomic_read(&svsk->sk_inuse)); + svsk->sk_sk, atomic_read(&svsk->sk_xprt.xpt_ref.refcount)); return svsk; } @@ -324,15 +327,15 @@ svc_sock_dequeue(struct svc_pool *pool) /* * Having read something from a socket, check whether it * needs to be re-enqueued. - * Note: SK_DATA only gets cleared when a read-attempt finds + * Note: XPT_DATA only gets cleared when a read-attempt finds * no (or insufficient) data. */ static inline void svc_sock_received(struct svc_sock *svsk) { - svsk->sk_pool = NULL; - clear_bit(SK_BUSY, &svsk->sk_flags); - svc_sock_enqueue(svsk); + svsk->sk_xprt.xpt_pool = NULL; + clear_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags); + svc_xprt_enqueue(&svsk->sk_xprt); } @@ -351,24 +354,11 @@ void svc_reserve(struct svc_rqst *rqstp, int space) space += rqstp->rq_res.head[0].iov_len; if (space < rqstp->rq_reserved) { - struct svc_sock *svsk = rqstp->rq_sock; - atomic_sub((rqstp->rq_reserved - space), &svsk->sk_reserved); + struct svc_xprt *xprt = rqstp->rq_xprt; + atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved); rqstp->rq_reserved = space; - svc_sock_enqueue(svsk); - } -} - -/* - * Release a socket after use. - */ -static inline void -svc_sock_put(struct svc_sock *svsk) -{ - if (atomic_dec_and_test(&svsk->sk_inuse)) { - BUG_ON(!test_bit(SK_DEAD, &svsk->sk_flags)); - module_put(svsk->sk_xprt.xpt_class->xcl_owner); - svsk->sk_xprt.xpt_ops->xpo_free(&svsk->sk_xprt); + svc_xprt_enqueue(xprt); } } @@ -398,7 +388,7 @@ svc_sock_release(struct svc_rqst *rqstp) svc_reserve(rqstp, 0); rqstp->rq_sock = NULL; - svc_sock_put(svsk); + svc_xprt_put(&svsk->sk_xprt); } /* @@ -584,7 +574,7 @@ svc_sock_names(char *buf, struct svc_serv *serv, char *toclose) if (!serv) return 0; spin_lock_bh(&serv->sv_lock); - list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) { + list_for_each_entry(svsk, &serv->sv_permsocks, sk_xprt.xpt_list) { int onelen = one_sock_name(buf+len, svsk); if (toclose && strcmp(toclose, buf+len) == 0) closesk = svsk; @@ -596,7 +586,7 @@ svc_sock_names(char *buf, struct svc_serv *serv, char *toclose) /* Should unregister with portmap, but you cannot * unregister just one protocol... */ - svc_close_socket(closesk); + svc_close_xprt(&closesk->sk_xprt); else if (toclose) return -ENOENT; return len; @@ -693,9 +683,10 @@ svc_udp_data_ready(struct sock *sk, int count) if (svsk) { dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n", - svsk, sk, count, test_bit(SK_BUSY, &svsk->sk_flags)); - set_bit(SK_DATA, &svsk->sk_flags); - svc_sock_enqueue(svsk); + svsk, sk, count, + test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); + svc_xprt_enqueue(&svsk->sk_xprt); } if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) wake_up_interruptible(sk->sk_sleep); @@ -711,8 +702,8 @@ svc_write_space(struct sock *sk) if (svsk) { dprintk("svc: socket %p(inet %p), write_space busy=%d\n", - svsk, sk, test_bit(SK_BUSY, &svsk->sk_flags)); - svc_sock_enqueue(svsk); + svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); + svc_xprt_enqueue(&svsk->sk_xprt); } if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) { @@ -746,7 +737,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) { struct svc_sock *svsk = rqstp->rq_sock; - struct svc_serv *serv = svsk->sk_server; + struct svc_serv *serv = svsk->sk_xprt.xpt_server; struct sk_buff *skb; union { struct cmsghdr hdr; @@ -761,7 +752,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) .msg_flags = MSG_DONTWAIT, }; - if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags)) + if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags)) /* udp sockets need large rcvbuf as all pending * requests are still in that buffer. sndbuf must * also be large enough that there is enough space @@ -779,7 +770,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) return svc_deferred_recv(rqstp); } - clear_bit(SK_DATA, &svsk->sk_flags); + clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); skb = NULL; err = kernel_recvmsg(svsk->sk_sock, &msg, NULL, 0, 0, MSG_PEEK | MSG_DONTWAIT); @@ -790,7 +781,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) if (err != -EAGAIN) { /* possibly an icmp error */ dprintk("svc: recvfrom returned error %d\n", -err); - set_bit(SK_DATA, &svsk->sk_flags); + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); } svc_sock_received(svsk); return -EAGAIN; @@ -802,7 +793,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) need that much accuracy */ } svsk->sk_sk->sk_stamp = skb->tstamp; - set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */ + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */ /* * Maybe more packets - kick another thread ASAP. @@ -884,7 +875,7 @@ static void svc_udp_prep_reply_hdr(struct svc_rqst *rqstp) static int svc_udp_has_wspace(struct svc_xprt *xprt) { struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); - struct svc_serv *serv = svsk->sk_server; + struct svc_serv *serv = xprt->xpt_server; unsigned long required; /* @@ -892,7 +883,7 @@ static int svc_udp_has_wspace(struct svc_xprt *xprt) * sock space. */ set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); - required = atomic_read(&svsk->sk_reserved) + serv->sv_max_mesg; + required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg; if (required*2 > sock_wspace(svsk->sk_sk)) return 0; clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); @@ -931,13 +922,12 @@ static struct svc_xprt_class svc_udp_class = { .xcl_max_payload = RPCSVC_MAXPAYLOAD_UDP, }; -static void -svc_udp_init(struct svc_sock *svsk) +static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) { int one = 1; mm_segment_t oldfs; - svc_xprt_init(&svc_udp_class, &svsk->sk_xprt); + svc_xprt_init(&svc_udp_class, &svsk->sk_xprt, serv); svsk->sk_sk->sk_data_ready = svc_udp_data_ready; svsk->sk_sk->sk_write_space = svc_write_space; @@ -946,11 +936,11 @@ svc_udp_init(struct svc_sock *svsk) * svc_udp_recvfrom will re-adjust if necessary */ svc_sock_setbufsize(svsk->sk_sock, - 3 * svsk->sk_server->sv_max_mesg, - 3 * svsk->sk_server->sv_max_mesg); + 3 * svsk->sk_xprt.xpt_server->sv_max_mesg, + 3 * svsk->sk_xprt.xpt_server->sv_max_mesg); - set_bit(SK_DATA, &svsk->sk_flags); /* might have come in before data_ready set up */ - set_bit(SK_CHNGBUF, &svsk->sk_flags); + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* might have come in before data_ready set up */ + set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); oldfs = get_fs(); set_fs(KERNEL_DS); @@ -984,8 +974,8 @@ svc_tcp_listen_data_ready(struct sock *sk, int count_unused) */ if (sk->sk_state == TCP_LISTEN) { if (svsk) { - set_bit(SK_CONN, &svsk->sk_flags); - svc_sock_enqueue(svsk); + set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); + svc_xprt_enqueue(&svsk->sk_xprt); } else printk("svc: socket %p: no user data\n", sk); } @@ -1008,8 +998,8 @@ svc_tcp_state_change(struct sock *sk) if (!svsk) printk("svc: socket %p: no user data\n", sk); else { - set_bit(SK_CLOSE, &svsk->sk_flags); - svc_sock_enqueue(svsk); + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); + svc_xprt_enqueue(&svsk->sk_xprt); } if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) wake_up_interruptible_all(sk->sk_sleep); @@ -1023,8 +1013,8 @@ svc_tcp_data_ready(struct sock *sk, int count) dprintk("svc: socket %p TCP data ready (svsk %p)\n", sk, sk->sk_user_data); if (svsk) { - set_bit(SK_DATA, &svsk->sk_flags); - svc_sock_enqueue(svsk); + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); + svc_xprt_enqueue(&svsk->sk_xprt); } if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) wake_up_interruptible(sk->sk_sleep); @@ -1052,7 +1042,7 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); struct sockaddr_storage addr; struct sockaddr *sin = (struct sockaddr *) &addr; - struct svc_serv *serv = svsk->sk_server; + struct svc_serv *serv = svsk->sk_xprt.xpt_server; struct socket *sock = svsk->sk_sock; struct socket *newsock; struct svc_sock *newsvsk; @@ -1063,7 +1053,7 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) if (!sock) return NULL; - clear_bit(SK_CONN, &svsk->sk_flags); + clear_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); err = kernel_accept(sock, &newsock, O_NONBLOCK); if (err < 0) { if (err == -ENOMEM) @@ -1074,8 +1064,7 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) serv->sv_name, -err); return NULL; } - - set_bit(SK_CONN, &svsk->sk_flags); + set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); err = kernel_getpeername(newsock, sin, &slen); if (err < 0) { @@ -1127,50 +1116,6 @@ failed: return NULL; } -/* - * Make sure that we don't have too many active connections. If we - * have, something must be dropped. - * - * There's no point in trying to do random drop here for DoS - * prevention. The NFS clients does 1 reconnect in 15 seconds. An - * attacker can easily beat that. - * - * The only somewhat efficient mechanism would be if drop old - * connections from the same IP first. But right now we don't even - * record the client IP in svc_sock. - */ -static void svc_check_conn_limits(struct svc_serv *serv) -{ - if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) { - struct svc_sock *svsk = NULL; - spin_lock_bh(&serv->sv_lock); - if (!list_empty(&serv->sv_tempsocks)) { - if (net_ratelimit()) { - /* Try to help the admin */ - printk(KERN_NOTICE "%s: too many open TCP " - "sockets, consider increasing the " - "number of nfsd threads\n", - serv->sv_name); - } - /* - * Always select the oldest socket. It's not fair, - * but so is life - */ - svsk = list_entry(serv->sv_tempsocks.prev, - struct svc_sock, - sk_list); - set_bit(SK_CLOSE, &svsk->sk_flags); - atomic_inc(&svsk->sk_inuse); - } - spin_unlock_bh(&serv->sv_lock); - - if (svsk) { - svc_sock_enqueue(svsk); - svc_sock_put(svsk); - } - } -} - /* * Receive data from a TCP socket. */ @@ -1178,22 +1123,22 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) { struct svc_sock *svsk = rqstp->rq_sock; - struct svc_serv *serv = svsk->sk_server; + struct svc_serv *serv = svsk->sk_xprt.xpt_server; int len; struct kvec *vec; int pnum, vlen; dprintk("svc: tcp_recv %p data %d conn %d close %d\n", - svsk, test_bit(SK_DATA, &svsk->sk_flags), - test_bit(SK_CONN, &svsk->sk_flags), - test_bit(SK_CLOSE, &svsk->sk_flags)); + svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags), + test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags), + test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags)); if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) { svc_sock_received(svsk); return svc_deferred_recv(rqstp); } - if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags)) + if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags)) /* sndbuf needs to have room for one request * per thread, otherwise we can stall even when the * network isn't a bottleneck. @@ -1210,7 +1155,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) (serv->sv_nrthreads+3) * serv->sv_max_mesg, 3 * serv->sv_max_mesg); - clear_bit(SK_DATA, &svsk->sk_flags); + clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* Receive data. If we haven't got the record length yet, get * the next four bytes. Otherwise try to gobble up as much as @@ -1269,7 +1214,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) return -EAGAIN; /* record not complete */ } len = svsk->sk_reclen; - set_bit(SK_DATA, &svsk->sk_flags); + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); vec = rqstp->rq_vec; vec[0] = rqstp->rq_arg.head[0]; @@ -1312,7 +1257,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) return len; err_delete: - set_bit(SK_CLOSE, &svsk->sk_flags); + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); return -EAGAIN; error: @@ -1321,7 +1266,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) svc_sock_received(svsk); } else { printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", - svsk->sk_server->sv_name, -len); + svsk->sk_xprt.xpt_server->sv_name, -len); goto err_delete; } @@ -1345,17 +1290,17 @@ svc_tcp_sendto(struct svc_rqst *rqstp) reclen = htonl(0x80000000|((xbufp->len ) - 4)); memcpy(xbufp->head[0].iov_base, &reclen, 4); - if (test_bit(SK_DEAD, &rqstp->rq_sock->sk_flags)) + if (test_bit(XPT_DEAD, &rqstp->rq_sock->sk_xprt.xpt_flags)) return -ENOTCONN; sent = svc_sendto(rqstp, &rqstp->rq_res); if (sent != xbufp->len) { printk(KERN_NOTICE "rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n", - rqstp->rq_sock->sk_server->sv_name, + rqstp->rq_sock->sk_xprt.xpt_server->sv_name, (sent<0)?"got error":"sent only", sent, xbufp->len); - set_bit(SK_CLOSE, &rqstp->rq_sock->sk_flags); - svc_sock_enqueue(rqstp->rq_sock); + set_bit(XPT_CLOSE, &rqstp->rq_sock->sk_xprt.xpt_flags); + svc_xprt_enqueue(rqstp->rq_xprt); sent = -EAGAIN; } return sent; @@ -1375,7 +1320,7 @@ static void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp) static int svc_tcp_has_wspace(struct svc_xprt *xprt) { struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); - struct svc_serv *serv = svsk->sk_server; + struct svc_serv *serv = svsk->sk_xprt.xpt_server; int required; int wspace; @@ -1384,7 +1329,7 @@ static int svc_tcp_has_wspace(struct svc_xprt *xprt) * sock space. */ set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); - required = atomic_read(&svsk->sk_reserved) + serv->sv_max_mesg; + required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg; wspace = sk_stream_wspace(svsk->sk_sk); if (wspace < sk_stream_min_wspace(svsk->sk_sk)) @@ -1434,19 +1379,18 @@ void svc_cleanup_xprt_sock(void) svc_unreg_xprt_class(&svc_udp_class); } -static void -svc_tcp_init(struct svc_sock *svsk) +static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) { struct sock *sk = svsk->sk_sk; struct tcp_sock *tp = tcp_sk(sk); - svc_xprt_init(&svc_tcp_class, &svsk->sk_xprt); + svc_xprt_init(&svc_tcp_class, &svsk->sk_xprt, serv); if (sk->sk_state == TCP_LISTEN) { dprintk("setting up TCP socket for listening\n"); - set_bit(SK_LISTENER, &svsk->sk_flags); + set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags); sk->sk_data_ready = svc_tcp_listen_data_ready; - set_bit(SK_CONN, &svsk->sk_flags); + set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); } else { dprintk("setting up TCP socket for reading\n"); sk->sk_state_change = svc_tcp_state_change; @@ -1463,13 +1407,13 @@ svc_tcp_init(struct svc_sock *svsk) * svc_tcp_recvfrom will re-adjust if necessary */ svc_sock_setbufsize(svsk->sk_sock, - 3 * svsk->sk_server->sv_max_mesg, - 3 * svsk->sk_server->sv_max_mesg); + 3 * svsk->sk_xprt.xpt_server->sv_max_mesg, + 3 * svsk->sk_xprt.xpt_server->sv_max_mesg); - set_bit(SK_CHNGBUF, &svsk->sk_flags); - set_bit(SK_DATA, &svsk->sk_flags); + set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); if (sk->sk_state != TCP_ESTABLISHED) - set_bit(SK_CLOSE, &svsk->sk_flags); + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); } } @@ -1485,17 +1429,61 @@ svc_sock_update_bufs(struct svc_serv *serv) spin_lock_bh(&serv->sv_lock); list_for_each(le, &serv->sv_permsocks) { struct svc_sock *svsk = - list_entry(le, struct svc_sock, sk_list); - set_bit(SK_CHNGBUF, &svsk->sk_flags); + list_entry(le, struct svc_sock, sk_xprt.xpt_list); + set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); } list_for_each(le, &serv->sv_tempsocks) { struct svc_sock *svsk = - list_entry(le, struct svc_sock, sk_list); - set_bit(SK_CHNGBUF, &svsk->sk_flags); + list_entry(le, struct svc_sock, sk_xprt.xpt_list); + set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); } spin_unlock_bh(&serv->sv_lock); } +/* + * Make sure that we don't have too many active connections. If we + * have, something must be dropped. + * + * There's no point in trying to do random drop here for DoS + * prevention. The NFS clients does 1 reconnect in 15 seconds. An + * attacker can easily beat that. + * + * The only somewhat efficient mechanism would be if drop old + * connections from the same IP first. But right now we don't even + * record the client IP in svc_sock. + */ +static void svc_check_conn_limits(struct svc_serv *serv) +{ + if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) { + struct svc_sock *svsk = NULL; + spin_lock_bh(&serv->sv_lock); + if (!list_empty(&serv->sv_tempsocks)) { + if (net_ratelimit()) { + /* Try to help the admin */ + printk(KERN_NOTICE "%s: too many open TCP " + "sockets, consider increasing the " + "number of nfsd threads\n", + serv->sv_name); + } + /* + * Always select the oldest socket. It's not fair, + * but so is life + */ + svsk = list_entry(serv->sv_tempsocks.prev, + struct svc_sock, + sk_xprt.xpt_list); + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); + svc_xprt_get(&svsk->sk_xprt); + } + spin_unlock_bh(&serv->sv_lock); + + if (svsk) { + svc_xprt_enqueue(&svsk->sk_xprt); + svc_xprt_put(&svsk->sk_xprt); + } + } +} + /* * Receive the next request on any socket. This code is carefully * organised not to touch any cachelines in the shared svc_serv @@ -1556,9 +1544,9 @@ svc_recv(struct svc_rqst *rqstp, long timeout) spin_lock_bh(&pool->sp_lock); if ((svsk = svc_sock_dequeue(pool)) != NULL) { rqstp->rq_sock = svsk; - atomic_inc(&svsk->sk_inuse); + svc_xprt_get(&svsk->sk_xprt); rqstp->rq_reserved = serv->sv_max_mesg; - atomic_add(rqstp->rq_reserved, &svsk->sk_reserved); + atomic_add(rqstp->rq_reserved, &svsk->sk_xprt.xpt_reserved); } else { /* No data pending. Go to sleep */ svc_thread_enqueue(pool, rqstp); @@ -1588,10 +1576,10 @@ svc_recv(struct svc_rqst *rqstp, long timeout) spin_unlock_bh(&pool->sp_lock); len = 0; - if (test_bit(SK_CLOSE, &svsk->sk_flags)) { - dprintk("svc_recv: found SK_CLOSE\n"); - svc_delete_socket(svsk); - } else if (test_bit(SK_LISTENER, &svsk->sk_flags)) { + if (test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags)) { + dprintk("svc_recv: found XPT_CLOSE\n"); + svc_delete_xprt(&svsk->sk_xprt); + } else if (test_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags)) { struct svc_xprt *newxpt; newxpt = svsk->sk_xprt.xpt_ops->xpo_accept(&svsk->sk_xprt); if (newxpt) { @@ -1600,12 +1588,13 @@ svc_recv(struct svc_rqst *rqstp, long timeout) * listener holds a reference too */ __module_get(newxpt->xpt_class->xcl_owner); - svc_check_conn_limits(svsk->sk_server); + svc_check_conn_limits(svsk->sk_xprt.xpt_server); } svc_sock_received(svsk); } else { dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n", - rqstp, pool->sp_id, svsk, atomic_read(&svsk->sk_inuse)); + rqstp, pool->sp_id, svsk, + atomic_read(&svsk->sk_xprt.xpt_ref.refcount)); len = svsk->sk_xprt.xpt_ops->xpo_recvfrom(rqstp); dprintk("svc: got len=%d\n", len); } @@ -1617,7 +1606,7 @@ svc_recv(struct svc_rqst *rqstp, long timeout) return -EAGAIN; } svsk->sk_lastrecv = get_seconds(); - clear_bit(SK_OLD, &svsk->sk_flags); + clear_bit(XPT_OLD, &svsk->sk_xprt.xpt_flags); rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp)); rqstp->rq_chandle.defer = svc_defer; @@ -1664,7 +1653,7 @@ svc_send(struct svc_rqst *rqstp) /* Grab svsk->sk_mutex to serialize outgoing data. */ mutex_lock(&svsk->sk_mutex); - if (test_bit(SK_DEAD, &svsk->sk_flags)) + if (test_bit(XPT_DEAD, &svsk->sk_xprt.xpt_flags)) len = -ENOTCONN; else len = svsk->sk_xprt.xpt_ops->xpo_sendto(rqstp); @@ -1698,31 +1687,32 @@ svc_age_temp_sockets(unsigned long closure) } list_for_each_safe(le, next, &serv->sv_tempsocks) { - svsk = list_entry(le, struct svc_sock, sk_list); + svsk = list_entry(le, struct svc_sock, sk_xprt.xpt_list); - if (!test_and_set_bit(SK_OLD, &svsk->sk_flags)) + if (!test_and_set_bit(XPT_OLD, &svsk->sk_xprt.xpt_flags)) continue; - if (atomic_read(&svsk->sk_inuse) > 1 || test_bit(SK_BUSY, &svsk->sk_flags)) + if (atomic_read(&svsk->sk_xprt.xpt_ref.refcount) > 1 + || test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)) continue; - atomic_inc(&svsk->sk_inuse); + svc_xprt_get(&svsk->sk_xprt); list_move(le, &to_be_aged); - set_bit(SK_CLOSE, &svsk->sk_flags); - set_bit(SK_DETACHED, &svsk->sk_flags); + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); + set_bit(XPT_DETACHED, &svsk->sk_xprt.xpt_flags); } spin_unlock_bh(&serv->sv_lock); while (!list_empty(&to_be_aged)) { le = to_be_aged.next; - /* fiddling the sk_list node is safe 'cos we're SK_DETACHED */ + /* fiddling the sk_xprt.xpt_list node is safe 'cos we're XPT_DETACHED */ list_del_init(le); - svsk = list_entry(le, struct svc_sock, sk_list); + svsk = list_entry(le, struct svc_sock, sk_xprt.xpt_list); dprintk("queuing svsk %p for closing, %lu seconds old\n", svsk, get_seconds() - svsk->sk_lastrecv); /* a thread will dequeue and close it soon */ - svc_sock_enqueue(svsk); - svc_sock_put(svsk); + svc_xprt_enqueue(&svsk->sk_xprt); + svc_xprt_put(&svsk->sk_xprt); } mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); @@ -1759,31 +1749,28 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, return NULL; } - set_bit(SK_BUSY, &svsk->sk_flags); + set_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags); inet->sk_user_data = svsk; svsk->sk_sock = sock; svsk->sk_sk = inet; svsk->sk_ostate = inet->sk_state_change; svsk->sk_odata = inet->sk_data_ready; svsk->sk_owspace = inet->sk_write_space; - svsk->sk_server = serv; - atomic_set(&svsk->sk_inuse, 1); svsk->sk_lastrecv = get_seconds(); spin_lock_init(&svsk->sk_lock); INIT_LIST_HEAD(&svsk->sk_deferred); - INIT_LIST_HEAD(&svsk->sk_ready); mutex_init(&svsk->sk_mutex); /* Initialize the socket */ if (sock->type == SOCK_DGRAM) - svc_udp_init(svsk); + svc_udp_init(svsk, serv); else - svc_tcp_init(svsk); + svc_tcp_init(svsk, serv); spin_lock_bh(&serv->sv_lock); if (is_temporary) { - set_bit(SK_TEMP, &svsk->sk_flags); - list_add(&svsk->sk_list, &serv->sv_tempsocks); + set_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags); + list_add(&svsk->sk_xprt.xpt_list, &serv->sv_tempsocks); serv->sv_tmpcnt++; if (serv->sv_temptimer.function == NULL) { /* setup timer to age temp sockets */ @@ -1793,8 +1780,8 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, jiffies + svc_conn_age_period * HZ); } } else { - clear_bit(SK_TEMP, &svsk->sk_flags); - list_add(&svsk->sk_list, &serv->sv_permsocks); + clear_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags); + list_add(&svsk->sk_xprt.xpt_list, &serv->sv_permsocks); } spin_unlock_bh(&serv->sv_lock); @@ -1926,66 +1913,63 @@ static void svc_sock_free(struct svc_xprt *xprt) } /* - * Remove a dead socket + * Remove a dead transport */ -static void -svc_delete_socket(struct svc_sock *svsk) +static void svc_delete_xprt(struct svc_xprt *xprt) { - struct svc_serv *serv; - struct sock *sk; - - dprintk("svc: svc_delete_socket(%p)\n", svsk); + struct svc_serv *serv = xprt->xpt_server; - serv = svsk->sk_server; - sk = svsk->sk_sk; - - svsk->sk_xprt.xpt_ops->xpo_detach(&svsk->sk_xprt); + dprintk("svc: svc_delete_xprt(%p)\n", xprt); + xprt->xpt_ops->xpo_detach(xprt); spin_lock_bh(&serv->sv_lock); - - if (!test_and_set_bit(SK_DETACHED, &svsk->sk_flags)) - list_del_init(&svsk->sk_list); + if (!test_and_set_bit(XPT_DETACHED, &xprt->xpt_flags)) + list_del_init(&xprt->xpt_list); /* - * We used to delete the svc_sock from whichever list - * it's sk_ready node was on, but we don't actually + * We used to delete the transport from whichever list + * it's sk_xprt.xpt_ready node was on, but we don't actually * need to. This is because the only time we're called * while still attached to a queue, the queue itself * is about to be destroyed (in svc_destroy). */ - if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags)) { - BUG_ON(atomic_read(&svsk->sk_inuse)<2); - atomic_dec(&svsk->sk_inuse); - if (test_bit(SK_TEMP, &svsk->sk_flags)) + if (!test_and_set_bit(XPT_DEAD, &xprt->xpt_flags)) { + BUG_ON(atomic_read(&xprt->xpt_ref.refcount) < 2); + if (test_bit(XPT_TEMP, &xprt->xpt_flags)) serv->sv_tmpcnt--; + svc_xprt_put(xprt); } - spin_unlock_bh(&serv->sv_lock); } -static void svc_close_socket(struct svc_sock *svsk) +static void svc_close_xprt(struct svc_xprt *xprt) { - set_bit(SK_CLOSE, &svsk->sk_flags); - if (test_and_set_bit(SK_BUSY, &svsk->sk_flags)) + set_bit(XPT_CLOSE, &xprt->xpt_flags); + if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) /* someone else will have to effect the close */ return; - atomic_inc(&svsk->sk_inuse); - svc_delete_socket(svsk); - clear_bit(SK_BUSY, &svsk->sk_flags); - svc_sock_put(svsk); + svc_xprt_get(xprt); + svc_delete_xprt(xprt); + clear_bit(XPT_BUSY, &xprt->xpt_flags); + svc_xprt_put(xprt); } -void svc_force_close_socket(struct svc_sock *svsk) +void svc_close_all(struct list_head *xprt_list) { - set_bit(SK_CLOSE, &svsk->sk_flags); - if (test_bit(SK_BUSY, &svsk->sk_flags)) { - /* Waiting to be processed, but no threads left, - * So just remove it from the waiting list - */ - list_del_init(&svsk->sk_ready); - clear_bit(SK_BUSY, &svsk->sk_flags); + struct svc_xprt *xprt; + struct svc_xprt *tmp; + + list_for_each_entry_safe(xprt, tmp, xprt_list, xpt_list) { + set_bit(XPT_CLOSE, &xprt->xpt_flags); + if (test_bit(XPT_BUSY, &xprt->xpt_flags)) { + /* Waiting to be processed, but no threads left, + * So just remove it from the waiting list + */ + list_del_init(&xprt->xpt_ready); + clear_bit(XPT_BUSY, &xprt->xpt_flags); + } + svc_close_xprt(xprt); } - svc_close_socket(svsk); } /* @@ -1998,7 +1982,7 @@ static void svc_revisit(struct cache_deferred_req *dreq, int too_many) struct svc_sock *svsk; if (too_many) { - svc_sock_put(dr->svsk); + svc_xprt_put(&dr->svsk->sk_xprt); kfree(dr); return; } @@ -2008,9 +1992,9 @@ static void svc_revisit(struct cache_deferred_req *dreq, int too_many) spin_lock(&svsk->sk_lock); list_add(&dr->handle.recent, &svsk->sk_deferred); spin_unlock(&svsk->sk_lock); - set_bit(SK_DEFERRED, &svsk->sk_flags); - svc_sock_enqueue(svsk); - svc_sock_put(svsk); + set_bit(XPT_DEFERRED, &svsk->sk_xprt.xpt_flags); + svc_xprt_enqueue(&svsk->sk_xprt); + svc_xprt_put(&svsk->sk_xprt); } static struct cache_deferred_req * @@ -2040,7 +2024,7 @@ svc_defer(struct cache_req *req) dr->argslen = rqstp->rq_arg.len >> 2; memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2); } - atomic_inc(&rqstp->rq_sock->sk_inuse); + svc_xprt_get(rqstp->rq_xprt); dr->svsk = rqstp->rq_sock; dr->handle.revisit = svc_revisit; @@ -2071,16 +2055,16 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk) { struct svc_deferred_req *dr = NULL; - if (!test_bit(SK_DEFERRED, &svsk->sk_flags)) + if (!test_bit(XPT_DEFERRED, &svsk->sk_xprt.xpt_flags)) return NULL; spin_lock(&svsk->sk_lock); - clear_bit(SK_DEFERRED, &svsk->sk_flags); + clear_bit(XPT_DEFERRED, &svsk->sk_xprt.xpt_flags); if (!list_empty(&svsk->sk_deferred)) { dr = list_entry(svsk->sk_deferred.next, struct svc_deferred_req, handle.recent); list_del_init(&dr->handle.recent); - set_bit(SK_DEFERRED, &svsk->sk_flags); + set_bit(XPT_DEFERRED, &svsk->sk_xprt.xpt_flags); } spin_unlock(&svsk->sk_lock); return dr;