#include <linux/module.h>
#include <linux/lockdep.h>
#include <linux/netdevice.h>
-#include <linux/pcounter.h>
#include <linux/skbuff.h> /* struct sk_buff */
#include <linux/mm.h>
#include <linux/security.h>
#define SOCK_DEBUG(sk, msg...) do { if ((sk) && sock_flag((sk), SOCK_DBG)) \
printk(KERN_DEBUG msg); } while (0)
#else
-#define SOCK_DEBUG(sk, msg...) do { } while (0)
+/* Validate arguments and do nothing */
+static void inline int __attribute__ ((format (printf, 2, 3)))
+SOCK_DEBUG(struct sock *sk, const char *msg, ...)
+{
+}
#endif
/* This is the per-socket lock. The spinlock provides a synchronization
atomic_t skc_refcnt;
unsigned int skc_hash;
struct proto *skc_prot;
+#ifdef CONFIG_NET_NS
struct net *skc_net;
+#endif
};
/**
* @sk_no_check: %SO_NO_CHECK setting, wether or not checkup packets
* @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)
* @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)
+ * @sk_gso_max_size: Maximum GSO segment size to build
* @sk_lingertime: %SO_LINGER l_linger setting
* @sk_backlog: always used with the per-socket spinlock held
* @sk_callback_lock: used with the callbacks in the end of this struct
* @sk_err: last error
* @sk_err_soft: errors that don't cause failure but are the cause of a
* persistent failure not just 'timed out'
- * @sk_drops: raw drops counter
+ * @sk_drops: raw/udp drops counter
* @sk_ack_backlog: current listen backlog
* @sk_max_ack_backlog: listen backlog set in listen()
* @sk_priority: %SO_PRIORITY setting
* @sk_sndmsg_off: cached offset for sendmsg
* @sk_send_head: front of stuff to transmit
* @sk_security: used by security modules
+ * @sk_mark: generic packet mark
* @sk_write_pending: a write to stream socket waits to start
* @sk_state_change: callback to indicate change in the state of the sock
* @sk_data_ready: callback to indicate there is data to be processed
gfp_t sk_allocation;
int sk_route_caps;
int sk_gso_type;
+ unsigned int sk_gso_max_size;
int sk_rcvlowat;
unsigned long sk_flags;
unsigned long sk_lingertime;
__u32 sk_sndmsg_off;
int sk_write_pending;
void *sk_security;
+ __u32 sk_mark;
+ /* XXX 4 bytes hole on 64 bit */
void (*sk_state_change)(struct sock *sk);
void (*sk_data_ready)(struct sock *sk, int bytes);
void (*sk_write_space)(struct sock *sk);
*/
static inline int sk_stream_min_wspace(struct sock *sk)
{
- return sk->sk_wmem_queued / 2;
+ return sk->sk_wmem_queued >> 1;
}
static inline int sk_stream_wspace(struct sock *sk)
return sk->sk_wmem_queued < sk->sk_sndbuf;
}
-extern void sk_stream_rfree(struct sk_buff *skb);
-
-static inline void sk_stream_set_owner_r(struct sk_buff *skb, struct sock *sk)
-{
- skb->sk = sk;
- skb->destructor = sk_stream_rfree;
- atomic_add(skb->truesize, &sk->sk_rmem_alloc);
- sk->sk_forward_alloc -= skb->truesize;
-}
-
-static inline void sk_stream_free_skb(struct sock *sk, struct sk_buff *skb)
-{
- skb_truesize_check(skb);
- sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
- sk->sk_wmem_queued -= skb->truesize;
- sk->sk_forward_alloc += skb->truesize;
- __kfree_skb(skb);
-}
-
/* The per-socket spinlock must be held here. */
static inline void sk_add_backlog(struct sock *sk, struct sk_buff *skb)
{
struct request_sock_ops;
struct timewait_sock_ops;
+struct inet_hashinfo;
+struct raw_hashinfo;
/* Networking protocol blocks we attach to sockets.
* socket layer -> transport layer interface
int (*ioctl)(struct sock *sk, int cmd,
unsigned long arg);
int (*init)(struct sock *sk);
- int (*destroy)(struct sock *sk);
+ void (*destroy)(struct sock *sk);
void (*shutdown)(struct sock *sk, int how);
int (*setsockopt)(struct sock *sk, int level,
int optname, char __user *optval,
int (*get_port)(struct sock *sk, unsigned short snum);
/* Keeping track of sockets in use */
- struct pcounter inuse;
+#ifdef CONFIG_PROC_FS
+ unsigned int inuse_idx;
+#endif
/* Memory pressure */
- void (*enter_memory_pressure)(void);
+ void (*enter_memory_pressure)(struct sock *sk);
atomic_t *memory_allocated; /* Current allocated memory. */
atomic_t *sockets_allocated; /* Current number of sockets. */
/*
* Pressure flag: try to collapse.
* Technical note: it is used by multiple contexts non atomically.
- * All the sk_stream_mem_schedule() is of this nature: accounting
+ * All the __sk_mem_schedule() is of this nature: accounting
* is strict, actions are advisory and have some latency.
*/
int *memory_pressure;
struct request_sock_ops *rsk_prot;
struct timewait_sock_ops *twsk_prot;
+ union {
+ struct inet_hashinfo *hashinfo;
+ struct hlist_head *udp_hash;
+ struct raw_hashinfo *raw_hash;
+ } h;
+
struct module *owner;
char name[32];
#endif
};
-#define DEFINE_PROTO_INUSE(NAME) DEFINE_PCOUNTER(NAME)
-#define REF_PROTO_INUSE(NAME) PCOUNTER_MEMBER_INITIALIZER(NAME, .inuse)
-
extern int proto_register(struct proto *prot, int alloc_slab);
extern void proto_unregister(struct proto *prot);
#define sk_refcnt_debug_release(sk) do { } while (0)
#endif /* SOCK_REFCNT_DEBUG */
-/* Called with local bh disabled */
-static __inline__ void sock_prot_inc_use(struct proto *prot)
-{
- pcounter_add(&prot->inuse, 1);
-}
-static __inline__ void sock_prot_dec_use(struct proto *prot)
+#ifdef CONFIG_PROC_FS
+/* Called with local bh disabled */
+extern void sock_prot_inuse_add(struct net *net, struct proto *prot, int inc);
+extern int sock_prot_inuse_get(struct net *net, struct proto *proto);
+#else
+static void inline sock_prot_inuse_add(struct net *net, struct proto *prot,
+ int inc)
{
- pcounter_add(&prot->inuse, -1);
}
+#endif
-static __inline__ int sock_prot_inuse(struct proto *proto)
-{
- return pcounter_getval(&proto->inuse);
-}
/* With per-bucket locks this operation is not-atomic, so that
* this version is not worse.
return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
}
-extern void __sk_stream_mem_reclaim(struct sock *sk);
-extern int sk_stream_mem_schedule(struct sock *sk, int size, int kind);
+/*
+ * Functions for memory accounting
+ */
+extern int __sk_mem_schedule(struct sock *sk, int size, int kind);
+extern void __sk_mem_reclaim(struct sock *sk);
-#define SK_STREAM_MEM_QUANTUM ((int)PAGE_SIZE)
+#define SK_MEM_QUANTUM ((int)PAGE_SIZE)
+#define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM)
+#define SK_MEM_SEND 0
+#define SK_MEM_RECV 1
-static inline int sk_stream_pages(int amt)
+static inline int sk_mem_pages(int amt)
{
- return DIV_ROUND_UP(amt, SK_STREAM_MEM_QUANTUM);
+ return (amt + SK_MEM_QUANTUM - 1) >> SK_MEM_QUANTUM_SHIFT;
}
-static inline void sk_stream_mem_reclaim(struct sock *sk)
+static inline int sk_has_account(struct sock *sk)
{
- if (sk->sk_forward_alloc >= SK_STREAM_MEM_QUANTUM)
- __sk_stream_mem_reclaim(sk);
+ /* return true if protocol supports memory accounting */
+ return !!sk->sk_prot->memory_allocated;
}
-static inline int sk_stream_rmem_schedule(struct sock *sk, struct sk_buff *skb)
+static inline int sk_wmem_schedule(struct sock *sk, int size)
{
- return (int)skb->truesize <= sk->sk_forward_alloc ||
- sk_stream_mem_schedule(sk, skb->truesize, 1);
+ if (!sk_has_account(sk))
+ return 1;
+ return size <= sk->sk_forward_alloc ||
+ __sk_mem_schedule(sk, size, SK_MEM_SEND);
}
-static inline int sk_stream_wmem_schedule(struct sock *sk, int size)
+static inline int sk_rmem_schedule(struct sock *sk, int size)
{
+ if (!sk_has_account(sk))
+ return 1;
return size <= sk->sk_forward_alloc ||
- sk_stream_mem_schedule(sk, size, 0);
+ __sk_mem_schedule(sk, size, SK_MEM_RECV);
+}
+
+static inline void sk_mem_reclaim(struct sock *sk)
+{
+ if (!sk_has_account(sk))
+ return;
+ if (sk->sk_forward_alloc >= SK_MEM_QUANTUM)
+ __sk_mem_reclaim(sk);
+}
+
+static inline void sk_mem_reclaim_partial(struct sock *sk)
+{
+ if (!sk_has_account(sk))
+ return;
+ if (sk->sk_forward_alloc > SK_MEM_QUANTUM)
+ __sk_mem_reclaim(sk);
+}
+
+static inline void sk_mem_charge(struct sock *sk, int size)
+{
+ if (!sk_has_account(sk))
+ return;
+ sk->sk_forward_alloc -= size;
+}
+
+static inline void sk_mem_uncharge(struct sock *sk, int size)
+{
+ if (!sk_has_account(sk))
+ return;
+ sk->sk_forward_alloc += size;
+}
+
+static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
+{
+ skb_truesize_check(skb);
+ sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
+ sk->sk_wmem_queued -= skb->truesize;
+ sk_mem_uncharge(sk, skb->truesize);
+ __kfree_skb(skb);
}
/* Used by processes to "lock" a socket state, so that
gfp_t priority,
struct proto *prot);
extern void sk_free(struct sock *sk);
+extern void sk_release_kernel(struct sock *sk);
extern struct sock *sk_clone(const struct sock *sk,
const gfp_t priority);
/* Initialise core socket variables */
extern void sock_init_data(struct socket *sock, struct sock *sk);
-/**
- * sk_filter - run a packet through a socket filter
- * @sk: sock associated with &sk_buff
- * @skb: buffer to filter
- * @needlock: set to 1 if the sock is not locked by caller.
- *
- * Run the filter code and then cut skb->data to correct size returned by
- * sk_run_filter. If pkt_len is 0 we toss packet. If skb->len is smaller
- * than pkt_len we keep whole skb->data. This is the socket level
- * wrapper to sk_run_filter. It returns 0 if the packet should
- * be accepted or -EPERM if the packet should be tossed.
- *
- */
-
-static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
-{
- int err;
- struct sk_filter *filter;
-
- err = security_sock_rcv_skb(sk, skb);
- if (err)
- return err;
-
- rcu_read_lock_bh();
- filter = rcu_dereference(sk->sk_filter);
- if (filter) {
- unsigned int pkt_len = sk_run_filter(skb, filter->insns,
- filter->len);
- err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
- }
- rcu_read_unlock_bh();
-
- return err;
-}
-
/**
* sk_filter_release: Release a socket filter
* @sk: socket
extern int sk_receive_skb(struct sock *sk, struct sk_buff *skb,
const int nested);
+static inline void sk_set_socket(struct sock *sk, struct socket *sock)
+{
+ sk->sk_socket = sock;
+}
+
/* Detach socket from process context.
* Announce socket dead, detach it from wait queue and inode.
* Note that parent inode held reference count on this struct sock,
{
write_lock_bh(&sk->sk_callback_lock);
sock_set_flag(sk, SOCK_DEAD);
- sk->sk_socket = NULL;
+ sk_set_socket(sk, NULL);
sk->sk_sleep = NULL;
write_unlock_bh(&sk->sk_callback_lock);
}
write_lock_bh(&sk->sk_callback_lock);
sk->sk_sleep = &parent->wait;
parent->sk = sk;
- sk->sk_socket = parent;
+ sk_set_socket(sk, parent);
security_sock_graft(sk, parent);
write_unlock_bh(&sk->sk_callback_lock);
}
extern void sk_setup_caps(struct sock *sk, struct dst_entry *dst);
-static inline void sk_charge_skb(struct sock *sk, struct sk_buff *skb)
-{
- sk->sk_wmem_queued += skb->truesize;
- sk->sk_forward_alloc -= skb->truesize;
-}
-
static inline int skb_copy_to_page(struct sock *sk, char __user *from,
struct sk_buff *skb, struct page *page,
int off, int copy)
skb->data_len += copy;
skb->truesize += copy;
sk->sk_wmem_queued += copy;
- sk->sk_forward_alloc -= copy;
+ sk_mem_charge(sk, copy);
return 0;
}
skb->sk = sk;
skb->destructor = sock_rfree;
atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+ sk_mem_charge(sk, skb->truesize);
}
extern void sk_reset_timer(struct sock *sk, struct timer_list* timer,
static inline void sk_stream_moderate_sndbuf(struct sock *sk)
{
if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) {
- sk->sk_sndbuf = min(sk->sk_sndbuf, sk->sk_wmem_queued / 2);
+ sk->sk_sndbuf = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1);
sk->sk_sndbuf = max(sk->sk_sndbuf, SOCK_MIN_SNDBUF);
}
}
page = alloc_pages(sk->sk_allocation, 0);
if (!page) {
- sk->sk_prot->enter_memory_pressure();
+ sk->sk_prot->enter_memory_pressure(sk);
sk_stream_moderate_sndbuf(sk);
}
return page;
*/
static inline int sock_writeable(const struct sock *sk)
{
- return atomic_read(&sk->sk_wmem_alloc) < (sk->sk_sndbuf / 2);
+ return atomic_read(&sk->sk_wmem_alloc) < (sk->sk_sndbuf >> 1);
}
static inline gfp_t gfp_any(void)
}
#endif
+static inline
+struct net *sock_net(const struct sock *sk)
+{
+#ifdef CONFIG_NET_NS
+ return sk->sk_net;
+#else
+ return &init_net;
+#endif
+}
+
+static inline
+void sock_net_set(struct sock *sk, struct net *net)
+{
+#ifdef CONFIG_NET_NS
+ sk->sk_net = net;
+#endif
+}
+
+/*
+ * Kernel sockets, f.e. rtnl or icmp_socket, are a part of a namespace.
+ * They should not hold a referrence to a namespace in order to allow
+ * to stop it.
+ * Sockets after sk_change_net should be released using sk_release_kernel
+ */
+static inline void sk_change_net(struct sock *sk, struct net *net)
+{
+ put_net(sock_net(sk));
+ sock_net_set(sk, hold_net(net));
+}
+
extern void sock_enable_timestamp(struct sock *sk);
extern int sock_get_timestamp(struct sock *, struct timeval __user *);
extern int sock_get_timestampns(struct sock *, struct timespec __user *);
#define LIMIT_NETDEBUG(fmt, args...) \
do { if (net_msg_warn && net_ratelimit()) printk(fmt,##args); } while(0)
-/*
- * Macros for sleeping on a socket. Use them like this:
- *
- * SOCK_SLEEP_PRE(sk)
- * if (condition)
- * schedule();
- * SOCK_SLEEP_POST(sk)
- *
- * N.B. These are now obsolete and were, afaik, only ever used in DECnet
- * and when the last use of them in DECnet has gone, I'm intending to
- * remove them.
- */
-
-#define SOCK_SLEEP_PRE(sk) { struct task_struct *tsk = current; \
- DECLARE_WAITQUEUE(wait, tsk); \
- tsk->state = TASK_INTERRUPTIBLE; \
- add_wait_queue((sk)->sk_sleep, &wait); \
- release_sock(sk);
-
-#define SOCK_SLEEP_POST(sk) tsk->state = TASK_RUNNING; \
- remove_wait_queue((sk)->sk_sleep, &wait); \
- lock_sock(sk); \
- }
-
extern __u32 sysctl_wmem_max;
extern __u32 sysctl_rmem_max;