socket系统调用实现

socket()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
retval = sock_create(family, type, protocol, &sock);
retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
if (retval < 0)
goto out_release;
/* It may be already another descriptor 8) Not kernel problem. */
return retval;
}
int sock_create(int family, int type, int protocol, struct socket **res)
{
return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}
int __sock_create(struct net *net, int family, int type, int protocol,
struct socket **res, int kern)
{
int err;
struct socket *sock;
const struct net_proto_family *pf;
sock = sock_alloc(); //分配inode和socket并关联
sock->type = type;
rcu_read_lock();
pf = rcu_dereference(net_families[family]);
if (!try_module_get(pf->owner))
goto out_release;
rcu_read_unlock();
err = pf->create(net, sock, protocol, kern); //inet_family_ops, inet_create
/*
* Now to bump the refcnt of the [loadable] module that owns this
* socket at sock_release time we decrement its refcnt.
*/
if (!try_module_get(sock->ops->owner))
goto out_module_busy;
module_put(pf->owner);
*res = sock;
return 0;
}
static int inet_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
sock->state = SS_UNCONNECTED;
/* Look for the requested type/protocol pair. */
lookup_protocol:
err = -ESOCKTNOSUPPORT;
rcu_read_lock();
list_for_each_entry_rcu(answer, &inetsw[sock->type], list) { //inet_init->inet_register_protosw, inetsw_array
err = 0;
/* Check the non-wild match. */
if (protocol == answer->protocol) {
if (protocol != IPPROTO_IP)
break;
} else {
/* Check for the two wild cases. */
if (IPPROTO_IP == protocol) {
protocol = answer->protocol;
break;
}
if (IPPROTO_IP == answer->protocol)
break;
}
err = -EPROTONOSUPPORT;
}
sock->ops = answer->ops; //inet_stream_ops
answer_prot = answer->prot; //tcp_prot
answer_flags = answer->flags; //INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK
rcu_read_unlock();
sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern); //根据proto,分配tcp_sock
inet = inet_sk(sk);
inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
inet->nodefrag = 0;
sock_init_data(sock, sk); // 初始化sk,并关联到sock
sk->sk_destruct = inet_sock_destruct;
sk->sk_protocol = protocol;
sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
// ...
if (sk->sk_prot->init) {
err = sk->sk_prot->init(sk); // tcp_v4_init_sock->tcp_init_sock
}
return 0
}
// 分配fd, 分配file结构到进程文件打开表,并设置socket_file_ops
static int sock_map_fd(struct socket *sock, int flags)
{
struct file *newfile;
int fd = get_unused_fd_flags(flags); // 从当前进程的文件打开表中分配fd
if (unlikely(fd < 0))
return fd;
newfile = sock_alloc_file(sock, flags, NULL); //分配dentry,然后分配并初始化file,设置socket_file_ops
if (likely(!IS_ERR(newfile))) {
fd_install(fd, newfile); //设置newfile到进程打开表的fdt->fd[fd]
return fd;
}
}

创建完成后socket的状态(sock->state)为SS_UNCONNECTED, tcp状态(sk->sk_state)为TCP_CLOSE

bind()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
{
sock = sockfd_lookup_light(fd, &err, &fput_needed); //通过fd查找进程文件打开表,然后从中得到socket
if (sock) {
err = move_addr_to_kernel(umyaddr, addrlen, &address);
if (err >= 0) {
err = security_socket_bind(sock,
(struct sockaddr *)&address,
addrlen);
if (!err)
err = sock->ops->bind(sock, // inet_bind
(struct sockaddr *)
&address, addrlen);
}
fput_light(sock->file, fput_needed); // 如果需要释放引用计数
}
return err;
}
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr; //绑定地址
sk->sk_prot->get_port(sk, snum)); // inet_csk_get_port
...
}
int inet_csk_get_port(struct sock *sk, unsigned short port)
{
bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
if (port) {
have_port:
head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
spin_lock_bh(&head->lock);
inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->port == port) //找到bind到相同端口的tb
goto tb_found;
goto tb_not_found; //没有找到相同端口
}
//没有bind到特定端口,需要选择一个本地端口, 搜索本地可用端口
...
tb_not_found: //没有找到其他socket bind到这个port,创建tb
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
net, head, port);
tb_found:
if (!hlist_empty(&tb->owners)) { //有其他sk也hash到同一个tb
if (sk->sk_reuse == SK_FORCE_REUSE) // 当前sk可以被reuse,不管别人
goto success;
if (((tb->fastreuse > 0 && reuse) || // tb和当前都可以被reuse
(tb->fastreuseport > 0 &&
!rcu_access_pointer(sk->sk_reuseport_cb) &&
sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && //tb和sk都被同一用户开启了reuseport
smallest_size == -1)
goto success;
if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { //inet_csk_bind_conflict解决冲突
...
}
if (!reuse)
tb->fastreuse = 0;
if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))
tb->fastreuseport = 0;
} else { //新建的tb情况
tb->fastreuse = reuse; //reuseaddr
if (sk->sk_reuseport) {
tb->fastreuseport = 1;
tb->fastuid = uid;
} else {
tb->fastreuseport = 0;
}
}
success:
if (!inet_csk(sk)->icsk_bind_hash)
inet_bind_hash(sk, tb, port); //sk和tb相互关联
WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
spin_unlock_bh(&head->lock);
return 0;
}

listen()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
SYSCALL_DEFINE2(listen, int, fd, int, backlog)
{
struct socket *sock;
int err, fput_needed;
int somaxconn;
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (sock) {
somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn; // 默认128, 需要优化调大
if ((unsigned int)backlog > somaxconn)
backlog = somaxconn;
err = security_socket_listen(sock, backlog);
if (!err)
err = sock->ops->listen(sock, backlog); // inet_listen, 添加到listen hash表中
fput_light(sock->file, fput_needed);
}
return err;
}
int inet_listen(struct socket *sock, int backlog)
{
lock_sock(sk);
if (old_state != TCP_LISTEN) { // TCP_CLOSE
if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) &&
(sysctl_tcp_fastopen & TFO_SERVER_ENABLE) &&
!inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) {
fastopen_queue_tune(sk, backlog); //设置icsk_accept_queue->fastopenq.max_qlen
tcp_fastopen_init_key_once(true);
}
err = inet_csk_listen_start(sk, backlog);
}
sk->sk_max_ack_backlog = backlog;
release_sock(sk);
return 0;
}
int inet_csk_listen_start(struct sock *sk, int backlog)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_sock *inet = inet_sk(sk);
int err = -EADDRINUSE;
reqsk_queue_alloc(&icsk->icsk_accept_queue); //分配并初始化
sk->sk_max_ack_backlog = backlog;
sk->sk_ack_backlog = 0;
inet_csk_delack_init(sk);
sk_state_store(sk, TCP_LISTEN); // 进入listen状态
if (!sk->sk_prot->get_port(sk, inet->inet_num)) { //inet_csk_get_port
/*
再执行一次是因为,如果两个thread bind到同一端口,然后有个socket不能reuse但是先调用了listen的情况
*/
inet->inet_sport = htons(inet->inet_num);
sk_dst_reset(sk);
err = sk->sk_prot->hash(sk); //inet_hash,添加到listening_hash表中,如果是reuseport,再添加到sk->sk_reuseport_cb中
if (likely(!err))
return 0;
}
sk->sk_state = TCP_CLOSE;
return err;
}
int inet_hash(struct sock *sk)
{
int err = 0;
if (sk->sk_state != TCP_CLOSE) {
local_bh_disable();
err = __inet_hash(sk, NULL, ipv4_rcv_saddr_equal);
local_bh_enable();
}
return err;
}

accept()

sys_accept->sys_accept4->inet_accept->inet_csk_accept->inet_csk_wait_for_connect

sys_accept4

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
int __user *, upeer_addrlen, int, flags)
{
...
sock = sockfd_lookup_light(fd, &err, &fput_needed);
newsock = sock_alloc();
newsock->type = sock->type;
newsock->ops = sock->ops; //inet_stream_ops
__module_get(newsock->ops->owner);
newfd = get_unused_fd_flags(flags);
if (unlikely(newfd < 0)) { //分配不了fd返回EMFILE
err = newfd;
sock_release(newsock);
goto out_put;
}
newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name); //tcp_prot
err = sock->ops->accept(sock, newsock, sock->file->f_flags); // inet_accept
if (upeer_sockaddr) {
newsock->ops->getname(newsock, (struct sockaddr *)&address, //inet_getname
&len, 2);
err = move_addr_to_user(&address, //copy accept socket的地址返回
len, upeer_sockaddr, upeer_addrlen);
}
/* File flags are not inherited via accept() unlike another OSes. */
fd_install(newfd, newfile); //install到进程fdt
err = newfd;
fput_light(sock->file, fput_needed);
return err;
}
int inet_accept(struct socket *sock, struct socket *newsock, int flags)
{
struct sock *sk1 = sock->sk;
int err = -EINVAL;
struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err); //inet_csk_accept
lock_sock(sk2);
sock_rps_record_flow(sk2);
sock_graft(sk2, newsock); // 关联sock和socket
newsock->state = SS_CONNECTED;
err = 0;
release_sock(sk2); //sk_rcv_backlog, 释放ownership
do_err:
return err;
}
struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
struct request_sock *req;
struct sock *newsk;
int error;
lock_sock(sk); //锁listen
error = -EINVAL;
if (sk->sk_state != TCP_LISTEN)
goto out_err;
/* Find already established connection */
if (reqsk_queue_empty(queue)) {
long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
/* If this is a non blocking socket don't sleep */
error = -EAGAIN;
if (!timeo)
goto out_err;
error = inet_csk_wait_for_connect(sk, timeo); //阻塞等待新的连接
if (error) // 如果是超时返回EAGAIN
goto out_err;
}
req = reqsk_queue_remove(queue, sk); //出队
newsk = req->sk;
if (sk->sk_protocol == IPPROTO_TCP &&
tcp_rsk(req)->tfo_listener) { //fastopen socket
spin_lock_bh(&queue->fastopenq.lock);
if (tcp_rsk(req)->tfo_listener) {
/* We are still waiting for the final ACK from 3WHS
* so can't free req now. Instead, we set req->sk to
* NULL to signify that the child socket is taken
* so reqsk_fastopen_remove() will free the req
* when 3WHS finishes (or is aborted).
*/
req->sk = NULL; //设为NULL,表示已经被accept
req = NULL;
}
spin_unlock_bh(&queue->fastopenq.lock);
}
out:
release_sock(sk); //sk_backlog_rcv, 并释放应用程序的ownership
if (req)
reqsk_put(req); //如果不是fastopen, 则在这里回收req
return newsk;
out_err:
newsk = NULL;
req = NULL;
*err = error;
goto out;
}
static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
{
struct inet_connection_sock *icsk = inet_csk(sk);
DEFINE_WAIT(wait);
int err;
/*
* True wake-one mechanism for incoming connections: only
* one process gets woken up, not the 'whole herd'.
* Since we do not 'race & poll' for established sockets
* anymore, the common case will execute the loop only once.
*
* Subtle issue: "add_wait_queue_exclusive()" will be added
* after any current non-exclusive waiters, and we know that
* it will always _stay_ after any new non-exclusive waiters
* because all non-exclusive waiters are added at the
* beginning of the wait-queue. As such, it's ok to "drop"
* our exclusiveness temporarily when we get woken up without
* having to remove and re-insert us on the wait queue.
*/
for (;;) {
prepare_to_wait_exclusive(sk_sleep(sk), &wait, //wait添加到sk_sleep(sk)中,添加自动唤醒autoremove_wake_function
TASK_INTERRUPTIBLE);
release_sock(sk); // 处理backlog队列
if (reqsk_queue_empty(&icsk->icsk_accept_queue))
timeo = schedule_timeout(timeo); // sleep 直到超时
sched_annotate_sleep();
lock_sock(sk); //唤醒后再锁listen
err = 0;
if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) //有新的连接可以accept,则返回
break;
err = -EINVAL;
if (sk->sk_state != TCP_LISTEN) //出错了
break;
err = sock_intr_errno(timeo); //根据剩余timeout判断原因
if (signal_pending(current)) //有信号
break;
err = -EAGAIN; //不是被信号唤醒, 说明超时了
if (!timeo) //如果还有剩余timeo没用完则继续,否则返回EAGAIN
break;
}
finish_wait(sk_sleep(sk), &wait); //删除wait
return err;
}

lock_sock & release_sock

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
static inline void lock_sock(struct sock *sk)
{
lock_sock_nested(sk, 0);
}
void lock_sock_nested(struct sock *sk, int subclass)
{
might_sleep();
spin_lock_bh(&sk->sk_lock.slock); //下半部是协议栈处理所以也要禁用本地cpu的bh执行
if (sk->sk_lock.owned) // 如果应用在处理, 说明是其他应用, 使用schedule()方式等待
__lock_sock(sk); // 等待被释放
sk->sk_lock.owned = 1; // 标记应用程序在处理
spin_unlock(&sk->sk_lock.slock);
/*
* The sk_lock has mutex_lock() semantics here:
*/
mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
local_bh_enable();
}
void release_sock(struct sock *sk)
{
spin_lock_bh(&sk->sk_lock.slock);
if (sk->sk_backlog.tail)
__release_sock(sk); //sk_backlog_rcv
/* Warning : release_cb() might need to release sk ownership,
* ie call sock_release_ownership(sk) before us.
*/
if (sk->sk_prot->release_cb) // tcp_release_cb
sk->sk_prot->release_cb(sk);
sock_release_ownership(sk); //!sock_owned_by_user(sk)
if (waitqueue_active(&sk->sk_lock.wq)) //如果其他进程在等待这个ownership
wake_up(&sk->sk_lock.wq);
spin_unlock_bh(&sk->sk_lock.slock);
}

connect()

sys_connect->inet_stream_connect->__inet_stream_connect->tcp_v4_connect->tcp_connect

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
int err;
lock_sock(sock->sk);
err = __inet_stream_connect(sock, uaddr, addr_len, flags);
release_sock(sock->sk);
return err;
}
int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
switch (sock->state) {
...
case SS_UNCONNECTED:
err = -EISCONN;
if (sk->sk_state != TCP_CLOSE)
goto out;
err = sk->sk_prot->connect(sk, uaddr, addr_len); //tcp_v4_connect->tcp_connect, 发送syn or fastopen
if (err < 0)
goto out;
sock->state = SS_CONNECTING;
/* Just entered SS_CONNECTING state; the only
* difference is that return value in non-blocking
* case is EINPROGRESS, rather than EALREADY.
*/
err = -EINPROGRESS;
break;
}
timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); //非阻塞返回0
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
int writebias = (sk->sk_protocol == IPPROTO_TCP) &&
tcp_sk(sk)->fastopen_req &&
tcp_sk(sk)->fastopen_req->data ? 1 : 0;
/* Error code is set above */
if (!timeo || !inet_wait_for_connect(sk, timeo, writebias)) //阻塞等待
goto out;
err = sock_intr_errno(timeo);
if (signal_pending(current))
goto out;
}
/* Connection was closed by RST, timeout, ICMP error
* or another process disconnected us.
*/
if (sk->sk_state == TCP_CLOSE)
goto sock_error;
/* sk->sk_err may be not zero now, if RECVERR was ordered by user
* and error was received after socket entered established state.
* Hence, it is handled normally after connect() return successfully.
*/
sock->state = SS_CONNECTED;
err = 0;
out:
return err;
..
}

read()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
loff_t pos = file_pos_read(f.file);
ret = vfs_read(f.file, buf, count, &pos);
if (ret >= 0)
file_pos_write(f.file, pos); //socket的pos一直为0
fdput_pos(f);
}
return ret;
}
ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
...
ret = rw_verify_area(READ, file, pos, count); //检查参数和权限
if (!ret) {
if (count > MAX_RW_COUNT)
count = MAX_RW_COUNT;
ret = __vfs_read(file, buf, count, pos);
if (ret > 0) {
fsnotify_access(file); // fsnotify机制进行通知
add_rchar(current, ret); // 审计信息
}
inc_syscr(current); // 审计信息
}
return ret;
}
ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
loff_t *pos)
{
if (file->f_op->read)
return file->f_op->read(file, buf, count, pos);
else if (file->f_op->read_iter)
return new_sync_read(file, buf, count, pos); //socket_file_ops->sock_read_iter
else
return -EINVAL;
}
static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
struct iovec iov = { .iov_base = buf, .iov_len = len };
struct kiocb kiocb;
struct iov_iter iter;
ssize_t ret;
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
iov_iter_init(&iter, READ, &iov, 1, len);
ret = filp->f_op->read_iter(&kiocb, &iter); //socket_file_ops->sock_read_iter
BUG_ON(ret == -EIOCBQUEUED);
*ppos = kiocb.ki_pos;
return ret;
}
static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct file *file = iocb->ki_filp;
struct socket *sock = file->private_data;
struct msghdr msg = {.msg_iter = *to,
.msg_iocb = iocb};
ssize_t res;
if (file->f_flags & O_NONBLOCK)
msg.msg_flags = MSG_DONTWAIT;
if (iocb->ki_pos != 0)
return -ESPIPE;
if (!iov_iter_count(to)) /* Match SYS5 behaviour */
return 0;
res = sock_recvmsg(sock, &msg, msg.msg_flags);
*to = msg.msg_iter;
return res;
}
int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags)
{
int err = security_socket_recvmsg(sock, msg, msg_data_left(msg), flags);
return err ?: sock_recvmsg_nosec(sock, msg, flags);
}
static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
int flags)
{
return sock->ops->recvmsg(sock, msg, msg_data_left(msg), flags);//inet_stream_ops->inet_recvmsg
}
int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
int flags)
{
struct sock *sk = sock->sk;
int addr_len = 0;
int err;
sock_rps_record_flow(sk);
err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, //tcp_prot->tcp_recvmsg
flags & ~MSG_DONTWAIT, &addr_len);
if (err >= 0)
msg->msg_namelen = addr_len;
return err;
}

write()

write和read类似,
sys_write->vfs_write->__vfs_write->new_sync_write->sock_write_iter->sock_sendmsg->sock_sendmsg_nosec->inet_sendmsg->tcp_sendmsg

setsockopt()

sys_setsockopt()->sock_common_setsockopt()->tcp_setsockopt()->do_tcp_setsockopt()

close()

sys_close()->__close_fd()->filp_close()->fput()
在__close_fd()中回收fd, 在fput中,通过task_work机制,设置____fput回调延时执行, task_work callback会在返回用户态的时候被调用
____fput()->__fput()->sock_close()->sock_release()->inet_release()->tcp_close()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
static void __fput(struct file *file)
{
struct dentry *dentry = file->f_path.dentry;
struct vfsmount *mnt = file->f_path.mnt;
struct inode *inode = file->f_inode;
might_sleep();
fsnotify_close(file);
eventpoll_release(file); //如果加入过epoll,先从epoll中释放
locks_remove_file(file);
if (unlikely(file->f_flags & FASYNC)) {
if (file->f_op->fasync)
file->f_op->fasync(-1, file, 0);
}
ima_file_free(file);
if (file->f_op->release)
file->f_op->release(inode, file); //sock_close()->sock_release()
security_file_free(file);
if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
!(file->f_mode & FMODE_PATH))) {
cdev_put(inode->i_cdev);
}
fops_put(file->f_op);
put_pid(file->f_owner.pid);
if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
i_readcount_dec(inode);
if (file->f_mode & FMODE_WRITER) {
put_write_access(inode);
__mnt_drop_write(mnt);
}
file->f_path.dentry = NULL;
file->f_path.mnt = NULL;
file->f_inode = NULL;
file_free(file);
dput(dentry);
mntput(mnt);
}
void sock_release(struct socket *sock)
{
if (sock->ops) {
struct module *owner = sock->ops->owner;
sock->ops->release(sock); //inet_release()
sock->ops = NULL;
module_put(owner);
}
if (rcu_dereference_protected(sock->wq, 1)->fasync_list)
pr_err("%s: fasync list not empty!\n", __func__);
this_cpu_sub(sockets_in_use, 1);
if (!sock->file) {
iput(SOCK_INODE(sock));
return;
}
sock->file = NULL;
}
int inet_release(struct socket *sock)
{
struct sock *sk = sock->sk;
if (sk) {
long timeout;
/* Applications forget to leave groups before exiting */
ip_mc_drop_socket(sk);
/* If linger is set, we don't return until the close
* is complete. Otherwise we return immediately. The
* actually closing is done the same either way.
*
* If the close is due to the process exiting, we never
* linger..
*/
timeout = 0;
if (sock_flag(sk, SOCK_LINGER) &&
!(current->flags & PF_EXITING))
timeout = sk->sk_lingertime;
sock->sk = NULL;
sk->sk_prot->close(sk, timeout); //tcp_close
}
return 0;
}

sendto()

sys_sendto()->sock_sendmsg()->sock_sendmsg_nosec()->inet_sendmsg()->tcp_sendmsg()

sendmsg

sys_sendmsg()->__sys_sendmsg()->___sys_sendmsg->sock_sendmsg()->sock_sendmsg_nosec()->inet_sendmsg()->tcp_sendmsg()