linux tcp keepalive定时器

概念

为了避免频繁的创建和关闭连接,我们需要把连接保持在活跃状态,并且定时的发送探测包,查看对方是否还在连接状态

  • 检测对方是否down了

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    _____ _____
    | | | |
    | A | | B |
    |_____| |_____|
    ^ ^
    |--->--->--->-------------- SYN -------------->--->--->---|
    |---<---<---<------------ SYN/ACK ------------<---<---<---|
    |--->--->--->-------------- ACK -------------->--->--->---|
    | |
    | system crash ---> X
    |
    | system restart ---> ^
    | |
    |--->--->--->-------------- PSH -------------->--->--->---|
    |---<---<---<-------------- RST --------------<---<---<---|
    | |
  • 检测网络问题
    比如在nat或者proxy的情况下,可能条目满了,导致旧的连接被关闭

这就是tcp keepalive定时器的实现的目的。

http keepalive和tcp keepalive

http keepalive: 客户端设置Connection: keep-alive请求头表示使用长连接。 当服务端回复response后,到客户端下一个请求, 在一定的超时时间内不会关闭tcp连接。
超时时间由应用服务器配置,超时定时器由用户程序自己实现。 在http/1.1中默认开启长连接。

tcp keepalive只是检测对方是否还在连接状态。

因此它们之间没有任何关系。

开启tcp keepalive

setsockopt设置SO_KEEPALIVE选项

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
case SO_KEEPALIVE: //开启
if (sk->sk_protocol == IPPROTO_TCP &&
sk->sk_type == SOCK_STREAM)
tcp_set_keepalive(sk, valbool);
sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
break;
case TCP_KEEPIDLE: //keepalive时间, 超过该时间才会开始探测
if (val < 1 || val > MAX_TCP_KEEPIDLE)
err = -EINVAL;
else {
tp->keepalive_time = val * HZ;
if (sock_flag(sk, SOCK_KEEPOPEN) &&
!((1 << sk->sk_state) &
(TCPF_CLOSE | TCPF_LISTEN))) {
u32 elapsed = keepalive_time_elapsed(tp);
if (tp->keepalive_time > elapsed)
elapsed = tp->keepalive_time - elapsed;
else
elapsed = 0;
inet_csk_reset_keepalive_timer(sk, elapsed);
}
}
break;
case TCP_KEEPINTVL://超过keepalive时间后,每次探测的间隔时间
if (val < 1 || val > MAX_TCP_KEEPINTVL)
err = -EINVAL;
else
tp->keepalive_intvl = val * HZ;
break;
case TCP_KEEPCNT: //keepalive最大探测次数
if (val < 1 || val > MAX_TCP_KEEPCNT)
err = -EINVAL;
else
tp->keepalive_probes = val;
break;

如果后面3个socket选项没设置将会使用系统配置

1
2
3
net.ipv4.tcp_keepalive_intvl = 75 //每次探测间隔75秒
net.ipv4.tcp_keepalive_probes = 9 //9次
net.ipv4.tcp_keepalive_time = 7200 //2小时

因此系统默认会在连接空闲2小时后,开始探测,总共探测9次,每次间隔75秒。

设置定时器

  • 对非listen socket设置SO_KEEPALIVE的时候, 或者已经设置了SO_KEEPALIVE的socket上,设置TCP_KEEPIDLE的时候重置定时器时间

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    void tcp_set_keepalive(struct sock *sk, int val)
    {
    if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) //close和listen状态不需要设置定时器
    return;
    if (val && !sock_flag(sk, SOCK_KEEPOPEN)) //第一次setsockopt
    inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
    else if (!val) //删除定时器
    inet_csk_delete_keepalive_timer(sk);
    }
    void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
    {
    sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
    }
    static inline int keepalive_time_when(const struct tcp_sock *tp)
    {
    struct net *net = sock_net((struct sock *)tp);
    return tp->keepalive_time ? : net->ipv4.sysctl_tcp_keepalive_time; //优先使用TCP_KEEPIDLE的值
    }
  • 客户端收到synack,进入TCP_ESTABLISHED的时候,如果设置了SO_KEEPALIVE

    1
    2
    3
    4
    5
    6
    7
    8
    9
    void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
    {
    ...
    tcp_set_state(sk, TCP_ESTABLISHED);
    ...
    if (sock_flag(sk, SOCK_KEEPOPEN))
    inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
    ...
    }
  • 服务端完成握手创新新的sock的时候, 比如收到TFO syn包后TFO cookie验证通过;或是三次握手最后的ack包的时候. 并会进入TCP_SYN_RCV状态

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    struct sock *tcp_create_openreq_child(const struct sock *sk,
    struct request_sock *req,
    struct sk_buff *skb)
    {
    struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
    if (newsk) {
    ...
    if (sock_flag(newsk, SOCK_KEEPOPEN))
    inet_csk_reset_keepalive_timer(newsk,
    keepalive_time_when(newtp)); //TCP_KEEPALIVE_TIME
    }
    }
    }

超时处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
static void tcp_keepalive_timer (unsigned long data)
{
struct sock *sk = (struct sock *) data;
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
u32 elapsed;
/* Only process if socket is not in use. */
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) { //应用程序在使用该sock则不处理
/* Try again later. */
inet_csk_reset_keepalive_timer (sk, HZ/20);
goto out;
}
if (sk->sk_state == TCP_LISTEN) {
pr_err("Hmm... keepalive on a LISTEN ???\n");
goto out;
}
if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { //FIN_WAIT2定时器超时处理
if (tp->linger2 >= 0) {
const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
if (tmo > 0) {
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
goto out;
}
}
tcp_send_active_reset(sk, GFP_ATOMIC);
goto death;
}
if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE) //没有设置SO_KEEPALIVE,或者连接在关闭状态
goto out;
elapsed = keepalive_time_when(tp); //sysctl_tcp_keepalive_time=7200, 2小时
/* It is alive without keepalive 8) */
if (tp->packets_out || tcp_send_head(sk)) //连接在活跃状态,不需要keepalive
goto resched;
elapsed = keepalive_time_elapsed(tp); //距离上一次收到数据间隔的时间
if (elapsed >= keepalive_time_when(tp)) { //超过系统时间限制
/* If the TCP_USER_TIMEOUT option is enabled, use that
* to determine when to timeout instead.
*/
if ((icsk->icsk_user_timeout != 0 &&
elapsed >= icsk->icsk_user_timeout &&
icsk->icsk_probes_out > 0) || //如果用户定义了TCP_USER_TIMEOUT,判断该限制,及是否已经有keepalive probe发出去了
(icsk->icsk_user_timeout == 0 &&
icsk->icsk_probes_out >= keepalive_probes(tp))) { //用户没设置TCP_USER_TIMEOUT,且超过最大probe数
tcp_send_active_reset(sk, GFP_ATOMIC); //发送reset关闭
tcp_write_err(sk);
goto out;
}
//
if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) {
icsk->icsk_probes_out++;
elapsed = keepalive_intvl_when(tp);
} else {
/* If keepalive was lost due to local congestion,
* try harder.
*/
elapsed = TCP_RESOURCE_PROBE_INTERVAL;
}
} else {
/* It is tp->rcv_tstamp + keepalive_time_when(tp) */
elapsed = keepalive_time_when(tp) - elapsed;
}
sk_mem_reclaim(sk);
resched:
inet_csk_reset_keepalive_timer (sk, elapsed);
goto out;
death:
tcp_done(sk);
out:
bh_unlock_sock(sk);
sock_put(sk);
}
//lrcvtime是最后一次接收到数据报的时间
//rcv_tstamp是最后一次接收到ACK的时间
static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp)
{
const struct inet_connection_sock *icsk = &tp->inet_conn;
return min_t(u32, tcp_time_stamp - icsk->icsk_ack.lrcvtime,
tcp_time_stamp - tp->rcv_tstamp);
}
static inline int keepalive_intvl_when(const struct tcp_sock *tp)
{
struct net *net = sock_net((struct sock *)tp);
return tp->keepalive_intvl ? : net->ipv4.sysctl_tcp_keepalive_intvl;
}
static inline int keepalive_probes(const struct tcp_sock *tp)
{
struct net *net = sock_net((struct sock *)tp);
return tp->keepalive_probes ? : net->ipv4.sysctl_tcp_keepalive_probes;
}

发送探测包

如果发送队列中有数据则不用发送探测包,之间发送数据就好了。
否则最终会发送序号为snd_una-1,长度为0的ack包

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
int tcp_write_wakeup(struct sock *sk, int mib)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
if (sk->sk_state == TCP_CLOSE)
return -1;
skb = tcp_send_head(sk);
//发送队列中有数据未发送,且在创健对方的接收窗口内
if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
int err;
unsigned int mss = tcp_current_mss(sk);
unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; //对端接收窗口所允许的最大报文长度
if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
/* We are probing the opening of a window
* but the window size is != 0
* must have been a result SWS avoidance ( sender )
*/
//skb超过限制,需要分段
if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
skb->len > mss) {
seg_size = min(seg_size, mss);
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC))
return -1;
} else if (!tcp_skb_pcount(skb))
tcp_set_skb_tso_segs(skb, mss);
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; //设置push标记,让对方马上传给应用程序
err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); //发送
if (!err)
tcp_event_new_data_sent(sk, skb);
return err;
} else {
//没有新的数据可以发送,或者对方窗口满了
// /* 处于紧急模式时,额外发送一个序号为snd_una的ACK包,告诉对端紧急指针 */
if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
tcp_xmit_probe_skb(sk, 1, mib);
return tcp_xmit_probe_skb(sk, 0, mib); //发送序号为snd_una-1的ack包,长度为0,作为探测包
}
}
/* This routine sends a packet with an out of date sequence
* number. It assumes the other end will try to ack it.
*
* Question: what should we make while urgent mode?
* 4.4BSD forces sending single byte of data. We cannot send
* out of window data, because we have SND.NXT==SND.MAX...
*
* Current solution: to send TWO zero-length segments in urgent mode:
* one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
* out-of-date with SND.UNA-1 to probe window.
*/
static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
/* We don't queue it, tcp_transmit_skb() sets ownership. */
skb = alloc_skb(MAX_TCP_HEADER,
sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
if (!skb)
return -1;
/* Reserve space for headers and set control bits. */
skb_reserve(skb, MAX_TCP_HEADER);
/* Use a previous sequence. This should cause the other
* end to send an ack. Don't queue or clone SKB, just
* send it.
*/
tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
skb_mstamp_get(&skb->skb_mstamp);
NET_INC_STATS(sock_net(sk), mib);
return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0);
}