linux tcp GSO和TSO实现

概念

TSO(TCP Segmentation Offload): 是一种利用网卡来对大数据包进行自动分段,降低CPU负载的技术。 其主要是延迟分段
GSO(Generic Segmentation Offload): GSO是协议栈是否推迟分段,在发送到网卡之前判断网卡是否支持TSO,如果网卡支持TSO则让网卡分段,否则协议栈分完段再交给驱动。 如果TSO开启,GSO会自动开启

  • GSO开启, TSO开启: 协议栈推迟分段,并直接传递大数据包到网卡,让网卡自动分段
  • GSO开启, TSO关闭: 协议栈推迟分段,在最后发送到网卡前才执行分段
  • GSO关闭, TSO开启: 同GSO开启, TSO开启
  • GSO关闭, TSO关闭: 不推迟分段,在tcp_sendmsg中直接发送MSS大小的数据包

开启GSO/TSO

  • 驱动程序在注册网卡设备的时候默认开启GSO: NETIF_F_GSO
  • 驱动程序会根据网卡硬件是否支持来设置TSO: NETIF_F_TSO
  • 可以通过ethtool -K来开关GSO/TSO
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#define NETIF_F_SOFT_FEATURES (NETIF_F_GSO | NETIF_F_GRO)
int register_netdevice(struct net_device *dev)
{
...
/* Transfer changeable features to wanted_features and enable
* software offloads (GSO and GRO).
*/
dev->hw_features |= NETIF_F_SOFT_FEATURES;
dev->features |= NETIF_F_SOFT_FEATURES; //默认开启GRO/GSO
dev->wanted_features = dev->features & dev->hw_features;
...
}
static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
{
...
netdev->features = NETIF_F_SG |
NETIF_F_TSO |
NETIF_F_TSO6 |
NETIF_F_RXHASH |
NETIF_F_RXCSUM |
NETIF_F_HW_CSUM;
register_netdev(netdev);
...
}

是否推迟分段

从上面我们知道GSO/TSO是否开启是保存在dev->features中,而设备和路由关联,当我们查询到路由后就可以把配置保存在sock中
比如在tcp_v4_connect和tcp_v4_syn_recv_sock都会调用sk_setup_caps来设置GSO/TSO配置
需要注意的是,只要开启了GSO,即使硬件不支持TSO,也会设置NETIF_F_TSO,使得sk_can_gso(sk)在GSO开启或者TSO开启的时候都返回true

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#define NETIF_F_ALL_TSO (NETIF_F_TSO | NETIF_F_TSO6 | \
NETIF_F_TSO_ECN | NETIF_F_TSO_MANGLEID)
/* List of features with software fallbacks. */
#define NETIF_F_GSO_SOFTWARE (NETIF_F_ALL_TSO | NETIF_F_UFO | \
NETIF_F_GSO_SCTP)
void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
{
u32 max_segs = 1;
sk_dst_set(sk, dst);
sk->sk_route_caps = dst->dev->features;
if (sk->sk_route_caps & NETIF_F_GSO) //软件GSO,默认开启
sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; //开启延时gso延时选项,包括NETIF_F_TSO
sk->sk_route_caps &= ~sk->sk_route_nocaps;
if (sk_can_gso(sk)) { //开启tso,对于tcpv4, 设置了NETIF_F_TSO
if (dst->header_len) {
sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
} else {
sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; // 开启gso后,设置sg和校验
sk->sk_gso_max_size = dst->dev->gso_max_size; //GSO_MAX_SIZE=65536
max_segs = max_t(u32, dst->dev->gso_max_segs, 1); //GSO_MAX_SEGS=65535
}
}
sk->sk_gso_max_segs = max_segs;
}
//判断GSO或TSO是否开启
static inline bool sk_can_gso(const struct sock *sk)
{
return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type); //SKB_GSO_TCPV4
}
//对于tcp4, 判断NETIF_F_TSO是否被设置, 即使硬件不支持TSO,开启GSO的情况下也会被设置
static inline bool net_gso_ok(netdev_features_t features, int gso_type)
{
netdev_features_t feature = (netdev_features_t)gso_type << NETIF_F_GSO_SHIFT;
/* check flags correspondence */
BUILD_BUG_ON(SKB_GSO_TCPV4 != (NETIF_F_TSO >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_UFO >> NETIF_F_GSO_SHIFT));
return (features & feature) == feature;
}

GSO的数据包长度

对紧急数据包或GSO/TSO都不开启的情况,才不会推迟发送, 默认使用当前MSS
开启GSO后,tcp_send_mss返回mss和单个skb的GSO大小,为mss的整数倍

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
struct sock {
...
int sk_gso_type; //SKB_GSO_TCPV4
unsigned int sk_gso_max_size; //GSO_MAX_SIZE=65536
u16 sk_gso_max_segs; //GSO_MAX_SEGS=65535, 基本不限制了
}
static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
{
int mss_now;
mss_now = tcp_current_mss(sk);
*size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
return mss_now;
}
static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
int large_allowed)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 new_size_goal, size_goal;
if (!large_allowed || !sk_can_gso(sk)) //紧急数据,或者不支持gso/tso
return mss_now;
/* Note : tcp_tso_autosize() will eventually split this later */
new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER;
new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal); //最多达到收到的最大rwnd窗口通告的一半
/* We try hard to avoid divides here */
size_goal = tp->gso_segs * mss_now;
if (unlikely(new_size_goal < size_goal ||
new_size_goal >= size_goal + mss_now)) {
tp->gso_segs = min_t(u16, new_size_goal / mss_now, //更新分段数
sk->sk_gso_max_segs);
size_goal = tp->gso_segs * mss_now;
}
return max(size_goal, mss_now);
}

tcp_sendmsg

应用程序send()数据后,会在tcp_sendmsg中尝试在同一个skb,保存size_goal大小的数据,然后再通过tcp_push把这些包通过tcp_write_xmit发出去

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
{
//size_goal表示GSO支持的大小,为mss_now的整数倍,不支持GSO时则相等
mss_now = tcp_send_mss(sk, &size_goal, flags);
sg = !!(sk->sk_route_caps & NETIF_F_SG);
while (msg_data_left(msg)) {
int copy = 0;
int max = size_goal;
skb = tcp_write_queue_tail(sk); // 发送队列的最后一个skb, 如果该skb未发送且还有剩余空间,尝试往该skb中继续填充数据
if (tcp_send_head(sk)) { //有未发送的数据
if (skb->ip_summed == CHECKSUM_NONE) //比如路由变更,之前的不支持TSO,现在的支持了
max = mss_now; //上一个不支持GSO的skb,继续不支持
copy = max - skb->len;
}
//copy<=0表示不能合并到之前skb做GSO,或者被设置了eor标记不能合并
if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
bool first_skb;
new_segment:
first_skb = skb_queue_empty(&sk->sk_write_queue); //发送队列为空
skb = sk_stream_alloc_skb(sk, //分配新的skb
select_size(sk, sg, first_skb), //first_skb才会把数据放到线性区
sk->sk_allocation,
first_skb);
/*
* Check whether we can use HW checksum.
*/
if (sk_check_csum_caps(sk)) //sk_setup_caps中被设置
skb->ip_summed = CHECKSUM_PARTIAL; //GSO的skb都会被设置为CHECKSUM_PARTIAL
skb_entail(sk, skb); // 加入发送队列, 去掉skb的TCP_NAGLE_PUSH标记
copy = size_goal; // 新建的skb,还没有往里面copy数据
max = size_goal;
}
/* Try to append data to the end of skb. */
if (copy > msg_data_left(msg))
copy = msg_data_left(msg);
/* Where to copy to? */
if (skb_availroom(skb) > 0) {//有剩余线性空间,且没有frag数据, 只有first_skb才会把数据放到线性区
/* We have some space in skb head. Superb! */
copy = min_t(int, copy, skb_availroom(skb)); //能copy多少就copy多少
err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy); //copy用户态数据到skb线性区
if (err)
goto do_fault;
} else { //说明要使用frag区了
bool merge = true;
int i = skb_shinfo(skb)->nr_frags;
struct page_frag *pfrag = sk_page_frag(sk);
if (!sk_page_frag_refill(sk, pfrag)) //查看pfrag中空间够不够,不够的话尝试分配,分配失败则返回false
goto wait_for_memory;
if (!skb_can_coalesce(skb, i, pfrag->page, //pfrag->page和frags[i-1]是否使用相同页,并且page_offset相同
pfrag->offset)) {
if (i >= sysctl_max_skb_frags || !sg) {
tcp_mark_push(tp, skb); //达到最大frags数了,标记push,并重新分配新的skb
goto new_segment;
}
merge = false; //说明和之前frags中是不同的page,不用merge
}
copy = min_t(int, copy, pfrag->size - pfrag->offset); //pfrage中剩余空间
err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, //copy到pfrag中
pfrag->page,
pfrag->offset,
copy);
/* Update the skb. */
if (merge) {
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); //pfrag和frags[i - 1]是相同的
} else { //把pfrag设置为新的frags[i]
skb_fill_page_desc(skb, i, pfrag->page,
pfrag->offset, copy);
get_page(pfrag->page); //增加引用计数
}
pfrag->offset += copy;
}
if (!copied)
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; //第一个包
tp->write_seq += copy;
TCP_SKB_CB(skb)->end_seq += copy;
tcp_skb_pcount_set(skb, 0); //清零tso分段数,让tcp_write_xmit去计算
copied += copy;
if (!msg_data_left(msg)) {
tcp_tx_timestamp(sk, sockc.tsflags, skb);
if (unlikely(flags & MSG_EOR))
TCP_SKB_CB(skb)->eor = 1; //发完数据的话设置eor标记
goto out;
}
// 还有数据没copy,并且没有达到最大可拷贝的大小, 尝试往该skb继续添加数据
if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
continue;
//还有数据没copy,但是当前skb已经满了,可以发送了
if (forced_push(tp)) { //超过最大窗口的一半没有设置push了
tcp_mark_push(tp, skb); //设置push标记,更新pushed_seq
__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); //调用tcp_write_xmit马上发送
} else if (skb == tcp_send_head(sk)) //第一个包,直接发送
tcp_push_one(sk, mss_now);
continue; //说明发送队列前面还有skb等待发送,且距离之前push的包还不是非常久
wait_for_sndbuf:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
if (copied) //先把copied的发出去再等内存
tcp_push(sk, flags & ~MSG_MORE, mss_now,
TCP_NAGLE_PUSH, size_goal);
err = sk_stream_wait_memory(sk, &timeo); //阻塞等待内存
mss_now = tcp_send_mss(sk, &size_goal, flags);
}
//所有数据都放到发送队列中了,调用tcp_push发送
out:
if (copied)
tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
out_nopush:
release_sock(sk);
return copied + copied_syn;
}

tcp_write_xmit

tcp_sendmsg已经把数据按照GSO最大的size,放到一个个的skb中, 最终调用tcp_write_xmit发送这些GSO包
tcp_write_xmit会检查当前的拥塞窗口,还有nagle测试,tsq检查来决定是否能发送整个或者部分的skb, 如果只能发送一部分,则需要调用tso_fragment做切分
最后通过tcp_transmit_skb发送, 如果发送窗口没有达到限制,skb中存放的数据将达到GSO最大值

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp)
{
...
max_segs = tcp_tso_segs(sk, mss_now); //当前tso支持的最大segs数量, 会结合pacing做计算
while ((skb = tcp_send_head(sk))) { //遍历发送队列
tso_segs = tcp_init_tso_segs(skb, mss_now); //skb->len/mss,重新设置tcp_gso_segs,因为在tcp_sendmsg中被清零了
...
if (tso_segs == 1) {//tso_segs=1表示无需tso分段
/* 根据nagle算法,计算是否需要推迟发送数据 */
if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
(tcp_skb_is_last(sk, skb) ?
nonagle : TCP_NAGLE_PUSH)))) //last skb就直接发送
break; //推迟发送
} else { //tso分段
if (!push_one && //push所有skb
tcp_tso_should_defer(sk, skb, &is_cwnd_limited, //如果发送窗口剩余不多,并且预计下一个ack将很快到来(意味着可用窗口会增加),则推迟发送
max_segs))
break; //可以推迟
}
//不用推迟发送,马上发送
limit = mss_now;
if (tso_segs > 1 && !tcp_urg_mode(tp)) //tso且非urg模式
limit = tcp_mss_split_point(sk, skb, mss_now, //返回当前skb中可以发送的数据大小,通过mss和cwnd
min_t(unsigned int,
cwnd_quota, //cwnd的剩余配额
max_segs),
nonagle); //对最后一个段,如果没有写满,则用nagle判断是否要延时发
/* 当skb的长度大于限制时,需要调用tso_fragment分片,如果分段失败则暂不发送 */
if (skb->len > limit && //可能窗口限制,或者最后一个seg<mss 并且需要nagle发送
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) //按limit切割成两个skb
break;
if (tcp_small_queue_check(sk, skb, 0)) //tsq检查,qdisc是否达到限制
break;
if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) //发送,如果包被qdisc丢了,则退出循环,不继续发送了
break;
tcp_event_new_data_sent(sk, skb);//更新sk_send_head和packets_out
..
}
...
}

tcp_tso_segs

tcp_tso_segs返回当前tso支持的最大segs数量

  • 如果拥塞算法提供则使用拥塞算法提供的值
  • 否则调用tcp_tso_autosize(), 使用pacing来计算每毫秒能发送的segs数量
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
/* Return how many segs we'd like on a TSO packet,
* to send one TSO packet per ms
*/
u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
int min_tso_segs)
{
u32 bytes, segs;
bytes = min(sk->sk_pacing_rate >> 10, //每毫秒字节数
sk->sk_gso_max_size - 1 - MAX_TCP_HEADER); //设备支持的最大tso大小
/* Goal is to send at least one packet per ms,
* not one big TSO packet every 100 ms.
* This preserves ACK clocking and is consistent
* with tcp_tso_should_defer() heuristic.
*/
segs = max_t(u32, bytes / mss_now, min_tso_segs); //每毫秒支持的tso segs数量
return min_t(u32, segs, sk->sk_gso_max_segs);
}
/* Return the number of segments we want in the skb we are transmitting.
* See if congestion control module wants to decide; otherwise, autosize.
*/
static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
{
const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0; //如果拥塞算法提供,则返回该值
return tso_segs ? :
tcp_tso_autosize(sk, mss_now, sysctl_tcp_min_tso_segs); //根据pacing和设备支持的最大tso大小,计算每毫秒的tso segs数量
}

tcp_init_tso_segs

在tcp_sendmsg中tso_segs被设置为0,这里要计算最终的tso_segs数量
设为skb->len/mss的大小

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
{
int tso_segs = tcp_skb_pcount(skb);
if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
tcp_set_skb_tso_segs(skb, mss_now);
tso_segs = tcp_skb_pcount(skb);
}
return tso_segs;
}
static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
{
if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) {
/* Avoid the costly divide in the normal
* non-TSO case.
*/
tcp_skb_pcount_set(skb, 1);
TCP_SKB_CB(skb)->tcp_gso_size = 0;
} else {
tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
}
}

tcp_tso_should_defer

对小包直接调用tcp_nagle_test()测试,对大包调用tcp_tso_should_defer()判断是否要推迟发送
如果发送窗口剩余不是太多(即使能发送当前skb),并且预计下一个ack将很快到来(意味着可用窗口会增加),则推迟发送

sysctl_tcp_tso_win_divisor=3, 意味着剩余发送窗口超过1/3,则不会推迟发送

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
bool *is_cwnd_limited, u32 max_segs)
{
...
send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; //发送窗口
/* From in_flight test above, we know that cwnd > in_flight. */
cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache; //拥塞窗口
limit = min(send_win, cong_win); //最大发送窗口剩余
/* If a full-sized TSO skb can be sent, do it. */
if (limit >= max_segs * tp->mss_cache) //支持最大尺寸的tso发送
goto send_now;
/* Middle in queue won't get any more data, full sendable already? */
if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) //不是发送队列的最后一个,且满足发送窗口
goto send_now; //直接发送,不会有数据被添加到这个skb了
win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor);
if (win_divisor) {
u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
/* If at least some fraction of a window is available,
* just use it.
*/
chunk /= win_divisor;
if (limit >= chunk) //剩余的窗口大于总窗口的比例, 默认1/3
goto send_now;
} else {
/* Different approach, try not to defer past a single
* ACK. Receiver should ACK every other full sized
* frame, so if we have space for more than 3 frames
* then send now.
*/
if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)
goto send_now;
}
head = tcp_write_queue_head(sk);
skb_mstamp_get(&now);
age = skb_mstamp_us_delta(&now, &head->skb_mstamp); //最早的未确认包的距离现在的时间
/* If next ACK is likely to come too late (half srtt), do not defer */
if (age < (tp->srtt_us >> 4)) // 也就是说下一个ack的到来很可能大于1/2的srtt,直接发送
goto send_now;
/* Ok, it looks like it is advisable to defer. */
//当前skb的收到cwnd限制
if (cong_win < send_win && cong_win <= skb->len)
*is_cwnd_limited = true;
//可以推迟发送了
return true;
send_now:
return false;
}

tcp_mss_split_point

返回当前skb中可以发送的数据大小,通过cwnd检查
对最后一个段,如果没有写满,则用nagle判断是否要延时发

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
/* Returns the portion of skb which can be sent right away */
static unsigned int tcp_mss_split_point(const struct sock *sk,
const struct sk_buff *skb,
unsigned int mss_now,
unsigned int max_segs,
int nonagle)
{
const struct tcp_sock *tp = tcp_sk(sk);
u32 partial, needed, window, max_len;
window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
max_len = mss_now * max_segs;
if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
return max_len;
needed = min(skb->len, window);
if (max_len <= needed)
return max_len;
partial = needed % mss_now;
/* If last segment is not a full MSS, check if Nagle rules allow us
* to include this last segment in this skb.
* Otherwise, we'll split the skb at last MSS boundary
*/
if (tcp_nagle_check(partial != 0, tp, nonagle))
return needed - partial; //使用nagle,返回前几个完整的segs大小
return needed;
}

tso_fragment

在tcp_write_xmit中,如果发送窗口等因此限制,需要拆分GSO skb成两个skb,以满足发送要求.
在tso_fragment中,如果skb没有head线性区,则使用skb_split快速拆分;如果有线性区则调用tcp_fragment做更细致的通用处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
unsigned int mss_now, gfp_t gfp)
{
if (skb->len != skb->data_len) // 包含线性区
return tcp_fragment(sk, skb, len, mss_now, gfp);
//所有数据都在frags中
//skb保留len长度的数据,其他部分切分到一个新的skb中
buff = sk_stream_alloc_skb(sk, 0, gfp, true);
sk->sk_wmem_queued += buff->truesize;
sk_mem_charge(sk, buff->truesize);
buff->truesize += nlen;
skb->truesize -= nlen;
/* Correct the sequence numbers. */
TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
/* PSH and FIN should only be set in the second packet. */
flags = TCP_SKB_CB(skb)->tcp_flags;
TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
TCP_SKB_CB(buff)->tcp_flags = flags;
/* This packet was never sent out yet, so no SACK bits. */
TCP_SKB_CB(buff)->sacked = 0;
tcp_skb_fragment_eor(skb, buff); //如果有eor标记,则转移到buff上
buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL; //GSO都是CHECKSUM_PARTIAL
skb_split(skb, buff, len); //在len长度点做切割
tcp_fragment_tstamp(skb, buff);
/* Fix up tso_factor for both original and new SKB. */
tcp_set_skb_tso_segs(skb, mss_now);
tcp_set_skb_tso_segs(buff, mss_now);
/* Link BUFF into the send queue. */
__skb_header_release(buff);
tcp_insert_write_queue_after(skb, buff, sk);
return 0;
}
int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
unsigned int mss_now, gfp_t gfp)
{
nsize = skb_headlen(skb) - len; //nsize>0, 如果切分的长度都在head区域
if (nsize < 0)
nsize = 0;
if (skb_unclone(skb, gfp)) //如果是clone的skb,重新分配head区,并对frags区的page增加引用计数
return -ENOMEM;
/* Get a new skb... force flag on. */
buff = sk_stream_alloc_skb(sk, nsize, gfp, true);
if (!buff)
return -ENOMEM; /* We'll just try again later. */
sk->sk_wmem_queued += buff->truesize;
sk_mem_charge(sk, buff->truesize);
nlen = skb->len - len - nsize;
buff->truesize += nlen;
skb->truesize -= nlen;
/* Correct the sequence numbers. */
TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
/* PSH and FIN should only be set in the second packet. */
flags = TCP_SKB_CB(skb)->tcp_flags;
TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
TCP_SKB_CB(buff)->tcp_flags = flags;
TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
tcp_skb_fragment_eor(skb, buff);
if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {// 所有数据都在head线性区,并且不使用硬件校验
/* Copy and checksum data tail into the new buffer. */
buff->csum = csum_partial_copy_nocheck(skb->data + len, //skb保留len长度数据,之后nsize的数据copy到buff中
skb_put(buff, nsize),
nsize, 0);
skb_trim(skb, len); //长度len已经移动到buff中,删除这部分
skb->csum = csum_block_sub(skb->csum, buff->csum, len); //重新计算skb的校验和
} else { //GSO skb
skb->ip_summed = CHECKSUM_PARTIAL;
skb_split(skb, buff, len); //按len长度切割
}
buff->ip_summed = skb->ip_summed;
buff->tstamp = skb->tstamp;
tcp_fragment_tstamp(skb, buff);
old_factor = tcp_skb_pcount(skb);
/* Fix up tso_factor for both original and new SKB. */
tcp_set_skb_tso_segs(skb, mss_now);
tcp_set_skb_tso_segs(buff, mss_now);
/* Update delivered info for the new segment */
TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
/* If this packet has been sent out already, we must
* adjust the various packet counters.
*/
if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) { //如果skb已经被发出去了
int diff = old_factor - tcp_skb_pcount(skb) - //切拆分后的segs和之前的不一样
tcp_skb_pcount(buff);
if (diff)
tcp_adjust_pcount(sk, skb, diff); //修正计数器统计值
}
/* Link BUFF into the send queue. */
__skb_header_release(buff);
tcp_insert_write_queue_after(skb, buff, sk); //buff插入到skb后面
return 0;
}

通常都会调用skb_split来做拆分

  • 如果分割点在head线性区,调用skb_split_inside_header()把剩余的head和frags放到新的skb中
  • 如果分割点在frags区域,调用skb_split_no_header()把分割点所在及其后的frags,放到新的skb中,并更新两个skb交集的frag的偏移和长度
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
/**
* skb_split - Split fragmented skb to two parts at length len.
* @skb: the buffer to split
* @skb1: the buffer to receive the second part
* @len: new length for skb
*/
void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
{
int pos = skb_headlen(skb);
skb_shinfo(skb1)->tx_flags = skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG;
if (len < pos) /* Split line is inside header. */ //分割点在head区域
skb_split_inside_header(skb, skb1, len, pos);
else /* Second chunk has no header, nothing to copy. *///分割点在frags区域,只用对frags区域操作,不用copy
skb_split_no_header(skb, skb1, len, pos);
}
static inline void skb_split_inside_header(struct sk_buff *skb,
struct sk_buff* skb1,
const u32 len, const int pos)
{
int i;
skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len), //copy skb head区域的[len,pos)到skb1中
pos - len);
/* And move data appendix as is. */ //把skb的frags数据全部移动到skb1
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
skb_shinfo(skb)->nr_frags = 0;
skb1->data_len = skb->data_len;
skb1->len += skb1->data_len;
skb->data_len = 0;
skb->len = len;
skb_set_tail_pointer(skb, len);
}
static inline void skb_split_no_header(struct sk_buff *skb,
struct sk_buff* skb1,
const u32 len, int pos) //pos=skb_headlen(skb)
{
int i, k = 0;
const int nfrags = skb_shinfo(skb)->nr_frags;
skb_shinfo(skb)->nr_frags = 0;
skb1->len = skb1->data_len = skb->len - len;
skb->len = len;
skb->data_len = len - pos; //
for (i = 0; i < nfrags; i++) {
int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
if (pos + size > len) { //分割点在当前frags中,或者之后的
skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
if (pos < len) {
/* Split frag.
* We have two variants in this case:
* 1. Move all the frag to the second
* part, if it is possible. F.e.
* this approach is mandatory for TUX,
* where splitting is expensive.
* 2. Split is accurately. We make this.
*/
skb_frag_ref(skb, i); //增加引用
skb_shinfo(skb1)->frags[0].page_offset += len - pos; //设置不同的page_offset作为分割点
skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos);
skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos);
skb_shinfo(skb)->nr_frags++;
}
k++; //之后的frags也移动到skb1中个
} else
skb_shinfo(skb)->nr_frags++; //分割点不在当前frags, 则在skb中保留
pos += size; //skb最终保留的大小
}
skb_shinfo(skb1)->nr_frags = k;
}

tcp_transmit_skb

在tcp_write_xmit中,通过cwnd/nagle/tsq等检验后,调用tcp_transmit_skb把skb向下发送出去
最终在qdisc发送给驱动的时候sch_direct_xmit(),判断是否需要软件分段
tcp_transmit_skb->ip_queue_xmit->ip_local_out->__ip_local_out->dst_output->ip_output->ip_finish_output->ip_finish_output_gso->ip_finish_output2->dst_neigh_output->neigh_hh_output->dev_queue_xmit->__dev_queue_xmit->__dev_xmit_skb->sch_direct_xmit->dev_hard_start_xmit->xmit_one->netdev_start_xmit->__netdev_start_xmit->ixgbe_xmit_frame->__ixgbe_xmit_frame->ixgbe_xmit_frame_ring

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
gfp_t gfp_mask)
{
...
skb = skb_clone(skb, gfp_mask);
...
skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
skb_shinfo(skb)->gso_type = sk->sk_gso_type; //tcp_v4_connect->SKB_GSO_TCPV4
tp->segs_out += tcp_skb_pcount(skb);
/* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */
skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
/* Our usage of tstamp should remain private */
skb->tstamp.tv64 = 0;
/* Cleanup our debris for IP stacks */
memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
sizeof(struct inet6_skb_parm)));
err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); //ip_queue_xmit
...
}
static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
unsigned int mtu;
mtu = ip_skb_dst_mtu(sk, skb);
if (skb_is_gso(skb))
return ip_finish_output_gso(net, sk, skb, mtu);
if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU))
return ip_fragment(net, sk, skb, mtu, ip_finish_output2);
return ip_finish_output2(net, sk, skb);
}
int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
struct net_device *dev, struct netdev_queue *txq,
spinlock_t *root_lock, bool validate)
{
..
/* Note that we validate skb (GSO, checksum, ...) outside of locks */
if (validate)
skb = validate_xmit_skb_list(skb, dev); //判断是否需要GSO软件分段
if (likely(skb)) {
HARD_TX_LOCK(dev, txq, smp_processor_id());
if (!netif_xmit_frozen_or_stopped(txq))
skb = dev_hard_start_xmit(skb, dev, txq, &ret);
HARD_TX_UNLOCK(dev, txq);
}
...
}
struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
{
struct sk_buff *next, *head = NULL, *tail;
for (; skb != NULL; skb = next) {
next = skb->next;
skb->next = NULL;
/* in case skb wont be segmented, point to itself */
skb->prev = skb;
skb = validate_xmit_skb(skb, dev);
if (!skb)
continue;
if (!head)
head = skb;
else
tail->next = skb;
/* If skb was segmented, skb->prev points to
* the last segment. If not, it still contains skb.
*/
tail = skb->prev;
}
return head;
}
static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
{
netdev_features_t features;
features = netif_skb_features(skb); //到这一层使用dev的feature来判断,而不是保存在sk中修改过的
skb = validate_xmit_vlan(skb, features);
if (unlikely(!skb))
goto out_null;
if (netif_needs_gso(skb, features)) { //网卡不支持tso,则需要软件分段
struct sk_buff *segs;
segs = skb_gso_segment(skb, features);
if (IS_ERR(segs)) {
goto out_kfree_skb;
} else if (segs) {
consume_skb(skb);
skb = segs;
}
} else {
if (skb_needs_linearize(skb, features) && //skb有frags数据,但是网卡不支持sg,则需要线性化数据
__skb_linearize(skb))
goto out_kfree_skb;
/* If packet is not checksummed and device does not
* support checksumming for this protocol, complete
* checksumming here.
*/
if (skb->ip_summed == CHECKSUM_PARTIAL) {
if (skb->encapsulation)
skb_set_inner_transport_header(skb,
skb_checksum_start_offset(skb));
else
skb_set_transport_header(skb,
skb_checksum_start_offset(skb));
if (!(features & NETIF_F_CSUM_MASK) &&
skb_checksum_help(skb))
goto out_kfree_skb;
}
}
return skb;
out_kfree_skb:
kfree_skb(skb);
out_null:
atomic_long_inc(&dev->tx_dropped);
return NULL;
}

skb_gso_segment软件分段

skb_gso_segment()->skb_mac_gso_segment()->inet_gso_segment()->tcp4_gso_segment()->tcp_gso_segment()->skb_segment()
其中在skb_segment()进行实际的分段, 在tcp_gso_segment()对各分段做协议相关的数据更新(比如tcp头序号,tcp校验和等)
在inet_gso_segment()中则更新各分段的ip层信息

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
static inline
struct sk_buff *skb_gso_segment(struct sk_buff *skb, netdev_features_t features)
{
return __skb_gso_segment(skb, features, true);
}
struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
netdev_features_t features, bool tx_path)
{
...
SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
SKB_GSO_CB(skb)->encap_level = 0;
skb_reset_mac_header(skb);
skb_reset_mac_len(skb);
return skb_mac_gso_segment(skb, features);
}
struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
struct packet_offload *ptype;
int vlan_depth = skb->mac_len;
__be16 type = skb_network_protocol(skb, &vlan_depth);
__skb_pull(skb, vlan_depth);
rcu_read_lock();
list_for_each_entry_rcu(ptype, &offload_base, list) {
if (ptype->type == type && ptype->callbacks.gso_segment) {
segs = ptype->callbacks.gso_segment(skb, features); //inet_gso_segment
break;
}
}
rcu_read_unlock();
__skb_push(skb, skb->data - skb_mac_header(skb));
return segs;
}
struct sk_buff *inet_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
skb_reset_network_header(skb);
nhoff = skb_network_header(skb) - skb_mac_header(skb);
if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
goto out;
iph = ip_hdr(skb);
ihl = iph->ihl * 4;
if (ihl < sizeof(*iph))
goto out;
id = ntohs(iph->id);
proto = iph->protocol;
/* Warning: after this point, iph might be no longer valid */
if (unlikely(!pskb_may_pull(skb, ihl)))
goto out;
__skb_pull(skb, ihl);
encap = SKB_GSO_CB(skb)->encap_level > 0;
if (encap)
features &= skb->dev->hw_enc_features;
SKB_GSO_CB(skb)->encap_level += ihl;
skb_reset_transport_header(skb);
segs = ERR_PTR(-EPROTONOSUPPORT);
if (!skb->encapsulation || encap) {
udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID);
/* fixed ID is invalid if DF bit is not set */
if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF)))
goto out;
}
ops = rcu_dereference(inet_offloads[proto]);
if (likely(ops && ops->callbacks.gso_segment))
segs = ops->callbacks.gso_segment(skb, features); //tcp4_gso_segment
if (IS_ERR_OR_NULL(segs))
goto out;
gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
skb = segs;
do {
iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
if (udpfrag) {
iph->frag_off = htons(offset >> 3);
if (skb->next)
iph->frag_off |= htons(IP_MF);
offset += skb->len - nhoff - ihl;
tot_len = skb->len - nhoff;
} else if (skb_is_gso(skb)) {
if (!fixedid) {
iph->id = htons(id); //更新每个segs的ip头id
id += skb_shinfo(skb)->gso_segs;
}
if (gso_partial)
tot_len = skb_shinfo(skb)->gso_size +
SKB_GSO_CB(skb)->data_offset +
skb->head - (unsigned char *)iph;
else
tot_len = skb->len - nhoff;
} else {
if (!fixedid)
iph->id = htons(id++);
tot_len = skb->len - nhoff;
}
iph->tot_len = htons(tot_len);
ip_send_check(iph); //ip校验
if (encap)
skb_reset_inner_headers(skb);
skb->network_header = (u8 *)iph - skb->head;
} while ((skb = skb->next));
out:
return segs;
}
static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
return ERR_PTR(-EINVAL);
if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
const struct iphdr *iph = ip_hdr(skb);
struct tcphdr *th = tcp_hdr(skb);
/* Set up checksum pseudo header, usually expect stack to
* have done this already.
*/
th->check = 0;
skb->ip_summed = CHECKSUM_PARTIAL;
__tcp_v4_send_check(skb, iph->saddr, iph->daddr);
}
return tcp_gso_segment(skb, features);
}
struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
unsigned int sum_truesize = 0;
struct tcphdr *th;
unsigned int thlen;
unsigned int seq;
__be32 delta;
unsigned int oldlen;
unsigned int mss;
struct sk_buff *gso_skb = skb;
__sum16 newcheck;
bool ooo_okay, copy_destructor;
th = tcp_hdr(skb);
thlen = th->doff * 4;
if (thlen < sizeof(*th))
goto out;
if (!pskb_may_pull(skb, thlen))
goto out;
oldlen = (u16)~skb->len;
__skb_pull(skb, thlen);
mss = skb_shinfo(skb)->gso_size;
if (unlikely(skb->len <= mss))
goto out;
if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
/* Packet is from an untrusted source, reset gso_segs. */
skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
segs = NULL;
goto out;
}
copy_destructor = gso_skb->destructor == tcp_wfree;
ooo_okay = gso_skb->ooo_okay;
/* All segments but the first should have ooo_okay cleared */
skb->ooo_okay = 0;
segs = skb_segment(skb, features); //gso分段
if (IS_ERR(segs))
goto out;
/* Only first segment might have ooo_okay set */
segs->ooo_okay = ooo_okay;
/* GSO partial and frag_list segmentation only requires splitting
* the frame into an MSS multiple and possibly a remainder, both
* cases return a GSO skb. So update the mss now.
*/
if (skb_is_gso(segs))
mss *= skb_shinfo(segs)->gso_segs;
delta = htonl(oldlen + (thlen + mss));
skb = segs;
th = tcp_hdr(skb);
seq = ntohl(th->seq);
if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_SW_TSTAMP))
tcp_gso_tstamp(segs, skb_shinfo(gso_skb)->tskey, seq, mss);
newcheck = ~csum_fold((__force __wsum)((__force u32)th->check +
(__force u32)delta));
while (skb->next) { //遍历各分段
th->fin = th->psh = 0;
th->check = newcheck;
if (skb->ip_summed == CHECKSUM_PARTIAL) //硬件计算
gso_reset_checksum(skb, ~th->check);
else
th->check = gso_make_checksum(skb, ~th->check);
seq += mss;
if (copy_destructor) {
skb->destructor = gso_skb->destructor;
skb->sk = gso_skb->sk;
sum_truesize += skb->truesize;
}
skb = skb->next;
th = tcp_hdr(skb);
th->seq = htonl(seq); //更新序号
th->cwr = 0;
}
//skb开始指向最后一个分段
/* Following permits TCP Small Queues to work well with GSO :
* The callback to TCP stack will be called at the time last frag
* is freed at TX completion, and not right now when gso_skb
* is freed by GSO engine
*/
if (copy_destructor) {
swap(gso_skb->sk, skb->sk);
swap(gso_skb->destructor, skb->destructor);
sum_truesize += skb->truesize;
atomic_add(sum_truesize - gso_skb->truesize,
&skb->sk->sk_wmem_alloc);
}
delta = htonl(oldlen + (skb_tail_pointer(skb) -
skb_transport_header(skb)) +
skb->data_len);
th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
(__force u32)delta));
if (skb->ip_summed == CHECKSUM_PARTIAL)
gso_reset_checksum(skb, ~th->check);
else
th->check = gso_make_checksum(skb, ~th->check);
out:
return segs;
}

最后调用skb_segment执行实际的分段

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
struct sk_buff *skb_segment(struct sk_buff *head_skb,
netdev_features_t features)
{
struct sk_buff *segs = NULL;
struct sk_buff *tail = NULL;
struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
skb_frag_t *frag = skb_shinfo(head_skb)->frags;
unsigned int mss = skb_shinfo(head_skb)->gso_size;
unsigned int doffset = head_skb->data - skb_mac_header(head_skb); //mac+ip+tcp头长度
struct sk_buff *frag_skb = head_skb;
unsigned int offset = doffset;
unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
unsigned int partial_segs = 0;
unsigned int headroom;
unsigned int len = head_skb->len;
__be16 proto;
bool csum, sg;
int nfrags = skb_shinfo(head_skb)->nr_frags;
int err = -ENOMEM;
int i = 0;
int pos;
int dummy;
__skb_push(head_skb, doffset); //把协议头放入数据部分
proto = skb_network_protocol(head_skb, &dummy);
if (unlikely(!proto))
return ERR_PTR(-EINVAL);
sg = !!(features & NETIF_F_SG);
csum = !!can_checksum_protocol(features, proto);
if (sg && csum && (mss != GSO_BY_FRAGS)) {
if (!(features & NETIF_F_GSO_PARTIAL)) {
struct sk_buff *iter;
if (!list_skb || //没有frags区
!net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) //不支持tso
goto normal;
/* Split the buffer at the frag_list pointer.
* This is based on the assumption that all
* buffers in the chain excluding the last
* containing the same amount of data.
*/
skb_walk_frags(head_skb, iter) {
if (skb_headlen(iter)) //如果frag带head区
goto normal;
len -= iter->len;
}
}
/* GSO partial only requires that we trim off any excess that
* doesn't fit into an MSS sized block, so take care of that
* now.
*/
partial_segs = len / mss;
if (partial_segs > 1)
mss *= partial_segs;
else
partial_segs = 0;
}
normal:
headroom = skb_headroom(head_skb); //不包含mac+ip+tcp头
pos = skb_headlen(head_skb); //包含协议头的head区长度
do {
struct sk_buff *nskb;
skb_frag_t *nskb_frag;
int hsize;
int size;
if (unlikely(mss == GSO_BY_FRAGS)) {
len = list_skb->len;
} else {
len = head_skb->len - offset; // 减去协议头部
if (len > mss)
len = mss; //当前要分段的长度
}
hsize = skb_headlen(head_skb) - offset; //剩余线性区要copy的长度
if (hsize < 0)
hsize = 0;
if (hsize > len || !sg) //如果不支持sg,也把数据copy到head区域
hsize = len;
if (!hsize && i >= nfrags && skb_headlen(list_skb) &&
(skb_headlen(list_skb) == len || sg)) {
BUG_ON(skb_headlen(list_skb) > len);
i = 0;
nfrags = skb_shinfo(list_skb)->nr_frags;
frag = skb_shinfo(list_skb)->frags;
frag_skb = list_skb;
pos += skb_headlen(list_skb);
while (pos < offset + len) {
BUG_ON(i >= nfrags);
size = skb_frag_size(frag);
if (pos + size > offset + len)
break;
i++;
pos += size;
frag++;
}
nskb = skb_clone(list_skb, GFP_ATOMIC);
list_skb = list_skb->next;
if (unlikely(!nskb))
goto err;
if (unlikely(pskb_trim(nskb, len))) {
kfree_skb(nskb);
goto err;
}
hsize = skb_end_offset(nskb);
if (skb_cow_head(nskb, doffset + headroom)) {
kfree_skb(nskb);
goto err;
}
nskb->truesize += skb_end_offset(nskb) - hsize;
skb_release_head_state(nskb);
__skb_push(nskb, doffset);
} else {
nskb = __alloc_skb(hsize + doffset + headroom, // head区域的长度+协议头长度+headroom
GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
NUMA_NO_NODE);
if (unlikely(!nskb))
goto err;
skb_reserve(nskb, headroom);
__skb_put(nskb, doffset);
}
if (segs)
tail->next = nskb;
else
segs = nskb;
tail = nskb;
__copy_skb_header(nskb, head_skb); //copy所有的header
skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);
skb_reset_mac_len(nskb);
skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,
nskb->data - tnl_hlen, //copy协议头到nskb
doffset + tnl_hlen);
if (nskb->len == len + doffset)
goto perform_csum_check;
if (!sg) {
if (!nskb->remcsum_offload)
nskb->ip_summed = CHECKSUM_NONE;
SKB_GSO_CB(nskb)->csum =
skb_copy_and_csum_bits(head_skb, offset, //copy数据到nskb,并计算校验和
skb_put(nskb, len),
len, 0);
SKB_GSO_CB(nskb)->csum_start =
skb_headroom(nskb) + doffset;
continue;
}
nskb_frag = skb_shinfo(nskb)->frags;
skb_copy_from_linear_data_offset(head_skb, offset, //先copy head线性区部分
skb_put(nskb, hsize), hsize);
skb_shinfo(nskb)->tx_flags = skb_shinfo(head_skb)->tx_flags &
SKBTX_SHARED_FRAG;
while (pos < offset + len) { // 分段长度超过head区域
if (i >= nfrags) {
BUG_ON(skb_headlen(list_skb));
i = 0;
nfrags = skb_shinfo(list_skb)->nr_frags;
frag = skb_shinfo(list_skb)->frags;
frag_skb = list_skb;
BUG_ON(!nfrags);
list_skb = list_skb->next;
}
if (unlikely(skb_shinfo(nskb)->nr_frags >=
MAX_SKB_FRAGS)) {
net_warn_ratelimited(
"skb_segment: too many frags: %u %u\n",
pos, mss);
goto err;
}
if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC)))
goto err;
*nskb_frag = *frag;
__skb_frag_ref(nskb_frag);
size = skb_frag_size(nskb_frag);
if (pos < offset) {
nskb_frag->page_offset += offset - pos;
skb_frag_size_sub(nskb_frag, offset - pos);
}
skb_shinfo(nskb)->nr_frags++;
if (pos + size <= offset + len) { //当前frag不够,继续用下一个frag
i++;
frag++;
pos += size;
} else {
skb_frag_size_sub(nskb_frag, pos + size - (offset + len)); //够了,减去不需要的部分
goto skip_fraglist;
}
nskb_frag++;
}
skip_fraglist:
nskb->data_len = len - hsize; //更新frag区长度
nskb->len += nskb->data_len;
nskb->truesize += nskb->data_len;
perform_csum_check:
if (!csum) { //需要计算校验和
if (skb_has_shared_frag(nskb)) {
err = __skb_linearize(nskb);
if (err)
goto err;
}
if (!nskb->remcsum_offload)
nskb->ip_summed = CHECKSUM_NONE;
SKB_GSO_CB(nskb)->csum =
skb_checksum(nskb, doffset,
nskb->len - doffset, 0);
SKB_GSO_CB(nskb)->csum_start =
skb_headroom(nskb) + doffset;
}
} while ((offset += len) < head_skb->len);
/* Some callers want to get the end of the list.
* Put it in segs->prev to avoid walking the list.
* (see validate_xmit_skb_list() for example)
*/
segs->prev = tail;
if (partial_segs) {
struct sk_buff *iter;
int type = skb_shinfo(head_skb)->gso_type;
unsigned short gso_size = skb_shinfo(head_skb)->gso_size;
/* Update type to add partial and then remove dodgy if set */
type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL;
type &= ~SKB_GSO_DODGY;
/* Update GSO info and prepare to start updating headers on
* our way back down the stack of protocols.
*/
for (iter = segs; iter; iter = iter->next) {
skb_shinfo(iter)->gso_size = gso_size;
skb_shinfo(iter)->gso_segs = partial_segs;
skb_shinfo(iter)->gso_type = type;
SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset;
}
if (tail->len - doffset <= gso_size)
skb_shinfo(tail)->gso_size = 0;
else if (tail != segs)
skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size);
}
/* Following permits correct backpressure, for protocols
* using skb_set_owner_w().
* Idea is to tranfert ownership from head_skb to last segment.
*/
if (head_skb->destructor == sock_wfree) {
swap(tail->truesize, head_skb->truesize);
swap(tail->destructor, head_skb->destructor);
swap(tail->sk, head_skb->sk);
}
return segs;
err:
kfree_skb_list(segs);
return ERR_PTR(err);
}

ixgbe_tso硬件分段

ixgbe_xmit_frame_ring会调用ixgbe_tso,来吧tso的信息存放到网卡发送描述符的上下文里,
最后ixgbe_xmit_frame_ring会调用ixgbe_tx_map()把数据发出去

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
static int ixgbe_tso(struct ixgbe_ring *tx_ring,
struct ixgbe_tx_buffer *first,
u8 *hdr_len)
{
u32 vlan_macip_lens, type_tucmd, mss_l4len_idx;
struct sk_buff *skb = first->skb;
union {
struct iphdr *v4;
struct ipv6hdr *v6;
unsigned char *hdr;
} ip;
union {
struct tcphdr *tcp;
unsigned char *hdr;
} l4;
u32 paylen, l4_offset;
int err;
if (skb->ip_summed != CHECKSUM_PARTIAL)
return 0;
if (!skb_is_gso(skb))
return 0;
err = skb_cow_head(skb, 0);
if (err < 0)
return err;
ip.hdr = skb_network_header(skb);
l4.hdr = skb_checksum_start(skb);
/* ADV DTYP TUCMD MKRLOC/ISCSIHEDLEN */
type_tucmd = IXGBE_ADVTXD_TUCMD_L4T_TCP;
/* initialize outer IP header fields */
if (ip.v4->version == 4) {
unsigned char *csum_start = skb_checksum_start(skb);
unsigned char *trans_start = ip.hdr + (ip.v4->ihl * 4);
/* IP header will have to cancel out any data that
* is not a part of the outer IP header
*/
ip.v4->check = csum_fold(csum_partial(trans_start,
csum_start - trans_start,
0));
type_tucmd |= IXGBE_ADVTXD_TUCMD_IPV4;
ip.v4->tot_len = 0;
first->tx_flags |= IXGBE_TX_FLAGS_TSO |
IXGBE_TX_FLAGS_CSUM |
IXGBE_TX_FLAGS_IPV4;
} else {
ip.v6->payload_len = 0;
first->tx_flags |= IXGBE_TX_FLAGS_TSO |
IXGBE_TX_FLAGS_CSUM;
}
/* determine offset of inner transport header */
l4_offset = l4.hdr - skb->data;
/* compute length of segmentation header */
*hdr_len = (l4.tcp->doff * 4) + l4_offset;
/* remove payload length from inner checksum */
paylen = skb->len - l4_offset;
csum_replace_by_diff(&l4.tcp->check, htonl(paylen));
/* update gso size and bytecount with header size */
first->gso_segs = skb_shinfo(skb)->gso_segs;
first->bytecount += (first->gso_segs - 1) * *hdr_len;
/* mss_l4len_id: use 0 as index for TSO */
mss_l4len_idx = (*hdr_len - l4_offset) << IXGBE_ADVTXD_L4LEN_SHIFT;
mss_l4len_idx |= skb_shinfo(skb)->gso_size << IXGBE_ADVTXD_MSS_SHIFT;
/* vlan_macip_lens: HEADLEN, MACLEN, VLAN tag */
vlan_macip_lens = l4.hdr - ip.hdr;
vlan_macip_lens |= (ip.hdr - skb->data) << IXGBE_ADVTXD_MACLEN_SHIFT;
vlan_macip_lens |= first->tx_flags & IXGBE_TX_FLAGS_VLAN_MASK;
ixgbe_tx_ctxtdesc(tx_ring, vlan_macip_lens, 0, type_tucmd,
mss_l4len_idx);
return 1;
}
void ixgbe_tx_ctxtdesc(struct ixgbe_ring *tx_ring, u32 vlan_macip_lens,
u32 fcoe_sof_eof, u32 type_tucmd, u32 mss_l4len_idx)
{
struct ixgbe_adv_tx_context_desc *context_desc;
u16 i = tx_ring->next_to_use;
context_desc = IXGBE_TX_CTXTDESC(tx_ring, i);
i++;
tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
/* set bits to identify this as an advanced context descriptor */
type_tucmd |= IXGBE_TXD_CMD_DEXT | IXGBE_ADVTXD_DTYP_CTXT;
context_desc->vlan_macip_lens = cpu_to_le32(vlan_macip_lens);
context_desc->seqnum_seed = cpu_to_le32(fcoe_sof_eof);
context_desc->type_tucmd_mlhl = cpu_to_le32(type_tucmd);
context_desc->mss_l4len_idx = cpu_to_le32(mss_l4len_idx);
}

相关配置

  • net.ipv4.tcp_min_tso_segs = 2
    每个GSO skb最少需要的分段数, 在tcp_write_xmit()中调用tcp_tso_segs()获取当前最大分段数的时候被使用,如果拥塞算法不提供, 最大分段数将由pacing计算的每毫秒分段数和sysctl_tcp_min_tso_segs中的最大值来确定, 最大分段数*mss=(full-sized GSO skb)的大小
    不建议修改

  • net.ipv4.tcp_tso_win_divisor = 3
    该值影响tso是否延迟发送。
    在tcp_tso_should_defer()中, 如果剩余发送窗口大于上面计算的full-sized GSO的大小,则直接发送;再判断如果剩余发送窗口大于总窗口的1/3, 则马上发送。 否则再判断下一个ack的到来是否在半个srtt内到来,否则就延时发送。
    tcp_tso_win_divisor可以略微调大,可以增加网络吞吐量,在某些情况下也可能导致重传。