linux tun实现

模块初始化

模块初始化的时候注册misc_device,是一种特殊的字符设备

1
2
modprobe tun
mknod /dev/net/tun c 10 200
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
static int __init tun_init(void)
{
int ret = 0;
pr_info("%s, %s\n", DRV_DESCRIPTION, DRV_VERSION);
pr_info("%s\n", DRV_COPYRIGHT);
ret = rtnl_link_register(&tun_link_ops);
if (ret) {
pr_err("Can't register link_ops\n");
goto err_linkops;
}
ret = misc_register(&tun_miscdev);
if (ret) {
pr_err("Can't register misc device %d\n", TUN_MINOR);
goto err_misc;
}
register_netdevice_notifier(&tun_notifier_block);
return 0;
err_misc:
rtnl_link_unregister(&tun_link_ops);
err_linkops:
return ret;
}
module_init(tun_init);
static struct miscdevice tun_miscdev = {
.minor = TUN_MINOR,
.name = "tun",
.nodename = "net/tun",
.fops = &tun_fops,
};
static const struct file_operations tun_fops = {
.owner = THIS_MODULE,
.llseek = no_llseek,
.read_iter = tun_chr_read_iter,
.write_iter = tun_chr_write_iter,
.poll = tun_chr_poll,
.unlocked_ioctl = tun_chr_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = tun_chr_compat_ioctl,
#endif
.open = tun_chr_open,
.release = tun_chr_close,
.fasync = tun_chr_fasync,
#ifdef CONFIG_PROC_FS
.show_fdinfo = tun_chr_show_fdinfo,
#endif
};

创建设备

从3.8内核开始,tun支持多队列。
这里有几个数据结构抽象:

  • tun_file: tun队列的抽象, open的时候被分配,关联到文件fd
  • tun_struct: tun设备的抽象,每个tun_struct中包含多个tun_file。 tun_struct在第一次ioctl TUNSETIFF的时候被分配

另外TUN设备是ip包,TAP设备是Ethernet包
IFF_NO_PI这个flag是每次收发的时候都带有协议和flag信息

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#include <linux/if.h>
#include <linux/if_tun.h>
int tun_alloc_mq(char *dev, int queues, int *fds)
{
struct ifreq ifr;
int fd, err, i;
memset(&ifr, 0, sizeof(ifr));
/* Flags: IFF_TUN - TUN device (no Ethernet headers)
* IFF_TAP - TAP device
*
* IFF_NO_PI - Do not provide packet information
* IFF_MULTI_QUEUE - Create a queue of multiqueue device
*/
ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_MULTI_QUEUE;
strcpy(ifr.ifr_name, dev);
for (i = 0; i < queues; i++) {
if ((fd = open("/dev/net/tun", O_RDWR)) < 0)
goto err;
err = ioctl(fd, TUNSETIFF, (void *)&ifr);
if (err) {
close(fd);
goto err;
}
fds[i] = fd;
}
return 0;
err:
for (--i; i >= 0; i--)
close(fds[i]);
return err;
}
int tun_set_queue(int fd, int enable)
{
struct ifreq ifr;
memset(&ifr, 0, sizeof(ifr));
if (enable)
ifr.ifr_flags = IFF_ATTACH_QUEUE;
else
ifr.ifr_flags = IFF_DETACH_QUEUE;
return ioctl(fd, TUNSETQUEUE, (void *)&ifr);
}

tun_chr_open

open()会调用tun_chr_open()来创建一个设备队列对应的tun_file, 并返回文件fd给应用程序

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
static struct proto tun_proto = {
.name = "tun",
.owner = THIS_MODULE,
.obj_size = sizeof(struct tun_file),
};
static const struct proto_ops tun_socket_ops = {
.peek_len = tun_peek_len,
.sendmsg = tun_sendmsg,
.recvmsg = tun_recvmsg,
};
static int tun_chr_open(struct inode *inode, struct file * file)
{
struct net *net = current->nsproxy->net_ns;
struct tun_file *tfile;
DBG1(KERN_INFO, "tunX: tun_chr_open\n");
tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
&tun_proto, 0);
if (!tfile)
return -ENOMEM;
RCU_INIT_POINTER(tfile->tun, NULL);
tfile->flags = 0;
tfile->ifindex = 0;
init_waitqueue_head(&tfile->wq.wait);
RCU_INIT_POINTER(tfile->socket.wq, &tfile->wq);
tfile->socket.file = file;
tfile->socket.ops = &tun_socket_ops;
sock_init_data(&tfile->socket, &tfile->sk);
tfile->sk.sk_write_space = tun_sock_write_space;
tfile->sk.sk_sndbuf = INT_MAX;
file->private_data = tfile;
INIT_LIST_HEAD(&tfile->next);
sock_set_flag(&tfile->sk, SOCK_ZEROCOPY);
return 0;
}

ioctl

ioctl会调用__tun_chr_ioctl()来处理,主要的几个选项如下

  • TUNSETIFF
    第一次TUNSETIFF的时候会创建tun设备对应的tun_struct,并attach一个队列。 之后TUNSETIFF到该设备名的话,就会attach队列到这个tun设备
    __tun_chr_ioctl中会调用tun_set_iff()来处理

  • TUNSETQUEUE
    主要调用tun_set_queue来detach和attach设备队列。 TUNSETIFF后是默认attach

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
unsigned long arg, int ifreq_len)
{
struct tun_file *tfile = file->private_data;
struct tun_struct *tun;
void __user* argp = (void __user*)arg;
struct ifreq ifr;
kuid_t owner;
kgid_t group;
int sndbuf;
int vnet_hdr_sz;
unsigned int ifindex;
int le;
int ret;
if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || _IOC_TYPE(cmd) == 0x89) {
if (copy_from_user(&ifr, argp, ifreq_len))
return -EFAULT;
} else {
memset(&ifr, 0, sizeof(ifr));
}
if (cmd == TUNGETFEATURES) {
/* Currently this just means: "what IFF flags are valid?".
* This is needed because we never checked for invalid flags on
* TUNSETIFF.
*/
return put_user(IFF_TUN | IFF_TAP | TUN_FEATURES,
(unsigned int __user*)argp);
} else if (cmd == TUNSETQUEUE)
return tun_set_queue(file, &ifr); // enable/disbale queue
ret = 0;
rtnl_lock();
tun = __tun_get(tfile);
if (cmd == TUNSETIFF && !tun) {
ifr.ifr_name[IFNAMSIZ-1] = '\0';
ret = tun_set_iff(sock_net(&tfile->sk), file, &ifr);
if (ret)
goto unlock;
if (copy_to_user(argp, &ifr, ifreq_len))
ret = -EFAULT;
goto unlock;
}
if (cmd == TUNSETIFINDEX) {
ret = -EPERM;
if (tun)
goto unlock;
ret = -EFAULT;
if (copy_from_user(&ifindex, argp, sizeof(ifindex)))
goto unlock;
ret = 0;
tfile->ifindex = ifindex;
goto unlock;
}
ret = -EBADFD;
if (!tun)
goto unlock;
tun_debug(KERN_INFO, tun, "tun_chr_ioctl cmd %u\n", cmd);
ret = 0;
switch (cmd) {
case TUNGETIFF: //获取设备名和flags
tun_get_iff(current->nsproxy->net_ns, tun, &ifr);
if (tfile->detached)
ifr.ifr_flags |= IFF_DETACH_QUEUE;
if (!tfile->socket.sk->sk_filter)
ifr.ifr_flags |= IFF_NOFILTER;
if (copy_to_user(argp, &ifr, ifreq_len))
ret = -EFAULT;
break;
case TUNSETNOCSUM:
/* Disable/Enable checksum */
/* [unimplemented] */
tun_debug(KERN_INFO, tun, "ignored: set checksum %s\n",
arg ? "disabled" : "enabled");
break;
case TUNSETPERSIST: //增加模块引用计数
/* Disable/Enable persist mode. Keep an extra reference to the
* module to prevent the module being unprobed.
*/
if (arg && !(tun->flags & IFF_PERSIST)) {
tun->flags |= IFF_PERSIST;
__module_get(THIS_MODULE);
}
if (!arg && (tun->flags & IFF_PERSIST)) {
tun->flags &= ~IFF_PERSIST;
module_put(THIS_MODULE);
}
tun_debug(KERN_INFO, tun, "persist %s\n",
arg ? "enabled" : "disabled");
break;
case TUNSETOWNER: //设置设备拥有者
/* Set owner of the device */
owner = make_kuid(current_user_ns(), arg);
if (!uid_valid(owner)) {
ret = -EINVAL;
break;
}
tun->owner = owner;
tun_debug(KERN_INFO, tun, "owner set to %u\n",
from_kuid(&init_user_ns, tun->owner));
break;
case TUNSETGROUP: //设置设备用户组
/* Set group of the device */
group = make_kgid(current_user_ns(), arg);
if (!gid_valid(group)) {
ret = -EINVAL;
break;
}
tun->group = group;
tun_debug(KERN_INFO, tun, "group set to %u\n",
from_kgid(&init_user_ns, tun->group));
break;
case TUNSETLINK:
/* Only allow setting the type when the interface is down */
if (tun->dev->flags & IFF_UP) {
tun_debug(KERN_INFO, tun,
"Linktype set failed because interface is up\n");
ret = -EBUSY;
} else {
tun->dev->type = (int) arg;
tun_debug(KERN_INFO, tun, "linktype set to %d\n",
tun->dev->type);
ret = 0;
}
break;
#ifdef TUN_DEBUG
case TUNSETDEBUG:
tun->debug = arg;
break;
#endif
case TUNSETOFFLOAD:
ret = set_offload(tun, arg); //设置硬件校验和,tfo/ufo
break;
case TUNSETTXFILTER: //设置tap filter
/* Can be set only for TAPs */
ret = -EINVAL;
if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
break;
ret = update_filter(&tun->txflt, (void __user *)arg);
break;
case SIOCGIFHWADDR: //获取mac地址
/* Get hw address */
memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN);
ifr.ifr_hwaddr.sa_family = tun->dev->type;
if (copy_to_user(argp, &ifr, ifreq_len))
ret = -EFAULT;
break;
case SIOCSIFHWADDR: //设置mac地址
/* Set hw address */
tun_debug(KERN_DEBUG, tun, "set hw address: %pM\n",
ifr.ifr_hwaddr.sa_data);
ret = dev_set_mac_address(tun->dev, &ifr.ifr_hwaddr);
break;
case TUNGETSNDBUF: //获取发送缓存大小
sndbuf = tfile->socket.sk->sk_sndbuf;
if (copy_to_user(argp, &sndbuf, sizeof(sndbuf)))
ret = -EFAULT;
break;
case TUNSETSNDBUF: //设置发送缓存大小
if (copy_from_user(&sndbuf, argp, sizeof(sndbuf))) {
ret = -EFAULT;
break;
}
tun->sndbuf = sndbuf;
tun_set_sndbuf(tun);
break;
case TUNGETVNETHDRSZ:
vnet_hdr_sz = tun->vnet_hdr_sz;
if (copy_to_user(argp, &vnet_hdr_sz, sizeof(vnet_hdr_sz)))
ret = -EFAULT;
break;
case TUNSETVNETHDRSZ:
if (copy_from_user(&vnet_hdr_sz, argp, sizeof(vnet_hdr_sz))) {
ret = -EFAULT;
break;
}
if (vnet_hdr_sz < (int)sizeof(struct virtio_net_hdr)) {
ret = -EINVAL;
break;
}
tun->vnet_hdr_sz = vnet_hdr_sz;
break;
case TUNGETVNETLE:
le = !!(tun->flags & TUN_VNET_LE);
if (put_user(le, (int __user *)argp))
ret = -EFAULT;
break;
case TUNSETVNETLE:
if (get_user(le, (int __user *)argp)) {
ret = -EFAULT;
break;
}
if (le)
tun->flags |= TUN_VNET_LE;
else
tun->flags &= ~TUN_VNET_LE;
break;
case TUNGETVNETBE:
ret = tun_get_vnet_be(tun, argp);
break;
case TUNSETVNETBE:
ret = tun_set_vnet_be(tun, argp);
break;
case TUNATTACHFILTER: //对tap,设置bpf filter
/* Can be set only for TAPs */
ret = -EINVAL;
if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
break;
ret = -EFAULT;
if (copy_from_user(&tun->fprog, argp, sizeof(tun->fprog)))
break;
ret = tun_attach_filter(tun);
break;
case TUNDETACHFILTER: //删除tap的 bpf filter
/* Can be set only for TAPs */
ret = -EINVAL;
if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
break;
ret = 0;
tun_detach_filter(tun, tun->numqueues);
break;
case TUNGETFILTER: //获取bpf filter
ret = -EINVAL;
if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
break;
ret = -EFAULT;
if (copy_to_user(argp, &tun->fprog, sizeof(tun->fprog)))
break;
ret = 0;
break;
default:
ret = -EINVAL;
break;
}
unlock:
rtnl_unlock();
if (tun)
tun_put(tun);
return ret;
}

tun_set_iff

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
{
struct tun_struct *tun;
struct tun_file *tfile = file->private_data;
struct net_device *dev;
int err;
if (tfile->detached)
return -EINVAL;
dev = __dev_get_by_name(net, ifr->ifr_name); //根据名字查找net_device,第一次创建的时候查找不到
if (dev) { //找到设备,说明已经TUNSETIFF过了,这次添加一个队列
if (ifr->ifr_flags & IFF_TUN_EXCL)
return -EBUSY;
if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops)
tun = netdev_priv(dev);
else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops)
tun = netdev_priv(dev);
else
return -EINVAL;
if (!!(ifr->ifr_flags & IFF_MULTI_QUEUE) !=
!!(tun->flags & IFF_MULTI_QUEUE))
return -EINVAL;
if (tun_not_capable(tun))
return -EPERM;
err = security_tun_dev_open(tun->security);
if (err < 0)
return err;
err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER); //attach新的队列
if (err < 0)
return err;
if (tun->flags & IFF_MULTI_QUEUE &&
(tun->numqueues + tun->numdisabled > 1)) {
/* One or more queue has already been attached, no need
* to initialize the device again.
*/
return 0;
}
}
else {
char *name;
unsigned long flags = 0;
int queues = ifr->ifr_flags & IFF_MULTI_QUEUE ?
MAX_TAP_QUEUES : 1; //最大256个queue
if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) //创建设备需要CAP_NET_ADMIN权限
return -EPERM;
err = security_tun_dev_create();
if (err < 0)
return err;
/* Set dev type */
if (ifr->ifr_flags & IFF_TUN) {
/* TUN device */
flags |= IFF_TUN;
name = "tun%d";
} else if (ifr->ifr_flags & IFF_TAP) {
/* TAP device */
flags |= IFF_TAP;
name = "tap%d";
} else
return -EINVAL;
if (*ifr->ifr_name)
name = ifr->ifr_name; //如果用户指定了名字则用用户提供的,否则使用默认的
dev = alloc_netdev_mqs(sizeof(struct tun_struct), name, //分配net_device并初始化,分配tun_struct
NET_NAME_UNKNOWN, tun_setup, queues,
queues);
if (!dev)
return -ENOMEM;
dev_net_set(dev, net);
dev->rtnl_link_ops = &tun_link_ops;
dev->ifindex = tfile->ifindex;
dev->sysfs_groups[0] = &tun_attr_group;
tun = netdev_priv(dev);
tun->dev = dev;
tun->flags = flags;
tun->txflt.count = 0;
tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
tun->align = NET_SKB_PAD;
tun->filter_attached = false;
tun->sndbuf = tfile->socket.sk->sk_sndbuf; //INT_MAX
tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats);
if (!tun->pcpu_stats) {
err = -ENOMEM;
goto err_free_dev;
}
spin_lock_init(&tun->lock);
err = security_tun_dev_alloc_security(&tun->security);
if (err < 0)
goto err_free_stat;
tun_net_init(dev);
tun_flow_init(tun);
dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
NETIF_F_HW_VLAN_STAG_TX;
dev->features = dev->hw_features | NETIF_F_LLTX;
dev->vlan_features = dev->features &
~(NETIF_F_HW_VLAN_CTAG_TX |
NETIF_F_HW_VLAN_STAG_TX);
INIT_LIST_HEAD(&tun->disabled);
err = tun_attach(tun, file, false);
if (err < 0)
goto err_free_flow;
err = register_netdevice(tun->dev);
if (err < 0)
goto err_detach;
}
netif_carrier_on(tun->dev);
tun_debug(KERN_INFO, tun, "tun_set_iff\n");
tun->flags = (tun->flags & ~TUN_FEATURES) |
(ifr->ifr_flags & TUN_FEATURES);
/* Make sure persistent devices do not get stuck in
* xoff state.
*/
if (netif_running(tun->dev))
netif_tx_wake_all_queues(tun->dev);
strcpy(ifr->ifr_name, tun->dev->name);
return 0;
err_detach:
tun_detach_all(dev);
err_free_flow:
tun_flow_uninit(tun);
security_tun_dev_free_security(tun->security);
err_free_stat:
free_percpu(tun->pcpu_stats);
err_free_dev:
free_netdev(dev);
return err;
}
/* Initialize net device. */
static void tun_net_init(struct net_device *dev)
{
struct tun_struct *tun = netdev_priv(dev);
switch (tun->flags & TUN_TYPE_MASK) {
case IFF_TUN:
dev->netdev_ops = &tun_netdev_ops;
/* Point-to-Point TUN Device */
dev->hard_header_len = 0;
dev->addr_len = 0;
dev->mtu = 1500;
/* Zero header length */
dev->type = ARPHRD_NONE; //路由时直接从出口发出,不再发送arp, 也就是点对点连接
dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
break;
case IFF_TAP: //tap设备包含以太网协议头部
dev->netdev_ops = &tap_netdev_ops;
/* Ethernet TAP Device */
ether_setup(dev);
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
eth_hw_addr_random(dev); //随机生成mac地址
break;
}
}
static const struct net_device_ops tun_netdev_ops = {
.ndo_uninit = tun_net_uninit,
.ndo_open = tun_net_open,
.ndo_stop = tun_net_close,
.ndo_start_xmit = tun_net_xmit,
.ndo_change_mtu = tun_net_change_mtu,
.ndo_fix_features = tun_net_fix_features,
.ndo_select_queue = tun_select_queue,
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = tun_poll_controller,
#endif
.ndo_set_rx_headroom = tun_set_headroom,
.ndo_get_stats64 = tun_net_get_stats64,
};
static const struct net_device_ops tap_netdev_ops = {
.ndo_uninit = tun_net_uninit,
.ndo_open = tun_net_open,
.ndo_stop = tun_net_close,
.ndo_start_xmit = tun_net_xmit,
.ndo_change_mtu = tun_net_change_mtu,
.ndo_fix_features = tun_net_fix_features,
.ndo_set_rx_mode = tun_net_mclist,
.ndo_set_mac_address = eth_mac_addr,
.ndo_validate_addr = eth_validate_addr,
.ndo_select_queue = tun_select_queue,
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = tun_poll_controller,
#endif
.ndo_features_check = passthru_features_check,
.ndo_set_rx_headroom = tun_set_headroom,
.ndo_get_stats64 = tun_net_get_stats64,
};

tun_set_queue

tun_set_queue可以任务是enable/disable tun queue

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
static int tun_set_queue(struct file *file, struct ifreq *ifr)
{
struct tun_file *tfile = file->private_data;
struct tun_struct *tun;
int ret = 0;
rtnl_lock();
if (ifr->ifr_flags & IFF_ATTACH_QUEUE) { //enable
tun = tfile->detached;
if (!tun) {
ret = -EINVAL;
goto unlock;
}
ret = security_tun_dev_attach_queue(tun->security);
if (ret < 0)
goto unlock;
ret = tun_attach(tun, file, false);
} else if (ifr->ifr_flags & IFF_DETACH_QUEUE) { //disable
tun = rtnl_dereference(tfile->tun);
if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached)
ret = -EINVAL;
else
__tun_detach(tfile, false);
} else
ret = -EINVAL;
unlock:
rtnl_unlock();
return ret;
}

tun_attach

attach主要是关联队列到tun设备

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter)
{
struct tun_file *tfile = file->private_data;
struct net_device *dev = tun->dev;
int err;
err = security_tun_dev_attach(tfile->socket.sk, tun->security);
if (err < 0)
goto out;
err = -EINVAL;
if (rtnl_dereference(tfile->tun) && !tfile->detached) //已经attach过了,直接返回
goto out;
err = -EBUSY;
if (!(tun->flags & IFF_MULTI_QUEUE) && tun->numqueues == 1)
goto out;
err = -E2BIG;
if (!tfile->detached &&
tun->numqueues + tun->numdisabled == MAX_TAP_QUEUES)
goto out;
err = 0;
/* Re-attach the filter to persist device */
if (!skip_filter && (tun->filter_attached == true)) {
lock_sock(tfile->socket.sk);
err = sk_attach_filter(&tun->fprog, tfile->socket.sk);
release_sock(tfile->socket.sk);
if (!err)
goto out;
}
if (!tfile->detached &&
skb_array_init(&tfile->tx_array, dev->tx_queue_len, GFP_KERNEL)) { //如果不是detach的,初始化一个固定大小的fifo指针ring
err = -ENOMEM;
goto out;
}
tfile->queue_index = tun->numqueues; //更新队列索引
tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
rcu_assign_pointer(tfile->tun, tun);
rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
tun->numqueues++; //更新队列数
if (tfile->detached)
tun_enable_queue(tfile); //detach的队列重新attach
else
sock_hold(&tfile->sk); //增加引用
tun_set_real_num_queues(tun);
/* device is allowed to go away first, so no need to hold extra
* refcnt.
*/
out:
return err;
}
static struct tun_struct *tun_enable_queue(struct tun_file *tfile)
{
struct tun_struct *tun = tfile->detached;
tfile->detached = NULL;
list_del_init(&tfile->next); //从tun的disable队列中移除
--tun->numdisabled;
return tun;
}

tun_detach

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
static void tun_detach(struct tun_file *tfile, bool clean)
{
rtnl_lock();
__tun_detach(tfile, clean);
rtnl_unlock();
}
static void __tun_detach(struct tun_file *tfile, bool clean)
{
struct tun_file *ntfile;
struct tun_struct *tun;
tun = rtnl_dereference(tfile->tun);
if (tun && !tfile->detached) { // 没有detach过
u16 index = tfile->queue_index;
BUG_ON(index >= tun->numqueues);
rcu_assign_pointer(tun->tfiles[index], //移动最后的队列覆盖当前要detach的位置
tun->tfiles[tun->numqueues - 1]);
ntfile = rtnl_dereference(tun->tfiles[index]);
ntfile->queue_index = index;
--tun->numqueues;
if (clean) {
RCU_INIT_POINTER(tfile->tun, NULL);
sock_put(&tfile->sk);
} else
tun_disable_queue(tun, tfile); //移动到tun disable队列中
synchronize_net();
tun_flow_delete_by_queue(tun, tun->numqueues + 1); //删除移动的队列flow
/* Drop read queue */
tun_queue_purge(tfile);
tun_set_real_num_queues(tun);
} else if (tfile->detached && clean) { //移动detach过了,并且标记clean。则enbale
tun = tun_enable_queue(tfile);
sock_put(&tfile->sk);
}
if (clean) {
if (tun && tun->numqueues == 0 && tun->numdisabled == 0) {
netif_carrier_off(tun->dev);
if (!(tun->flags & IFF_PERSIST) &&
tun->dev->reg_state == NETREG_REGISTERED)
unregister_netdevice(tun->dev);
}
if (tun)
skb_array_cleanup(&tfile->tx_array); //清空队列
sock_put(&tfile->sk);
}
}
static void tun_disable_queue(struct tun_struct *tun, struct tun_file *tfile)
{
tfile->detached = tun;
list_add_tail(&tfile->next, &tun->disabled);
++tun->numdisabled;
}

tun数据路径

tun数据流

  • 应用程序往tun设备写数据: 最终copy到percpu的softnet_data的input_pkt_queue队列中,等待在软中断中重新被协议栈处理
  • 通过添加路由,最终流量通过tun设备被发送,调用tun驱动的ndo_start_xmit函数, 添加skb到tun发送队列的fifo ring中, 然后通知select/epoll有数据,等待应用程序接收
  • 应用程序从tun设备读取: 从tun设备发送队列的fifo ring中读取一个skb,然后copy到用户态

tun_sendmsg

应用程序sendmsg到tun设备,让数据包重新走协议栈

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
{
int ret;
struct tun_file *tfile = container_of(sock, struct tun_file, socket);
struct tun_struct *tun = __tun_get(tfile);
if (!tun)
return -EBADFD;
ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter,
m->msg_flags & MSG_DONTWAIT);
tun_put(tun);
return ret;
}
/* Get packet from user space buffer */
static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
void *msg_control, struct iov_iter *from,
int noblock)
{
struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
struct sk_buff *skb;
size_t total_len = iov_iter_count(from);
size_t len = total_len, align = tun->align, linear;
struct virtio_net_hdr gso = { 0 };
struct tun_pcpu_stats *stats;
int good_linear;
int copylen;
bool zerocopy = false;
int err;
u32 rxhash;
ssize_t n;
if (!(tun->dev->flags & IFF_UP))
return -EIO;
if (!(tun->flags & IFF_NO_PI)) { //带tun_pi信息
if (len < sizeof(pi))
return -EINVAL;
len -= sizeof(pi);
n = copy_from_iter(&pi, sizeof(pi), from); //copy pi部分
if (n != sizeof(pi))
return -EFAULT;
}
if (tun->flags & IFF_VNET_HDR) {
if (len < tun->vnet_hdr_sz)
return -EINVAL;
len -= tun->vnet_hdr_sz;
n = copy_from_iter(&gso, sizeof(gso), from);
if (n != sizeof(gso))
return -EFAULT;
if ((gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2 > tun16_to_cpu(tun, gso.hdr_len))
gso.hdr_len = cpu_to_tun16(tun, tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2);
if (tun16_to_cpu(tun, gso.hdr_len) > len)
return -EINVAL;
iov_iter_advance(from, tun->vnet_hdr_sz - sizeof(gso));
}
if ((tun->flags & TUN_TYPE_MASK) == IFF_TAP) {
align += NET_IP_ALIGN;
if (unlikely(len < ETH_HLEN ||
(gso.hdr_len && tun16_to_cpu(tun, gso.hdr_len) < ETH_HLEN)))
return -EINVAL;
}
good_linear = SKB_MAX_HEAD(align);
if (msg_control) {
struct iov_iter i = *from;
/* There are 256 bytes to be copied in skb, so there is
* enough room for skb expand head in case it is used.
* The rest of the buffer is mapped from userspace.
*/
copylen = gso.hdr_len ? tun16_to_cpu(tun, gso.hdr_len) : GOODCOPY_LEN;
if (copylen > good_linear)
copylen = good_linear;
linear = copylen;
iov_iter_advance(&i, copylen);
if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS) //buffer中的数据可以都放到frags中
zerocopy = true; //不用拷贝
}
if (!zerocopy) {
copylen = len;
if (tun16_to_cpu(tun, gso.hdr_len) > good_linear)
linear = good_linear;
else
linear = tun16_to_cpu(tun, gso.hdr_len);
}
skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
if (IS_ERR(skb)) {
if (PTR_ERR(skb) != -EAGAIN)
this_cpu_inc(tun->pcpu_stats->rx_dropped);
return PTR_ERR(skb);
}
if (zerocopy)
err = zerocopy_sg_from_iter(skb, from); //不拷贝,挂载页面到frags部分
else
err = skb_copy_datagram_from_iter(skb, 0, from, len); //拷贝到内核
if (err) {
this_cpu_inc(tun->pcpu_stats->rx_dropped);
kfree_skb(skb);
return -EFAULT;
}
err = virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun));
if (err) {
this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
kfree_skb(skb);
return -EINVAL;
}
switch (tun->flags & TUN_TYPE_MASK) {
case IFF_TUN:
if (tun->flags & IFF_NO_PI) {
switch (skb->data[0] & 0xf0) {
case 0x40:
pi.proto = htons(ETH_P_IP);
break;
case 0x60:
pi.proto = htons(ETH_P_IPV6);
break;
default:
this_cpu_inc(tun->pcpu_stats->rx_dropped);
kfree_skb(skb);
return -EINVAL;
}
}
skb_reset_mac_header(skb);
skb->protocol = pi.proto;
skb->dev = tun->dev;
break;
case IFF_TAP:
skb->protocol = eth_type_trans(skb, tun->dev);
break;
}
/* copy skb_ubuf_info for callback when skb has no error */
if (zerocopy) {
skb_shinfo(skb)->destructor_arg = msg_control;
skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
} else if (msg_control) {
struct ubuf_info *uarg = msg_control;
uarg->callback(uarg, false);
}
skb_reset_network_header(skb);
skb_probe_transport_header(skb, 0);
rxhash = skb_get_hash(skb);
netif_rx_ni(skb); //添加到softnet_data的input_pkt_queue中,在软中断中处理
stats = get_cpu_ptr(tun->pcpu_stats);
u64_stats_update_begin(&stats->syncp);
stats->rx_packets++;
stats->rx_bytes += len;
u64_stats_update_end(&stats->syncp);
put_cpu_ptr(stats);
tun_flow_update(tun, rxhash, tfile); //根据hash,更新flow状态
return total_len;
}

tun_recvmsg

应用程序从tun中读取路由到tun设备的数据包

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
int flags)
{
struct tun_file *tfile = container_of(sock, struct tun_file, socket);
struct tun_struct *tun = __tun_get(tfile);
int ret;
if (!tun)
return -EBADFD;
if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) {
ret = -EINVAL;
goto out;
}
if (flags & MSG_ERRQUEUE) { //读取sk_error_queue
ret = sock_recv_errqueue(sock->sk, m, total_len,
SOL_PACKET, TUN_TX_TIMESTAMP);
goto out;
}
ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT);
if (ret > (ssize_t)total_len) {
m->msg_flags |= MSG_TRUNC;
ret = flags & MSG_TRUNC ? ret : total_len;
}
out:
tun_put(tun);
return ret;
}
static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
struct iov_iter *to,
int noblock)
{
struct sk_buff *skb;
ssize_t ret;
int err;
tun_debug(KERN_INFO, tun, "tun_do_read\n");
if (!iov_iter_count(to))
return 0;
/* Read frames from ring */
skb = tun_ring_recv(tfile, noblock, &err); //从tfile->tx_array读取一个skb
if (!skb)
return err;
ret = tun_put_user(tun, tfile, skb, to); //copy skb到用户态缓存
if (unlikely(ret < 0))
kfree_skb(skb); //异常丢弃
else
consume_skb(skb); //正常丢弃
return ret;
}
static ssize_t tun_put_user(struct tun_struct *tun,
struct tun_file *tfile,
struct sk_buff *skb,
struct iov_iter *iter)
{
struct tun_pi pi = { 0, skb->protocol };
struct tun_pcpu_stats *stats;
ssize_t total;
int vlan_offset = 0;
int vlan_hlen = 0;
int vnet_hdr_sz = 0;
if (skb_vlan_tag_present(skb))
vlan_hlen = VLAN_HLEN;
if (tun->flags & IFF_VNET_HDR)
vnet_hdr_sz = tun->vnet_hdr_sz;
total = skb->len + vlan_hlen + vnet_hdr_sz;
if (!(tun->flags & IFF_NO_PI)) { //是否要tun_pi信息
if (iov_iter_count(iter) < sizeof(pi))
return -EINVAL;
total += sizeof(pi);
if (iov_iter_count(iter) < total) {
/* Packet will be striped */
pi.flags |= TUN_PKT_STRIP;
}
if (copy_to_iter(&pi, sizeof(pi), iter) != sizeof(pi))
return -EFAULT;
}
if (vnet_hdr_sz) {
struct virtio_net_hdr gso = { 0 }; /* no info leak */
int ret;
if (iov_iter_count(iter) < vnet_hdr_sz)
return -EINVAL;
ret = virtio_net_hdr_from_skb(skb, &gso,
tun_is_little_endian(tun));
if (ret) {
struct skb_shared_info *sinfo = skb_shinfo(skb);
pr_err("unexpected GSO type: "
"0x%x, gso_size %d, hdr_len %d\n",
sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size),
tun16_to_cpu(tun, gso.hdr_len));
print_hex_dump(KERN_ERR, "tun: ",
DUMP_PREFIX_NONE,
16, 1, skb->head,
min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true);
WARN_ON_ONCE(1);
return -EINVAL;
}
if (copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso))
return -EFAULT;
iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
}
if (vlan_hlen) {
int ret;
struct {
__be16 h_vlan_proto;
__be16 h_vlan_TCI;
} veth;
veth.h_vlan_proto = skb->vlan_proto;
veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));
vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset);
if (ret || !iov_iter_count(iter))
goto done;
ret = copy_to_iter(&veth, sizeof(veth), iter);
if (ret != sizeof(veth) || !iov_iter_count(iter))
goto done;
}
skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset); //copy数据部分
done:
/* caller is in process context, */
stats = get_cpu_ptr(tun->pcpu_stats);
u64_stats_update_begin(&stats->syncp);
stats->tx_packets++;
stats->tx_bytes += skb->len + vlan_hlen;
u64_stats_update_end(&stats->syncp);
put_cpu_ptr(tun->pcpu_stats);
return total;
}

tun_net_xmit

tun_net_xmit是驱动的发送函数,主要添加路由到tun设备的流量到其对应的发送队列的fifo ring中,然后通知select/epoll有数据到来

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct tun_struct *tun = netdev_priv(dev);
int txq = skb->queue_mapping;
struct tun_file *tfile;
u32 numqueues = 0;
rcu_read_lock();
tfile = rcu_dereference(tun->tfiles[txq]);
numqueues = ACCESS_ONCE(tun->numqueues);
/* Drop packet if interface is not attached */
if (txq >= numqueues)
goto drop;
#ifdef CONFIG_RPS
if (numqueues == 1 && static_key_false(&rps_needed)) {
/* Select queue was not called for the skbuff, so we extract the
* RPS hash and save it into the flow_table here.
*/
__u32 rxhash;
rxhash = skb_get_hash(skb);
if (rxhash) {
struct tun_flow_entry *e;
e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)],
rxhash);
if (e)
tun_flow_save_rps_rxhash(e, rxhash);
}
}
#endif
tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
BUG_ON(!tfile);
/* Drop if the filter does not like it.
* This is a noop if the filter is disabled.
* Filter can be enabled only for the TAP devices. */
if (!check_filter(&tun->txflt, skb))
goto drop;
if (tfile->socket.sk->sk_filter &&
sk_filter(tfile->socket.sk, skb))
goto drop;
/* Limit the number of packets queued by dividing txq length with the
* number of queues.
*/
if (skb_queue_len(&tfile->socket.sk->sk_receive_queue) * numqueues //接收队列长度不能大于tx_queue_len/numqueues
>= dev->tx_queue_len)
goto drop;
if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
goto drop;
skb_tx_timestamp(skb);
/* Orphan the skb - required as we might hang on to it
* for indefinite time.
*/
skb_orphan(skb);
nf_reset(skb);
if (skb_array_produce(&tfile->tx_array, skb)) //添加skb到fifo ring中
goto drop;
/* Notify and wake up reader process */
if (tfile->flags & TUN_FASYNC)
kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
tfile->socket.sk->sk_data_ready(tfile->socket.sk); //通知epoll有数据接受
rcu_read_unlock();
return NETDEV_TX_OK;
drop:
this_cpu_inc(tun->pcpu_stats->tx_dropped);
skb_tx_error(skb);
kfree_skb(skb);
rcu_read_unlock();
return NET_XMIT_DROP;
}

flow及tun_select_queue

在多队列的情况,因为应用程序有不同的fd,同一个流需要hash到同一个tun个队列,因此tun建了一个hash表来存放每个流的队列索引。 并且设置了一个定时器每隔3秒会被清除未活跃flow的hash表项

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
struct tun_flow_entry {
struct hlist_node hash_link;
struct rcu_head rcu;
struct tun_struct *tun;
u32 rxhash;
u32 rps_rxhash;
int queue_index; //发送队列索引
unsigned long updated;
};
static void tun_flow_init(struct tun_struct *tun)
{
int i;
for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) //1024
INIT_HLIST_HEAD(&tun->flows[i]);
tun->ageing_time = TUN_FLOW_EXPIRE; //3秒
setup_timer(&tun->flow_gc_timer, tun_flow_cleanup, (unsigned long)tun);
mod_timer(&tun->flow_gc_timer,
round_jiffies_up(jiffies + tun->ageing_time)); //每隔3秒,删除过期的flow
}
static void tun_flow_update(struct tun_struct *tun, u32 rxhash,
struct tun_file *tfile)
{
struct hlist_head *head;
struct tun_flow_entry *e;
unsigned long delay = tun->ageing_time;
u16 queue_index = tfile->queue_index;
if (!rxhash)
return;
else
head = &tun->flows[tun_hashfn(rxhash)];
rcu_read_lock();
/* We may get a very small possibility of OOO during switching, not
* worth to optimize.*/
if (tun->numqueues == 1 || tfile->detached)
goto unlock;
e = tun_flow_find(head, rxhash);
if (likely(e)) {
/* TODO: keep queueing to old queue until it's empty? */
e->queue_index = queue_index;
e->updated = jiffies;
sock_rps_record_flow_hash(e->rps_rxhash);
} else {
spin_lock_bh(&tun->lock);
if (!tun_flow_find(head, rxhash) && //加锁后再查一次
tun->flow_count < MAX_TAP_FLOWS) //如果查不到,并且没有达到最大数量
tun_flow_create(tun, head, rxhash, queue_index); //创建
if (!timer_pending(&tun->flow_gc_timer)) //重置定时器
mod_timer(&tun->flow_gc_timer,
round_jiffies_up(jiffies + delay));
spin_unlock_bh(&tun->lock);
}
unlock:
rcu_read_unlock();
}

tun_select_queue

路由到tun设备的流量,需要选择发送队列, 选择的时候先查看tun hash中是否有缓存

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
void *accel_priv, select_queue_fallback_t fallback)
{
struct tun_struct *tun = netdev_priv(dev);
struct tun_flow_entry *e;
u32 txq = 0;
u32 numqueues = 0;
rcu_read_lock();
numqueues = ACCESS_ONCE(tun->numqueues);
txq = skb_get_hash(skb);
if (txq) {
e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
if (e) {
tun_flow_save_rps_rxhash(e, txq);
txq = e->queue_index;//获取缓存的队列索引
} else
/* use multiply and shift instead of expensive divide */
txq = ((u64)txq * numqueues) >> 32; // hash表中未找到缓存,使用hash算法
} else if (likely(skb_rx_queue_recorded(skb))) {
txq = skb_get_rx_queue(skb);
while (unlikely(txq >= numqueues))
txq -= numqueues;
}
rcu_read_unlock();
return txq;
}

poll

应用程序select/epoll的时候需要判断设备是否可读写, 当有流量路由到tun设备对应的发送队列的时候, tfile->tx_array为非空,则可读
默认sendbuf为最大值,通常内存够的话都可写

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
{
struct tun_file *tfile = file->private_data;
struct tun_struct *tun = __tun_get(tfile);
struct sock *sk;
unsigned int mask = 0;
if (!tun)
return POLLERR;
sk = tfile->socket.sk;
tun_debug(KERN_INFO, tun, "tun_chr_poll\n");
poll_wait(file, sk_sleep(sk), wait);
if (!skb_array_empty(&tfile->tx_array)) //等待fifo队列非空
mask |= POLLIN | POLLRDNORM;
if (sock_writeable(sk) || //默认INX_MAX, 不限制
(!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
sock_writeable(sk)))
mask |= POLLOUT | POLLWRNORM;
if (tun->dev->reg_state != NETREG_REGISTERED)
mask = POLLERR;
tun_put(tun);
return mask;
}