命名空间
<h2>概述</h2>
<h2>分析</h2>
<pre><code class="language-c">// file: include/linux/sched.h
struct task_struct {
/* namespaces */
struct nsproxy *nsproxy;
// ...
}
// file: include/linux/nsproxy.h
/*
* A structure to contain pointers to all per-process
* namespaces - fs (mount), uts, network, sysvipc, etc.
*
* 'count' is the number of tasks holding a reference.
* The count for each namespace, then, will be the number
* of nsproxies pointing to it, not the number of tasks.
*
* The nsproxy is shared by tasks which share all namespaces.
* As soon as a single namespace is cloned or unshared, the
* nsproxy is copied.
*/
struct nsproxy {
atomic_t count;
struct uts_namespace *uts_ns;
struct ipc_namespace *ipc_ns;
struct mnt_namespace *mnt_ns;
struct pid_namespace *pid_ns;
struct net *net_ns; // 网络命名空间
};</code></pre>
<p>继续:</p>
<pre><code class="language-c">// file: include/net/net_namespace.h
struct net {
atomic_t passive; /* To decided when the network
* namespace should be freed.
*/
atomic_t count; /* To decided when the network
* namespace should be shut down.
*/
#ifdef NETNS_REFCNT_DEBUG
atomic_t use_count; /* To track references we
* destroy on demand
*/
#endif
spinlock_t rules_mod_lock;
struct list_head list; /* list of network namespaces */
struct list_head cleanup_list; /* namespaces on death row */
struct list_head exit_list; /* Use only net_mutex */
struct user_namespace *user_ns; /* Owning user namespace */
unsigned int proc_inum;
struct proc_dir_entry *proc_net;
struct proc_dir_entry *proc_net_stat;
#ifdef CONFIG_SYSCTL
struct ctl_table_set sysctls;
#endif
struct sock *rtnl; /* rtnetlink socket */
struct sock *genl_sock;
struct list_head dev_base_head;
struct hlist_head *dev_name_head;
struct hlist_head *dev_index_head;
unsigned int dev_base_seq; /* protected by rtnl_mutex */
int ifindex;
/* core fib_rules */
struct list_head rules_ops;
struct net_device *loopback_dev; /* The loopback */ // 每个 net 中都有一个回环设备
struct netns_core core;
struct netns_mib mib;
struct netns_packet packet;
struct netns_unix unx;
struct netns_ipv4 ipv4; // 路由表、netfilter 都在这里
#if IS_ENABLED(CONFIG_IPV6)
struct netns_ipv6 ipv6;
#endif
#if defined(CONFIG_IP_SCTP) || defined(CONFIG_IP_SCTP_MODULE)
struct netns_sctp sctp;
#endif
#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE)
struct netns_dccp dccp;
#endif
#ifdef CONFIG_NETFILTER
struct netns_nf nf;
struct netns_xt xt;
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
struct netns_ct ct;
#endif
#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
struct netns_nf_frag nf_frag;
#endif
struct sock *nfnl;
struct sock *nfnl_stash;
#endif
#ifdef CONFIG_WEXT_CORE
struct sk_buff_head wext_nlevents;
#endif
struct net_generic __rcu *gen;
/* Note : following structs are cache line aligned */
#ifdef CONFIG_XFRM
struct netns_xfrm xfrm;
#endif
struct netns_ipvs *ipvs;
struct sock *diag_nlsk;
atomic_t rt_genid;
};
// file: include/net/netns/ipv4.h
struct netns_ipv4 {
#ifdef CONFIG_SYSCTL
struct ctl_table_header *forw_hdr;
struct ctl_table_header *frags_hdr;
struct ctl_table_header *ipv4_hdr;
struct ctl_table_header *route_hdr;
struct ctl_table_header *xfrm4_hdr;
#endif
struct ipv4_devconf *devconf_all;
struct ipv4_devconf *devconf_dflt;
#ifdef CONFIG_IP_MULTIPLE_TABLES
struct fib_rules_ops *rules_ops;
bool fib_has_custom_rules;
struct fib_table *fib_local; // 路由表
struct fib_table *fib_main;
struct fib_table *fib_default;
#endif
#ifdef CONFIG_IP_ROUTE_CLASSID
int fib_num_tclassid_users;
#endif
struct hlist_head *fib_table_hash;
struct sock *fibnl;
struct sock **icmp_sk;
struct inet_peer_base *peers;
struct tcpm_hash_bucket *tcp_metrics_hash;
unsigned int tcp_metrics_hash_log;
struct sock * __percpu *tcp_sk;
struct netns_frags frags;
#ifdef CONFIG_NETFILTER
struct xt_table *iptable_filter; // netfilter
struct xt_table *iptable_mangle;
struct xt_table *iptable_raw;
struct xt_table *arptable_filter;
#ifdef CONFIG_SECURITY
struct xt_table *iptable_security;
#endif
struct xt_table *nat_table;
#endif
int sysctl_icmp_echo_ignore_all;
int sysctl_icmp_echo_ignore_broadcasts;
int sysctl_icmp_ignore_bogus_error_responses;
int sysctl_icmp_ratelimit;
int sysctl_icmp_ratemask;
int sysctl_icmp_errors_use_inbound_ifaddr;
int sysctl_tcp_ecn;
kgid_t sysctl_ping_group_range[2];
long sysctl_tcp_mem[3]; // 内核参数
atomic_t dev_addr_genid;
#ifdef CONFIG_IP_MROUTE
#ifndef CONFIG_IP_MROUTE_MULTIPLE_TABLES
struct mr_table *mrt;
#else
struct list_head mr_tables;
struct fib_rules_ops *mr_rules_ops;
#endif
#endif
};
</code></pre>
<h2>默认命名空间初始化</h2>
<pre><code class="language-c">// file: init/init_task.c
/* Initial task structure */
struct task_struct init_task = INIT_TASK(init_task);
EXPORT_SYMBOL(init_task);
// file: include/linux/inet_task.h
/*
* INIT_TASK is used to set up the first task table, touch at
* your own risk!. Base=0, limit=0x1fffff (=2MB)
*/
#define INIT_TASK(tsk) \
{ \
.state = 0, \
.stack = &amp;init_thread_info, \
.usage = ATOMIC_INIT(2), \
.flags = PF_KTHREAD, \
.prio = MAX_PRIO-20, \
.static_prio = MAX_PRIO-20, \
.normal_prio = MAX_PRIO-20, \
.policy = SCHED_NORMAL, \
.cpus_allowed = CPU_MASK_ALL, \
.nr_cpus_allowed= NR_CPUS, \
.mm = NULL, \
.active_mm = &amp;init_mm, \
.se = { \
.group_node = LIST_HEAD_INIT(tsk.se.group_node), \
}, \
.rt = { \
.run_list = LIST_HEAD_INIT(tsk.rt.run_list), \
.time_slice = RR_TIMESLICE, \
}, \
.tasks = LIST_HEAD_INIT(tsk.tasks), \
INIT_PUSHABLE_TASKS(tsk) \
INIT_CGROUP_SCHED(tsk) \
.ptraced = LIST_HEAD_INIT(tsk.ptraced), \
.ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \
.real_parent = &amp;tsk, \
.parent = &amp;tsk, \
.children = LIST_HEAD_INIT(tsk.children), \
.sibling = LIST_HEAD_INIT(tsk.sibling), \
.group_leader = &amp;tsk, \
RCU_POINTER_INITIALIZER(real_cred, &amp;init_cred), \
RCU_POINTER_INITIALIZER(cred, &amp;init_cred), \
.comm = INIT_TASK_COMM, \
.thread = INIT_THREAD, \
.fs = &amp;init_fs, \
.files = &amp;init_files, \
.signal = &amp;init_signals, \
.sighand = &amp;init_sighand, \
.nsproxy = &amp;init_nsproxy, \ // 命名空间赋值为 init_nsproxy
.pending = { \
.list = LIST_HEAD_INIT(tsk.pending.list), \
.signal = {{0}}}, \
.blocked = {{0}}, \
.alloc_lock = __SPIN_LOCK_UNLOCKED(tsk.alloc_lock), \
.journal_info = NULL, \
.cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \
.pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \
.timer_slack_ns = 50000, /* 50 usec default slack */ \
.pids = { \
[PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \
[PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \
[PIDTYPE_SID] = INIT_PID_LINK(PIDTYPE_SID), \
}, \
.thread_group = LIST_HEAD_INIT(tsk.thread_group), \
.thread_node = LIST_HEAD_INIT(init_signals.thread_head), \
INIT_IDS \
INIT_PERF_EVENTS(tsk) \
INIT_TRACE_IRQFLAGS \
INIT_LOCKDEP \
INIT_FTRACE_GRAPH \
INIT_TRACE_RECURSION \
INIT_TASK_RCU_PREEMPT(tsk) \
INIT_CPUSET_SEQ \
INIT_VTIME(tsk) \
}
// file: kernel/nsproxy.c
struct nsproxy init_nsproxy = {
.count = ATOMIC_INIT(1),
.uts_ns = &amp;init_uts_ns,
#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
.ipc_ns = &amp;init_ipc_ns,
#endif
.mnt_ns = NULL,
.pid_ns = &amp;init_pid_ns,
#ifdef CONFIG_NET
.net_ns = &amp;init_net, // 初始化的网络命名空间
#endif
};
</code></pre>
<h2>init_net 初始化</h2>
<pre><code class="language-c">// file: net/core/net_namespace.c
struct net init_net = {
.dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head),
};
EXPORT_SYMBOL(init_net);
static int __init net_ns_init(void)
{
struct net_generic *ng;
#ifdef CONFIG_NET_NS
net_cachep = kmem_cache_create(&quot;net_namespace&quot;, sizeof(struct net),
SMP_CACHE_BYTES,
SLAB_PANIC, NULL);
/* Create workqueue for cleanup */
netns_wq = create_singlethread_workqueue(&quot;netns&quot;);
if (!netns_wq)
panic(&quot;Could not create netns workq&quot;);
#endif
ng = net_alloc_generic();
if (!ng)
panic(&quot;Could not allocate generic netns&quot;);
rcu_assign_pointer(init_net.gen, ng);
mutex_lock(&amp;net_mutex);
if (setup_net(&amp;init_net, &amp;init_user_ns)) // 初始化
panic(&quot;Could not setup the initial network namespace&quot;);
rtnl_lock();
list_add_tail_rcu(&amp;init_net.list, &amp;net_namespace_list); // 加入到全局变量 net_namespace_list 中
rtnl_unlock();
mutex_unlock(&amp;net_mutex);
register_pernet_subsys(&amp;net_ns_ops); // ?
return 0;
}
/*
* setup_net runs the initializers for the network namespace object.
*/
static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
{
/* Must be called with net_mutex held */
const struct pernet_operations *ops, *saved_ops;
int error = 0;
LIST_HEAD(net_exit_list);
atomic_set(&amp;net-&gt;count, 1);
atomic_set(&amp;net-&gt;passive, 1);
net-&gt;dev_base_seq = 1;
net-&gt;user_ns = user_ns;
#ifdef NETNS_REFCNT_DEBUG
atomic_set(&amp;net-&gt;use_count, 0);
#endif
list_for_each_entry(ops, &amp;pernet_list, list) { // 调用每一个子系统的初始化函数。其中 pernet_list 是全局变量
error = ops_init(ops, net);
if (error &lt; 0)
goto out_undo;
}
out:
return error;
out_undo:
/* Walk through the list backwards calling the exit functions
* for the pernet modules whose init functions did not fail.
*/
list_add(&amp;net-&gt;exit_list, &amp;net_exit_list);
saved_ops = ops;
list_for_each_entry_continue_reverse(ops, &amp;pernet_list, list)
ops_exit_list(ops, &amp;net_exit_list);
ops = saved_ops;
list_for_each_entry_continue_reverse(ops, &amp;pernet_list, list)
ops_free_list(ops, &amp;net_exit_list);
rcu_barrier();
goto out;
}</code></pre>
<h2>子系统 pernet ops</h2>
<p>以路由为例:</p>
<pre><code class="language-c">// file: net/ipv4/fib_frontend.c
static struct pernet_operations fib_net_ops = {
.init = fib_net_init,
.exit = fib_net_exit,
};
void __init ip_fib_init(void)
{
fib_trie_init();
register_pernet_subsys(&amp;fib_net_ops); // 注册到 pernet_list 中
register_netdevice_notifier(&amp;fib_netdev_notifier);
register_inetaddr_notifier(&amp;fib_inetaddr_notifier);
rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL);
rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);
}
</code></pre>