你的位置:首页 > 操作系统

[操作系统]Linux如何创建一个新进程


2016-03-31

张超《Linux内核分析》MOOC课程http://mooc.study.163.com/course/USTC-1000029000

Linux如何创建一个新进程

1.我们先阅读理解task_struct数据结构

1235struct task_struct {1236  volatile long state;  /* -1 unrunnable, 0 runnable, >0 stopped */1237  void *stack;1238  atomic_t usage;1239  unsigned int flags;  /* per process flags, defined below */1240  unsigned int ptrace;12411242#ifdef CONFIG_SMP1243  struct llist_node wake_entry;1244  int on_cpu;1245  struct task_struct *last_wakee;1246  unsigned long wakee_flips;1247  unsigned long wakee_flip_decay_ts;12481249  int wake_cpu;1250#endif1251  int on_rq;12521253  int prio, static_prio, normal_prio;1254  unsigned int rt_priority;1255  const struct sched_class *sched_class;1256  struct sched_entity se;1257  struct sched_rt_entity rt;1258#ifdef CONFIG_CGROUP_SCHED1259  struct task_group *sched_task_group;1260#endif1261  struct sched_dl_entity dl;12621263#ifdef CONFIG_PREEMPT_NOTIFIERS1264  /* list of struct preempt_notifier: */1265  struct hlist_head preempt_notifiers;1266#endif12671268#ifdef CONFIG_BLK_DEV_IO_TRACE1269  unsigned int btrace_seq;1270#endif12711272  unsigned int policy;1273  int nr_cpus_allowed;1274  cpumask_t cpus_allowed;12751276#ifdef CONFIG_PREEMPT_RCU1277  int rcu_read_lock_nesting;1278  union rcu_special rcu_read_unlock_special;1279  struct list_head rcu_node_entry;1280#endif /* #ifdef CONFIG_PREEMPT_RCU */1281#ifdef CONFIG_TREE_PREEMPT_RCU1282  struct rcu_node *rcu_blocked_node;1283#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */1284#ifdef CONFIG_TASKS_RCU1285  unsigned long rcu_tasks_nvcsw;1286  bool rcu_tasks_holdout;1287  struct list_head rcu_tasks_holdout_list;1288  int rcu_tasks_idle_cpu;1289#endif /* #ifdef CONFIG_TASKS_RCU */12901291#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)1292  struct sched_info sched_info;1293#endif12941295  struct list_head tasks;1296#ifdef CONFIG_SMP1297  struct plist_node pushable_tasks;1298  struct rb_node pushable_dl_tasks;1299#endif13001301  struct mm_struct *mm, *active_mm;1302#ifdef CONFIG_COMPAT_BRK1303  unsigned brk_randomized:1;1304#endif1305  /* per-thread vma caching */1306  u32 vmacache_seqnum;1307  struct vm_area_struct *vmacache[VMACACHE_SIZE];1308#if defined(SPLIT_RSS_COUNTING)1309  struct task_rss_stat  rss_stat;1310#endif1311/* task state */1312  int exit_state;1313  int exit_code, exit_signal;1314  int pdeath_signal; /* The signal sent when the parent dies */1315  unsigned int jobctl;  /* JOBCTL_*, siglock protected */13161317  /* Used for emulating ABI behavior of previous Linux versions */1318  unsigned int personality;13191320  unsigned in_execve:1;  /* Tell the LSMs that the process is doing an1321         * execve */1322  unsigned in_iowait:1;13231324  /* Revert to default priority/policy when forking */1325  unsigned sched_reset_on_fork:1;1326  unsigned sched_contributes_to_load:1;13271328  unsigned long atomic_flags; /* Flags needing atomic access. */13291330  pid_t pid;1331  pid_t tgid;13321333#ifdef CONFIG_CC_STACKPROTECTOR1334  /* Canary value for the -fstack-protector gcc feature */1335  unsigned long stack_canary;1336#endif1337  /*1338   * pointers to (original) parent process, youngest child, younger sibling,1339   * older sibling, respectively. (p->father can be replaced with1340   * p->real_parent->pid)1341   */1342  struct task_struct __rcu *real_parent; /* real parent process */1343  struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */1344  /*1345   * children/sibling forms the list of my natural children1346   */1347  struct list_head children;  /* list of my children */1348  struct list_head sibling;  /* linkage in my parent's children list */1349  struct task_struct *group_leader;  /* threadgroup leader */13501351  /*1352   * ptraced is the list of tasks this task is using ptrace on.1353   * This includes both natural children and PTRACE_ATTACH targets.1354   * p->ptrace_entry is p's link on the p->parent->ptraced list.1355   */1356  struct list_head ptraced;1357  struct list_head ptrace_entry;13581359  /* PID/PID hash table linkage. */1360  struct pid_link pids[PIDTYPE_MAX];1361  struct list_head thread_group;1362  struct list_head thread_node;13631364  struct completion *vfork_done;    /* for vfork() */1365  int __user *set_child_tid;    /* CLONE_CHILD_SETTID */1366  int __user *clear_child_tid;    /* CLONE_CHILD_CLEARTID */13671368  cputime_t utime, stime, utimescaled, stimescaled;1369  cputime_t gtime;1370#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE1371  struct cputime prev_cputime;1372#endif1373#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN1374  seqlock_t vtime_seqlock;1375  unsigned long long vtime_snap;1376  enum {1377    VTIME_SLEEPING = 0,1378    VTIME_USER,1379    VTIME_SYS,1380  } vtime_snap_whence;1381#endif1382  unsigned long nvcsw, nivcsw; /* context switch counts */1383  u64 start_time;    /* monotonic time in nsec */1384  u64 real_start_time;  /* boot based time in nsec */1385/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */1386  unsigned long min_flt, maj_flt;13871388  struct task_cputime cputime_expires;1389  struct list_head cpu_timers[3];13901391/* process credentials */1392  const struct cred __rcu *real_cred; /* objective and real subjective task1393           * credentials (COW) */1394  const struct cred __rcu *cred;  /* effective (overridable) subjective task1395           * credentials (COW) */1396  char comm[TASK_COMM_LEN]; /* executable name excluding path1397           - access with [gs]et_task_comm (which lock1398            it with task_lock())1399           - initialized normally by setup_new_exec */1400/* file system info */1401  int link_count, total_link_count;1402#ifdef CONFIG_SYSVIPC1403/* ipc stuff */1404  struct sysv_sem sysvsem;1405  struct sysv_shm sysvshm;1406#endif1407#ifdef CONFIG_DETECT_HUNG_TASK1408/* hung task detection */1409  unsigned long last_switch_count;1410#endif1411/* CPU-specific state of this task */1412  struct thread_struct thread;1413/* filesystem information */1414  struct fs_struct *fs;1415/* open file information */1416  struct files_struct *files;1417/* namespaces */1418  struct nsproxy *nsproxy;1419/* signal handlers */1420  struct signal_struct *signal;1421  struct sighand_struct *sighand;14221423  sigset_t blocked, real_blocked;1424  sigset_t saved_sigmask;  /* restored if set_restore_sigmask() was used */1425  struct sigpending pending;14261427  unsigned long sas_ss_sp;1428  size_t sas_ss_size;1429  int (*notifier)(void *priv);1430  void *notifier_data;1431  sigset_t *notifier_mask;1432  struct callback_head *task_works;14331434  struct audit_context *audit_context;1435#ifdef CONFIG_AUDITSYSCALL1436  kuid_t loginuid;1437  unsigned int sessionid;1438#endif1439  struct seccomp seccomp;14401441/* Thread group tracking */1442    u32 parent_exec_id;1443    u32 self_exec_id;1444/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,1445 * mempolicy */1446  spinlock_t alloc_lock;14471448  /* Protection of the PI data structures: */1449  raw_spinlock_t pi_lock;14501451#ifdef CONFIG_RT_MUTEXES1452  /* PI waiters blocked on a rt_mutex held by this task */1453  struct rb_root pi_waiters;1454  struct rb_node *pi_waiters_leftmost;1455  /* Deadlock detection and priority inheritance handling */1456  struct rt_mutex_waiter *pi_blocked_on;1457#endif14581459#ifdef CONFIG_DEBUG_MUTEXES1460  /* mutex deadlock detection */1461  struct mutex_waiter *blocked_on;1462#endif1463#ifdef CONFIG_TRACE_IRQFLAGS1464  unsigned int irq_events;1465  unsigned long hardirq_enable_ip;1466  unsigned long hardirq_disable_ip;1467  unsigned int hardirq_enable_event;1468  unsigned int hardirq_disable_event;1469  int hardirqs_enabled;1470  int hardirq_context;1471  unsigned long softirq_disable_ip;1472  unsigned long softirq_enable_ip;1473  unsigned int softirq_disable_event;1474  unsigned int softirq_enable_event;1475  int softirqs_enabled;1476  int softirq_context;1477#endif1478#ifdef CONFIG_LOCKDEP1479# define MAX_LOCK_DEPTH 48UL1480  u64 curr_chain_key;1481  int lockdep_depth;1482  unsigned int lockdep_recursion;1483  struct held_lock held_locks[MAX_LOCK_DEPTH];1484  gfp_t lockdep_reclaim_gfp;1485#endif14861487/* journalling filesystem info */1488  void *journal_info;14891490/* stacked block device info */1491  struct bio_list *bio_list;14921493#ifdef CONFIG_BLOCK1494/* stack plugging */1495  struct blk_plug *plug;1496#endif14971498/* VM state */1499  struct reclaim_state *reclaim_state;15001501  struct backing_dev_info *backing_dev_info;15021503  struct io_context *io_context;15041505  unsigned long ptrace_message;1506  siginfo_t *last_siginfo; /* For ptrace use. */1507  struct task_io_accounting ioac;1508#if defined(CONFIG_TASK_XACCT)1509  u64 acct_rss_mem1;  /* accumulated rss usage */1510  u64 acct_vm_mem1;  /* accumulated virtual memory usage */1511  cputime_t acct_timexpd;  /* stime + utime since last update */1512#endif1513#ifdef CONFIG_CPUSETS1514  nodemask_t mems_allowed;  /* Protected by alloc_lock */1515  seqcount_t mems_allowed_seq;  /* Seqence no to catch updates */1516  int cpuset_mem_spread_rotor;1517  int cpuset_slab_spread_rotor;1518#endif1519#ifdef CONFIG_CGROUPS1520  /* Control Group info protected by css_set_lock */1521  struct css_set __rcu *cgroups;1522  /* cg_list protected by css_set_lock and tsk->alloc_lock */1523  struct list_head cg_list;1524#endif1525#ifdef CONFIG_FUTEX1526  struct robust_list_head __user *robust_list;1527#ifdef CONFIG_COMPAT1528  struct compat_robust_list_head __user *compat_robust_list;1529#endif1530  struct list_head pi_state_list;1531  struct futex_pi_state *pi_state_cache;1532#endif1533#ifdef CONFIG_PERF_EVENTS1534  struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];1535  struct mutex perf_event_mutex;1536  struct list_head perf_event_list;1537#endif1538#ifdef CONFIG_DEBUG_PREEMPT1539  unsigned long preempt_disable_ip;1540#endif1541#ifdef CONFIG_NUMA1542  struct mempolicy *mempolicy;  /* Protected by alloc_lock */1543  short il_next;1544  short pref_node_fork;1545#endif1546#ifdef CONFIG_NUMA_BALANCING1547  int numa_scan_seq;1548  unsigned int numa_scan_period;1549  unsigned int numa_scan_period_max;1550  int numa_preferred_nid;1551  unsigned long numa_migrate_retry;1552  u64 node_stamp;      /* migration stamp */1553  u64 last_task_numa_placement;1554  u64 last_sum_exec_runtime;1555  struct callback_head numa_work;15561557  struct list_head numa_entry;1558  struct numa_group *numa_group;15591560  /*1561   * Exponential decaying average of faults on a per-node basis.1562   * Scheduling placement decisions are made based on the these counts.1563   * The values remain static for the duration of a PTE scan1564   */1565  unsigned long *numa_faults_memory;1566  unsigned long total_numa_faults;15671568  /*1569   * numa_faults_buffer records faults per node during the current1570   * scan window. When the scan completes, the counts in1571   * numa_faults_memory decay and these values are copied.1572   */1573  unsigned long *numa_faults_buffer_memory;15741575  /*1576   * Track the nodes the process was running on when a NUMA hinting1577   * fault was incurred.1578   */1579  unsigned long *numa_faults_cpu;1580  unsigned long *numa_faults_buffer_cpu;15811582  /*1583   * numa_faults_locality tracks if faults recorded during the last1584   * scan window were remote/local. The task scan period is adapted1585   * based on the locality of the faults with different weights1586   * depending on whether they were shared or private faults1587   */1588  unsigned long numa_faults_locality[2];15891590  unsigned long numa_pages_migrated;1591#endif /* CONFIG_NUMA_BALANCING */15921593  struct rcu_head rcu;15941595  /*1596   * cache last used pipe for splice1597   */1598  struct pipe_inode_info *splice_pipe;15991600  struct page_frag task_frag;16011602#ifdef  CONFIG_TASK_DELAY_ACCT1603  struct task_delay_info *delays;1604#endif1605#ifdef CONFIG_FAULT_INJECTION1606  int make_it_fail;1607#endif1608  /*1609   * when (nr_dirtied >= nr_dirtied_pause), it's time to call1610   * balance_dirty_pages() for some dirty throttling pause1611   */1612  int nr_dirtied;1613  int nr_dirtied_pause;1614  unsigned long dirty_paused_when; /* start of a write-and-pause period */16151616#ifdef CONFIG_LATENCYTOP1617  int latency_record_count;1618  struct latency_record latency_record[LT_SAVECOUNT];1619#endif1620  /*1621   * time slack values; these are used to round up poll() and1622   * select() etc timeout values. These are in nanoseconds.1623   */1624  unsigned long timer_slack_ns;1625  unsigned long default_timer_slack_ns;16261627#ifdef CONFIG_FUNCTION_GRAPH_TRACER1628  /* Index of current stored address in ret_stack */1629  int curr_ret_stack;1630  /* Stack of return addresses for return function tracing */1631  struct ftrace_ret_stack  *ret_stack;1632  /* time stamp for last schedule */1633  unsigned long long ftrace_timestamp;1634  /*1635   * Number of functions that haven't been traced1636   * because of depth overrun.1637   */1638  atomic_t trace_overrun;1639  /* Pause for the tracing */1640  atomic_t tracing_graph_pause;1641#endif1642#ifdef CONFIG_TRACING1643  /* state flags for use by tracers */1644  unsigned long trace;1645  /* bitmask and counter of trace recursion */1646  unsigned long trace_recursion;1647#endif /* CONFIG_TRACING */1648#ifdef CONFIG_MEMCG /* memcg uses this to do batch job */1649  unsigned int memcg_kmem_skip_account;1650  struct memcg_oom_info {1651    struct mem_cgroup *memcg;1652    gfp_t gfp_mask;1653    int order;1654    unsigned int may_oom:1;1655  } memcg_oom;1656#endif1657#ifdef CONFIG_UPROBES1658  struct uprobe_task *utask;1659#endif1660#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)1661  unsigned int  sequential_io;1662  unsigned int  sequential_io_avg;1663#endif1664};

task_struct

关于task_struct的具体介绍,见

http://blog.csdn.net/npy_lp/article/details/7292563

它定义在linux-3.18.6/include/linux/sched.h文件中。

进程(Process)是系统进行资源分配和调度的基本单位,一个进程是一个程序的运行实例。而在Linux中,可以使用一个进程来创建另外一个进程。这样的话,Linux的进程的组织结

构其实有点像Linux目录树,是个层次结构的,可以使用 pstree命令来查看。在最上面是init程序的执行进程。它是所有进程的老祖宗。Linux提供了两个函数来创建进程。

1.fork() 

fork()提供了创建进程的基本操作,可以说它是Linux系统多任务的基础。该函数在/linux-3.18.6/kernel/fork.c

2.exec系列函数

如果只有fork(),肯定是不完美的,因为fork()只能参数一个父进程的副本。而exec系列函数则可以帮助我们建立一个全新的新进程。

在Linux系统中,一个进程的PCB是一个C语言的结构体task_struct来表示,而多个PCB之间是由一个双向链表组织起来的,在《Understanding the Linux Kernel》中,则是进一步描

述这个链表是一个双向循环链表。

在Linux中创建一个新进程的方法是使用fork函数,fork()执行一次但有两个返回值。

在父进程中,返回值是子进程的进程号;在子进程中,返回值为0。因此可通过返回值来判断当前进程是父进程还是子进程。

使用fork函数得到的子进程是父进程的一个复制品,它从父进程处复制了整个进程的地址空间,包括进程上下文,进程堆栈,内存信息,打开的文件描述符,信 号控制设定,进程优

先级,进程组号,当前工作目录,根目录,资源限制,控制终端等。而子进程所独有的只是它的进程号,资源使用和计时器等。可以看出,使用 fork函数的代价是很大的,它复制了

父进程中的代码段,数据段和堆栈段里的大部分内容,使得fork函数的执行速度并不快。

创建一个进程,至少涉及的函数:

sys_clone, do_fork, dup_task_struct, copy_process, copy_thread, ret_from_fork


 这只是图中的fork一个分支

学习笔记

进程的描述

1.进程描述符task_struct数据结构(一)

为了管理进程,内核必须对每个进程进行清晰的描述,进程描述符提供了内核所需了解的进程信息。

  • struct task_struct数据结构很庞大
  • Linux进程的状态与操作系统原理中的描述的进程状态似乎有所不同,比如就绪状态和运行状态都是TASK_RUNNING,为什么呢?
  • 进程的标示pid
  • 所有进程链表struct list_head tasks;     内核的双向循环链表的实现方法 - 一个更简略的双向循环链表
  • 程序创建的进程具有父子关系,在编程时往往需要引用这样的父子关系。进程描述符中有几个域用来表示这样的关系
  • Linux为每个进程分配一个8KB大小的内存区域,用于存放该进程两个不同的数据结构:Thread_info和进程的内核堆栈               

      进程处于内核态时使用,不同于用户态堆栈,即PCB中指定了内核栈,那为什么PCB中没有用户态堆栈?用户态堆栈是怎么设定的?

      内核控制路径所用的堆栈很少,因此对栈和Thread_info来说,8KB足够了

  •  struct thread_struct thread; //CPU-specific state of this task
  • 文件系统和文件描述符
  • 内存管理——进程的地址空间

进程状态的切换过程和原因大致如下图:

 

双向循环链表图如下:

 

进程的父子关系直观图:

 

进程的创建

1.进程的创建概览及fork一个进程的用户态代码

(1)进程的起源再回顾

  • 道生一(start_kernel...cpu_idle)
  • 一生二(kernel_init和kthreadd)
  • 二生三(即前面的0、1、2三个进程)
  • 三生万物(1号进程是所有用户态进程的祖先,2号进程是所有内核线程的祖先)

(2)0号进程手工写,1号进程复制、加载init程序

(3)shell命令行是如何启动进程的

fork一个子进程的代码:

 1  #include <stdio.h> 2  #include <stdlib.h> 3  #include <unistd.h> 4  int main(int argc, char * argv[]) 5  { 6    int pid; 7    /* fork another process */ 8    pid = fork(); 9    if (pid < 0)  出错处理10    { 11     /* error occurred */12     fprintf(stderr,"Fork Failed!");13     exit(-1);14    } 15   else if (pid == 0) 16    {17     /* child process */ 子进程  pid=0时 if和else都会执行 fork系统调用在父进程和子进程各返回一次18     printf("This is Child Process!\n");19    } 20   else 21    { 22     /* parent process */23     printf("This is Parent Process!\n");24     /* parent will wait for the child to complete*/25      wait(NULL);26     printf("Child Complete!\n");27    }28 }

View Code

 

2.理解进程创建过程复杂代码的方法

(1)系统调用再回顾

(2)fork的子进程是从哪里开始执行的?

与基于mykernel写的精简内核对照起来。

(3)创建一个新进程在内核中的执行过程

  • fork、vfork和clone三个系统调用都可以创建一个新进程,而且都是通过调用do_fork来实现进程的创建;
  • Linux通过复制父进程来创建一个新进程,那么这就给我们理解这一个过程提供一个想象的框架:
  • 复制一个PCB——task_struct
    err = arch_dup_task_struct(tsk, orig);

  • 要给新进程分配一个新的内核堆栈
ti = alloc_thread_info_node(tsk, node);tsk->stack = ti;setup_thread_stack(tsk, orig); //这里只是复制thread_info,而非复制内核堆栈


  • 要修改复制过来的进程数据,比如pid、进程链表等等都要改改吧,见copy_process内部。
  • 从用户态的代码看fork();函数返回了两次,即在父子进程中各返回一次,父进程从系统调用中返回比较容易理解,子进程从系统调用中返回,那它 在系统调用处理过程中的哪里开始执行的呢?这就涉及子进程的内核堆栈数据状态和task_struct中thread记录的sp和ip的一致性问题,这是 在哪里设定的?copy_thread in copy_process
1 *childregs = *current_pt_regs(); //复制内核堆栈2 childregs->ax = 0; //为什么子进程的fork返回0,这里就是原因!3 4 p->thread.sp = (unsigned long) childregs; //调度到子进程时的内核栈顶5 p->thread.ip = (unsigned long) ret_from_fork; //调度到子进程时的第一条指令地址


(4)理解复杂事物要预设一个大致的框架。

(5)创建新进程是通过复制当前进程来实现的。

(6)设想创建新进程过程中需要做哪些事

3.浏览进程创建过程相关的关键代码

(1)系统调用内核处理函数sys_fork、sys_clone、sys_vfork

最终都是执行do_fork()。

do_fork()里的复制进程的函数:

 

 

具体:

打开复制PCB的具体函数:

打开alloc_thread_info():

 

拷贝内核堆栈数据和指定新进程的第一条指令地址。

4.创建的新进程是从哪里开始执行的?

(1)复制内核堆栈时

打开pt_regs:

int指令和SAVE_ALL压到内核栈的内容。

下面分析entry_32.S,也就是总控程序。

5.使用gdb跟踪创建新进程的过程(见作业)



实验:

1、流程

 

添加fork()到MenuOS

 

编译并启动MenuOS

 

用GDB连接,添加breakpoints,

 

根据观察copy_process是建立新进程,

 

weak_up_new_task则是运行这个新进程,所以要尝试添加这样一个断点

 

breakpoints list:b sys_clone

 

b sys_clone

b do_fork

 

b copy_process

 

b dup_task_struct

 

b alloc_task_struct_node

 

b arch_dup_task_struct

 

b copy_thread

 

b ret_from_fork

 

b wake_up_new_task

 跟踪fork执行

2、实验记录

2.1 添加并验证fork()可用

2.2 跟踪fork

 

四、总结

Fork创建的新进程是和父进程(除了PID和PPID)一样的副本,包括真实和有效的UID和GID、进程组合会话ID、环境、资源限制、打开的文件以及共享内存段。

根据代码的分析,do_fork中,copy_process管子进程运行的准备,wake_up_new_task作为子进程forking的完成。