2016-03-31 張超《Linux內核分析》MOOC課程http://mooc.study.163.com/course/USTC-1000029000 Linux如何創建一個新進程 1.我們先閱讀理解task_struct數據結構 1235struct task_struct { 1236 v ...
2016-03-31
張超《Linux內核分析》MOOC課程http://mooc.study.163.com/course/USTC-1000029000
Linux如何創建一個新進程
1.我們先閱讀理解task_struct數據結構
1235struct task_struct { 1236 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ 1237 void *stack; 1238 atomic_t usage; 1239 unsigned int flags; /* per process flags, defined below */ 1240 unsigned int ptrace; 1241 1242#ifdef CONFIG_SMP 1243 struct llist_node wake_entry; 1244 int on_cpu; 1245 struct task_struct *last_wakee; 1246 unsigned long wakee_flips; 1247 unsigned long wakee_flip_decay_ts; 1248 1249 int wake_cpu; 1250#endif 1251 int on_rq; 1252 1253 int prio, static_prio, normal_prio; 1254 unsigned int rt_priority; 1255 const struct sched_class *sched_class; 1256 struct sched_entity se; 1257 struct sched_rt_entity rt; 1258#ifdef CONFIG_CGROUP_SCHED 1259 struct task_group *sched_task_group; 1260#endif 1261 struct sched_dl_entity dl; 1262 1263#ifdef CONFIG_PREEMPT_NOTIFIERS 1264 /* list of struct preempt_notifier: */ 1265 struct hlist_head preempt_notifiers; 1266#endif 1267 1268#ifdef CONFIG_BLK_DEV_IO_TRACE 1269 unsigned int btrace_seq; 1270#endif 1271 1272 unsigned int policy; 1273 int nr_cpus_allowed; 1274 cpumask_t cpus_allowed; 1275 1276#ifdef CONFIG_PREEMPT_RCU 1277 int rcu_read_lock_nesting; 1278 union rcu_special rcu_read_unlock_special; 1279 struct list_head rcu_node_entry; 1280#endif /* #ifdef CONFIG_PREEMPT_RCU */ 1281#ifdef CONFIG_TREE_PREEMPT_RCU 1282 struct rcu_node *rcu_blocked_node; 1283#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 1284#ifdef CONFIG_TASKS_RCU 1285 unsigned long rcu_tasks_nvcsw; 1286 bool rcu_tasks_holdout; 1287 struct list_head rcu_tasks_holdout_list; 1288 int rcu_tasks_idle_cpu; 1289#endif /* #ifdef CONFIG_TASKS_RCU */ 1290 1291#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 1292 struct sched_info sched_info; 1293#endif 1294 1295 struct list_head tasks; 1296#ifdef CONFIG_SMP 1297 struct plist_node pushable_tasks; 1298 struct rb_node pushable_dl_tasks; 1299#endif 1300 1301 struct mm_struct *mm, *active_mm; 1302#ifdef CONFIG_COMPAT_BRK 1303 unsigned brk_randomized:1; 1304#endif 1305 /* per-thread vma caching */ 1306 u32 vmacache_seqnum; 1307 struct vm_area_struct *vmacache[VMACACHE_SIZE]; 1308#if defined(SPLIT_RSS_COUNTING) 1309 struct task_rss_stat rss_stat; 1310#endif 1311/* task state */ 1312 int exit_state; 1313 int exit_code, exit_signal; 1314 int pdeath_signal; /* The signal sent when the parent dies */ 1315 unsigned int jobctl; /* JOBCTL_*, siglock protected */ 1316 1317 /* Used for emulating ABI behavior of previous Linux versions */ 1318 unsigned int personality; 1319 1320 unsigned in_execve:1; /* Tell the LSMs that the process is doing an 1321 * execve */ 1322 unsigned in_iowait:1; 1323 1324 /* Revert to default priority/policy when forking */ 1325 unsigned sched_reset_on_fork:1; 1326 unsigned sched_contributes_to_load:1; 1327 1328 unsigned long atomic_flags; /* Flags needing atomic access. */ 1329 1330 pid_t pid; 1331 pid_t tgid; 1332 1333#ifdef CONFIG_CC_STACKPROTECTOR 1334 /* Canary value for the -fstack-protector gcc feature */ 1335 unsigned long stack_canary; 1336#endif 1337 /* 1338 * pointers to (original) parent process, youngest child, younger sibling, 1339 * older sibling, respectively. (p->father can be replaced with 1340 * p->real_parent->pid) 1341 */ 1342 struct task_struct __rcu *real_parent; /* real parent process */ 1343 struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */ 1344 /* 1345 * children/sibling forms the list of my natural children 1346 */ 1347 struct list_head children; /* list of my children */ 1348 struct list_head sibling; /* linkage in my parent's children list */ 1349 struct task_struct *group_leader; /* threadgroup leader */ 1350 1351 /* 1352 * ptraced is the list of tasks this task is using ptrace on. 1353 * This includes both natural children and PTRACE_ATTACH targets. 1354 * p->ptrace_entry is p's link on the p->parent->ptraced list. 1355 */ 1356 struct list_head ptraced; 1357 struct list_head ptrace_entry; 1358 1359 /* PID/PID hash table linkage. */ 1360 struct pid_link pids[PIDTYPE_MAX]; 1361 struct list_head thread_group; 1362 struct list_head thread_node; 1363 1364 struct completion *vfork_done; /* for vfork() */ 1365 int __user *set_child_tid; /* CLONE_CHILD_SETTID */ 1366 int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ 1367 1368 cputime_t utime, stime, utimescaled, stimescaled; 1369 cputime_t gtime; 1370#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 1371 struct cputime prev_cputime; 1372#endif 1373#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 1374 seqlock_t vtime_seqlock; 1375 unsigned long long vtime_snap; 1376 enum { 1377 VTIME_SLEEPING = 0, 1378 VTIME_USER, 1379 VTIME_SYS, 1380 } vtime_snap_whence; 1381#endif 1382 unsigned long nvcsw, nivcsw; /* context switch counts */ 1383 u64 start_time; /* monotonic time in nsec */ 1384 u64 real_start_time; /* boot based time in nsec */ 1385/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ 1386 unsigned long min_flt, maj_flt; 1387 1388 struct task_cputime cputime_expires; 1389 struct list_head cpu_timers[3]; 1390 1391/* process credentials */ 1392 const struct cred __rcu *real_cred; /* objective and real subjective task 1393 * credentials (COW) */ 1394 const struct cred __rcu *cred; /* effective (overridable) subjective task 1395 * credentials (COW) */ 1396 char comm[TASK_COMM_LEN]; /* executable name excluding path 1397 - access with [gs]et_task_comm (which lock 1398 it with task_lock()) 1399 - initialized normally by setup_new_exec */ 1400/* file system info */ 1401 int link_count, total_link_count; 1402#ifdef CONFIG_SYSVIPC 1403/* ipc stuff */ 1404 struct sysv_sem sysvsem; 1405 struct sysv_shm sysvshm; 1406#endif 1407#ifdef CONFIG_DETECT_HUNG_TASK 1408/* hung task detection */ 1409 unsigned long last_switch_count; 1410#endif 1411/* CPU-specific state of this task */ 1412 struct thread_struct thread; 1413/* filesystem information */ 1414 struct fs_struct *fs; 1415/* open file information */ 1416 struct files_struct *files; 1417/* namespaces */ 1418 struct nsproxy *nsproxy; 1419/* signal handlers */ 1420 struct signal_struct *signal; 1421 struct sighand_struct *sighand; 1422 1423 sigset_t blocked, real_blocked; 1424 sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */ 1425 struct sigpending pending; 1426 1427 unsigned long sas_ss_sp; 1428 size_t sas_ss_size; 1429 int (*notifier)(void *priv); 1430 void *notifier_data; 1431 sigset_t *notifier_mask; 1432 struct callback_head *task_works; 1433 1434 struct audit_context *audit_context; 1435#ifdef CONFIG_AUDITSYSCALL 1436 kuid_t loginuid; 1437 unsigned int sessionid; 1438#endif 1439 struct seccomp seccomp; 1440 1441/* Thread group tracking */ 1442 u32 parent_exec_id; 1443 u32 self_exec_id; 1444/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, 1445 * mempolicy */ 1446 spinlock_t alloc_lock; 1447 1448 /* Protection of the PI data structures: */ 1449 raw_spinlock_t pi_lock; 1450 1451#ifdef CONFIG_RT_MUTEXES 1452 /* PI waiters blocked on a rt_mutex held by this task */ 1453 struct rb_root pi_waiters; 1454 struct rb_node *pi_waiters_leftmost; 1455 /* Deadlock detection and priority inheritance handling */ 1456 struct rt_mutex_waiter *pi_blocked_on; 1457#endif 1458 1459#ifdef CONFIG_DEBUG_MUTEXES 1460 /* mutex deadlock detection */ 1461 struct mutex_waiter *blocked_on; 1462#endif 1463#ifdef CONFIG_TRACE_IRQFLAGS 1464 unsigned int irq_events; 1465 unsigned long hardirq_enable_ip; 1466 unsigned long hardirq_disable_ip; 1467 unsigned int hardirq_enable_event; 1468 unsigned int hardirq_disable_event; 1469 int hardirqs_enabled; 1470 int hardirq_context; 1471 unsigned long softirq_disable_ip; 1472 unsigned long softirq_enable_ip; 1473 unsigned int softirq_disable_event; 1474 unsigned int softirq_enable_event; 1475 int softirqs_enabled; 1476 int softirq_context; 1477#endif 1478#ifdef CONFIG_LOCKDEP 1479# define MAX_LOCK_DEPTH 48UL 1480 u64 curr_chain_key; 1481 int lockdep_depth; 1482 unsigned int lockdep_recursion; 1483 struct held_lock held_locks[MAX_LOCK_DEPTH]; 1484 gfp_t lockdep_reclaim_gfp; 1485#endif 1486 1487/* journalling filesystem info */ 1488 void *journal_info; 1489 1490/* stacked block device info */ 1491 struct bio_list *bio_list; 1492 1493#ifdef CONFIG_BLOCK 1494/* stack plugging */ 1495 struct blk_plug *plug; 1496#endif 1497 1498/* VM state */ 1499 struct reclaim_state *reclaim_state; 1500 1501 struct backing_dev_info *backing_dev_info; 1502 1503 struct io_context *io_context; 1504 1505 unsigned long ptrace_message; 1506 siginfo_t *last_siginfo; /* For ptrace use. */ 1507 struct task_io_accounting ioac; 1508#if defined(CONFIG_TASK_XACCT) 1509 u64 acct_rss_mem1; /* accumulated rss usage */ 1510 u64 acct_vm_mem1; /* accumulated virtual memory usage */ 1511 cputime_t acct_timexpd; /* stime + utime since last update */ 1512#endif 1513#ifdef CONFIG_CPUSETS 1514 nodemask_t mems_allowed; /* Protected by alloc_lock */ 1515 seqcount_t mems_allowed_seq; /* Seqence no to catch updates */ 1516 int cpuset_mem_spread_rotor; 1517 int cpuset_slab_spread_rotor; 1518#endif 1519#ifdef CONFIG_CGROUPS 1520 /* Control Group info protected by css_set_lock */ 1521 struct css_set __rcu *cgroups; 1522 /* cg_list protected by css_set_lock and tsk->alloc_lock */ 1523 struct list_head cg_list; 1524#endif 1525#ifdef CONFIG_FUTEX 1526 struct robust_list_head __user *robust_list; 1527#ifdef CONFIG_COMPAT 1528 struct compat_robust_list_head __user *compat_robust_list; 1529#endif 1530 struct list_head pi_state_list; 1531 struct futex_pi_state *pi_state_cache; 1532#endif 1533#ifdef CONFIG_PERF_EVENTS 1534 struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts]; 1535 struct mutex perf_event_mutex; 1536 struct list_head perf_event_list; 1537#endif 1538#ifdef CONFIG_DEBUG_PREEMPT 1539 unsigned long preempt_disable_ip; 1540#endif 1541#ifdef CONFIG_NUMA 1542 struct mempolicy *mempolicy; /* Protected by alloc_lock */ 1543 short il_next; 1544 short pref_node_fork; 1545#endif 1546#ifdef CONFIG_NUMA_BALANCING 1547 int numa_scan_seq; 1548 unsigned int numa_scan_period; 1549 unsigned int numa_scan_period_max; 1550 int numa_preferred_nid; 1551 unsigned long numa_migrate_retry; 1552 u64 node_stamp; /* migration stamp */ 1553 u64 last_task_numa_placement; 1554 u64 last_sum_exec_runtime; 1555 struct callback_head numa_work; 1556 1557 struct list_head numa_entry; 1558 struct numa_group *numa_group; 1559 1560 /* 1561 * Exponential decaying average of faults on a per-node basis. 1562 * Scheduling placement decisions are made based on the these counts. 1563 * The values remain static for the duration of a PTE scan 1564 */ 1565 unsigned long *numa_faults_memory; 1566 unsigned long total_numa_faults; 1567 1568 /* 1569 * numa_faults_buffer records faults per node during the current 1570 * scan window. When the scan completes, the counts in 1571 * numa_faults_memory decay and these values are copied. 1572 */ 1573 unsigned long *numa_faults_buffer_memory; 1574 1575 /* 1576 * Track the nodes the process was running on when a NUMA hinting 1577 * fault was incurred. 1578 */ 1579 unsigned long *numa_faults_cpu; 1580 unsigned long *numa_faults_buffer_cpu; 1581 1582 /* 1583 * numa_faults_locality tracks if faults recorded during the last 1584 * scan window were remote/local. The task scan period is adapted 1585 * based on the locality of the faults with different weights 1586 * depending on whether they were shared or private faults 1587 */ 1588 unsigned long numa_faults_locality[2]; 1589 1590 unsigned long numa_pages_migrated; 1591#endif /* CONFIG_NUMA_BALANCING */ 1592 1593 struct rcu_head rcu; 1594 1595 /* 1596 * cache last used pipe for splice 1597 */ 1598 struct pipe_inode_info *splice_pipe; 1599 1600 struct page_frag task_frag; 1601 1602#ifdef CONFIG_TASK_DELAY_ACCT 1603 struct task_delay_info *delays; 1604#endif 1605#ifdef CONFIG_FAULT_INJECTION 1606 int make_it_fail; 1607#endif 1608 /* 1609 * when (nr_dirtied >= nr_dirtied_pause), it's time to call 1610 * balance_dirty_pages() for some dirty throttling pause 1611 */ 1612 int nr_dirtied; 1613 int nr_dirtied_pause; 1614 unsigned long dirty_paused_when; /* start of a write-and-pause period */ 1615 1616#ifdef CONFIG_LATENCYTOP 1617 int latency_record_count; 1618 struct latency_record latency_record[LT_SAVECOUNT]; 1619#endif 1620 /* 1621 * time slack values; these are used to round up poll() and 1622 * select() etc timeout values. These are in nanoseconds. 1623 */ 1624 unsigned long timer_slack_ns; 1625 unsigned long default_timer_slack_ns; 1626 1627#ifdef CONFIG_FUNCTION_GRAPH_TRACER 1628 /* Index of current stored address in ret_stack */ 1629 int curr_ret_stack; 1630 /* Stack of return addresses for return function tracing */ 1631 struct ftrace_ret_stack *ret_stack; 1632 /* time stamp for last schedule */ 1633 unsigned long long ftrace_timestamp; 1634 /* 1635 * Number of functions that haven't been traced 1636 * because of depth overrun. 1637 */ 1638 atomic_t trace_overrun; 1639 /* Pause for the tracing */ 1640 atomic_t tracing_graph_pause; 1641#endif 1642#ifdef CONFIG_TRACING 1643 /* state flags for use by tracers */ 1644 unsigned long trace; 1645 /* bitmask and counter of trace recursion */ 1646 unsigned long trace_recursion; 1647#endif /* CONFIG_TRACING */ 1648#ifdef CONFIG_MEMCG /* memcg uses this to do batch job */ 1649 unsigned int memcg_kmem_skip_account; 1650 struct memcg_oom_info { 1651 struct mem_cgroup *memcg; 1652 gfp_t gfp_mask; 1653 int order; 1654 unsigned int may_oom:1; 1655 } memcg_oom; 1656#endif 1657#ifdef CONFIG_UPROBES 1658 struct uprobe_task *utask; 1659#endif 1660#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE) 1661 unsigned int sequential_io; 1662 unsigned int sequential_io_avg; 1663#endif 1664};task_struct
關於task_struct的具體介紹,見
http://blog.csdn.net/npy_lp/article/details/7292563
它定義在linux-3.18.6/include/linux/sched.h文件中。
進程(Process)是系統進行資源分配和調度的基本單位,一個進程是一個程式的運行實例。而在Linux中,可以使用一個進程來創建另外一個進程。這樣的話,Linux的進程的組織結
構其實有點像Linux目錄樹,是個層次結構的,可以使用 pstree命令來查看。在最上面是init程式的執行進程。它是所有進程的老祖宗。Linux提供了兩個函數來創建進程。
1.fork()
fork()提供了創建進程的基本操作,可以說它是Linux系統多任務的基礎。該函數在/linux-3.18.6/kernel/fork.c。
2.exec系列函數
如果只有fork(),肯定是不完美的,因為fork()只能參數一個父進程的副本。而exec系列函數則可以幫助我們建立一個全新的新進程。
在Linux系統中,一個進程的PCB是一個C語言的結構體task_struct來表示,而多個PCB之間是由一個雙向鏈表組織起來的,在《Understanding the Linux Kernel》中,則是進一步描
述這個鏈表是一個雙向迴圈鏈表。
在Linux中創建一個新進程的方法是使用fork函數,fork()執行一次但有兩個返回值。
在父進程中,返回值是子進程的進程號;在子進程中,返回值為0。因此可通過返回值來判斷當前進程是父進程還是子進程。
使用fork函數得到的子進程是父進程的一個複製品,它從父進程處複製了整個進程的地址空間,包括進程上下文,進程堆棧,記憶體信息,打開的文件描述符,信 號控制設定,進程優
先級,進程組號,當前工作目錄,根目錄,資源限制,控制終端等。而子進程所獨有的只是它的進程號,資源使用和計時器等。可以看出,使用 fork函數的代價是很大的,它複製了
父進程中的代碼段,數據段和堆棧段里的大部分內容,使得fork函數的執行速度並不快。
創建一個進程,至少涉及的函數:
sys_clone, do_fork, dup_task_struct, copy_process, copy_thread, ret_from_fork
這隻是圖中的fork一個分支
學習筆記
進程的描述
1.進程描述符task_struct數據結構(一)
為了管理進程,內核必須對每個進程進行清晰的描述,進程描述符提供了內核所需瞭解的進程信息。
- struct task_struct數據結構很龐大
- Linux進程的狀態與操作系統原理中的描述的進程狀態似乎有所不同,比如就緒狀態和運行狀態都是TASK_RUNNING,為什麼呢?
- 進程的標示pid
- 所有進程鏈表struct list_head tasks; 內核的雙向迴圈鏈表的實現方法 - 一個更簡略的雙向迴圈鏈表
- 程式創建的進程具有父子關係,在編程時往往需要引用這樣的父子關係。進程描述符中有幾個域用來表示這樣的關係
- Linux為每個進程分配一個8KB大小的記憶體區域,用於存放該進程兩個不同的數據結構:Thread_info和進程的內核堆棧
進程處於內核態時使用,不同於用戶態堆棧,即PCB中指定了內核棧,那為什麼PCB中沒有用戶態堆棧?用戶態堆棧是怎麼設定的?
內核控制路徑所用的堆棧很少,因此對棧和Thread_info來說,8KB足夠了
- struct thread_struct thread; //CPU-specific state of this task
- 文件系統和文件描述符
- 記憶體管理——進程的地址空間
進程狀態的切換過程和原因大致如下圖:
雙向迴圈鏈表圖如下:
進程的父子關係直觀圖:
進程的創建
1.進程的創建概覽及fork一個進程的用戶態代碼
(1)進程的起源再回顧
- 道生一(start_kernel...cpu_idle)
- 一生二(kernel_init和kthreadd)
- 二生三(即前面的0、1、2三個進程)
- 三生萬物(1號進程是所有用戶態進程的祖先,2號進程是所有內核線程的祖先)
(2)0號進程手工寫,1號進程複製、載入init程式
(3)shell命令行是如何啟動進程的
fork一個子進程的代碼:
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <unistd.h> 4 int main(int argc, char * argv[]) 5 { 6 int pid; 7 /* fork another process */ 8 pid = fork(); 9 if (pid < 0) 出錯處理 10 { 11 /* error occurred */ 12 fprintf(stderr,"Fork Failed!"); 13 exit(-1); 14 } 15 else if (pid == 0) 16 { 17 /* child process */ 子進程 pid=0時 if和else都會執行 fork系統調用在父進程和子進程各返回一次 18 printf("This is Child Process!\n"); 19 } 20 else 21 { 22 /* parent process */ 23 printf("This is Parent Process!\n");