2017-01-04

Mach-O加载流程

了解了Mach-O的文件格式，肯定需要了解Mach-O的加载流程，既然要研究OSX，这就是必不可少的一步，那么通过阅读源码可以比较清晰的认识这一流程
用户态在加载新的可执行文件时会有一种方式是通过exec*系列函数，所以我们先从exec*说起，由于exec*是都只是对系统调用execve()库的封装，所以我们先从一张流程图来看起

QQ20170304-185855@2x.jpg

1.1`__mac_execve()`

int
__mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval)
{
	char *bufp = NULL; 
	struct image_params *imgp;
......
	context.vc_thread = current_thread();
	context.vc_ucred = kauth_cred_proc_ref(p);	/* XXX must NOT be kauth_cred_get() */
	/* Allocate a big chunk for locals instead of using stack since these  
	 * structures a pretty big.
	 */
	 //申请一块内存用来存放imgp,vap,origavp的数据结构
	MALLOC(bufp, char *, (sizeof(*imgp) + sizeof(*vap) + sizeof(*origvap)), M_TEMP, M_WAITOK | M_ZERO);
	imgp = (struct image_params *) bufp;
	if (bufp == NULL) {
		error = ENOMEM;
		goto exit_with_error;
	}
	vap = (struct vnode_attr *) (bufp + sizeof(*imgp));
	origvap = (struct vnode_attr *) (bufp + sizeof(*imgp) + sizeof(*vap));
	//初始化imgp
	/* Initialize the common data in the image_params structure */
	imgp->ip_user_fname = uap->fname;
	imgp->ip_user_argv = uap->argp;
	imgp->ip_user_envv = uap->envp;
	imgp->ip_vattr = vap;
	imgp->ip_origvattr = origvap;
	imgp->ip_vfs_context = &context;
	imgp->ip_flags = (is_64 ? IMGPF_WAS_64BIT : IMGPF_NONE) | ((p->p_flag & P_DISABLE_ASLR) ? IMGPF_DISABLE_ASLR : IMGPF_NONE);
	imgp->ip_seg = (is_64 ? UIO_USERSPACE64 : UIO_USERSPACE32);
	imgp->ip_mac_return = 0;
	uthread = get_bsdthread_info(current_thread());
	if (uthread->uu_flag & UT_VFORK) {
		imgp->ip_flags |= IMGPF_VFORK_EXEC;
	}
#if CONFIG_MACF
	if (uap->mac_p != USER_ADDR_NULL) {
		error = mac_execve_enter(uap->mac_p, imgp);
		if (error) {
			kauth_cred_unref(&context.vc_ucred);
			goto exit_with_error;
		}
	}
#endif
//执行image函数
	error = exec_activate_image(imgp);
	
//释放资源
	kauth_cred_unref(&context.vc_ucred);
......
	return(error);
}

主要加载镜像进行数据的初始化以及资源的相关操作，具体的家在流程在exec_activate_image())中

1.2`exec_activate_image()`

static int
exec_activate_image(struct image_params *imgp)
{
	struct nameidata *ndp = NULL;
	const char *excpath;
	int error;
	int resid;
	int once = 1;	/* save SGUID-ness for interpreted files */
	int i;
	int itercount = 0;
	proc_t p = vfs_context_proc(imgp->ip_vfs_context);
	error = execargs_alloc(imgp);
	if (error)
		goto bad_notrans;
	//保存程序路径
	error = exec_save_path(imgp, imgp->ip_user_fname, imgp->ip_seg, &excpath);
	if (error) {
		goto bad_notrans;
	}
	/* Use excpath, which contains the copyin-ed exec path */
	DTRACE_PROC1(exec, uintptr_t, excpath);
	MALLOC(ndp, struct nameidata *, sizeof(*ndp), M_TEMP, M_WAITOK | M_ZERO);
	if (ndp == NULL) {
		error = ENOMEM;
		goto bad_notrans;
	}
	NDINIT(ndp, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1,
		   UIO_SYSSPACE, CAST_USER_ADDR_T(excpath), imgp->ip_vfs_context);
again:
	error = namei(ndp);
	if (error)
		goto bad_notrans;
	imgp->ip_ndp = ndp;	/* successful namei(); call nameidone() later */
	imgp->ip_vp = ndp->ni_vp;	/* if set, need to vnode_put() at some point */
	/*
	 * Before we start the transition from binary A to binary B, make
	 * sure another thread hasn't started exiting the process.  We grab
	 * the proc lock to check p_lflag initially, and the transition
	 * mechanism ensures that the value doesn't change after we release
	 * the lock.
	 */
	proc_lock(p);
	if (p->p_lflag & P_LEXIT) {
		error = EDEADLK;
		proc_unlock(p);
		goto bad_notrans;
	}
	error = proc_transstart(p, 1, 0);
	proc_unlock(p);
	if (error)
		goto bad_notrans;
	error = exec_check_permissions(imgp);
	if (error)
		goto bad;
	/* Copy; avoid invocation of an interpreter overwriting the original */
	if (once) {
		once = 0;
		*imgp->ip_origvattr = *imgp->ip_vattr;
	}
	error = vn_rdwr(UIO_READ, imgp->ip_vp, imgp->ip_vdata, PAGE_SIZE, 0,
			UIO_SYSSPACE, IO_NODELOCKED,
			vfs_context_ucred(imgp->ip_vfs_context),
			&resid, vfs_context_proc(imgp->ip_vfs_context));
	if (error)
		goto bad;
	if (resid) {
		memset(imgp->ip_vdata + (PAGE_SIZE - resid), 0x0, resid);
	}
encapsulated_binary:
	/* Limit the number of iterations we will attempt on each binary */
	if (++itercount > EAI_ITERLIMIT) {
	    	error = EBADEXEC;
		goto bad;
	}
	error = -1;
//遍历execsw来确定不同的加载函数
	for(i = 0; error == -1 && execsw[i].ex_imgact != NULL; i++) {
		error = (*execsw[i].ex_imgact)(imgp);
		switch (error) {
		/* case -1: not claimed: continue */
		case -2:		/* Encapsulated binary, imgp->ip_XXX set for next iteration */
			goto encapsulated_binary;
                   		case -3:		/* Interpreter */
#if CONFIG_MACF
			/*
			 * Copy the script label for later use. Note that
			 * the label can be different when the script is
			 * actually read by the interpreter.
			 */
			if (imgp->ip_scriptlabelp)
				mac_vnode_label_free(imgp->ip_scriptlabelp);
			imgp->ip_scriptlabelp = mac_vnode_label_alloc();
			if (imgp->ip_scriptlabelp == NULL) {
				error = ENOMEM;
				break;
			}
			mac_vnode_label_copy(imgp->ip_vp->v_label,
					     imgp->ip_scriptlabelp);
			/*
			 * Take a ref of the script vnode for later use.
			 */
			if (imgp->ip_scriptvp)
				vnode_put(imgp->ip_scriptvp);
			if (vnode_getwithref(imgp->ip_vp) == 0)
				imgp->ip_scriptvp = imgp->ip_vp;
#endif
			nameidone(ndp);
			vnode_put(imgp->ip_vp);
			imgp->ip_vp = NULL;	/* already put */
			imgp->ip_ndp = NULL; /* already nameidone */
			/* Use excpath, which exec_shell_imgact reset to the interpreter */
			NDINIT(ndp, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF,
				   UIO_SYSSPACE, CAST_USER_ADDR_T(excpath), imgp->ip_vfs_context);
			proc_transend(p, 0);
			goto again;
		default:
			break;
		}
	}
	/*
	 * Call out to allow 3rd party notification of exec. 
	 * Ignore result of kauth_authorize_fileop call.
	 */
	if (error == 0 && kauth_authorize_fileop_has_listeners()) {
		kauth_authorize_fileop(vfs_context_ucred(imgp->ip_vfs_context),
					KAUTH_FILEOP_EXEC,
					(uintptr_t)ndp->ni_vp, 0);
	}
	if (error == 0) {
		/*
		 * Reset atm context from task
		 */
		task_atm_reset(p->task);
		/*
		 * Reset old bank context from task
		 */
		task_bank_reset(p->task);
	}
bad:
	proc_transend(p, 0);
bad_notrans:
	if (imgp->ip_strings)
		execargs_free(imgp);
	if (imgp->ip_ndp)
		nameidone(imgp->ip_ndp);
	if (ndp)
		FREE(ndp, M_TEMP);
	return (error);
}

exec_activate_image()完成了镜像加载的所有工作，主要是寻找可执行文件并把他们拷贝在内存中，通过遍历execsw，根据可执行文件的不同类型来确定不同的加载函数，从if(error)可以看出这些加载过程如果不是被终止就是完成在整个流程的加载

1.3 `exec_mach_imgact`

struct execsw {
	int (*ex_imgact)(struct image_params *);
	const char *ex_name;
} execsw[] = {
	{ exec_mach_imgact,		"Mach-o Binary" },
	{ exec_fat_imgact,		"Fat Binary" },
	{ exec_shell_imgact,		"Interpreter Script" },
	{ NULL, NULL}
}

可以看出，OSX有三种可执行文件:

Mach-o由exec_mach_imgact处理
Fat Binary由exec_fat_imgact处理
Interpreter Script由exec_shell_imgact处理

static int
exec_mach_imgact(struct image_params *imgp)
{
	struct mach_header *mach_header = (struct mach_header *)imgp->ip_vdata;
	proc_t			p = vfs_context_proc(imgp->ip_vfs_context);
	int			error = 0;
	task_t			task;
	task_t			new_task = NULL; /* protected by vfexec */
	thread_t		thread;
	struct uthread		*uthread;
	vm_map_t old_map = VM_MAP_NULL;
	vm_map_t map;
	load_return_t		lret;
	load_result_t		load_result;
	struct _posix_spawnattr *psa = NULL;
	int			spawn = (imgp->ip_flags & IMGPF_SPAWN);
	int			vfexec = (imgp->ip_flags & IMGPF_VFORK_EXEC);
	int			p_name_len;
	/*
	 * make sure it's a Mach-O 1.0 or Mach-O 2.0 binary; the difference
	 * is a reserved field on the end, so for the most part, we can
	 * treat them as if they were identical. Reverse-endian Mach-O
	 * binaries are recognized but not compatible.
 	 */
//检测header中的magic来确定是否符合Mach-O的条件
	if ((mach_header->magic == MH_CIGAM) ||
	    (mach_header->magic == MH_CIGAM_64)) {
		error = EBADARCH;
		goto bad;
	}
	if ((mach_header->magic != MH_MAGIC) &&
	    (mach_header->magic != MH_MAGIC_64)) {
		error = -1;
		goto bad;
	}
//检测header中的文件类型且必须为可执行文件
	if (mach_header->filetype != MH_EXECUTE) {
		error = -1;
		goto bad;
	}
//检测Mach-O文件的CPU种类与版本
	if (imgp->ip_origcputype != 0) {
		/* Fat header previously had an idea about this thin file */
		if (imgp->ip_origcputype != mach_header->cputype ||
			imgp->ip_origcpusubtype != mach_header->cpusubtype) {
			error = EBADARCH;
			goto bad;
		}
	} else {
		imgp->ip_origcputype = mach_header->cputype;
		imgp->ip_origcpusubtype = mach_header->cpusubtype;
	}
	task = current_task();
	thread = current_thread();
	uthread = get_bsdthread_info(thread);
	if ((mach_header->cputype & CPU_ARCH_ABI64) == CPU_ARCH_ABI64)
		imgp->ip_flags |= IMGPF_IS_64BIT;
	/* If posix_spawn binprefs exist, respect those prefs. */
	psa = (struct _posix_spawnattr *) imgp->ip_px_sa;
	if (psa != NULL && psa->psa_binprefs[0] != 0) {
		int pr = 0;
		for (pr = 0; pr < NBINPREFS; pr++) {
			cpu_type_t pref = psa->psa_binprefs[pr];
			if (pref == 0) {
				/* No suitable arch in the pref list */
				error = EBADARCH;
				goto bad;
			}
			if (pref == CPU_TYPE_ANY) {
				/* Jump to regular grading */
				goto grade;
			}
			if (pref == imgp->ip_origcputype) {
				/* We have a match! */
				goto grade;
			}
		}
		error = EBADARCH;
		goto bad;
	}
grade:
	if (!grade_binary(imgp->ip_origcputype, imgp->ip_origcpusubtype & ~CPU_SUBTYPE_MASK)) {
		error = EBADARCH;
		goto bad;
	}
	/* Copy in arguments/environment from the old process */
	error = exec_extract_strings(imgp);
	if (error)
		goto bad;
	error = exec_add_apple_strings(imgp);
	if (error)
		goto bad;
	AUDIT_ARG(argv, imgp->ip_startargv, imgp->ip_argc, 
	    imgp->ip_endargv - imgp->ip_startargv);
	AUDIT_ARG(envv, imgp->ip_endargv, imgp->ip_envc,
	    imgp->ip_endenvv - imgp->ip_endargv);
	/*
	 * We are being called to activate an image subsequent to a vfork()
	 * operation; in this case, we know that our task, thread, and
	 * uthread are actually those of our parent, and our proc, which we
	 * obtained indirectly from the image_params vfs_context_t, is the
	 * new child process.
	 */
	if (vfexec || spawn) {
		if (vfexec) {
			imgp->ip_new_thread = fork_create_child(task, NULL, p, FALSE, (imgp->ip_flags & IMGPF_IS_64BIT));
			if (imgp->ip_new_thread == NULL) {
				error = ENOMEM;
				goto bad;
			}
		}
		/* reset local idea of thread, uthread, task */
		thread = imgp->ip_new_thread;
		uthread = get_bsdthread_info(thread);
		task = new_task = get_threadtask(thread);
		map = get_task_map(task);
	} else {
		map = VM_MAP_NULL;
	}
	/*
	 * We set these flags here; this is OK, since if we fail after
	 * this point, we have already destroyed the parent process anyway.
	 */
	task_set_dyld_info(task, MACH_VM_MIN_ADDRESS, 0);
	if (imgp->ip_flags & IMGPF_IS_64BIT) {
		task_set_64bit(task, TRUE);
		OSBitOrAtomic(P_LP64, &p->p_flag);
	} else {
		task_set_64bit(task, FALSE);
		OSBitAndAtomic(~((uint32_t)P_LP64), &p->p_flag);
	}
	/*
	 *	Load the Mach-O file.
	 *
	 * NOTE: An error after this point  indicates we have potentially
	 * destroyed or overwritten some process state while attempting an
	 * execve() following a vfork(), which is an unrecoverable condition.
	 * We send the new process an immediate SIGKILL to avoid it executing
	 * any instructions in the mutated address space. For true spawns,
	 * this is not the case, and "too late" is still not too late to
	 * return an error code to the parent process.
	 */
	/*
	 * Actually load the image file we previously decided to load.
	 */
//把Mach-O文件映射到内存中并调用load_machfile函数
	lret = load_machfile(imgp, mach_header, thread, &map, &load_result);
	if (lret != LOAD_SUCCESS) {
		error = load_return_to_errno(lret);
		goto badtoolate;
	}
	proc_lock(p);
	p->p_cputype = imgp->ip_origcputype;
	p->p_cpusubtype = imgp->ip_origcpusubtype;
	proc_unlock(p);
	vm_map_set_user_wire_limit(map, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
	/* 
	 * Set code-signing flags if this binary is signed, or if parent has
	 * requested them on exec.
	 */
	if (load_result.csflags & CS_VALID) {
		imgp->ip_csflags |= load_result.csflags & 
			(CS_VALID|
			 CS_HARD|CS_KILL|CS_RESTRICT|CS_ENFORCEMENT|CS_REQUIRE_LV|CS_DYLD_PLATFORM|
			 CS_EXEC_SET_HARD|CS_EXEC_SET_KILL|CS_EXEC_SET_ENFORCEMENT);
	} else {
		imgp->ip_csflags &= ~CS_VALID;
	}
	if (p->p_csflags & CS_EXEC_SET_HARD)
		imgp->ip_csflags |= CS_HARD;
	if (p->p_csflags & CS_EXEC_SET_KILL)
		imgp->ip_csflags |= CS_KILL;
	if (p->p_csflags & CS_EXEC_SET_ENFORCEMENT)
		imgp->ip_csflags |= CS_ENFORCEMENT;
	if (p->p_csflags & CS_EXEC_SET_INSTALLER)
		imgp->ip_csflags |= CS_INSTALLER;
	/*
	 * Set up the system reserved areas in the new address space.
	 */
	vm_map_exec(map, task, (void *)p->p_fd->fd_rdir, cpu_type());
	/*
	 * Close file descriptors which specify close-on-exec.
	 */
	fdexec(p, psa != NULL ? psa->psa_flags : 0);
	/*
	 * deal with set[ug]id.
	 */
	error = exec_handle_sugid(imgp);
	if (error) {
		if (spawn || !vfexec) {
			vm_map_deallocate(map);
		}
		goto badtoolate;
	}
	/*
	 * Commit to new map.
	 *
	 * Swap the new map for the old, which consumes our new map reference but
	 * each leaves us responsible for the old_map reference.  That lets us get
	 * off the pmap associated with it, and then we can release it.
	 */
	if (!vfexec) {
		old_map = swap_task_map(task, thread, map, !spawn);
		vm_map_deallocate(old_map);
	}
	lret = activate_thread_state(thread, &load_result);
	if (lret != KERN_SUCCESS) {
		goto badtoolate;
	}
	/*
	 * deal with voucher on exec-calling thread.
	 */
	if (imgp->ip_new_thread == NULL)
		thread_set_mach_voucher(current_thread(), IPC_VOUCHER_NULL);
	/* Make sure we won't interrupt ourself signalling a partial process */
	if (!vfexec && !spawn && (p->p_lflag & P_LTRACED))
		psignal(p, SIGTRAP);
	if (load_result.unixproc &&
		create_unix_stack(get_task_map(task),
				  &load_result,
				  p) != KERN_SUCCESS) {
		error = load_return_to_errno(LOAD_NOSPACE);
		goto badtoolate;
	}
	if (vfexec || spawn) {
		old_map = vm_map_switch(get_task_map(task));
	}
	if (load_result.unixproc) {
		user_addr_t	ap;
		/*
		 * Copy the strings area out into the new process address
		 * space.
		 */
		ap = p->user_stack;
		error = exec_copyout_strings(imgp, &ap);
		if (error) {
			if (vfexec || spawn)
				vm_map_switch(old_map);
			goto badtoolate;
		}
		/* Set the stack */
		thread_setuserstack(thread, ap);
	}
	if (load_result.dynlinker) {
		uint64_t	ap;
		int			new_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT) ? 8 : 4;
		/* Adjust the stack */
		ap = thread_adjuserstack(thread, -new_ptr_size);
		error = copyoutptr(load_result.mach_header, ap, new_ptr_size);
		if (error) {
			if (vfexec || spawn)
				vm_map_switch(old_map);
			goto badtoolate;
		}
		//由dyld接手接下来的工作
		task_set_dyld_info(task, load_result.all_image_info_addr,
		    load_result.all_image_info_size);
	}
	/* Avoid immediate VM faults back into kernel */
	exec_prefault_data(p, imgp, &load_result);
	if (vfexec || spawn) {
		vm_map_switch(old_map);
	}
	/* Stop profiling */
	stopprofclock(p);
	/*
	 * Reset signal state.
	 */
	execsigs(p, thread);
	/*
	 * need to cancel async IO requests that can be cancelled and wait for those
	 * already active.  MAY BLOCK!
	 */
	_aio_exec( p );
#if SYSV_SHM
	/* FIXME: Till vmspace inherit is fixed: */
	if (!vfexec && p->vm_shm)
		shmexec(p);
#endif
#if SYSV_SEM
	/* Clean up the semaphores */
	semexit(p);
#endif
	/*
	 * Remember file name for accounting.
	 */
	p->p_acflag &= ~AFORK;
	/*
	 * Set p->p_comm and p->p_name to the name passed to exec
	 */
	p_name_len = sizeof(p->p_name) - 1;
	if(imgp->ip_ndp->ni_cnd.cn_namelen > p_name_len)
		imgp->ip_ndp->ni_cnd.cn_namelen = p_name_len;
	bcopy((caddr_t)imgp->ip_ndp->ni_cnd.cn_nameptr, (caddr_t)p->p_name,
		(unsigned)imgp->ip_ndp->ni_cnd.cn_namelen);
	p->p_name[imgp->ip_ndp->ni_cnd.cn_namelen] = '\0';
	if (imgp->ip_ndp->ni_cnd.cn_namelen > MAXCOMLEN)
		imgp->ip_ndp->ni_cnd.cn_namelen = MAXCOMLEN;
	bcopy((caddr_t)imgp->ip_ndp->ni_cnd.cn_nameptr, (caddr_t)p->p_comm,
		(unsigned)imgp->ip_ndp->ni_cnd.cn_namelen);
	p->p_comm[imgp->ip_ndp->ni_cnd.cn_namelen] = '\0';
	pal_dbg_set_task_name( p->task );
#if DEVELOPMENT || DEBUG
	/* 
	 * Update the pid an proc name for importance base if any
	 */
	task_importance_update_owner_info(p->task);
#endif
	memcpy(&p->p_uuid[0], &load_result.uuid[0], sizeof(p->p_uuid));
// <rdar://6598155> dtrace code cleanup needed
#if CONFIG_DTRACE
	/*
	 * Invalidate any predicate evaluation already cached for this thread by DTrace.
	 * That's because we've just stored to p_comm and DTrace refers to that when it
	 * evaluates the "execname" special variable. uid and gid may have changed as well.
	 */
	dtrace_set_thread_predcache(current_thread(), 0);
	/*
	 * Free any outstanding lazy dof entries. It is imperative we
	 * always call dtrace_lazy_dofs_destroy, rather than null check
	 * and call if !NULL. If we NULL test, during lazy dof faulting
	 * we can race with the faulting code and proceed from here to
	 * beyond the helpers cleanup. The lazy dof faulting will then
	 * install new helpers which no longer belong to this process!
	 */
	dtrace_lazy_dofs_destroy(p);
	/*
    	 * Clean up any DTrace helpers for the process.
    	 */
    	if (p->p_dtrace_helpers != NULL && dtrace_helpers_cleanup) {
    		(*dtrace_helpers_cleanup)(p);
    	}
	
    	/*
    	 * Cleanup the DTrace provider associated with this process.
    	 */
	proc_lock(p);
	if (p->p_dtrace_probes && dtrace_fasttrap_exec_ptr) {
		(*dtrace_fasttrap_exec_ptr)(p);
	}
	proc_unlock(p);
#endif
	if (kdebug_enable) {
		long dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4;
		/*
		 * Collect the pathname for tracing
		 */
		kdbg_trace_string(p, &dbg_arg1, &dbg_arg2, &dbg_arg3, &dbg_arg4);
		if (vfexec || spawn) {
			KERNEL_DEBUG_CONSTANT1(TRACE_DATA_EXEC | DBG_FUNC_NONE,
					p->p_pid ,0,0,0, (uintptr_t)thread_tid(thread));
			KERNEL_DEBUG_CONSTANT1(TRACE_STRING_EXEC | DBG_FUNC_NONE,
					dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4, (uintptr_t)thread_tid(thread));
		} else {
			KERNEL_DEBUG_CONSTANT(TRACE_DATA_EXEC | DBG_FUNC_NONE,
					p->p_pid ,0,0,0,0);
			KERNEL_DEBUG_CONSTANT(TRACE_STRING_EXEC | DBG_FUNC_NONE,
					dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4, 0);
		}
	}
	/*
	 * If posix_spawned with the START_SUSPENDED flag, stop the
	 * process before it runs.
	 */
	if (imgp->ip_px_sa != NULL) {
		psa = (struct _posix_spawnattr *) imgp->ip_px_sa;
		if (psa->psa_flags & POSIX_SPAWN_START_SUSPENDED) {
			proc_lock(p);
			p->p_stat = SSTOP;
			proc_unlock(p);
			(void) task_suspend_internal(p->task);
		}
	}
	/*
	 * mark as execed, wakeup the process that vforked (if any) and tell
	 * it that it now has its own resources back
	 */
	OSBitOrAtomic(P_EXEC, &p->p_flag);
	proc_resetregister(p);
	if (p->p_pptr && (p->p_lflag & P_LPPWAIT)) {
		proc_lock(p);
		p->p_lflag &= ~P_LPPWAIT;
		proc_unlock(p);
		wakeup((caddr_t)p->p_pptr);
	}
	/*
	 * Pay for our earlier safety; deliver the delayed signals from
	 * the incomplete vfexec process now that it's complete.
	 */
	if (vfexec && (p->p_lflag & P_LTRACED)) {
		psignal_vfork(p, new_task, thread, SIGTRAP);
	}
	goto done;
badtoolate:
	/* Don't allow child process to execute any instructions */
	if (!spawn) {
		if (vfexec) {
			psignal_vfork(p, new_task, thread, SIGKILL);
		} else {
			psignal(p, SIGKILL);
		}
		/* We can't stop this system call at this point, so just pretend we succeeded */
		error = 0;
	}
	
done:
	if (!spawn) {
		/* notify only if it has not failed due to FP Key error */
		if ((p->p_lflag & P_LTERM_DECRYPTFAIL) == 0)
			proc_knote(p, NOTE_EXEC);
	}
	/* Drop extra references for cases where we don't expect the caller to clean up */
	if (vfexec || (spawn && error == 0)) {
		task_deallocate(new_task);
		thread_deallocate(thread);
	}
	if (load_result.threadstate) {
		kfree(load_result.threadstate, load_result.threadstate_sz);
		load_result.threadstate = NULL;
	}
bad:
	return(error);
}

这里主要对可执行文件的一些基本信息作了检测，如header，CPU，imgp等，然后把Mach-O文件映射到内存中，调用load_machfile()函数，由dyld加载

1.4 `load_machfile`

load_return_t
load_machfile(
	struct image_params	*imgp,
	struct mach_header	*header,
	thread_t 		thread,
	vm_map_t 		*mapp,
	load_result_t		*result
)
{
	struct vnode		*vp = imgp->ip_vp;
	off_t			file_offset = imgp->ip_arch_offset;
	off_t			macho_size = imgp->ip_arch_size;
	off_t			file_size = imgp->ip_vattr->va_data_size;
	vm_map_t		new_map = *mapp;
	pmap_t			pmap = 0;	/* protected by create_map */
	vm_map_t		map;
	load_result_t		myresult;
	load_return_t		lret;
	boolean_t create_map = FALSE;
	boolean_t enforce_hard_pagezero = TRUE;
	int spawn = (imgp->ip_flags & IMGPF_SPAWN);
	task_t task = current_task();
	proc_t p = current_proc();
	mach_vm_offset_t	aslr_offset = 0;
	mach_vm_offset_t	dyld_aslr_offset = 0;
	kern_return_t 		kret;
	if (macho_size > file_size) {
		return(LOAD_BADMACHO);
	}
	if (new_map == VM_MAP_NULL) {
		create_map = TRUE;
	}
	/*
	 * If we are spawning, we have created backing objects for the process
	 * already, which include non-lazily creating the task map.  So we
	 * are going to switch out the task map with one appropriate for the
	 * bitness of the image being loaded.
	 */
	if (spawn) {
		create_map = TRUE;
	}
	if (create_map) {
		task_t ledger_task;
		if (imgp->ip_new_thread) {
			ledger_task = get_threadtask(imgp->ip_new_thread);
		} else {
			ledger_task = task;
		}
		pmap = pmap_create(get_task_ledger(ledger_task),
				   (vm_map_size_t) 0,
				   ((imgp->ip_flags & IMGPF_IS_64BIT) != 0));
		pal_switch_pmap(thread, pmap, imgp->ip_flags & IMGPF_IS_64BIT);
		map = vm_map_create(pmap,
				0,
				vm_compute_max_offset(((imgp->ip_flags & IMGPF_IS_64BIT) == IMGPF_IS_64BIT)),
				TRUE);
	} else
		map = new_map;
#if   (__ARM_ARCH_7K__ >= 2) && defined(PLATFORM_WatchOS)
	/* enforce 16KB alignment for watch targets with new ABI */
	vm_map_set_page_shift(map, SIXTEENK_PAGE_SHIFT);
#endif /* __arm64__ */
#ifndef	CONFIG_ENFORCE_SIGNED_CODE
	/* This turns off faulting for executable pages, which allows
	 * to circumvent Code Signing Enforcement. The per process
	 * flag (CS_ENFORCEMENT) is not set yet, but we can use the
	 * global flag.
	 */
	if ( !cs_enforcement(NULL) && (header->flags & MH_ALLOW_STACK_EXECUTION) )
	        vm_map_disable_NX(map);
#endif
	/* Forcibly disallow execution from data pages on even if the arch
	 * normally permits it. */
	////将内存设置为不可执行，用来防止产生溢出漏洞
	if ((header->flags & MH_NO_HEAP_EXECUTION) && !(imgp->ip_flags & IMGPF_ALLOW_DATA_EXEC))
		vm_map_disallow_data_exec(map);
	
	/*
	 * Compute a random offset for ASLR, and an independent random offset for dyld.
	 */
	//计算ASLR的偏移量
	if (!(imgp->ip_flags & IMGPF_DISABLE_ASLR)) {
		uint64_t max_slide_pages;
		max_slide_pages = vm_map_get_max_aslr_slide_pages(map);
		aslr_offset = random();
		aslr_offset %= max_slide_pages;
		aslr_offset <<= vm_map_page_shift(map);
		dyld_aslr_offset = random();
		dyld_aslr_offset %= max_slide_pages;
		dyld_aslr_offset <<= vm_map_page_shift(map);
	}
	
	if (!result)
		result = &myresult;
	*result = load_result_null;
	//解析Mach-O文件
	lret = parse_machfile(vp, map, thread, header, file_offset, macho_size,
	                      0, (int64_t)aslr_offset, (int64_t)dyld_aslr_offset, result);
	if (lret != LOAD_SUCCESS) {
		if (create_map) {
			vm_map_deallocate(map);	/* will lose pmap reference too */
		}
		return(lret);
	}
#if __x86_64__
	/*
	 * On x86, for compatibility, don't enforce the hard page-zero restriction for 32-bit binaries.
	 */
	if ((imgp->ip_flags & IMGPF_IS_64BIT) == 0) {
		enforce_hard_pagezero = FALSE;
	}
#endif
	/*
	 * Check to see if the page zero is enforced by the map->min_offset.
	 */ 
	if (enforce_hard_pagezero &&
	    (vm_map_has_hard_pagezero(map, 0x1000) == FALSE)) {
		{
			if (create_map) {
				vm_map_deallocate(map);	/* will lose pmap reference too */
			}
			return (LOAD_BADMACHO);
		}
	}
	if (create_map) {
		/*
		 * If this is an exec, then we are going to destroy the old
		 * task, and it's correct to halt it; if it's spawn, the
		 * task is not yet running, and it makes no sense.
		 */
		if (!spawn) {
			/*
			 * Mark the task as halting and start the other
			 * threads towards terminating themselves.  Then
			 * make sure any threads waiting for a process
			 * transition get informed that we are committed to
			 * this transition, and then finally complete the
			 * task halting (wait for threads and then cleanup
			 * task resources).
			 *
			 * NOTE: task_start_halt() makes sure that no new
			 * threads are created in the task during the transition.
			 * We need to mark the workqueue as exiting before we
			 * wait for threads to terminate (at the end of which
			 * we no longer have a prohibition on thread creation).
			 * 
			 * Finally, clean up any lingering workqueue data structures
			 * that may have been left behind by the workqueue threads
			 * as they exited (and then clean up the work queue itself).
			 */
			kret = task_start_halt(task);
			if (kret != KERN_SUCCESS) {
				vm_map_deallocate(map);	/* will lose pmap reference too */
				return (LOAD_FAILURE);
			}
			proc_transcommit(p, 0);
			workqueue_mark_exiting(p);
			task_complete_halt(task);
			workqueue_exit(p);
			kqueue_dealloc(p->p_wqkqueue);
			p->p_wqkqueue = NULL;
		}
		*mapp = map;
	}
	return(LOAD_SUCCESS);
}
int macho_printf = 0;
#define MACHO_PRINTF(args)				\
	do {						\
		if (macho_printf) {			\
			printf args;			\
		}					\
	} while (0)

主要解析了Mach-O文件，设置内存不可执行，防止了产生溢出漏洞，还设置了ASLR偏移

1.5 `parse_machfile`

static
load_return_t
parse_machfile(
	struct vnode 		*vp,       
	vm_map_t		map,
	thread_t		thread,
	struct mach_header	*header,
	off_t			file_offset,
	off_t			macho_size,
	int			depth,
	int64_t			aslr_offset,
	int64_t			dyld_aslr_offset,
	load_result_t		*result
)
{
	uint32_t		ncmds;
	struct load_command	*lcp;
	struct dylinker_command	*dlp = 0;
	integer_t		dlarchbits = 0;
	void *			control;
	load_return_t		ret = LOAD_SUCCESS;
	caddr_t			addr;
	void *			kl_addr;
	vm_size_t		size,kl_size;
	size_t			offset;
	size_t			oldoffset;	/* for overflow check */
	int			pass;
	proc_t			p = current_proc();		/* XXXX */
	int			error;
	int 			resid = 0;
	size_t			mach_header_sz = sizeof(struct mach_header);
	boolean_t		abi64;
	boolean_t		got_code_signatures = FALSE;
	int64_t			slide = 0;
	if (header->magic == MH_MAGIC_64 ||
	    header->magic == MH_CIGAM_64) {
	    	mach_header_sz = sizeof(struct mach_header_64);
	}
	/*
	 *	Break infinite recursion
	 */
	if (depth > 1) {
		return(LOAD_FAILURE);
	}
	depth++;
	/*
	 *	Check to see if right machine type.
	 */
	if (((cpu_type_t)(header->cputype & ~CPU_ARCH_MASK) != (cpu_type() & ~CPU_ARCH_MASK)) ||
	    !grade_binary(header->cputype, 
	    	header->cpusubtype & ~CPU_SUBTYPE_MASK))
		return(LOAD_BADARCH);
		
	abi64 = ((header->cputype & CPU_ARCH_ABI64) == CPU_ARCH_ABI64);
		
	switch (header->filetype) {
	
	case MH_EXECUTE:
		if (depth != 1) {
			return (LOAD_FAILURE);
		}
		break;
	case MH_DYLINKER:
		if (depth != 2) {
			return (LOAD_FAILURE);
		}
		break;
		
	default:
		return (LOAD_FAILURE);
	}
	/*
	 *	Get the pager for the file.
	 */
	control = ubc_getobject(vp, UBC_FLAGS_NONE);
	/*
	 *	Map portion that must be accessible directly into
	 *	kernel's map.
	 */
	if ((off_t)(mach_header_sz + header->sizeofcmds) > macho_size)
		return(LOAD_BADMACHO);
	/*
	 *	Round size of Mach-O commands up to page boundry.
	 */
	size = round_page(mach_header_sz + header->sizeofcmds);
	if (size <= 0)
		return(LOAD_BADMACHO);
	/*
	 * Map the load commands into kernel memory.
	 */
	addr = 0;
	kl_size = size;
	kl_addr = kalloc(size);
	addr = (caddr_t)kl_addr;
	if (addr == NULL)
		return(LOAD_NOSPACE);
	error = vn_rdwr(UIO_READ, vp, addr, size, file_offset,
	    UIO_SYSSPACE, 0, kauth_cred_get(), &resid, p);
	if (error) {
		if (kl_addr)
			kfree(kl_addr, kl_size);
		return(LOAD_IOERROR);
	}
	if (resid) {
		/* We must be able to read in as much as the mach_header indicated */
		if (kl_addr)
			kfree(kl_addr, kl_size);
		return(LOAD_BADMACHO);
	}
	/*
	 *	For PIE and dyld, slide everything by the ASLR offset.
	 */
	if ((header->flags & MH_PIE) || (header->filetype == MH_DYLINKER)) {
		slide = aslr_offset;
	}
	 /*
	 *  Scan through the commands, processing each one as necessary.
	 *  We parse in three passes through the headers:
	 *  0: determine if TEXT and DATA boundary can be page-aligned
	 *  1: thread state, uuid, code signature
	 *  2: segments
	 *  3: dyld, encryption, check entry point
	 */
	
	for (pass = 0; pass <= 3; pass++) {
		if (pass == 0) {
			/* see if we need to adjust the slide to re-align... */
			/* no re-alignment needed on X86_64 or ARM32 kernel */
			continue;
		} else if (pass == 1) {
		}
		/*
		 * Check that the entry point is contained in an executable segments
		 */ 
		if ((pass == 3) && (!result->using_lcmain && result->validentry == 0)) {
			thread_state_initialize(thread);
			ret = LOAD_FAILURE;
			break;
		}
		/*
		 * Loop through each of the load_commands indicated by the
		 * Mach-O header; if an absurd value is provided, we just
		 * run off the end of the reserved section by incrementing
		 * the offset too far, so we are implicitly fail-safe.
		 */
		offset = mach_header_sz;
		ncmds = header->ncmds;
		while (ncmds--) {
			/*
			 *	Get a pointer to the command.
			 */
			lcp = (struct load_command *)(addr + offset);
			oldoffset = offset;
			offset += lcp->cmdsize;
			/*
			 * Perform prevalidation of the struct load_command
			 * before we attempt to use its contents.  Invalid
			 * values are ones which result in an overflow, or
			 * which can not possibly be valid commands, or which
			 * straddle or exist past the reserved section at the
			 * start of the image.
			 */
			if (oldoffset > offset ||
			    lcp->cmdsize < sizeof(struct load_command) ||
			    offset > header->sizeofcmds + mach_header_sz) {
				ret = LOAD_BADMACHO;
				break;
			}
			/*
			 * Act on struct load_command's for which kernel
			 * intervention is required.
			 */
			switch(lcp->cmd) {
				//加载segment的数据并映射到进程的内存空间中
			case LC_SEGMENT:
				if (pass == 0) {
					break;
				}
				if (pass != 2)
					break;
				if (abi64) {
					/*
					 * Having an LC_SEGMENT command for the
					 * wrong ABI is invalid <rdar://problem/11021230>
					 */
					ret = LOAD_BADMACHO;
					break;
				}
				ret = load_segment(lcp,
				                   header->filetype,
				                   control,
				                   file_offset,
				                   macho_size,
				                   vp,
				                   map,
				                   slide,
				                   result);
				break;
			case LC_SEGMENT_64:
				if (pass != 2)
					break;
				if (!abi64) {
					/*
					 * Having an LC_SEGMENT_64 command for the
					 * wrong ABI is invalid <rdar://problem/11021230>
					 */
					ret = LOAD_BADMACHO;
					break;
				}
				ret = load_segment(lcp,
				                   header->filetype,
				                   control,
				                   file_offset,
				                   macho_size,
				                   vp,
				                   map,
				                   slide,
				                   result);
				break;
				//开一个Unix线程
			case LC_UNIXTHREAD:
				if (pass != 1)
					break;
				ret = load_unixthread(
						 (struct thread_command *) lcp,
						 thread,
						 slide,
						 result);
				break;
			case LC_MAIN:
				if (pass != 1)
					break;
				if (depth != 1)
					break;
				ret = load_main(
						 (struct entry_point_command *) lcp,
						 thread,
						 slide,
						 result);
				break;
				//启动dyld
			case LC_LOAD_DYLINKER:
				if (pass != 3)
					break;
				if ((depth == 1) && (dlp == 0)) {
					dlp = (struct dylinker_command *)lcp;
					dlarchbits = (header->cputype & CPU_ARCH_MASK);
				} else {
					ret = LOAD_FAILURE;
				}
				break;
				//加载UUID
			case LC_UUID:
				if (pass == 1 && depth == 1) {
					ret = load_uuid((struct uuid_command *) lcp,
							(char *)addr + mach_header_sz + header->sizeofcmds,
							result);
				}
				break;
			case LC_CODE_SIGNATURE:
				/* CODE SIGNING */
				if (pass != 1)
					break;
				/* pager -> uip ->
				   load signatures & store in uip
				   set VM object "signed_pages"
				*/
				ret = load_code_signature(
					(struct linkedit_data_command *) lcp,
					vp,
					file_offset,
					macho_size,
					header->cputype,
					result);
				if (ret != LOAD_SUCCESS) {
					printf("proc %d: load code signature error %d "
					       "for file \"%s\"\n",
					       p->p_pid, ret, vp->v_name);
					/*
					 * Allow injections to be ignored on devices w/o enforcement enabled
					 */
					if (!cs_enforcement(NULL))
					    ret = LOAD_SUCCESS; /* ignore error */
				} else {
					got_code_signatures = TRUE;
				}
				if (got_code_signatures) {
					unsigned tainted = CS_VALIDATE_TAINTED;
					boolean_t valid = FALSE;
					struct cs_blob *blobs;
					vm_size_t off = 0;
					if (cs_debug > 10)
						printf("validating initial pages of %s\n", vp->v_name);
					blobs = ubc_get_cs_blobs(vp);
					
					while (off < size && ret == LOAD_SUCCESS) {
					     tainted = CS_VALIDATE_TAINTED;
					     valid = cs_validate_page(blobs,
								      NULL,
								      file_offset + off,
								      addr + off,
								      &tainted);
					     if (!valid || (tainted & CS_VALIDATE_TAINTED)) {
						     if (cs_debug)
							     printf("CODE SIGNING: %s[%d]: invalid initial page at offset %lld validated:%d tainted:%d csflags:0x%x\n", 
								    vp->v_name, p->p_pid, (long long)(file_offset + off), valid, tainted, result->csflags);
						     if (cs_enforcement(NULL) ||
							 (result->csflags & (CS_HARD|CS_KILL|CS_ENFORCEMENT))) {
							     ret = LOAD_FAILURE;
						     }
						     result->csflags &= ~CS_VALID;
					     }
					     off += PAGE_SIZE;
					}
				}
				break;
#if CONFIG_CODE_DECRYPTION
			case LC_ENCRYPTION_INFO:
			case LC_ENCRYPTION_INFO_64:
				if (pass != 3)
					break;
				ret = set_code_unprotect(
					(struct encryption_info_command *) lcp,
					addr, map, slide, vp, file_offset,
					header->cputype, header->cpusubtype);
				if (ret != LOAD_SUCCESS) {
					printf("proc %d: set_code_unprotect() error %d "
					       "for file \"%s\"\n",
					       p->p_pid, ret, vp->v_name);
					/* 
					 * Don't let the app run if it's 
					 * encrypted but we failed to set up the
					 * decrypter. If the keys are missing it will
					 * return LOAD_DECRYPTFAIL.
					 */
					 if (ret == LOAD_DECRYPTFAIL) {
						/* failed to load due to missing FP keys */
						proc_lock(p);
						p->p_lflag |= P_LTERM_DECRYPTFAIL;
						proc_unlock(p);
					 }
					 psignal(p, SIGKILL);
				}
				break;
#endif
			default:
				/* Other commands are ignored by the kernel */
				ret = LOAD_SUCCESS;
				break;
			}
			if (ret != LOAD_SUCCESS)
				break;
		}
		if (ret != LOAD_SUCCESS)
			break;
	}
	if (ret == LOAD_SUCCESS) { 
		if (! got_code_signatures) {
			if (cs_enforcement(NULL)) {
				ret = LOAD_FAILURE;
			} else {
                               /*
                                * No embedded signatures: look for detached by taskgated,
                                * this is only done on OSX, on embedded platforms we expect everything
                                * to be have embedded signatures.
                                */
				struct cs_blob *blob;
				blob = ubc_cs_blob_get(vp, -1, file_offset);
				if (blob != NULL) {
					unsigned int cs_flag_data = blob->csb_flags;
					if(0 != ubc_cs_generation_check(vp)) {
						if (0 != ubc_cs_blob_revalidate(vp, blob, 0)) {
							/* clear out the flag data if revalidation fails */
							cs_flag_data = 0;
							result->csflags &= ~CS_VALID;
						}
					}
					/* get flags to be applied to the process */
					result->csflags |= cs_flag_data;
				}
			}
		}

根据load_command的不同，加载不同的函数

以上只是比较粗略的了解了Mach-O的加载流程，之后还需理解的更透彻，如有不足的地方还希望可以多多指正^_^
参考:

http://turingh.github.io/2016/03/30/OSX内核加载mach-o流程分析/#1-5_parse_machfile

本文标题:Mach-O加载流程

文章作者:凌迟

发布时间:2017-01-04, 19:06:49

最后更新:2017-03-04, 19:22:36

原始链接:http://Hello-Sherlock.github.io/2017/01/04/six-Blog/

许可协议: "署名-非商用-相同方式共享 4.0" 转载请保留原文链接及作者。

1.1__mac_execve()

1.2exec_activate_image()

1.3 exec_mach_imgact

1.4 load_machfile

1.5 parse_machfile

1.1`__mac_execve()`

1.2`exec_activate_image()`

1.3 `exec_mach_imgact`

1.4 `load_machfile`

1.5 `parse_machfile`