diff options
Diffstat (limited to 'vm')
| -rw-r--r-- | vm/memory_object.c | 1090 | ||||
| -rw-r--r-- | vm/memory_object.h | 39 | ||||
| -rw-r--r-- | vm/memory_object_default.cli | 28 | ||||
| -rw-r--r-- | vm/memory_object_proxy.c | 228 | ||||
| -rw-r--r-- | vm/memory_object_proxy.h | 39 | ||||
| -rw-r--r-- | vm/memory_object_user.cli | 28 | ||||
| -rw-r--r-- | vm/pmap.h | 241 | ||||
| -rw-r--r-- | vm/vm_debug.c | 548 | ||||
| -rw-r--r-- | vm/vm_external.c | 151 | ||||
| -rw-r--r-- | vm/vm_external.h | 95 | ||||
| -rw-r--r-- | vm/vm_fault.c | 2136 | ||||
| -rw-r--r-- | vm/vm_fault.h | 81 | ||||
| -rw-r--r-- | vm/vm_init.c | 88 | ||||
| -rw-r--r-- | vm/vm_init.h | 25 | ||||
| -rw-r--r-- | vm/vm_kern.c | 1099 | ||||
| -rw-r--r-- | vm/vm_kern.h | 100 | ||||
| -rw-r--r-- | vm/vm_map.c | 5237 | ||||
| -rw-r--r-- | vm/vm_map.h | 585 | ||||
| -rw-r--r-- | vm/vm_object.c | 2994 | ||||
| -rw-r--r-- | vm/vm_object.h | 415 | ||||
| -rw-r--r-- | vm/vm_page.c | 2164 | ||||
| -rw-r--r-- | vm/vm_page.h | 567 | ||||
| -rw-r--r-- | vm/vm_pageout.c | 515 | ||||
| -rw-r--r-- | vm/vm_pageout.h | 53 | ||||
| -rw-r--r-- | vm/vm_print.h | 41 | ||||
| -rw-r--r-- | vm/vm_resident.c | 1116 | ||||
| -rw-r--r-- | vm/vm_resident.h | 45 | ||||
| -rw-r--r-- | vm/vm_types.h | 42 | ||||
| -rw-r--r-- | vm/vm_user.c | 803 | ||||
| -rw-r--r-- | vm/vm_user.h | 60 | 
30 files changed, 20653 insertions, 0 deletions
diff --git a/vm/memory_object.c b/vm/memory_object.c new file mode 100644 index 0000000..1ea5956 --- /dev/null +++ b/vm/memory_object.c @@ -0,0 +1,1090 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University. + * Copyright (c) 1993,1994 The University of Utah and + * the Computer Systems Laboratory (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF + * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY + * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF + * THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU + *  School of Computer Science + *  Carnegie Mellon University + *  Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + *	File:	vm/memory_object.c + *	Author:	Michael Wayne Young + * + *	External memory management interface control functions. + */ + +/* + *	Interface dependencies: + */ + +#include <mach/std_types.h>	/* For pointer_t */ +#include <mach/mach_types.h> + +#include <mach/kern_return.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <mach/memory_object.h> +#include <mach/boolean.h> +#include <mach/vm_prot.h> +#include <mach/message.h> + +#include <vm/memory_object_user.user.h> +#include <vm/memory_object_default.user.h> + +/* + *	Implementation dependencies: + */ +#include <vm/memory_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pageout.h> +#include <vm/pmap.h>		/* For copy_to_phys, pmap_clear_modify */ +#include <kern/debug.h>		/* For panic() */ +#include <kern/thread.h>		/* For current_thread() */ +#include <kern/host.h> +#include <kern/mach.server.h>		/* For rpc prototypes */ +#include <vm/vm_kern.h>		/* For kernel_map, vm_move */ +#include <vm/vm_map.h>		/* For vm_map_pageable */ +#include <ipc/ipc_port.h> + +#if	MACH_PAGEMAP +#include <vm/vm_external.h> +#endif	/* MACH_PAGEMAP */ + +typedef	int		memory_object_lock_result_t; /* moved from below */ + + +ipc_port_t	memory_manager_default = IP_NULL; +def_simple_lock_data(static,memory_manager_default_lock) + +/* + *	Important note: + *		All of these routines gain a reference to the + *		object (first argument) as part of the automatic + *		argument conversion. Explicit deallocation is necessary. + */ + +kern_return_t memory_object_data_supply( +       vm_object_t		object, +	vm_offset_t		offset, +	vm_offset_t		vm_data_copy, +	unsigned int		data_cnt, +	vm_prot_t		lock_value, +	boolean_t		precious, +	ipc_port_t		reply_to, +	mach_msg_type_name_t	reply_to_type) +{ +	kern_return_t	result = KERN_SUCCESS; +	vm_offset_t	error_offset = 0; +	vm_page_t	m; +	vm_page_t	data_m; +	vm_size_t	original_length; +	vm_offset_t	original_offset; +	vm_page_t	*page_list; +	boolean_t	was_absent; +	vm_map_copy_t data_copy = (vm_map_copy_t)vm_data_copy; +	vm_map_copy_t	orig_copy = data_copy; + +	/* +	 *	Look for bogus arguments +	 */ + +	if (object == VM_OBJECT_NULL) { +		return(KERN_INVALID_ARGUMENT); +	} + +	if (lock_value & ~VM_PROT_ALL) { +		vm_object_deallocate(object); +		return(KERN_INVALID_ARGUMENT); +	} + +	if ((data_cnt % PAGE_SIZE) != 0) { +	    vm_object_deallocate(object); +	    return(KERN_INVALID_ARGUMENT); +	} + +	/* +	 *	Adjust the offset from the memory object to the offset +	 *	within the vm_object. +	 */ + +	original_length = data_cnt; +	original_offset = offset; + +	assert(data_copy->type == VM_MAP_COPY_PAGE_LIST); +	page_list = &data_copy->cpy_page_list[0]; + +	vm_object_lock(object); +	vm_object_paging_begin(object); +	offset -= object->paging_offset; + +	/* +	 *	Loop over copy stealing pages for pagein. +	 */ + +	for (; data_cnt > 0 ; data_cnt -= PAGE_SIZE, offset += PAGE_SIZE) { + +		assert(data_copy->cpy_npages > 0); +		data_m = *page_list; + +		if (data_m == VM_PAGE_NULL || data_m->tabled || +		    data_m->error || data_m->absent || data_m->fictitious) { + +			panic("Data_supply: bad page"); +		      } + +		/* +		 *	Look up target page and check its state. +		 */ + +retry_lookup: +		m = vm_page_lookup(object,offset); +		if (m == VM_PAGE_NULL) { +		    was_absent = FALSE; +		} +		else { +		    if (m->absent && m->busy) { + +			/* +			 *	Page was requested.  Free the busy +			 *	page waiting for it.  Insertion +			 *	of new page happens below. +			 */ + +			VM_PAGE_FREE(m); +			was_absent = TRUE; +		    } +		    else { + +			/* +			 *	Have to wait for page that is busy and +			 *	not absent.  This is probably going to +			 *	be an error, but go back and check. +			 */ +			if (m->busy) { +				PAGE_ASSERT_WAIT(m, FALSE); +				vm_object_unlock(object); +				thread_block((void (*)()) 0); +				vm_object_lock(object); +				goto retry_lookup; +			} + +			/* +			 *	Page already present; error. +			 *	This is an error if data is precious. +			 */ +			result = KERN_MEMORY_PRESENT; +			error_offset = offset + object->paging_offset; + +			break; +		    } +		} + +		/* +		 *	Ok to pagein page.  Target object now has no page +		 *	at offset.  Set the page parameters, then drop +		 *	in new page and set up pageout state.  Object is +		 *	still locked here. +		 * +		 *	Must clear busy bit in page before inserting it. +		 *	Ok to skip wakeup logic because nobody else +		 *	can possibly know about this page. +		 */ + +		data_m->busy = FALSE; +		data_m->dirty = FALSE; +		pmap_clear_modify(data_m->phys_addr); + +		data_m->page_lock = lock_value; +		data_m->unlock_request = VM_PROT_NONE; +		data_m->precious = precious; + +		vm_page_lock_queues(); +		vm_page_insert(data_m, object, offset); + +		if (was_absent) +			vm_page_activate(data_m); +		else +			vm_page_deactivate(data_m); + +		vm_page_unlock_queues(); + +		/* +		 *	Null out this page list entry, and advance to next +		 *	page. +		 */ + +		*page_list++ = VM_PAGE_NULL; + +		if (--(data_copy->cpy_npages) == 0 && +		    vm_map_copy_has_cont(data_copy)) { +			vm_map_copy_t	new_copy; + +			vm_object_unlock(object); + +			vm_map_copy_invoke_cont(data_copy, &new_copy, &result); + +			if (result == KERN_SUCCESS) { + +			    /* +			     *	Consume on success requires that +			     *	we keep the original vm_map_copy +			     *	around in case something fails. +			     *	Free the old copy if it's not the original +			     */ +			    if (data_copy != orig_copy) { +				vm_map_copy_discard(data_copy); +			    } + +			    if ((data_copy = new_copy) != VM_MAP_COPY_NULL) +				page_list = &data_copy->cpy_page_list[0]; + +			    vm_object_lock(object); +			} +			else { +			    vm_object_lock(object); +			    error_offset = offset + object->paging_offset + +						PAGE_SIZE; +			    break; +			} +		} +	} + +	/* +	 *	Send reply if one was requested. +	 */ +	vm_object_paging_end(object); +	vm_object_unlock(object); + +	if (vm_map_copy_has_cont(data_copy)) +		vm_map_copy_abort_cont(data_copy); + +	if (IP_VALID(reply_to)) { +		memory_object_supply_completed( +				reply_to, reply_to_type, +				object->pager_request, +				original_offset, +				original_length, +				result, +				error_offset); +	} + +	vm_object_deallocate(object); + +	/* +	 *	Consume on success:  The final data copy must be +	 *	be discarded if it is not the original.  The original +	 *	gets discarded only if this routine succeeds. +	 */ +	if (data_copy != orig_copy) +		vm_map_copy_discard(data_copy); +	if (result == KERN_SUCCESS) +		vm_map_copy_discard(orig_copy); + + +	return(result); +} + +kern_return_t memory_object_data_error( +	vm_object_t	object, +	vm_offset_t	offset, +	vm_size_t	size, +	kern_return_t	error_value) +{ +	if (object == VM_OBJECT_NULL) +		return(KERN_INVALID_ARGUMENT); + +	if (size != round_page(size)) +		return(KERN_INVALID_ARGUMENT); + +	vm_object_lock(object); +	offset -= object->paging_offset; + +	while (size != 0) { +		vm_page_t m; + +		m = vm_page_lookup(object, offset); +		if ((m != VM_PAGE_NULL) && m->busy && m->absent) { +			m->error = TRUE; +			m->absent = FALSE; +			vm_object_absent_release(object); + +			PAGE_WAKEUP_DONE(m); + +			vm_page_lock_queues(); +			vm_page_activate(m); +			vm_page_unlock_queues(); +		} + +		size -= PAGE_SIZE; +		offset += PAGE_SIZE; +	 } +	vm_object_unlock(object); + +	vm_object_deallocate(object); +	return(KERN_SUCCESS); +} + +kern_return_t memory_object_data_unavailable( +	vm_object_t	object, +	vm_offset_t	offset, +	vm_size_t	size) +{ +#if	MACH_PAGEMAP +	vm_external_t	existence_info = VM_EXTERNAL_NULL; +#endif	/* MACH_PAGEMAP */ + +	if (object == VM_OBJECT_NULL) +		return(KERN_INVALID_ARGUMENT); + +	if (size != round_page(size)) +		return(KERN_INVALID_ARGUMENT); + +#if	MACH_PAGEMAP +	if ((offset == 0) && (size > VM_EXTERNAL_LARGE_SIZE) && +	    (object->existence_info == VM_EXTERNAL_NULL)) { +		existence_info = vm_external_create(VM_EXTERNAL_SMALL_SIZE); +	} +#endif	/* MACH_PAGEMAP */ + +	vm_object_lock(object); +#if	MACH_PAGEMAP + 	if (existence_info != VM_EXTERNAL_NULL) { +		object->existence_info = existence_info; +	} +	if ((offset == 0) && (size > VM_EXTERNAL_LARGE_SIZE)) { +		vm_object_unlock(object); +		vm_object_deallocate(object); +		return(KERN_SUCCESS); +	} +#endif	/* MACH_PAGEMAP */ +	offset -= object->paging_offset; + +	while (size != 0) { +		vm_page_t m; + +		/* +		 *	We're looking for pages that are both busy and +		 *	absent (waiting to be filled), converting them +		 *	to just absent. +		 * +		 *	Pages that are just busy can be ignored entirely. +		 */ + +		m = vm_page_lookup(object, offset); +		if ((m != VM_PAGE_NULL) && m->busy && m->absent) { +			PAGE_WAKEUP_DONE(m); + +			vm_page_lock_queues(); +			vm_page_activate(m); +			vm_page_unlock_queues(); +		} +		size -= PAGE_SIZE; +		offset += PAGE_SIZE; +	} + +	vm_object_unlock(object); + +	vm_object_deallocate(object); +	return(KERN_SUCCESS); +} + +/* + *	Routine:	memory_object_lock_page + * + *	Description: + *		Perform the appropriate lock operations on the + *		given page.  See the description of + *		"memory_object_lock_request" for the meanings + *		of the arguments. + * + *		Returns an indication that the operation + *		completed, blocked, or that the page must + *		be cleaned. + */ + +#define	MEMORY_OBJECT_LOCK_RESULT_DONE		0 +#define	MEMORY_OBJECT_LOCK_RESULT_MUST_BLOCK	1 +#define	MEMORY_OBJECT_LOCK_RESULT_MUST_CLEAN	2 +#define	MEMORY_OBJECT_LOCK_RESULT_MUST_RETURN	3 + +static memory_object_lock_result_t memory_object_lock_page( +	vm_page_t		m, +	memory_object_return_t	should_return, +	boolean_t		should_flush, +	vm_prot_t		prot) +{ +	/* +	 *	Don't worry about pages for which the kernel +	 *	does not have any data. +	 */ + +	if (m->absent) +		return(MEMORY_OBJECT_LOCK_RESULT_DONE); + +	/* +	 *	If we cannot change access to the page, +	 *	either because a mapping is in progress +	 *	(busy page) or because a mapping has been +	 *	wired, then give up. +	 */ + +	if (m->busy) +		return(MEMORY_OBJECT_LOCK_RESULT_MUST_BLOCK); + +	assert(!m->fictitious); + +	if (m->wire_count != 0) { +		/* +		 *	If no change would take place +		 *	anyway, return successfully. +		 * +		 *	No change means: +		 *		Not flushing AND +		 *		No change to page lock [2 checks]  AND +		 *		Don't need to send page to manager +		 * +		 *	Don't need to send page to manager means: +		 *		No clean or return request OR ( +		 *		    Page is not dirty [2 checks] AND ( +		 *		        Page is not precious OR +		 *			No request to return precious pages )) +		 * +		 *	Now isn't that straightforward and obvious ?? ;-) +		 * +		 * XXX	This doesn't handle sending a copy of a wired +		 * XXX	page to the pager, but that will require some +		 * XXX	significant surgery. +		 */ + +		if (!should_flush && +		    ((m->page_lock == prot) || (prot == VM_PROT_NO_CHANGE)) && +		    ((should_return == MEMORY_OBJECT_RETURN_NONE) || +		     (!m->dirty && !pmap_is_modified(m->phys_addr) && +		      (!m->precious || +		       should_return != MEMORY_OBJECT_RETURN_ALL)))) { +			/* +			 *	Restart page unlock requests, +			 *	even though no change took place. +			 *	[Memory managers may be expecting +			 *	to see new requests.] +			 */ +			m->unlock_request = VM_PROT_NONE; +			PAGE_WAKEUP(m); + +			return(MEMORY_OBJECT_LOCK_RESULT_DONE); +		} + +		return(MEMORY_OBJECT_LOCK_RESULT_MUST_BLOCK); +	} + +	/* +	 *	If the page is to be flushed, allow +	 *	that to be done as part of the protection. +	 */ + +	if (should_flush) +		prot = VM_PROT_ALL; + +	/* +	 *	Set the page lock. +	 * +	 *	If we are decreasing permission, do it now; +	 *	let the fault handler take care of increases +	 *	(pmap_page_protect may not increase protection). +	 */ + +	if (prot != VM_PROT_NO_CHANGE) { +		if ((m->page_lock ^ prot) & prot) { +			pmap_page_protect(m->phys_addr, VM_PROT_ALL & ~prot); +		} +		m->page_lock = prot; + +		/* +		 *	Restart any past unlock requests, even if no +		 *	change resulted.  If the manager explicitly +		 *	requested no protection change, then it is assumed +		 *	to be remembering past requests. +		 */ + +		m->unlock_request = VM_PROT_NONE; +		PAGE_WAKEUP(m); +	} + +	/* +	 *	Handle cleaning. +	 */ + +	if (should_return != MEMORY_OBJECT_RETURN_NONE) { +		/* +		 *	Check whether the page is dirty.  If +		 *	write permission has not been removed, +		 *	this may have unpredictable results. +		 */ + +		if (!m->dirty) +			m->dirty = pmap_is_modified(m->phys_addr); + +		if (m->dirty || (m->precious && +				 should_return == MEMORY_OBJECT_RETURN_ALL)) { +			/* +			 *	If we weren't planning +			 *	to flush the page anyway, +			 *	we may need to remove the +			 *	page from the pageout +			 *	system and from physical +			 *	maps now. +			 */ + +			vm_page_lock_queues(); +			VM_PAGE_QUEUES_REMOVE(m); +			vm_page_unlock_queues(); + +			if (!should_flush) +				pmap_page_protect(m->phys_addr, +						VM_PROT_NONE); + +			/* +			 *	Cleaning a page will cause +			 *	it to be flushed. +			 */ + +			if (m->dirty) +				return(MEMORY_OBJECT_LOCK_RESULT_MUST_CLEAN); +			else +				return(MEMORY_OBJECT_LOCK_RESULT_MUST_RETURN); +		} +	} + +	/* +	 *	Handle flushing +	 */ + +	if (should_flush) { +		VM_PAGE_FREE(m); +	} else { +		extern boolean_t vm_page_deactivate_hint; + +		/* +		 *	XXX Make clean but not flush a paging hint, +		 *	and deactivate the pages.  This is a hack +		 *	because it overloads flush/clean with +		 *	implementation-dependent meaning.  This only +		 *	happens to pages that are already clean. +		 */ + +		if (vm_page_deactivate_hint && +		    (should_return != MEMORY_OBJECT_RETURN_NONE)) { +			vm_page_lock_queues(); +			vm_page_deactivate(m); +			vm_page_unlock_queues(); +		} +	} + +	return(MEMORY_OBJECT_LOCK_RESULT_DONE); +} + +/* + *	Routine:	memory_object_lock_request [user interface] + * + *	Description: + *		Control use of the data associated with the given + *		memory object.  For each page in the given range, + *		perform the following operations, in order: + *			1)  restrict access to the page (disallow + *			    forms specified by "prot"); + *			2)  return data to the manager (if "should_return" + *			    is RETURN_DIRTY and the page is dirty, or + * 			    "should_return" is RETURN_ALL and the page + *			    is either dirty or precious); and, + *			3)  flush the cached copy (if "should_flush" + *			    is asserted). + *		The set of pages is defined by a starting offset + *		("offset") and size ("size").  Only pages with the + *		same page alignment as the starting offset are + *		considered. + * + *		A single acknowledgement is sent (to the "reply_to" + *		port) when these actions are complete.  If successful, + *		the naked send right for reply_to is consumed. + */ + +kern_return_t +memory_object_lock_request( +	vm_object_t		object, +	vm_offset_t		offset, +	vm_size_t		size, +	memory_object_return_t	should_return, +	boolean_t		should_flush, +	vm_prot_t		prot, +	ipc_port_t		reply_to, +	mach_msg_type_name_t	reply_to_type) +{ +	vm_page_t		m; +	vm_offset_t		original_offset = offset; +	vm_size_t		original_size = size; +	vm_offset_t		paging_offset = 0; +	vm_object_t		new_object = VM_OBJECT_NULL; +	vm_offset_t		new_offset = 0; +	vm_offset_t		last_offset = offset; +	int			page_lock_result; +	int			pageout_action = 0; /* '=0' to quiet lint */ + +#define	DATA_WRITE_MAX	32 +	vm_page_t		holding_pages[DATA_WRITE_MAX]; + +	/* +	 *	Check for bogus arguments. +	 */ +	if (object == VM_OBJECT_NULL || +		((prot & ~VM_PROT_ALL) != 0 && prot != VM_PROT_NO_CHANGE)) +	    return (KERN_INVALID_ARGUMENT); + +	size = round_page(size); + +	/* +	 *	Lock the object, and acquire a paging reference to +	 *	prevent the memory_object and control ports from +	 *	being destroyed. +	 */ + +	vm_object_lock(object); +	vm_object_paging_begin(object); +	offset -= object->paging_offset; + +	/* +	 *	To avoid blocking while scanning for pages, save +	 *	dirty pages to be cleaned all at once. +	 * +	 *	XXXO A similar strategy could be used to limit the +	 *	number of times that a scan must be restarted for +	 *	other reasons.  Those pages that would require blocking +	 *	could be temporarily collected in another list, or +	 *	their offsets could be recorded in a small array. +	 */ + +	/* +	 * XXX	NOTE: May want to consider converting this to a page list +	 * XXX	vm_map_copy interface.  Need to understand object +	 * XXX	coalescing implications before doing so. +	 */ + +#define	PAGEOUT_PAGES							\ +MACRO_BEGIN								\ +	vm_map_copy_t		copy;					\ +	unsigned		i;					\ +	vm_page_t		hp;					\ +									\ +	vm_object_unlock(object);					\ +									\ +	(void) vm_map_copyin_object(new_object, 0, new_offset, ©);	\ +									\ +	(void) memory_object_data_return(				\ +		object->pager,						\ +		object->pager_request,					\ +		paging_offset,						\ +		(pointer_t) copy,					\ +		new_offset,						\ +	     (pageout_action == MEMORY_OBJECT_LOCK_RESULT_MUST_CLEAN),	\ +		!should_flush);						\ +									\ +	vm_object_lock(object);						\ +									\ +	for (i = 0; i < atop(new_offset); i++) {			\ +	    hp = holding_pages[i];					\ +	    if (hp != VM_PAGE_NULL)					\ +		VM_PAGE_FREE(hp);					\ +	}								\ +									\ +	new_object = VM_OBJECT_NULL;					\ +MACRO_END + +	for (; +	     size != 0; +	     size -= PAGE_SIZE, offset += PAGE_SIZE) +	{ +	    /* +	     *	Limit the number of pages to be cleaned at once. +	     */ +	    if (new_object != VM_OBJECT_NULL && +		    new_offset >= PAGE_SIZE * DATA_WRITE_MAX) +	    { +		PAGEOUT_PAGES; +	    } + +	    while ((m = vm_page_lookup(object, offset)) != VM_PAGE_NULL) { +		switch ((page_lock_result = memory_object_lock_page(m, +					should_return, +					should_flush, +					prot))) +		{ +		    case MEMORY_OBJECT_LOCK_RESULT_DONE: +			/* +			 *	End of a cluster of dirty pages. +			 */ +			if (new_object != VM_OBJECT_NULL) { +			    PAGEOUT_PAGES; +			    continue; +			} +			break; + +		    case MEMORY_OBJECT_LOCK_RESULT_MUST_BLOCK: +			/* +			 *	Since it is necessary to block, +			 *	clean any dirty pages now. +			 */ +			if (new_object != VM_OBJECT_NULL) { +			    PAGEOUT_PAGES; +			    continue; +			} + +			PAGE_ASSERT_WAIT(m, FALSE); +			vm_object_unlock(object); +			thread_block((void (*)()) 0); +			vm_object_lock(object); +			continue; + +		    case MEMORY_OBJECT_LOCK_RESULT_MUST_CLEAN: +		    case MEMORY_OBJECT_LOCK_RESULT_MUST_RETURN: +			/* +			 * The clean and return cases are similar. +			 * +			 * Mark the page busy since we unlock the +			 * object below. +			 */ +			m->busy = TRUE; + +			/* +			 * if this would form a discontiguous block, +			 * clean the old pages and start anew. +			 * +			 * NOTE: The first time through here, new_object +			 * is null, hiding the fact that pageout_action +			 * is not initialized. +			 */ +			if (new_object != VM_OBJECT_NULL && +			    (last_offset != offset || +			     pageout_action != page_lock_result)) { +			        PAGEOUT_PAGES; +			} + +			vm_object_unlock(object); + +			/* +			 *	If we have not already allocated an object +			 *	for a range of pages to be written, do so +			 *	now. +			 */ +			if (new_object == VM_OBJECT_NULL) { +			    new_object = vm_object_allocate(original_size); +			    new_offset = 0; +			    paging_offset = m->offset + +					object->paging_offset; +			    pageout_action = page_lock_result; +			} + +			/* +			 *	Move or copy the dirty page into the +			 *	new object. +			 */ +			m = vm_pageout_setup(m, +					m->offset + object->paging_offset, +					new_object, +					new_offset, +					should_flush); + +			/* +			 *	Save the holding page if there is one. +			 */ +			holding_pages[atop(new_offset)] = m; +			new_offset += PAGE_SIZE; +			last_offset = offset + PAGE_SIZE; + +			vm_object_lock(object); +			break; +		} +		break; +	    } +	} + +	/* +	 *	We have completed the scan for applicable pages. +	 *	Clean any pages that have been saved. +	 */ +	if (new_object != VM_OBJECT_NULL) { +	    PAGEOUT_PAGES; +	} + +	if (IP_VALID(reply_to)) { +		vm_object_unlock(object); + +		/* consumes our naked send-once/send right for reply_to */ +		(void) memory_object_lock_completed(reply_to, reply_to_type, +			object->pager_request, original_offset, original_size); + +		vm_object_lock(object); +	} + +	vm_object_paging_end(object); +	vm_object_unlock(object); +	vm_object_deallocate(object); + +	return (KERN_SUCCESS); +} + +static kern_return_t +memory_object_set_attributes_common( +	vm_object_t	object, +	boolean_t	may_cache, +	memory_object_copy_strategy_t copy_strategy) +{ +	if (object == VM_OBJECT_NULL) +		return(KERN_INVALID_ARGUMENT); + +	/* +	 *	Verify the attributes of importance +	 */ + +	switch(copy_strategy) { +		case MEMORY_OBJECT_COPY_NONE: +		case MEMORY_OBJECT_COPY_CALL: +		case MEMORY_OBJECT_COPY_DELAY: +		case MEMORY_OBJECT_COPY_TEMPORARY: +			break; +		default: +			vm_object_deallocate(object); +			return(KERN_INVALID_ARGUMENT); +	} + +	if (may_cache) +		may_cache = TRUE; + +	vm_object_lock(object); + +	/* +	 *	Wake up anyone waiting for the ready attribute +	 *	to become asserted. +	 */ + +	if (!object->pager_ready) { +		vm_object_wakeup(object, VM_OBJECT_EVENT_PAGER_READY); +	} + +	/* +	 *	Copy the attributes +	 */ + +	object->can_persist = may_cache; +	object->pager_ready = TRUE; +	if (copy_strategy == MEMORY_OBJECT_COPY_TEMPORARY) { +		object->temporary = TRUE; +	} else { +		object->copy_strategy = copy_strategy; +	} + +	vm_object_unlock(object); + +	vm_object_deallocate(object); + +	return(KERN_SUCCESS); +} + +/* + * XXX	rpd claims that reply_to could be obviated in favor of a client + * XXX	stub that made change_attributes an RPC.  Need investigation. + */ + +kern_return_t	memory_object_change_attributes( +	vm_object_t		object, +	boolean_t		may_cache, +	memory_object_copy_strategy_t copy_strategy, +	ipc_port_t		reply_to, +	mach_msg_type_name_t	reply_to_type) +{ +	kern_return_t	result; + +	/* +	 *	Do the work and throw away our object reference.  It +	 *	is important that the object reference be deallocated +	 *	BEFORE sending the reply.  The whole point of the reply +	 *	is that it shows up after the terminate message that +	 *	may be generated by setting the object uncacheable. +	 * +	 * XXX	may_cache may become a tri-valued variable to handle +	 * XXX	uncache if not in use. +	 */ +	result = memory_object_set_attributes_common(object, may_cache, +						     copy_strategy); + +	if (IP_VALID(reply_to)) { + +		/* consumes our naked send-once/send right for reply_to */ +		(void) memory_object_change_completed(reply_to, reply_to_type, +			may_cache, copy_strategy); + +	} + +	return(result); +} + +kern_return_t	memory_object_ready( +	vm_object_t	object, +	boolean_t	may_cache, +	memory_object_copy_strategy_t copy_strategy) +{ +	return memory_object_set_attributes_common(object, may_cache, +						   copy_strategy); +} + +kern_return_t	memory_object_get_attributes( +	vm_object_t	object, +	boolean_t	*object_ready, +	boolean_t	*may_cache, +	memory_object_copy_strategy_t *copy_strategy) +{ +	if (object == VM_OBJECT_NULL) +		return(KERN_INVALID_ARGUMENT); + +	vm_object_lock(object); +	*may_cache = object->can_persist; +	*object_ready = object->pager_ready; +	*copy_strategy = object->copy_strategy; +	vm_object_unlock(object); + +	vm_object_deallocate(object); + +	return(KERN_SUCCESS); +} + +/* + *	If successful, consumes the supplied naked send right. + */ +kern_return_t	vm_set_default_memory_manager( +		const host_t host, +		ipc_port_t *default_manager) +{ +	ipc_port_t current_manager; +	ipc_port_t new_manager; +	ipc_port_t returned_manager; + +	if (host == HOST_NULL) +		return(KERN_INVALID_HOST); + +	new_manager = *default_manager; +	simple_lock(&memory_manager_default_lock); +	current_manager = memory_manager_default; + +	if (new_manager == IP_NULL) { +		/* +		 *	Retrieve the current value. +		 */ + +		returned_manager = ipc_port_copy_send(current_manager); +	} else { +		/* +		 *	Retrieve the current value, +		 *	and replace it with the supplied value. +		 *	We consume the supplied naked send right. +		 */ + +		returned_manager = current_manager; +		memory_manager_default = new_manager; + +		/* +		 *	In case anyone's been waiting for a memory +		 *	manager to be established, wake them up. +		 */ + +		thread_wakeup((event_t) &memory_manager_default); +	} + +	simple_unlock(&memory_manager_default_lock); + +	*default_manager = returned_manager; +	return(KERN_SUCCESS); +} + +/* + *	Routine:	memory_manager_default_reference + *	Purpose: + *		Returns a naked send right for the default + *		memory manager.  The returned right is always + *		valid (not IP_NULL or IP_DEAD). + */ + +ipc_port_t	memory_manager_default_reference(void) +{ +	ipc_port_t current_manager; + +	simple_lock(&memory_manager_default_lock); + +	while (current_manager = ipc_port_copy_send(memory_manager_default), +	       !IP_VALID(current_manager)) { +		thread_sleep((event_t) &memory_manager_default, +			     simple_lock_addr(memory_manager_default_lock), +			     FALSE); +		simple_lock(&memory_manager_default_lock); +	} + +	simple_unlock(&memory_manager_default_lock); + +	return current_manager; +} + +/* + *	Routine:	memory_manager_default_port + *	Purpose: + *		Returns true if the receiver for the port + *		is the default memory manager. + * + *		This is a hack to let ds_read_done + *		know when it should keep memory wired. + */ + +boolean_t	memory_manager_default_port(const ipc_port_t port) +{ +	ipc_port_t current; +	boolean_t result; + +	simple_lock(&memory_manager_default_lock); +	current = memory_manager_default; +	if (IP_VALID(current)) { +		/* +		 *	There is no point in bothering to lock +		 *	both ports, which would be painful to do. +		 *	If the receive rights are moving around, +		 *	we might be inaccurate. +		 */ + +		result = port->ip_receiver == current->ip_receiver; +	} else +		result = FALSE; +	simple_unlock(&memory_manager_default_lock); + +	return result; +} + +void		memory_manager_default_init(void) +{ +	memory_manager_default = IP_NULL; +	simple_lock_init(&memory_manager_default_lock); +} diff --git a/vm/memory_object.h b/vm/memory_object.h new file mode 100644 index 0000000..ee0c963 --- /dev/null +++ b/vm/memory_object.h @@ -0,0 +1,39 @@ +/* + * Mach Operating System + * Copyright (c) 1991 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS + * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU + *  School of Computer Science + *  Carnegie Mellon University + *  Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +#ifndef	_VM_MEMORY_OBJECT_H_ +#define	_VM_MEMORY_OBJECT_H_ + +#include <mach/boolean.h> +#include <ipc/ipc_types.h> + +extern ipc_port_t memory_manager_default_reference(void); +extern boolean_t memory_manager_default_port(ipc_port_t); +extern void memory_manager_default_init(void); + +extern ipc_port_t memory_manager_default; + +#endif	/* _VM_MEMORY_OBJECT_H_ */ diff --git a/vm/memory_object_default.cli b/vm/memory_object_default.cli new file mode 100644 index 0000000..998a986 --- /dev/null +++ b/vm/memory_object_default.cli @@ -0,0 +1,28 @@ +/*  + * Copyright (c) 1994 The University of Utah and + * the Computer Systems Laboratory at the University of Utah (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software is hereby + * granted provided that (1) source code retains these copyright, permission, + * and disclaimer notices, and (2) redistributions including binaries + * reproduce the notices in supporting documentation, and (3) all advertising + * materials mentioning features or use of this software display the following + * acknowledgement: ``This product includes software developed by the + * Computer Systems Laboratory at the University of Utah.'' + * + * THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF THIS SOFTWARE IN ITS "AS + * IS" CONDITION.  THE UNIVERSITY OF UTAH AND CSL DISCLAIM ANY LIABILITY OF + * ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * CSL requests users of this software to return to csl-dist@cs.utah.edu any + * improvements that they make and grant CSL redistribution rights. + * + *      Author: Bryan Ford, University of Utah CSL + */ +/* This is a client presentation file.  */ + +#define KERNEL_USER 1 +#define SEQNOS 1 + +#include <mach/memory_object_default.defs> diff --git a/vm/memory_object_proxy.c b/vm/memory_object_proxy.c new file mode 100644 index 0000000..5724349 --- /dev/null +++ b/vm/memory_object_proxy.c @@ -0,0 +1,228 @@ +/* memory_object_proxy.c - Proxy memory objects for Mach. +   Copyright (C) 2005 Free Software Foundation, Inc. +   Written by Marcus Brinkmann. + +   This file is part of GNU Mach. + +   GNU Mach is free software; you can redistribute it and/or modify it +   under the terms of the GNU General Public License as published by +   the Free Software Foundation; either version 2, or (at your option) +   any later version. + +   GNU Mach is distributed in the hope that it will be useful, but +   WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   General Public License for more details. + +   You should have received a copy of the GNU General Public License +   along with this program; if not, write to the Free Software +   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA. */ + +/* A proxy memory object is a kernel port that can be used like a real +   memory object in a vm_map call, except that the current and maximum +   protection are restricted to the proxy object's maximum protection +   at the time the mapping is established.  The kernel port will hold +   a reference to the real memory object for the life time of the +   proxy object. + +   Note that we don't need to do any reference counting on the proxy +   object.  Our caller will hold a reference to the proxy object when +   looking it up, and is expected to acquire its own reference to the +   real memory object if needed before releasing the reference to the +   proxy object. + +   The user provided real memory object and the maximum protection are +   not checked for validity.  The maximum protection is only used as a +   mask, and the memory object is validated at the time the mapping is +   established.  */ + +#include <mach/port.h> +#include <mach/kern_return.h> +#include <mach/notify.h> +#include <mach/vm_prot.h> +#include <kern/printf.h> +#include <kern/slab.h> +#include <kern/mach4.server.h> +#include <ipc/ipc_port.h> +#include <ipc/ipc_space.h> + +#include <vm/memory_object_proxy.h> + +/* The cache which holds our proxy memory objects.  */ +static struct kmem_cache memory_object_proxy_cache; + +struct memory_object_proxy +{ +  struct ipc_port *port; + +  ipc_port_t object; +  ipc_port_t notify; +  vm_prot_t max_protection; +  vm_offset_t start; +  vm_offset_t len; +}; +typedef struct memory_object_proxy *memory_object_proxy_t; + + +void +memory_object_proxy_init (void) +{ +  kmem_cache_init (&memory_object_proxy_cache, "memory_object_proxy", +		   sizeof (struct memory_object_proxy), 0, NULL, 0); +} + +/* Lookup a proxy memory object by its port.  */ +static memory_object_proxy_t +memory_object_proxy_port_lookup (ipc_port_t port) +{ +  memory_object_proxy_t proxy; + +  if (!IP_VALID(port)) +    return 0; + +  ip_lock (port); +  if (ip_active (port) && (ip_kotype (port) == IKOT_PAGER_PROXY)) +    proxy = (memory_object_proxy_t) port->ip_kobject; +  else +    proxy = 0; +  ip_unlock (port); +  return proxy; +} + + +/* Process a no-sender notification for the proxy memory object +   port.  */ +boolean_t +memory_object_proxy_notify (mach_msg_header_t *msg) +{ +  if (msg->msgh_id == MACH_NOTIFY_NO_SENDERS) +    { +      memory_object_proxy_t proxy; +      mach_no_senders_notification_t *ns; + +      ns = (mach_no_senders_notification_t *) msg; + +      proxy = (memory_object_proxy_t) +	      ((ipc_port_t) ns->not_header.msgh_remote_port)->ip_kobject; +      if (!proxy) +	return FALSE; +      if ((ipc_port_t) ns->not_header.msgh_remote_port != proxy->notify) +	return FALSE; + +      ipc_port_release_send (proxy->object); + +      ipc_kobject_set (proxy->port, IKO_NULL, IKOT_NONE); +      ipc_port_dealloc_kernel (proxy->port); +      ipc_kobject_set (proxy->notify, IKO_NULL, IKOT_NONE); +      ipc_port_dealloc_kernel (proxy->notify); + +      kmem_cache_free (&memory_object_proxy_cache, (vm_offset_t) proxy); + +      return TRUE; +    } + +  printf ("memory_object_proxy_notify: strange notification %d\n", +	  msg->msgh_id); +  return FALSE; +} + + +/* Create a new proxy memory object from [START;START+LEN) in the +   given OBJECT at OFFSET in the new object with the maximum +   protection MAX_PROTECTION and return it in *PORT.  */ +kern_return_t +memory_object_create_proxy (ipc_space_t space, vm_prot_t max_protection, +			    ipc_port_t *object, natural_t object_count, +			    rpc_vm_offset_t *offset, natural_t offset_count, +			    rpc_vm_offset_t *start, natural_t start_count, +			    rpc_vm_size_t *len, natural_t len_count, +			    ipc_port_t *port) +{ +  memory_object_proxy_t proxy; +  ipc_port_t notify; + +  if (space == IS_NULL) +    return KERN_INVALID_TASK; + +  if (offset_count != object_count || start_count != object_count +      || len_count != object_count) +    return KERN_INVALID_ARGUMENT; + +  /* FIXME: Support more than one memory object.  */ +  if (object_count != 1) +    return KERN_INVALID_ARGUMENT; + +  if (!IP_VALID(object[0])) +    return KERN_INVALID_NAME; + +  /* FIXME: Support a different offset from 0.  */ +  if (offset[0] != 0) +    return KERN_INVALID_ARGUMENT; + +  if (start[0] + len[0] < start[0]) +    return KERN_INVALID_ARGUMENT; + +  proxy = (memory_object_proxy_t) kmem_cache_alloc (&memory_object_proxy_cache); + +  /* Allocate port, keeping a reference for it.  */ +  proxy->port = ipc_port_alloc_kernel (); +  if (proxy->port == IP_NULL) +    { +      kmem_cache_free (&memory_object_proxy_cache, (vm_offset_t) proxy); +      return KERN_RESOURCE_SHORTAGE; +    } +  /* Associate the port with the proxy memory object.  */ +  ipc_kobject_set (proxy->port, (ipc_kobject_t) proxy, IKOT_PAGER_PROXY); + +  /* Request no-senders notifications on the port.  */ +  proxy->notify = ipc_port_alloc_kernel (); +  ipc_kobject_set (proxy->notify, (ipc_kobject_t) proxy, IKOT_PAGER_PROXY); +  notify = ipc_port_make_sonce (proxy->notify); +  ip_lock (proxy->port); +  ipc_port_nsrequest (proxy->port, 1, notify, ¬ify); +  assert (notify == IP_NULL); + +  /* Consumes the port right */ +  proxy->object = object[0]; +  proxy->max_protection = max_protection; +  proxy->start = start[0]; +  proxy->len = len[0]; + +  *port = ipc_port_make_send (proxy->port); +  return KERN_SUCCESS; +} + +/* Lookup the real memory object and maximum protection for the proxy +   memory object port PORT, for which the caller holds a reference. +   *OBJECT is only guaranteed to be valid as long as the caller holds +   the reference to PORT (unless the caller acquires its own reference +   to it).  If PORT is not a proxy memory object, return +   KERN_INVALID_ARGUMENT.  */ +kern_return_t +memory_object_proxy_lookup (ipc_port_t port, ipc_port_t *object, +			    vm_prot_t *max_protection, vm_offset_t *start, +			    vm_offset_t *len) +{ +  memory_object_proxy_t proxy; + +  proxy = memory_object_proxy_port_lookup (port); +  if (!proxy) +    return KERN_INVALID_ARGUMENT; + +  *max_protection = proxy->max_protection; +  *start = 0; +  *len = (vm_offset_t) ~0; + +  do +    { +      *object = proxy->object; +      if (proxy->len <= *start) +	*len = 0; +      else +	*len = MIN(*len, proxy->len - *start); +      *start += proxy->start; +    } +  while ((proxy = memory_object_proxy_port_lookup (proxy->object))); + +  return KERN_SUCCESS; +} diff --git a/vm/memory_object_proxy.h b/vm/memory_object_proxy.h new file mode 100644 index 0000000..8b3f202 --- /dev/null +++ b/vm/memory_object_proxy.h @@ -0,0 +1,39 @@ +/* memory_object_proxy.h - Proxy memory objects for Mach. +   Copyright (C) 2005, 2011 Free Software Foundation, Inc. +   Written by Marcus Brinkmann. + +   This file is part of GNU Mach. + +   GNU Mach is free software; you can redistribute it and/or modify it +   under the terms of the GNU General Public License as published by +   the Free Software Foundation; either version 2, or (at your option) +   any later version. + +   GNU Mach is distributed in the hope that it will be useful, but +   WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   General Public License for more details. + +   You should have received a copy of the GNU General Public License +   along with this program; if not, write to the Free Software +   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA. */ + +#ifndef _VM_MEMORY_OBJECT_PROXY_H_ +#define _VM_MEMORY_OBJECT_PROXY_H_ + +#include <ipc/ipc_types.h> +#include <mach/boolean.h> +#include <mach/machine/kern_return.h> +#include <mach/machine/vm_types.h> +#include <mach/message.h> +#include <mach/vm_prot.h> + +extern void memory_object_proxy_init (void); +extern boolean_t memory_object_proxy_notify (mach_msg_header_t *msg); +extern kern_return_t memory_object_proxy_lookup (ipc_port_t port, +                                                 ipc_port_t *object, +                                                 vm_prot_t *max_protection, +                                                 vm_offset_t *start, +                                                 vm_offset_t *len); + +#endif /* _VM_MEMORY_OBJECT_PROXY_H_ */ diff --git a/vm/memory_object_user.cli b/vm/memory_object_user.cli new file mode 100644 index 0000000..2bba41f --- /dev/null +++ b/vm/memory_object_user.cli @@ -0,0 +1,28 @@ +/*  + * Copyright (c) 1994 The University of Utah and + * the Computer Systems Laboratory at the University of Utah (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software is hereby + * granted provided that (1) source code retains these copyright, permission, + * and disclaimer notices, and (2) redistributions including binaries + * reproduce the notices in supporting documentation, and (3) all advertising + * materials mentioning features or use of this software display the following + * acknowledgement: ``This product includes software developed by the + * Computer Systems Laboratory at the University of Utah.'' + * + * THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF THIS SOFTWARE IN ITS "AS + * IS" CONDITION.  THE UNIVERSITY OF UTAH AND CSL DISCLAIM ANY LIABILITY OF + * ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * CSL requests users of this software to return to csl-dist@cs.utah.edu any + * improvements that they make and grant CSL redistribution rights. + * + *      Author: Bryan Ford, University of Utah CSL + */ +/* This is a client presentation file.  */ + +#define KERNEL_USER 1 +#define SEQNOS 1 + +#include <mach/memory_object.defs> diff --git a/vm/pmap.h b/vm/pmap.h new file mode 100644 index 0000000..aca9ada --- /dev/null +++ b/vm/pmap.h @@ -0,0 +1,241 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University. + * Copyright (c) 1993,1994 The University of Utah and + * the Computer Systems Laboratory (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF + * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY + * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF + * THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU + *  School of Computer Science + *  Carnegie Mellon University + *  Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + *	File:	vm/pmap.h + *	Author:	Avadis Tevanian, Jr. + *	Date:	1985 + * + *	Machine address mapping definitions -- machine-independent + *	section.  [For machine-dependent section, see "machine/pmap.h".] + */ + +#ifndef	_VM_PMAP_H_ +#define _VM_PMAP_H_ + +#include <machine/pmap.h> +#include <mach/machine/vm_types.h> +#include <mach/vm_prot.h> +#include <mach/boolean.h> +#include <kern/thread.h> + +/* + *	The following is a description of the interface to the + *	machine-dependent "physical map" data structure.  The module + *	must provide a "pmap_t" data type that represents the + *	set of valid virtual-to-physical addresses for one user + *	address space.  [The kernel address space is represented + *	by a distinguished "pmap_t".]  The routines described manage + *	this type, install and update virtual-to-physical mappings, + *	and perform operations on physical addresses common to + *	many address spaces. + */ + +/* + *	Routines used for initialization. + *	There is traditionally also a pmap_bootstrap, + *	used very early by machine-dependent code, + *	but it is not part of the interface. + */ + +/* During VM initialization, steal a chunk of memory.  */ +extern vm_offset_t	pmap_steal_memory(vm_size_t); +/* Initialization, after kernel runs in virtual memory.  */ +extern void		pmap_init(void); + +#ifndef	MACHINE_PAGES +/* + *	If machine/pmap.h defines MACHINE_PAGES, it must implement + *	the above functions.  The pmap module has complete control. + *	Otherwise, it must implement + *		pmap_virtual_space + *		pmap_init + *	and vm/vm_resident.c implements pmap_steal_memory using + *	pmap_virtual_space and pmap_enter. + */ + +/* During VM initialization, report virtual space available for the kernel.  */ +extern void		pmap_virtual_space(vm_offset_t *, vm_offset_t *); +#endif	/* MACHINE_PAGES */ + +/* + *	Routines to manage the physical map data structure. + */ + +/* Create a pmap_t. */ +pmap_t pmap_create(vm_size_t size); + +/* Return the kernel's pmap_t. */ +#ifndef pmap_kernel +extern pmap_t pmap_kernel(void); +#endif /* pmap_kernel */ + +/* Gain and release a reference. */ +extern void pmap_reference(pmap_t pmap); +extern void pmap_destroy(pmap_t pmap); + +/* Enter a mapping */ +extern void pmap_enter(pmap_t pmap, vm_offset_t va, phys_addr_t pa, +		       vm_prot_t prot, boolean_t wired); + + +/* + *	Routines that operate on ranges of virtual addresses. + */ + +/* Remove mappings. */ +void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); + +/* Change protections. */ +void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot); + +/* + *	Routines to set up hardware state for physical maps to be used. + */ +/* Prepare pmap_t to run on a given processor.  */ +extern void		pmap_activate(pmap_t, thread_t, int); +/* Release pmap_t from use on processor.  */ +extern void		pmap_deactivate(pmap_t, thread_t, int); + + +/* + *	Routines that operate on physical addresses. + */ + +/* Restrict access to page. */ +void pmap_page_protect(phys_addr_t pa, vm_prot_t prot); + +/* + *	Routines to manage reference/modify bits based on + *	physical addresses, simulating them if not provided + *	by the hardware. + */ + +/* Clear reference bit */ +void pmap_clear_reference(phys_addr_t pa); + +/* Return reference bit */ +#ifndef pmap_is_referenced +boolean_t pmap_is_referenced(phys_addr_t pa); +#endif /* pmap_is_referenced */ + +/* Clear modify bit */ +void pmap_clear_modify(phys_addr_t pa); + +/* Return modify bit */ +boolean_t pmap_is_modified(phys_addr_t pa); + +/* + *	Sundry required routines + */ +/* Return a virtual-to-physical mapping, if possible.  */ +extern phys_addr_t	pmap_extract(pmap_t, vm_offset_t); +/* Perform garbage collection, if any.  */ +extern void		pmap_collect(pmap_t); + +/* Lookup an address.  */ +int pmap_whatis(pmap_t, vm_offset_t); + +/* Specify pageability.  */ +extern void		pmap_change_wiring(pmap_t, vm_offset_t, boolean_t); + +/* + *	Optional routines + */ +#ifndef	pmap_copy +/* Copy range of mappings, if desired.  */ +extern void		pmap_copy(pmap_t, pmap_t, vm_offset_t, vm_size_t, +				  vm_offset_t); +#endif	/* pmap_copy */ +#ifndef pmap_attribute +/* Get/Set special memory attributes.  */ +extern kern_return_t	pmap_attribute(void); +#endif	/* pmap_attribute */ + +/* + *	Grab a physical page: + *	the standard memory allocation mechanism + *	during system initialization. + */ +extern vm_offset_t pmap_grab_page (void); + +/* + *      Make the specified pages (by pmap, offset) + *      pageable (or not) as requested. + */ +extern void pmap_pageable( +    pmap_t      pmap, +    vm_offset_t start, +    vm_offset_t end, +    boolean_t   pageable); + +/* + *      Back-door routine for mapping kernel VM at initialization. + *      Useful for mapping memory outside the range of direct mapped + *      physical memory (i.e., devices). + */ +extern vm_offset_t pmap_map_bd( +        vm_offset_t virt, +        phys_addr_t start, +        phys_addr_t end, +        vm_prot_t prot); + +/* + * Routines defined as macros. + */ +#ifndef	PMAP_ACTIVATE_USER +#define	PMAP_ACTIVATE_USER(pmap, thread, cpu) {		\ +	if ((pmap) != kernel_pmap)			\ +	    PMAP_ACTIVATE(pmap, thread, cpu);		\ +} +#endif	/* PMAP_ACTIVATE_USER */ + +#ifndef	PMAP_DEACTIVATE_USER +#define	PMAP_DEACTIVATE_USER(pmap, thread, cpu) {	\ +	if ((pmap) != kernel_pmap)			\ +	    PMAP_DEACTIVATE(pmap, thread, cpu);		\ +} +#endif	/* PMAP_DEACTIVATE_USER */ + +#ifndef	PMAP_ACTIVATE_KERNEL +#define	PMAP_ACTIVATE_KERNEL(cpu)			\ +		PMAP_ACTIVATE(kernel_pmap, THREAD_NULL, cpu) +#endif	/* PMAP_ACTIVATE_KERNEL */ + +#ifndef	PMAP_DEACTIVATE_KERNEL +#define	PMAP_DEACTIVATE_KERNEL(cpu)			\ +		PMAP_DEACTIVATE(kernel_pmap, THREAD_NULL, cpu) +#endif	/* PMAP_DEACTIVATE_KERNEL */ + +/* + *	Exported data structures + */ + +extern pmap_t	kernel_pmap;			/* The kernel's map */ + +#endif	/* _VM_PMAP_H_ */ diff --git a/vm/vm_debug.c b/vm/vm_debug.c new file mode 100644 index 0000000..b0dace8 --- /dev/null +++ b/vm/vm_debug.c @@ -0,0 +1,548 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU + *  School of Computer Science + *  Carnegie Mellon University + *  Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + *	File:	vm/vm_debug.c. + *	Author:	Rich Draves + *	Date:	March, 1990 + * + *	Exported kernel calls.  See mach_debug/mach_debug.defs. + */ + +#include <string.h> + +#include <kern/debug.h> +#include <kern/thread.h> +#include <mach/kern_return.h> +#include <mach/machine/vm_types.h> +#include <mach/memory_object.h> +#include <mach/vm_prot.h> +#include <mach/vm_inherit.h> +#include <mach/vm_param.h> +#include <mach_debug/vm_info.h> +#include <mach_debug/hash_info.h> +#include <vm/vm_map.h> +#include <vm/vm_kern.h> +#include <vm/vm_object.h> +#include <kern/mach_debug.server.h> +#include <kern/task.h> +#include <kern/host.h> +#include <kern/printf.h> +#include <ipc/ipc_port.h> + + +#if MACH_VM_DEBUG + +/* + *	Routine:	vm_object_real_name + *	Purpose: + *		Convert a VM object to a name port. + *	Conditions: + *		Takes object and port locks. + *	Returns: + *		A naked send right for the object's name port, + *		or IP_NULL if the object or its name port is null. + */ + +static ipc_port_t +vm_object_real_name(vm_object_t object) +{ +	ipc_port_t port = IP_NULL; + +	if (object != VM_OBJECT_NULL) { +		vm_object_lock(object); +		if (object->pager_name != IP_NULL) +			port = ipc_port_make_send(object->pager_name); +		vm_object_unlock(object); +	} + +	return port; +} + +/* + *	Routine:	mach_vm_region_info [kernel call] + *	Purpose: + *		Retrieve information about a VM region, + *		including info about the object chain. + *	Conditions: + *		Nothing locked. + *	Returns: + *		KERN_SUCCESS		Retrieve region/object info. + *		KERN_INVALID_TASK	The map is null. + *		KERN_NO_SPACE		There is no entry at/after the address. + */ + +kern_return_t +mach_vm_region_info( +	vm_map_t 		map, +	vm_offset_t 		address, +	vm_region_info_t 	*regionp, +	ipc_port_t 		*portp) +{ +	vm_map_t cmap;		/* current map in traversal */ +	vm_map_t nmap;		/* next map to look at */ +	vm_map_entry_t entry;	/* entry in current map */ +	vm_object_t object; + +	if (map == VM_MAP_NULL) +		return KERN_INVALID_TASK; + +	/* find the entry containing (or following) the address */ + +	vm_map_lock_read(map); +	for (cmap = map;;) { +		/* cmap is read-locked */ + +		if (!vm_map_lookup_entry(cmap, address, &entry)) { +			entry = entry->vme_next; +			if (entry == vm_map_to_entry(cmap)) { +				if (map == cmap) { +					vm_map_unlock_read(cmap); +					return KERN_NO_SPACE; +				} + +				/* back out to top-level & skip this submap */ + +				address = vm_map_max(cmap); +				vm_map_unlock_read(cmap); +				vm_map_lock_read(map); +				cmap = map; +				continue; +			} +		} + +		if (entry->is_sub_map) { +			/* move down to the sub map */ + +			nmap = entry->object.sub_map; +			vm_map_lock_read(nmap); +			vm_map_unlock_read(cmap); +			cmap = nmap; +			continue; +		} else { +			break; +		} +		/*NOTREACHED*/ +	} + + +	assert(entry->vme_start < entry->vme_end); + +	regionp->vri_start = entry->vme_start; +	regionp->vri_end = entry->vme_end; + +	/* attributes from the real entry */ + +	regionp->vri_protection = entry->protection; +	regionp->vri_max_protection = entry->max_protection; +	regionp->vri_inheritance = entry->inheritance; +	regionp->vri_wired_count = !!entry->wired_count; /* Doesn't stack */ +	regionp->vri_user_wired_count = regionp->vri_wired_count; /* Obsolete */ + +	object = entry->object.vm_object; +	*portp = vm_object_real_name(object); +	regionp->vri_object = (vm_offset_t) object; +	regionp->vri_offset = entry->offset; +	regionp->vri_needs_copy = entry->needs_copy; + +	regionp->vri_sharing = entry->is_shared; + +	vm_map_unlock_read(cmap); +	return KERN_SUCCESS; +} + +/* + *	Routine:	mach_vm_object_info [kernel call] + *	Purpose: + *		Retrieve information about a VM object. + *	Conditions: + *		Nothing locked. + *	Returns: + *		KERN_SUCCESS		Retrieved object info. + *		KERN_INVALID_ARGUMENT	The object is null. + */ + +kern_return_t +mach_vm_object_info( +	vm_object_t 		object, +	vm_object_info_t 	*infop, +	ipc_port_t 		*shadowp, +	ipc_port_t 		*copyp) +{ +	vm_object_info_t info; +	vm_object_info_state_t state; +	ipc_port_t shadow, copy; + +	if (object == VM_OBJECT_NULL) +		return KERN_INVALID_ARGUMENT; + +	/* +	 *	Because of lock-ordering/deadlock considerations, +	 *	we can't use vm_object_real_name for the copy object. +	 */ + +    retry: +	vm_object_lock(object); +	copy = IP_NULL; +	if (object->copy != VM_OBJECT_NULL) { +		if (!vm_object_lock_try(object->copy)) { +			vm_object_unlock(object); +			simple_lock_pause();	/* wait a bit */ +			goto retry; +		} + +		if (object->copy->pager_name != IP_NULL) +			copy = ipc_port_make_send(object->copy->pager_name); +		vm_object_unlock(object->copy); +	} +	shadow = vm_object_real_name(object->shadow); + +	info.voi_object = (vm_offset_t) object; +	info.voi_pagesize = PAGE_SIZE; +	info.voi_size = object->size; +	info.voi_ref_count = object->ref_count; +	info.voi_resident_page_count = object->resident_page_count; +	info.voi_absent_count = object->absent_count; +	info.voi_copy = (vm_offset_t) object->copy; +	info.voi_shadow = (vm_offset_t) object->shadow; +	info.voi_shadow_offset = object->shadow_offset; +	info.voi_paging_offset = object->paging_offset; +	info.voi_copy_strategy = object->copy_strategy; +	info.voi_last_alloc = object->last_alloc; +	info.voi_paging_in_progress = object->paging_in_progress; + +	state = 0; +	if (object->pager_created) +		state |= VOI_STATE_PAGER_CREATED; +	if (object->pager_initialized) +		state |= VOI_STATE_PAGER_INITIALIZED; +	if (object->pager_ready) +		state |= VOI_STATE_PAGER_READY; +	if (object->can_persist) +		state |= VOI_STATE_CAN_PERSIST; +	if (object->internal) +		state |= VOI_STATE_INTERNAL; +	if (object->temporary) +		state |= VOI_STATE_TEMPORARY; +	if (object->alive) +		state |= VOI_STATE_ALIVE; +	if (object->lock_in_progress) +		state |= VOI_STATE_LOCK_IN_PROGRESS; +	if (object->lock_restart) +		state |= VOI_STATE_LOCK_RESTART; +	info.voi_state = state; +	vm_object_unlock(object); + +	*infop = info; +	*shadowp = shadow; +	*copyp = copy; +	return KERN_SUCCESS; +} + +#define VPI_STATE_NODATA	(VPI_STATE_BUSY|VPI_STATE_FICTITIOUS| \ +				 VPI_STATE_PRIVATE|VPI_STATE_ABSENT) + +/* + *	Routine:	mach_vm_object_pages/mach_vm_object_pages_phys/ [kernel call] + *	Purpose: + *		Retrieve information about the pages in a VM object. + *	Conditions: + *		Nothing locked.  Obeys CountInOut protocol. + *	Returns: + *		KERN_SUCCESS		Retrieved object info. + *		KERN_INVALID_ARGUMENT	The object is null. + *		KERN_RESOURCE_SHORTAGE	Couldn't allocate memory. + */ + +static kern_return_t +_mach_vm_object_pages( +	vm_object_t 		object, +	void*		 	*pagesp, +	natural_t 		*countp, +	int			phys) +{ +	vm_size_t size; +	vm_offset_t addr; +	void *pages; +	unsigned int potential, actual, count; +	vm_page_t p; +	kern_return_t kr; + +	if (object == VM_OBJECT_NULL) +		return KERN_INVALID_ARGUMENT; + +	/* start with in-line memory */ + +	pages = *pagesp; +	potential = *countp; + +	for (size = 0;;) { +		vm_object_lock(object); +		actual = object->resident_page_count; +		if (actual <= potential) +			break; +		vm_object_unlock(object); + +		if (pages != *pagesp) +			kmem_free(ipc_kernel_map, addr, size); + +		if (phys) +			size = round_page(actual * sizeof(vm_page_phys_info_t)); +		else +			size = round_page(actual * sizeof(vm_page_info_t)); +		kr = kmem_alloc(ipc_kernel_map, &addr, size); +		if (kr != KERN_SUCCESS) +			return kr; + +		pages = (void *) addr; +		if (phys) +			potential = size / sizeof(vm_page_phys_info_t); +		else +			potential = size / sizeof(vm_page_info_t); +	} +	/* object is locked, we have enough wired memory */ + +	count = 0; +	queue_iterate(&object->memq, p, vm_page_t, listq) { +		vm_page_info_t *info = NULL; +		vm_page_phys_info_t *info_phys = NULL; + +		if (phys) +			info_phys = pages + count * sizeof(*info_phys); +		else +			info = pages + count * sizeof(*info); +		count++; + +		vm_page_info_state_t state = 0; + +		if (phys) { +			info_phys->vpi_offset = p->offset; +			if (p->phys_addr != (typeof(info_phys->vpi_phys_addr)) p->phys_addr) +				printf("warning: physical address overflow in mach_vm_object_pages!!\n"); +			info_phys->vpi_phys_addr = p->phys_addr; +			info_phys->vpi_wire_count = p->wire_count; +			info_phys->vpi_page_lock = p->page_lock; +			info_phys->vpi_unlock_request = p->unlock_request; +		} else { +			info->vpi_offset = p->offset; +			if (p->phys_addr != (typeof(info->vpi_phys_addr)) p->phys_addr) +				printf("warning: physical address overflow in mach_vm_object_pages!!\n"); +			info->vpi_phys_addr = p->phys_addr; +			info->vpi_wire_count = p->wire_count; +			info->vpi_page_lock = p->page_lock; +			info->vpi_unlock_request = p->unlock_request; +		} + +		if (p->busy) +			state |= VPI_STATE_BUSY; +		if (p->wanted) +			state |= VPI_STATE_WANTED; +		if (p->tabled) +			state |= VPI_STATE_TABLED; +		if (p->fictitious) +			state |= VPI_STATE_FICTITIOUS; +		if (p->private) +			state |= VPI_STATE_PRIVATE; +		if (p->absent) +			state |= VPI_STATE_ABSENT; +		if (p->error) +			state |= VPI_STATE_ERROR; +		if (p->dirty) +			state |= VPI_STATE_DIRTY; +		if (p->precious) +			state |= VPI_STATE_PRECIOUS; +		if (p->overwriting) +			state |= VPI_STATE_OVERWRITING; + +		if (((state & (VPI_STATE_NODATA|VPI_STATE_DIRTY)) == 0) && +		    pmap_is_modified(p->phys_addr)) { +			state |= VPI_STATE_DIRTY; +			p->dirty = TRUE; +		} + +		vm_page_lock_queues(); +		if (p->inactive) +			state |= VPI_STATE_INACTIVE; +		if (p->active) +			state |= VPI_STATE_ACTIVE; +		if (p->laundry) +			state |= VPI_STATE_LAUNDRY; +		if (p->free) +			state |= VPI_STATE_FREE; +		if (p->reference) +			state |= VPI_STATE_REFERENCE; + +		if (((state & (VPI_STATE_NODATA|VPI_STATE_REFERENCE)) == 0) && +		    pmap_is_referenced(p->phys_addr)) { +			state |= VPI_STATE_REFERENCE; +			p->reference = TRUE; +		} +		vm_page_unlock_queues(); + +		if (phys) +			info_phys->vpi_state = state; +		else +			info->vpi_state = state; +	} + +	if (object->resident_page_count != count) +		panic("mach_vm_object_pages"); +	vm_object_unlock(object); + +	if (pages == *pagesp) { +		/* data fit in-line; nothing to deallocate */ + +		*countp = actual; +	} else if (actual == 0) { +		kmem_free(ipc_kernel_map, addr, size); + +		*countp = 0; +	} else { +		vm_size_t size_used, rsize_used; +		vm_map_copy_t copy; + +		/* kmem_alloc doesn't zero memory */ + +		if (phys) +			size_used = actual * sizeof(vm_page_phys_info_t); +		else +			size_used = actual * sizeof(vm_page_info_t); +		rsize_used = round_page(size_used); + +		if (rsize_used != size) +			kmem_free(ipc_kernel_map, +				  addr + rsize_used, size - rsize_used); + +		if (size_used != rsize_used) +			memset((void *) (addr + size_used), 0, +			       rsize_used - size_used); + +		kr = vm_map_copyin(ipc_kernel_map, addr, rsize_used, +				   TRUE, ©); +		assert(kr == KERN_SUCCESS); + +		*pagesp = (void *) copy; +		*countp = actual; +	} + +	return KERN_SUCCESS; +} + +kern_return_t +mach_vm_object_pages( +	vm_object_t 		object, +	vm_page_info_array_t 	*pagesp, +	natural_t 		*countp) +{ +	return _mach_vm_object_pages(object, (void**) pagesp, countp, 0); +} + +kern_return_t +mach_vm_object_pages_phys( +	vm_object_t 			object, +	vm_page_phys_info_array_t 	*pagesp, +	natural_t 			*countp) +{ +	return _mach_vm_object_pages(object, (void**) pagesp, countp, 1); +} + +#endif	/* MACH_VM_DEBUG */ + +/* + *	Routine:	host_virtual_physical_table_info + *	Purpose: + *		Return information about the VP table. + *	Conditions: + *		Nothing locked.  Obeys CountInOut protocol. + *	Returns: + *		KERN_SUCCESS		Returned information. + *		KERN_INVALID_HOST	The host is null. + *		KERN_RESOURCE_SHORTAGE	Couldn't allocate memory. + */ + +kern_return_t +host_virtual_physical_table_info(const host_t host, +		hash_info_bucket_array_t *infop, natural_t *countp) +{ +	vm_offset_t addr; +	vm_size_t size = 0;/* '=0' to quiet gcc warnings */ +	hash_info_bucket_t *info; +	unsigned int potential, actual; +	kern_return_t kr; + +	if (host == HOST_NULL) +		return KERN_INVALID_HOST; + +	/* start with in-line data */ + +	info = *infop; +	potential = *countp; + +	for (;;) { +		actual = vm_page_info(info, potential); +		if (actual <= potential) +			break; + +		/* allocate more memory */ + +		if (info != *infop) +			kmem_free(ipc_kernel_map, addr, size); + +		size = round_page(actual * sizeof *info); +		kr = kmem_alloc_pageable(ipc_kernel_map, &addr, size); +		if (kr != KERN_SUCCESS) +			return KERN_RESOURCE_SHORTAGE; + +		info = (hash_info_bucket_t *) addr; +		potential = size/sizeof *info; +	} + +	if (info == *infop) { +		/* data fit in-line; nothing to deallocate */ + +		*countp = actual; +	} else if (actual == 0) { +		kmem_free(ipc_kernel_map, addr, size); + +		*countp = 0; +	} else { +		vm_map_copy_t copy; +		vm_size_t used; + +		used = round_page(actual * sizeof *info); + +		if (used != size) +			kmem_free(ipc_kernel_map, addr + used, size - used); + +		kr = vm_map_copyin(ipc_kernel_map, addr, used, +				   TRUE, ©); +		assert(kr == KERN_SUCCESS); + +		*infop = (hash_info_bucket_t *) copy; +		*countp = actual; +	} + +	return KERN_SUCCESS; +} diff --git a/vm/vm_external.c b/vm/vm_external.c new file mode 100644 index 0000000..99f4b9c --- /dev/null +++ b/vm/vm_external.c @@ -0,0 +1,151 @@ +/*  + * Mach Operating System + * Copyright (c) 1991,1990,1989 Carnegie Mellon University + * All Rights Reserved. + *  + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + *  + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + *  + * Carnegie Mellon requests users of this software to return to + *  + *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU + *  School of Computer Science + *  Carnegie Mellon University + *  Pittsburgh PA 15213-3890 + *  + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + *	This module maintains information about the presence of + *	pages not in memory.  Since an external memory object + *	must maintain a complete knowledge of its contents, this + *	information takes the form of hints. + */ + +#include <mach/boolean.h> +#include <kern/slab.h> +#include <vm/vm_external.h> +#include <mach/vm_param.h> +#include <kern/assert.h> +#include <string.h> + + + +boolean_t	vm_external_unsafe = FALSE; + +struct kmem_cache	vm_external_cache; + +/* + *	The implementation uses bit arrays to record whether + *	a page has been written to external storage.  For + *	convenience, these bit arrays come in two sizes + *	(measured in bytes). + */ + +#define		SMALL_SIZE	(VM_EXTERNAL_SMALL_SIZE/8) +#define		LARGE_SIZE	(VM_EXTERNAL_LARGE_SIZE/8) + +struct kmem_cache	vm_object_small_existence_map_cache; +struct kmem_cache	vm_object_large_existence_map_cache; + + +vm_external_t	vm_external_create(vm_offset_t size) +{ +	vm_external_t	result; +	vm_size_t	bytes; +	 +	result = (vm_external_t) kmem_cache_alloc(&vm_external_cache); +	result->existence_map = (char *) 0; + +	bytes = (atop(size) + 07) >> 3; +	if (bytes <= SMALL_SIZE) { +		result->existence_map = +		 (char *) kmem_cache_alloc(&vm_object_small_existence_map_cache); +		result->existence_size = SMALL_SIZE; +	} else { +		result->existence_map = +		 (char *) kmem_cache_alloc(&vm_object_large_existence_map_cache); +		result->existence_size = LARGE_SIZE; +	} +	memset (result->existence_map, 0, result->existence_size); +	return(result); +} + +void		vm_external_destroy(vm_external_t e) +{ +	if (e == VM_EXTERNAL_NULL) +		return; + +	if (e->existence_map != (char *) 0) { +		if (e->existence_size <= SMALL_SIZE) { +			kmem_cache_free(&vm_object_small_existence_map_cache, +				(vm_offset_t) e->existence_map); +		} else { +			kmem_cache_free(&vm_object_large_existence_map_cache, +				(vm_offset_t) e->existence_map); +		} +	} +	kmem_cache_free(&vm_external_cache, (vm_offset_t) e); +} + +vm_external_state_t _vm_external_state_get(const vm_external_t	e, +	vm_offset_t		offset) +{ +	unsigned +	int		bit, byte; + +	if (vm_external_unsafe || +	    (e == VM_EXTERNAL_NULL) || +	    (e->existence_map == (char *) 0)) +		return(VM_EXTERNAL_STATE_UNKNOWN); + +	bit = atop(offset); +	byte = bit >> 3; +	if (byte >= e->existence_size) return (VM_EXTERNAL_STATE_UNKNOWN); +	return( (e->existence_map[byte] & (1 << (bit & 07))) ? +		VM_EXTERNAL_STATE_EXISTS : VM_EXTERNAL_STATE_ABSENT ); +} + +void		vm_external_state_set( +	vm_external_t		e, +	vm_offset_t		offset, +	vm_external_state_t 	state) +{ +	unsigned +	int		bit, byte; + +	if ((e == VM_EXTERNAL_NULL) || (e->existence_map == (char *) 0)) +		return; + +	if (state != VM_EXTERNAL_STATE_EXISTS) +		return; + +	bit = atop(offset); +	byte = bit >> 3; +	if (byte >= e->existence_size) return; +	e->existence_map[byte] |= (1 << (bit & 07)); +} + +void		vm_external_module_initialize(void) +{ +	vm_size_t	size = (vm_size_t) sizeof(struct vm_external); + +	kmem_cache_init(&vm_external_cache, "vm_external", size, 0, +			NULL, 0); + +	kmem_cache_init(&vm_object_small_existence_map_cache, +			"small_existence_map", SMALL_SIZE, 0, +			NULL, 0); + +	kmem_cache_init(&vm_object_large_existence_map_cache, +			"large_existence_map", LARGE_SIZE, 0, +			NULL, 0); +} diff --git a/vm/vm_external.h b/vm/vm_external.h new file mode 100644 index 0000000..4e44ddf --- /dev/null +++ b/vm/vm_external.h @@ -0,0 +1,95 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU + *  School of Computer Science + *  Carnegie Mellon University + *  Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ + +#ifndef	_VM_VM_EXTERNAL_H_ +#define _VM_VM_EXTERNAL_H_ + +/* + *	External page management hint technology + * + *	The data structure exported by this module maintains + *	a (potentially incomplete) map of the pages written + *	to external storage for a range of virtual memory. + */ + +/* + *	The data structure representing the state of pages + *	on external storage. + */ + +typedef struct vm_external { +    	int		existence_size;	/* Size of the following bitmap */ +	char		*existence_map;	/* A bitmap of pages that have +					 * been written to backing +					 * storage. +					 */ +#if 0 +	/* XXX: Currently, existence_count is not used.  I guess it +	   could be useful to get rid of the map if the count drops to +	   zero.  */ +	int		existence_count;/* Number of bits turned on in +					 * existence_map. +					 */ +#endif +} *vm_external_t; + +#define	VM_EXTERNAL_NULL	((vm_external_t) 0) + +#define VM_EXTERNAL_SMALL_SIZE	128 +#define VM_EXTERNAL_LARGE_SIZE	8192 + +/* + *	The states that may be recorded for a page of external storage. + */ + +typedef int	vm_external_state_t; +#define	VM_EXTERNAL_STATE_EXISTS		1 +#define	VM_EXTERNAL_STATE_UNKNOWN		2 +#define	VM_EXTERNAL_STATE_ABSENT		3 + + +/* + *	Routines exported by this module. + */ + +/* Initialize the module */ +extern void		vm_external_module_initialize(void); +/* Create a vm_external_t */ +extern vm_external_t	vm_external_create(vm_offset_t); +/* Destroy one */ +extern void vm_external_destroy(vm_external_t); + +/* Set state of a page.  */ +extern void		vm_external_state_set(vm_external_t, vm_offset_t, +					      vm_external_state_t); +/* Retrieve the state for a given page, if known.  */ +#define	vm_external_state_get(e,offset)	(((e) != VM_EXTERNAL_NULL) ? \ +					  _vm_external_state_get(e, offset) : \ +					  VM_EXTERNAL_STATE_UNKNOWN) +/* HIDDEN routine */ +extern vm_external_state_t _vm_external_state_get(vm_external_t, vm_offset_t); + +#endif	/* _VM_VM_EXTERNAL_H_ */ diff --git a/vm/vm_fault.c b/vm/vm_fault.c new file mode 100644 index 0000000..c6e2800 --- /dev/null +++ b/vm/vm_fault.c @@ -0,0 +1,2136 @@ +/* + * Mach Operating System + * Copyright (c) 1994,1990,1989,1988,1987 Carnegie Mellon University. + * Copyright (c) 1993,1994 The University of Utah and + * the Computer Systems Laboratory (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF + * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY + * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF + * THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU + *  School of Computer Science + *  Carnegie Mellon University + *  Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + *	File:	vm_fault.c + *	Author:	Avadis Tevanian, Jr., Michael Wayne Young + * + *	Page fault handling module. + */ + +#include <kern/printf.h> +#include <vm/vm_fault.h> +#include <mach/kern_return.h> +#include <mach/message.h>	/* for error codes */ +#include <kern/counters.h> +#include <kern/debug.h> +#include <kern/thread.h> +#include <kern/sched_prim.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/pmap.h> +#include <mach/vm_statistics.h> +#include <vm/vm_pageout.h> +#include <mach/vm_param.h> +#include <mach/memory_object.h> +#include <vm/memory_object_user.user.h> +				/* For memory_object_data_{request,unlock} */ +#include <kern/macros.h> +#include <kern/slab.h> + +#if	MACH_PCSAMPLE +#include <kern/pc_sample.h> +#endif + + + +/* + *	State needed by vm_fault_continue. + *	This is a little hefty to drop directly + *	into the thread structure. + */ +typedef struct vm_fault_state { +	struct vm_map *vmf_map; +	vm_offset_t vmf_vaddr; +	vm_prot_t vmf_fault_type; +	boolean_t vmf_change_wiring; +	vm_fault_continuation_t vmf_continuation; +	vm_map_version_t vmf_version; +	boolean_t vmf_wired; +	struct vm_object *vmf_object; +	vm_offset_t vmf_offset; +	vm_prot_t vmf_prot; + +	boolean_t vmfp_backoff; +	struct vm_object *vmfp_object; +	vm_offset_t vmfp_offset; +	struct vm_page *vmfp_first_m; +	vm_prot_t vmfp_access; +} vm_fault_state_t; + +struct kmem_cache	vm_fault_state_cache; + +int		vm_object_absent_max = 50; + +boolean_t	vm_fault_dirty_handling = FALSE; +boolean_t	vm_fault_interruptible = TRUE; + +boolean_t	software_reference_bits = TRUE; + +#if	MACH_KDB +extern struct db_watchpoint *db_watchpoint_list; +#endif	/* MACH_KDB */ + +/* + *	Routine:	vm_fault_init + *	Purpose: + *		Initialize our private data structures. + */ +void vm_fault_init(void) +{ +	kmem_cache_init(&vm_fault_state_cache, "vm_fault_state", +			sizeof(vm_fault_state_t), 0, NULL, 0); +} + +/* + *	Routine:	vm_fault_cleanup + *	Purpose: + *		Clean up the result of vm_fault_page. + *	Results: + *		The paging reference for "object" is released. + *		"object" is unlocked. + *		If "top_page" is not null,  "top_page" is + *		freed and the paging reference for the object + *		containing it is released. + * + *	In/out conditions: + *		"object" must be locked. + */ +void +vm_fault_cleanup( +	vm_object_t	object, +	vm_page_t	top_page) +{ +	vm_object_paging_end(object); +	vm_object_unlock(object); + +	if (top_page != VM_PAGE_NULL) { +	    object = top_page->object; +	    vm_object_lock(object); +	    VM_PAGE_FREE(top_page); +	    vm_object_paging_end(object); +	    vm_object_unlock(object); +	} +} + + +#if	MACH_PCSAMPLE +/* + *	Do PC sampling on current thread, assuming + *	that it is the thread taking this page fault. + * + *	Must check for THREAD_NULL, since faults + *	can occur before threads are running. + */ + +#define	vm_stat_sample(flavor) \ +    MACRO_BEGIN \ +      thread_t _thread_ = current_thread(); \ + \ +      if (_thread_ != THREAD_NULL) \ +	  take_pc_sample_macro(_thread_, (flavor), 1, 0); \ +    MACRO_END + +#else +#define	vm_stat_sample(x) +#endif	/* MACH_PCSAMPLE */ + + + +/* + *	Routine:	vm_fault_page + *	Purpose: + *		Find the resident page for the virtual memory + *		specified by the given virtual memory object + *		and offset. + *	Additional arguments: + *		The required permissions for the page is given + *		in "fault_type".  Desired permissions are included + *		in "protection". + * + *		If the desired page is known to be resident (for + *		example, because it was previously wired down), asserting + *		the "unwiring" parameter will speed the search. + * + *		If the operation can be interrupted (by thread_abort + *		or thread_terminate), then the "interruptible" + *		parameter should be asserted. + * + *	Results: + *		The page containing the proper data is returned + *		in "result_page". + * + *	In/out conditions: + *		The source object must be locked and referenced, + *		and must donate one paging reference.  The reference + *		is not affected.  The paging reference and lock are + *		consumed. + * + *		If the call succeeds, the object in which "result_page" + *		resides is left locked and holding a paging reference. + *		If this is not the original object, a busy page in the + *		original object is returned in "top_page", to prevent other + *		callers from pursuing this same data, along with a paging + *		reference for the original object.  The "top_page" should + *		be destroyed when this guarantee is no longer required. + *		The "result_page" is also left busy.  It is not removed + *		from the pageout queues. + */ +vm_fault_return_t vm_fault_page( + /* Arguments: */ +	vm_object_t	first_object,	/* Object to begin search */ +	vm_offset_t	first_offset,	/* Offset into object */ +	vm_prot_t	fault_type,	/* What access is requested */ +	boolean_t	must_be_resident,/* Must page be resident? */ +	boolean_t	interruptible,	/* May fault be interrupted? */ + /* Modifies in place: */ +	vm_prot_t	*protection,	/* Protection for mapping */ + /* Returns: */ +	vm_page_t	*result_page,	/* Page found, if successful */ +	vm_page_t	*top_page,	/* Page in top object, if +					 * not result_page. +					 */ + /* More arguments: */ +	boolean_t	resume,		/* We are restarting. */ +	continuation_t	continuation) 	/* Continuation for blocking. */ +{ +	vm_page_t	m; +	vm_object_t	object; +	vm_offset_t	offset; +	vm_page_t	first_m; +	vm_object_t	next_object; +	vm_object_t	copy_object; +	boolean_t	look_for_page; +	vm_prot_t	access_required; + +	if (resume) { +		vm_fault_state_t *state = +			(vm_fault_state_t *) current_thread()->ith_other; + +		if (state->vmfp_backoff) +			goto after_block_and_backoff; + +		object = state->vmfp_object; +		offset = state->vmfp_offset; +		first_m = state->vmfp_first_m; +		access_required = state->vmfp_access; +		goto after_thread_block; +	} + +	vm_stat_sample(SAMPLED_PC_VM_FAULTS_ANY); +	vm_stat.faults++;		/* needs lock XXX */ +	current_task()->faults++; + +/* + *	Recovery actions + */ +#define RELEASE_PAGE(m)					\ +	MACRO_BEGIN					\ +	PAGE_WAKEUP_DONE(m);				\ +	vm_page_lock_queues();				\ +	if (!m->active && !m->inactive)			\ +		vm_page_activate(m);			\ +	vm_page_unlock_queues();			\ +	MACRO_END + +	if (vm_fault_dirty_handling +#if	MACH_KDB +		/* +		 *	If there are watchpoints set, then +		 *	we don't want to give away write permission +		 *	on a read fault.  Make the task write fault, +		 *	so that the watchpoint code notices the access. +		 */ +	    || db_watchpoint_list +#endif	/* MACH_KDB */ +	    ) { +		/* +		 *	If we aren't asking for write permission, +		 *	then don't give it away.  We're using write +		 *	faults to set the dirty bit. +		 */ +		if (!(fault_type & VM_PROT_WRITE)) +			*protection &= ~VM_PROT_WRITE; +	} + +	if (!vm_fault_interruptible) +		interruptible = FALSE; + +	/* +	 *	INVARIANTS (through entire routine): +	 * +	 *	1)	At all times, we must either have the object +	 *		lock or a busy page in some object to prevent +	 *		some other thread from trying to bring in +	 *		the same page. +	 * +	 *		Note that we cannot hold any locks during the +	 *		pager access or when waiting for memory, so +	 *		we use a busy page then. +	 * +	 *		Note also that we aren't as concerned about more than +	 *		one thread attempting to memory_object_data_unlock +	 *		the same page at once, so we don't hold the page +	 *		as busy then, but do record the highest unlock +	 *		value so far.  [Unlock requests may also be delivered +	 *		out of order.] +	 * +	 *	2)	To prevent another thread from racing us down the +	 *		shadow chain and entering a new page in the top +	 *		object before we do, we must keep a busy page in +	 *		the top object while following the shadow chain. +	 * +	 *	3)	We must increment paging_in_progress on any object +	 *		for which we have a busy page, to prevent +	 *		vm_object_collapse from removing the busy page +	 *		without our noticing. +	 * +	 *	4)	We leave busy pages on the pageout queues. +	 *		If the pageout daemon comes across a busy page, +	 *		it will remove the page from the pageout queues. +	 */ + +	/* +	 *	Search for the page at object/offset. +	 */ + +	object = first_object; +	offset = first_offset; +	first_m = VM_PAGE_NULL; +	access_required = fault_type; + +	/* +	 *	See whether this page is resident +	 */ + +	while (TRUE) { +		m = vm_page_lookup(object, offset); +		if (m != VM_PAGE_NULL) { +			/* +			 *	If the page is being brought in, +			 *	wait for it and then retry. +			 * +			 *	A possible optimization: if the page +			 *	is known to be resident, we can ignore +			 *	pages that are absent (regardless of +			 *	whether they're busy). +			 */ + +			if (m->busy) { +				kern_return_t	wait_result; + +				PAGE_ASSERT_WAIT(m, interruptible); +				vm_object_unlock(object); +				if (continuation != thread_no_continuation) { +					vm_fault_state_t *state = +						(vm_fault_state_t *) current_thread()->ith_other; + +					/* +					 *	Save variables in case +					 *	thread_block discards +					 *	our kernel stack. +					 */ + +					state->vmfp_backoff = FALSE; +					state->vmfp_object = object; +					state->vmfp_offset = offset; +					state->vmfp_first_m = first_m; +					state->vmfp_access = +						access_required; +					state->vmf_prot = *protection; + +					counter(c_vm_fault_page_block_busy_user++); +					thread_block(continuation); +				} else +				{ +					counter(c_vm_fault_page_block_busy_kernel++); +					thread_block((void (*)()) 0); +				} +			    after_thread_block: +				wait_result = current_thread()->wait_result; +				vm_object_lock(object); +				if (wait_result != THREAD_AWAKENED) { +					vm_fault_cleanup(object, first_m); +					if (wait_result == THREAD_RESTART) +						return(VM_FAULT_RETRY); +					else +						return(VM_FAULT_INTERRUPTED); +				} +				continue; +			} + +			/* +			 *	If the page is in error, give up now. +			 */ + +			if (m->error) { +				VM_PAGE_FREE(m); +				vm_fault_cleanup(object, first_m); +				return(VM_FAULT_MEMORY_ERROR); +			} + +			/* +			 *	If the page isn't busy, but is absent, +			 *	then it was deemed "unavailable". +			 */ + +			if (m->absent) { +				/* +				 * Remove the non-existent page (unless it's +				 * in the top object) and move on down to the +				 * next object (if there is one). +				 */ + +				offset += object->shadow_offset; +				access_required = VM_PROT_READ; +				next_object = object->shadow; +				if (next_object == VM_OBJECT_NULL) { +					vm_page_t real_m; + +					assert(!must_be_resident); + +					/* +					 * Absent page at bottom of shadow +					 * chain; zero fill the page we left +					 * busy in the first object, and flush +					 * the absent page.  But first we +					 * need to allocate a real page. +					 */ + +					real_m = vm_page_grab(VM_PAGE_HIGHMEM); +					if (real_m == VM_PAGE_NULL) { +						vm_fault_cleanup(object, first_m); +						return(VM_FAULT_MEMORY_SHORTAGE); +					} + +					if (object != first_object) { +						VM_PAGE_FREE(m); +						vm_object_paging_end(object); +						vm_object_unlock(object); +						object = first_object; +						offset = first_offset; +						m = first_m; +						first_m = VM_PAGE_NULL; +						vm_object_lock(object); +					} + +					VM_PAGE_FREE(m); +					assert(real_m->busy); +					vm_page_lock_queues(); +					vm_page_insert(real_m, object, offset); +					vm_page_unlock_queues(); +					m = real_m; + +					/* +					 *  Drop the lock while zero filling +					 *  page.  Then break because this +					 *  is the page we wanted.  Checking +					 *  the page lock is a waste of time; +					 *  this page was either absent or +					 *  newly allocated -- in both cases +					 *  it can't be page locked by a pager. +					 */ +					vm_object_unlock(object); + +					vm_page_zero_fill(m); + +					vm_stat_sample(SAMPLED_PC_VM_ZFILL_FAULTS); + +					vm_stat.zero_fill_count++; +					current_task()->zero_fills++; +					vm_object_lock(object); +					pmap_clear_modify(m->phys_addr); +					break; +				} else { +					if (must_be_resident) { +						vm_object_paging_end(object); +					} else if (object != first_object) { +						vm_object_paging_end(object); +						VM_PAGE_FREE(m); +					} else { +						first_m = m; +						m->absent = FALSE; +						vm_object_absent_release(object); +						m->busy = TRUE; + +						vm_page_lock_queues(); +						VM_PAGE_QUEUES_REMOVE(m); +						vm_page_unlock_queues(); +					} +					vm_object_lock(next_object); +					vm_object_unlock(object); +					object = next_object; +					vm_object_paging_begin(object); +					continue; +				} +			} + +			/* +			 *	If the desired access to this page has +			 *	been locked out, request that it be unlocked. +			 */ + +			if (access_required & m->page_lock) { +				if ((access_required & m->unlock_request) != access_required) { +					vm_prot_t	new_unlock_request; +					kern_return_t	rc; + +					if (!object->pager_ready) { +						vm_object_assert_wait(object, +							VM_OBJECT_EVENT_PAGER_READY, +							interruptible); +						goto block_and_backoff; +					} + +					new_unlock_request = m->unlock_request = +						(access_required | m->unlock_request); +					vm_object_unlock(object); +					if ((rc = memory_object_data_unlock( +						object->pager, +						object->pager_request, +						offset + object->paging_offset, +						PAGE_SIZE, +						new_unlock_request)) +					     != KERN_SUCCESS) { +					     	printf("vm_fault: memory_object_data_unlock failed\n"); +						vm_object_lock(object); +						vm_fault_cleanup(object, first_m); +						return((rc == MACH_SEND_INTERRUPTED) ? +							VM_FAULT_INTERRUPTED : +							VM_FAULT_MEMORY_ERROR); +					} +					vm_object_lock(object); +					continue; +				} + +				PAGE_ASSERT_WAIT(m, interruptible); +				goto block_and_backoff; +			} + +			/* +			 *	We mark the page busy and leave it on +			 *	the pageout queues.  If the pageout +			 *	deamon comes across it, then it will +			 *	remove the page. +			 */ + +			if (!software_reference_bits) { +				vm_page_lock_queues(); +				if (m->inactive)  { +				    	vm_stat_sample(SAMPLED_PC_VM_REACTIVATION_FAULTS); +					vm_stat.reactivations++; +					current_task()->reactivations++; +				} + +				VM_PAGE_QUEUES_REMOVE(m); +				vm_page_unlock_queues(); +			} + +			assert(!m->busy); +			m->busy = TRUE; +			assert(!m->absent); +			break; +		} + +		look_for_page = +			(object->pager_created) +#if	MACH_PAGEMAP +			&& (vm_external_state_get(object->existence_info, offset + object->paging_offset) != +			 VM_EXTERNAL_STATE_ABSENT) +#endif	/* MACH_PAGEMAP */ +			 ; + +		if ((look_for_page || (object == first_object)) +				 && !must_be_resident) { +			/* +			 *	Allocate a new page for this object/offset +			 *	pair. +			 */ + +			m = vm_page_grab_fictitious(); +			if (m == VM_PAGE_NULL) { +				vm_fault_cleanup(object, first_m); +				return(VM_FAULT_FICTITIOUS_SHORTAGE); +			} + +			vm_page_lock_queues(); +			vm_page_insert(m, object, offset); +			vm_page_unlock_queues(); +		} + +		if (look_for_page && !must_be_resident) { +			kern_return_t	rc; + +			/* +			 *	If the memory manager is not ready, we +			 *	cannot make requests. +			 */ +			if (!object->pager_ready) { +				vm_object_assert_wait(object, +					VM_OBJECT_EVENT_PAGER_READY, +					interruptible); +				VM_PAGE_FREE(m); +				goto block_and_backoff; +			} + +			if (object->internal) { +				/* +				 *	Requests to the default pager +				 *	must reserve a real page in advance, +				 *	because the pager's data-provided +				 *	won't block for pages. +				 */ + +				if (m->fictitious && !vm_page_convert(&m)) { +					VM_PAGE_FREE(m); +					vm_fault_cleanup(object, first_m); +					return(VM_FAULT_MEMORY_SHORTAGE); +				} +			} else if (object->absent_count > +						vm_object_absent_max) { +				/* +				 *	If there are too many outstanding page +				 *	requests pending on this object, we +				 *	wait for them to be resolved now. +				 */ + +				vm_object_absent_assert_wait(object, interruptible); +				VM_PAGE_FREE(m); +				goto block_and_backoff; +			} + +			/* +			 *	Indicate that the page is waiting for data +			 *	from the memory manager. +			 */ + +			m->absent = TRUE; +			object->absent_count++; + +			/* +			 *	We have a busy page, so we can +			 *	release the object lock. +			 */ +			vm_object_unlock(object); + +			/* +			 *	Call the memory manager to retrieve the data. +			 */ + +			vm_stat.pageins++; +		    	vm_stat_sample(SAMPLED_PC_VM_PAGEIN_FAULTS); +			current_task()->pageins++; + +			if ((rc = memory_object_data_request(object->pager, +				object->pager_request, +				m->offset + object->paging_offset, +				PAGE_SIZE, access_required)) != KERN_SUCCESS) { +				if (object->pager && rc != MACH_SEND_INTERRUPTED) +					printf("%s(0x%p, 0x%p, 0x%zx, 0x%x, 0x%x) failed, %x\n", +						"memory_object_data_request", +						object->pager, +						object->pager_request, +						m->offset + object->paging_offset, +						PAGE_SIZE, access_required, rc); +				/* +				 *	Don't want to leave a busy page around, +				 *	but the data request may have blocked, +				 *	so check if it's still there and busy. +				 */ +				vm_object_lock(object); +				if (m == vm_page_lookup(object,offset) && +				    m->absent && m->busy) +					VM_PAGE_FREE(m); +				vm_fault_cleanup(object, first_m); +				return((rc == MACH_SEND_INTERRUPTED) ? +					VM_FAULT_INTERRUPTED : +					VM_FAULT_MEMORY_ERROR); +			} + +			/* +			 * Retry with same object/offset, since new data may +			 * be in a different page (i.e., m is meaningless at +			 * this point). +			 */ +			vm_object_lock(object); +			continue; +		} + +		/* +		 * For the XP system, the only case in which we get here is if +		 * object has no pager (or unwiring).  If the pager doesn't +		 * have the page this is handled in the m->absent case above +		 * (and if you change things here you should look above). +		 */ +		if (object == first_object) +			first_m = m; +		else +		{ +			assert(m == VM_PAGE_NULL); +		} + +		/* +		 *	Move on to the next object.  Lock the next +		 *	object before unlocking the current one. +		 */ +		access_required = VM_PROT_READ; + +		offset += object->shadow_offset; +		next_object = object->shadow; +		if (next_object == VM_OBJECT_NULL) { +			assert(!must_be_resident); + +			/* +			 *	If there's no object left, fill the page +			 *	in the top object with zeros.  But first we +			 *	need to allocate a real page. +			 */ + +			if (object != first_object) { +				vm_object_paging_end(object); +				vm_object_unlock(object); + +				object = first_object; +				offset = first_offset; +				vm_object_lock(object); +			} + +			m = first_m; +			assert(m->object == object); +			first_m = VM_PAGE_NULL; + +			if (m->fictitious && !vm_page_convert(&m)) { +				VM_PAGE_FREE(m); +				vm_fault_cleanup(object, VM_PAGE_NULL); +				return(VM_FAULT_MEMORY_SHORTAGE); +			} + +			vm_object_unlock(object); +			vm_page_zero_fill(m); +			vm_stat_sample(SAMPLED_PC_VM_ZFILL_FAULTS); +			vm_stat.zero_fill_count++; +			current_task()->zero_fills++; +			vm_object_lock(object); +			pmap_clear_modify(m->phys_addr); +			break; +		} +		else { +			vm_object_lock(next_object); +			if ((object != first_object) || must_be_resident) +				vm_object_paging_end(object); +			vm_object_unlock(object); +			object = next_object; +			vm_object_paging_begin(object); +		} +	} + +	/* +	 *	PAGE HAS BEEN FOUND. +	 * +	 *	This page (m) is: +	 *		busy, so that we can play with it; +	 *		not absent, so that nobody else will fill it; +	 *		possibly eligible for pageout; +	 * +	 *	The top-level page (first_m) is: +	 *		VM_PAGE_NULL if the page was found in the +	 *		 top-level object; +	 *		busy, not absent, and ineligible for pageout. +	 * +	 *	The current object (object) is locked.  A paging +	 *	reference is held for the current and top-level +	 *	objects. +	 */ + +	assert(m->busy && !m->absent); +	assert((first_m == VM_PAGE_NULL) || +		(first_m->busy && !first_m->absent && +		 !first_m->active && !first_m->inactive)); + +	/* +	 *	If the page is being written, but isn't +	 *	already owned by the top-level object, +	 *	we have to copy it into a new page owned +	 *	by the top-level object. +	 */ + +	if (object != first_object) { +	    	/* +		 *	We only really need to copy if we +		 *	want to write it. +		 */ + +	    	if (fault_type & VM_PROT_WRITE) { +			vm_page_t copy_m; + +			assert(!must_be_resident); + +			/* +			 *	If we try to collapse first_object at this +			 *	point, we may deadlock when we try to get +			 *	the lock on an intermediate object (since we +			 *	have the bottom object locked).  We can't +			 *	unlock the bottom object, because the page +			 *	we found may move (by collapse) if we do. +			 * +			 *	Instead, we first copy the page.  Then, when +			 *	we have no more use for the bottom object, +			 *	we unlock it and try to collapse. +			 * +			 *	Note that we copy the page even if we didn't +			 *	need to... that's the breaks. +			 */ + +			/* +			 *	Allocate a page for the copy +			 */ +			copy_m = vm_page_grab(VM_PAGE_HIGHMEM); +			if (copy_m == VM_PAGE_NULL) { +				RELEASE_PAGE(m); +				vm_fault_cleanup(object, first_m); +				return(VM_FAULT_MEMORY_SHORTAGE); +			} + +			vm_object_unlock(object); +			vm_page_copy(m, copy_m); +			vm_object_lock(object); + +			/* +			 *	If another map is truly sharing this +			 *	page with us, we have to flush all +			 *	uses of the original page, since we +			 *	can't distinguish those which want the +			 *	original from those which need the +			 *	new copy. +			 * +			 *	XXXO If we know that only one map has +			 *	access to this page, then we could +			 *	avoid the pmap_page_protect() call. +			 */ + +			vm_page_lock_queues(); +			vm_page_deactivate(m); +			pmap_page_protect(m->phys_addr, VM_PROT_NONE); +			vm_page_unlock_queues(); + +			/* +			 *	We no longer need the old page or object. +			 */ + +			PAGE_WAKEUP_DONE(m); +			vm_object_paging_end(object); +			vm_object_unlock(object); + +			vm_stat.cow_faults++; +			vm_stat_sample(SAMPLED_PC_VM_COW_FAULTS); +			current_task()->cow_faults++; +			object = first_object; +			offset = first_offset; + +			vm_object_lock(object); +			VM_PAGE_FREE(first_m); +			first_m = VM_PAGE_NULL; +			assert(copy_m->busy); +			vm_page_lock_queues(); +			vm_page_insert(copy_m, object, offset); +			vm_page_unlock_queues(); +			m = copy_m; + +			/* +			 *	Now that we've gotten the copy out of the +			 *	way, let's try to collapse the top object. +			 *	But we have to play ugly games with +			 *	paging_in_progress to do that... +			 */ + +			vm_object_paging_end(object); +			vm_object_collapse(object); +			vm_object_paging_begin(object); +		} +		else { +		    	*protection &= (~VM_PROT_WRITE); +		} +	} + +	/* +	 *	Now check whether the page needs to be pushed into the +	 *	copy object.  The use of asymmetric copy on write for +	 *	shared temporary objects means that we may do two copies to +	 *	satisfy the fault; one above to get the page from a +	 *	shadowed object, and one here to push it into the copy. +	 */ + +	while ((copy_object = first_object->copy) != VM_OBJECT_NULL) { +		vm_offset_t	copy_offset; +		vm_page_t	copy_m; + +		/* +		 *	If the page is being written, but hasn't been +		 *	copied to the copy-object, we have to copy it there. +		 */ + +		if ((fault_type & VM_PROT_WRITE) == 0) { +			*protection &= ~VM_PROT_WRITE; +			break; +		} + +		/* +		 *	If the page was guaranteed to be resident, +		 *	we must have already performed the copy. +		 */ + +		if (must_be_resident) +			break; + +		/* +		 *	Try to get the lock on the copy_object. +		 */ +		if (!vm_object_lock_try(copy_object)) { +			vm_object_unlock(object); + +			simple_lock_pause();	/* wait a bit */ + +			vm_object_lock(object); +			continue; +		} + +		/* +		 *	Make another reference to the copy-object, +		 *	to keep it from disappearing during the +		 *	copy. +		 */ +		assert(copy_object->ref_count > 0); +		copy_object->ref_count++; + +		/* +		 *	Does the page exist in the copy? +		 */ +		copy_offset = first_offset - copy_object->shadow_offset; +		copy_m = vm_page_lookup(copy_object, copy_offset); +		if (copy_m != VM_PAGE_NULL) { +			if (copy_m->busy) { +				/* +				 *	If the page is being brought +				 *	in, wait for it and then retry. +				 */ +				PAGE_ASSERT_WAIT(copy_m, interruptible); +				RELEASE_PAGE(m); +				copy_object->ref_count--; +				assert(copy_object->ref_count > 0); +				vm_object_unlock(copy_object); +				goto block_and_backoff; +			} +		} +		else { +			/* +			 *	Allocate a page for the copy +			 */ +			copy_m = vm_page_alloc(copy_object, copy_offset); +			if (copy_m == VM_PAGE_NULL) { +				RELEASE_PAGE(m); +				copy_object->ref_count--; +				assert(copy_object->ref_count > 0); +				vm_object_unlock(copy_object); +				vm_fault_cleanup(object, first_m); +				return(VM_FAULT_MEMORY_SHORTAGE); +			} + +			/* +			 *	Must copy page into copy-object. +			 */ + +			vm_page_copy(m, copy_m); + +			/* +			 *	If the old page was in use by any users +			 *	of the copy-object, it must be removed +			 *	from all pmaps.  (We can't know which +			 *	pmaps use it.) +			 */ + +			vm_page_lock_queues(); +			pmap_page_protect(m->phys_addr, VM_PROT_NONE); +			copy_m->dirty = TRUE; +			vm_page_unlock_queues(); + +			/* +			 *	If there's a pager, then immediately +			 *	page out this page, using the "initialize" +			 *	option.  Else, we use the copy. +			 */ + +		 	if (!copy_object->pager_created) { +				vm_page_lock_queues(); +				vm_page_activate(copy_m); +				vm_page_unlock_queues(); +				PAGE_WAKEUP_DONE(copy_m); +			} else { +				/* +				 *	The page is already ready for pageout: +				 *	not on pageout queues and busy. +				 *	Unlock everything except the +				 *	copy_object itself. +				 */ + +				vm_object_unlock(object); + +				/* +				 *	Write the page to the copy-object, +				 *	flushing it from the kernel. +				 */ + +				vm_pageout_page(copy_m, TRUE, TRUE); + +				/* +				 *	Since the pageout may have +				 *	temporarily dropped the +				 *	copy_object's lock, we +				 *	check whether we'll have +				 *	to deallocate the hard way. +				 */ + +				if ((copy_object->shadow != object) || +				    (copy_object->ref_count == 1)) { +					vm_object_unlock(copy_object); +					vm_object_deallocate(copy_object); +					vm_object_lock(object); +					continue; +				} + +				/* +				 *	Pick back up the old object's +				 *	lock.  [It is safe to do so, +				 *	since it must be deeper in the +				 *	object tree.] +				 */ + +				vm_object_lock(object); +			} + +			/* +			 *	Because we're pushing a page upward +			 *	in the object tree, we must restart +			 *	any faults that are waiting here. +			 *	[Note that this is an expansion of +			 *	PAGE_WAKEUP that uses the THREAD_RESTART +			 *	wait result].  Can't turn off the page's +			 *	busy bit because we're not done with it. +			 */ + +			if (m->wanted) { +				m->wanted = FALSE; +				thread_wakeup_with_result((event_t) m, +					THREAD_RESTART); +			} +		} + +		/* +		 *	The reference count on copy_object must be +		 *	at least 2: one for our extra reference, +		 *	and at least one from the outside world +		 *	(we checked that when we last locked +		 *	copy_object). +		 */ +		copy_object->ref_count--; +		assert(copy_object->ref_count > 0); +		vm_object_unlock(copy_object); + +		break; +	} + +	*result_page = m; +	*top_page = first_m; + +	/* +	 *	If the page can be written, assume that it will be. +	 *	[Earlier, we restrict the permission to allow write +	 *	access only if the fault so required, so we don't +	 *	mark read-only data as dirty.] +	 */ + +	if (vm_fault_dirty_handling && (*protection & VM_PROT_WRITE)) +		m->dirty = TRUE; + +	return(VM_FAULT_SUCCESS); + +    block_and_backoff: +	vm_fault_cleanup(object, first_m); + +	if (continuation != thread_no_continuation) { +		vm_fault_state_t *state = +			(vm_fault_state_t *) current_thread()->ith_other; + +		/* +		 *	Save variables in case we must restart. +		 */ + +		state->vmfp_backoff = TRUE; +		state->vmf_prot = *protection; + +		counter(c_vm_fault_page_block_backoff_user++); +		thread_block(continuation); +	} else +	{ +		counter(c_vm_fault_page_block_backoff_kernel++); +		thread_block((void (*)()) 0); +	} +    after_block_and_backoff: +	if (current_thread()->wait_result == THREAD_AWAKENED) +		return VM_FAULT_RETRY; +	else +		return VM_FAULT_INTERRUPTED; + +#undef	RELEASE_PAGE +} + +/* + *	Routine:	vm_fault + *	Purpose: + *		Handle page faults, including pseudo-faults + *		used to change the wiring status of pages. + *	Returns: + *		If an explicit (expression) continuation is supplied, + *		then we call the continuation instead of returning. + *	Implementation: + *		Explicit continuations make this a little icky, + *		because it hasn't been rewritten to embrace CPS. + *		Instead, we have resume arguments for vm_fault and + *		vm_fault_page, to let continue the fault computation. + * + *		vm_fault and vm_fault_page save mucho state + *		in the moral equivalent of a closure.  The state + *		structure is allocated when first entering vm_fault + *		and deallocated when leaving vm_fault. + */ + +static void +vm_fault_continue(void) +{ +	vm_fault_state_t *state = +		(vm_fault_state_t *) current_thread()->ith_other; + +	(void) vm_fault(state->vmf_map, +			state->vmf_vaddr, +			state->vmf_fault_type, +			state->vmf_change_wiring, +			TRUE, state->vmf_continuation); +	/*NOTREACHED*/ +} + +kern_return_t vm_fault( +	vm_map_t	map, +	vm_offset_t	vaddr, +	vm_prot_t	fault_type, +	boolean_t	change_wiring, +	boolean_t	resume, +	vm_fault_continuation_t	continuation) +{ +	vm_map_version_t	version;	/* Map version for verificiation */ +	boolean_t		wired;		/* Should mapping be wired down? */ +	vm_object_t		object;		/* Top-level object */ +	vm_offset_t		offset;		/* Top-level offset */ +	vm_prot_t		prot;		/* Protection for mapping */ +	vm_object_t		old_copy_object; /* Saved copy object */ +	vm_page_t		result_page;	/* Result of vm_fault_page */ +	vm_page_t		top_page;	/* Placeholder page */ +	kern_return_t		kr; + +	vm_page_t		m;	/* Fast access to result_page */ + +	if (resume) { +		vm_fault_state_t *state = +			(vm_fault_state_t *) current_thread()->ith_other; + +		/* +		 *	Retrieve cached variables and +		 *	continue vm_fault_page. +		 */ + +		object = state->vmf_object; +		if (object == VM_OBJECT_NULL) +			goto RetryFault; +		version = state->vmf_version; +		wired = state->vmf_wired; +		offset = state->vmf_offset; +		prot = state->vmf_prot; + +		kr = vm_fault_page(object, offset, fault_type, +				(change_wiring && !wired), !change_wiring, +				&prot, &result_page, &top_page, +				TRUE, vm_fault_continue); +		goto after_vm_fault_page; +	} + +	if (continuation != vm_fault_no_continuation) { +		/* +		 *	We will probably need to save state. +		 */ + +		char *	state; + +		/* +		 * if this assignment stmt is written as +		 * 'active_threads[cpu_number()] = kmem_cache_alloc()', +		 * cpu_number may be evaluated before kmem_cache_alloc; +		 * if kmem_cache_alloc blocks, cpu_number will be wrong +		 */ + +		state = (char *) kmem_cache_alloc(&vm_fault_state_cache); +		current_thread()->ith_other = state; + +	} + +    RetryFault: ; + +	/* +	 *	Find the backing store object and offset into +	 *	it to begin the search. +	 */ + +	if ((kr = vm_map_lookup(&map, vaddr, fault_type, &version, +				&object, &offset, +				&prot, &wired)) != KERN_SUCCESS) { +		goto done; +	} + +	/* +	 *	If the page is wired, we must fault for the current protection +	 *	value, to avoid further faults. +	 */ + +	if (wired) +		fault_type = prot; + +   	/* +	 *	Make a reference to this object to +	 *	prevent its disposal while we are messing with +	 *	it.  Once we have the reference, the map is free +	 *	to be diddled.  Since objects reference their +	 *	shadows (and copies), they will stay around as well. +	 */ + +	assert(object->ref_count > 0); +	object->ref_count++; +	vm_object_paging_begin(object); + +	if (continuation != vm_fault_no_continuation) { +		vm_fault_state_t *state = +			(vm_fault_state_t *) current_thread()->ith_other; + +		/* +		 *	Save variables, in case vm_fault_page discards +		 *	our kernel stack and we have to restart. +		 */ + +		state->vmf_map = map; +		state->vmf_vaddr = vaddr; +		state->vmf_fault_type = fault_type; +		state->vmf_change_wiring = change_wiring; +		state->vmf_continuation = continuation; + +		state->vmf_version = version; +		state->vmf_wired = wired; +		state->vmf_object = object; +		state->vmf_offset = offset; +		state->vmf_prot = prot; + +		kr = vm_fault_page(object, offset, fault_type, +				   (change_wiring && !wired), !change_wiring, +				   &prot, &result_page, &top_page, +				   FALSE, vm_fault_continue); +	} else +	{ +		kr = vm_fault_page(object, offset, fault_type, +				   (change_wiring && !wired), !change_wiring, +				   &prot, &result_page, &top_page, +				   FALSE, (void (*)()) 0); +	} +    after_vm_fault_page: + +	/* +	 *	If we didn't succeed, lose the object reference immediately. +	 */ + +	if (kr != VM_FAULT_SUCCESS) +		vm_object_deallocate(object); + +	/* +	 *	See why we failed, and take corrective action. +	 */ + +	switch (kr) { +		case VM_FAULT_SUCCESS: +			break; +		case VM_FAULT_RETRY: +			goto RetryFault; +		case VM_FAULT_INTERRUPTED: +			kr = KERN_SUCCESS; +			goto done; +		case VM_FAULT_MEMORY_SHORTAGE: +			if (continuation != vm_fault_no_continuation) { +				vm_fault_state_t *state = +					(vm_fault_state_t *) current_thread()->ith_other; + +				/* +				 *	Save variables in case VM_PAGE_WAIT +				 *	discards our kernel stack. +				 */ + +				state->vmf_map = map; +				state->vmf_vaddr = vaddr; +				state->vmf_fault_type = fault_type; +				state->vmf_change_wiring = change_wiring; +				state->vmf_continuation = continuation; +				state->vmf_object = VM_OBJECT_NULL; + +				VM_PAGE_WAIT(vm_fault_continue); +			} else +				VM_PAGE_WAIT((void (*)()) 0); +			goto RetryFault; +		case VM_FAULT_FICTITIOUS_SHORTAGE: +			vm_page_more_fictitious(); +			goto RetryFault; +		case VM_FAULT_MEMORY_ERROR: +			kr = KERN_MEMORY_ERROR; +			goto done; +	} + +	m = result_page; + +	assert((change_wiring && !wired) ? +	       (top_page == VM_PAGE_NULL) : +	       ((top_page == VM_PAGE_NULL) == (m->object == object))); + +	/* +	 *	How to clean up the result of vm_fault_page.  This +	 *	happens whether the mapping is entered or not. +	 */ + +#define UNLOCK_AND_DEALLOCATE				\ +	MACRO_BEGIN					\ +	vm_fault_cleanup(m->object, top_page);		\ +	vm_object_deallocate(object);			\ +	MACRO_END + +	/* +	 *	What to do with the resulting page from vm_fault_page +	 *	if it doesn't get entered into the physical map: +	 */ + +#define RELEASE_PAGE(m)					\ +	MACRO_BEGIN					\ +	PAGE_WAKEUP_DONE(m);				\ +	vm_page_lock_queues();				\ +	if (!m->active && !m->inactive)			\ +		vm_page_activate(m);			\ +	vm_page_unlock_queues();			\ +	MACRO_END + +	/* +	 *	We must verify that the maps have not changed +	 *	since our last lookup. +	 */ + +	old_copy_object = m->object->copy; + +	vm_object_unlock(m->object); +	while (!vm_map_verify(map, &version)) { +		vm_object_t	retry_object; +		vm_offset_t	retry_offset; +		vm_prot_t	retry_prot; + +		/* +		 *	To avoid trying to write_lock the map while another +		 *	thread has it read_locked (in vm_map_pageable), we +		 *	do not try for write permission.  If the page is +		 *	still writable, we will get write permission.  If it +		 *	is not, or has been marked needs_copy, we enter the +		 *	mapping without write permission, and will merely +		 *	take another fault. +		 */ +		kr = vm_map_lookup(&map, vaddr, +				   fault_type & ~VM_PROT_WRITE, &version, +				   &retry_object, &retry_offset, &retry_prot, +				   &wired); + +		if (kr != KERN_SUCCESS) { +			vm_object_lock(m->object); +			RELEASE_PAGE(m); +			UNLOCK_AND_DEALLOCATE; +			goto done; +		} + +		vm_object_unlock(retry_object); +		vm_object_lock(m->object); + +		if ((retry_object != object) || +		    (retry_offset != offset)) { +			RELEASE_PAGE(m); +			UNLOCK_AND_DEALLOCATE; +			goto RetryFault; +		} + +		/* +		 *	Check whether the protection has changed or the object +		 *	has been copied while we left the map unlocked. +		 */ +		prot &= retry_prot; +		vm_object_unlock(m->object); +	} +	vm_object_lock(m->object); + +	/* +	 *	If the copy object changed while the top-level object +	 *	was unlocked, then we must take away write permission. +	 */ + +	if (m->object->copy != old_copy_object) +		prot &= ~VM_PROT_WRITE; + +	/* +	 *	If we want to wire down this page, but no longer have +	 *	adequate permissions, we must start all over. +	 */ + +	if (wired && (prot != fault_type)) { +		vm_map_verify_done(map, &version); +		RELEASE_PAGE(m); +		UNLOCK_AND_DEALLOCATE; +		goto RetryFault; +	} + +	/* +	 *	It's critically important that a wired-down page be faulted +	 *	only once in each map for which it is wired. +	 */ + +	vm_object_unlock(m->object); + +	/* +	 *	Put this page into the physical map. +	 *	We had to do the unlock above because pmap_enter +	 *	may cause other faults.  The page may be on +	 *	the pageout queues.  If the pageout daemon comes +	 *	across the page, it will remove it from the queues. +	 */ + +	PMAP_ENTER(map->pmap, vaddr, m, prot, wired); + +	/* +	 *	If the page is not wired down and isn't already +	 *	on a pageout queue, then put it where the +	 *	pageout daemon can find it. +	 */ +	vm_object_lock(m->object); +	vm_page_lock_queues(); +	if (change_wiring) { +		if (wired) +			vm_page_wire(m); +		else +			vm_page_unwire(m); +	} else if (software_reference_bits) { +		if (!m->active && !m->inactive) +			vm_page_activate(m); +		m->reference = TRUE; +	} else { +		vm_page_activate(m); +	} +	vm_page_unlock_queues(); + +	/* +	 *	Unlock everything, and return +	 */ + +	vm_map_verify_done(map, &version); +	PAGE_WAKEUP_DONE(m); +	kr = KERN_SUCCESS; +	UNLOCK_AND_DEALLOCATE; + +#undef	UNLOCK_AND_DEALLOCATE +#undef	RELEASE_PAGE + +    done: +	if (continuation != vm_fault_no_continuation) { +		vm_fault_state_t *state = +			(vm_fault_state_t *) current_thread()->ith_other; + +		kmem_cache_free(&vm_fault_state_cache, (vm_offset_t) state); +		(*continuation)(kr); +		/*NOTREACHED*/ +	} + +	return(kr); +} + +/* + *	vm_fault_wire: + * + *	Wire down a range of virtual addresses in a map. + */ +void vm_fault_wire( +	vm_map_t	map, +	vm_map_entry_t	entry) +{ + +	vm_offset_t	va; +	pmap_t		pmap; +	vm_offset_t	end_addr = entry->vme_end; + +	pmap = vm_map_pmap(map); + +	/* +	 *	Inform the physical mapping system that the +	 *	range of addresses may not fault, so that +	 *	page tables and such can be locked down as well. +	 */ + +	pmap_pageable(pmap, entry->vme_start, end_addr, FALSE); + +	/* +	 *	We simulate a fault to get the page and enter it +	 *	in the physical map. +	 */ + +	for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) { +		if (vm_fault_wire_fast(map, va, entry) != KERN_SUCCESS) +			(void) vm_fault(map, va, VM_PROT_NONE, TRUE, +					FALSE, (void (*)()) 0); +	} +} + +/* + *	vm_fault_unwire: + * + *	Unwire a range of virtual addresses in a map. + */ +void vm_fault_unwire( +	vm_map_t	map, +	vm_map_entry_t	entry) +{ +	vm_offset_t	va; +	pmap_t		pmap; +	vm_offset_t	end_addr = entry->vme_end; +	vm_object_t	object; + +	pmap = vm_map_pmap(map); + +	object = (entry->is_sub_map) +			? VM_OBJECT_NULL : entry->object.vm_object; + +	/* +	 *	Since the pages are wired down, we must be able to +	 *	get their mappings from the physical map system. +	 */ + +	for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) { +		pmap_change_wiring(pmap, va, FALSE); + +		if (object == VM_OBJECT_NULL) { +			vm_map_lock_set_recursive(map); +			(void) vm_fault(map, va, VM_PROT_NONE, TRUE, +					FALSE, (void (*)()) 0); +			vm_map_lock_clear_recursive(map); +		} else { +		 	vm_prot_t	prot; +			vm_page_t	result_page; +			vm_page_t	top_page; +			vm_fault_return_t result; + +			do { +				prot = VM_PROT_NONE; + +				vm_object_lock(object); +				vm_object_paging_begin(object); +			 	result = vm_fault_page(object, +						entry->offset + +						  (va - entry->vme_start), +						VM_PROT_NONE, TRUE, +						FALSE, &prot, +						&result_page, +						&top_page, +						FALSE, (void (*)()) 0); +			} while (result == VM_FAULT_RETRY); + +			if (result != VM_FAULT_SUCCESS) +				panic("vm_fault_unwire: failure"); + +			vm_page_lock_queues(); +			vm_page_unwire(result_page); +			vm_page_unlock_queues(); +			PAGE_WAKEUP_DONE(result_page); + +			vm_fault_cleanup(result_page->object, top_page); +		} +	} + +	/* +	 *	Inform the physical mapping system that the range +	 *	of addresses may fault, so that page tables and +	 *	such may be unwired themselves. +	 */ + +	pmap_pageable(pmap, entry->vme_start, end_addr, TRUE); +} + +/* + *	vm_fault_wire_fast: + * + *	Handle common case of a wire down page fault at the given address. + *	If successful, the page is inserted into the associated physical map. + *	The map entry is passed in to avoid the overhead of a map lookup. + * + *	NOTE: the given address should be truncated to the + *	proper page address. + * + *	KERN_SUCCESS is returned if the page fault is handled; otherwise, + *	a standard error specifying why the fault is fatal is returned. + * + *	The map in question must be referenced, and remains so. + *	Caller has a read lock on the map. + * + *	This is a stripped version of vm_fault() for wiring pages.  Anything + *	other than the common case will return KERN_FAILURE, and the caller + *	is expected to call vm_fault(). + */ +kern_return_t vm_fault_wire_fast( +	vm_map_t	map, +	vm_offset_t	va, +	vm_map_entry_t	entry) +{ +	vm_object_t		object; +	vm_offset_t		offset; +	vm_page_t		m; +	vm_prot_t		prot; + +	vm_stat.faults++;		/* needs lock XXX */ +	current_task()->faults++; +/* + *	Recovery actions + */ + +#undef	RELEASE_PAGE +#define RELEASE_PAGE(m)	{				\ +	PAGE_WAKEUP_DONE(m);				\ +	vm_page_lock_queues();				\ +	vm_page_unwire(m);				\ +	vm_page_unlock_queues();			\ +} + + +#undef	UNLOCK_THINGS +#define UNLOCK_THINGS	{				\ +	object->paging_in_progress--;			\ +	vm_object_unlock(object);			\ +} + +#undef	UNLOCK_AND_DEALLOCATE +#define UNLOCK_AND_DEALLOCATE	{			\ +	UNLOCK_THINGS;					\ +	vm_object_deallocate(object);			\ +} +/* + *	Give up and have caller do things the hard way. + */ + +#define GIVE_UP {					\ +	UNLOCK_AND_DEALLOCATE;				\ +	return(KERN_FAILURE);				\ +} + + +	/* +	 *	If this entry is not directly to a vm_object, bail out. +	 */ +	if (entry->is_sub_map) +		return(KERN_FAILURE); + +	/* +	 *	Find the backing store object and offset into it. +	 */ + +	object = entry->object.vm_object; +	offset = (va - entry->vme_start) + entry->offset; +	prot = entry->protection; + +   	/* +	 *	Make a reference to this object to prevent its +	 *	disposal while we are messing with it. +	 */ + +	vm_object_lock(object); +	assert(object->ref_count > 0); +	object->ref_count++; +	object->paging_in_progress++; + +	/* +	 *	INVARIANTS (through entire routine): +	 * +	 *	1)	At all times, we must either have the object +	 *		lock or a busy page in some object to prevent +	 *		some other thread from trying to bring in +	 *		the same page. +	 * +	 *	2)	Once we have a busy page, we must remove it from +	 *		the pageout queues, so that the pageout daemon +	 *		will not grab it away. +	 * +	 */ + +	/* +	 *	Look for page in top-level object.  If it's not there or +	 *	there's something going on, give up. +	 */ +	m = vm_page_lookup(object, offset); +	if ((m == VM_PAGE_NULL) || (m->error) || +	    (m->busy) || (m->absent) || (prot & m->page_lock)) { +		GIVE_UP; +	} + +	/* +	 *	Wire the page down now.  All bail outs beyond this +	 *	point must unwire the page. +	 */ + +	vm_page_lock_queues(); +	vm_page_wire(m); +	vm_page_unlock_queues(); + +	/* +	 *	Mark page busy for other threads. +	 */ +	assert(!m->busy); +	m->busy = TRUE; +	assert(!m->absent); + +	/* +	 *	Give up if the page is being written and there's a copy object +	 */ +	if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) { +		RELEASE_PAGE(m); +		GIVE_UP; +	} + +	/* +	 *	Put this page into the physical map. +	 *	We have to unlock the object because pmap_enter +	 *	may cause other faults. +	 */ +	vm_object_unlock(object); + +	PMAP_ENTER(map->pmap, va, m, prot, TRUE); + +	/* +	 *	Must relock object so that paging_in_progress can be cleared. +	 */ +	vm_object_lock(object); + +	/* +	 *	Unlock everything, and return +	 */ + +	PAGE_WAKEUP_DONE(m); +	UNLOCK_AND_DEALLOCATE; + +	return(KERN_SUCCESS); + +} + +/* + *	Routine:	vm_fault_copy_cleanup + *	Purpose: + *		Release a page used by vm_fault_copy. + */ + +static void vm_fault_copy_cleanup( +	vm_page_t	page, +	vm_page_t	top_page) +{ +	vm_object_t	object = page->object; + +	vm_object_lock(object); +	PAGE_WAKEUP_DONE(page); +	vm_page_lock_queues(); +	if (!page->active && !page->inactive) +		vm_page_activate(page); +	vm_page_unlock_queues(); +	vm_fault_cleanup(object, top_page); +} + +/* + *	Routine:	vm_fault_copy + * + *	Purpose: + *		Copy pages from one virtual memory object to another -- + *		neither the source nor destination pages need be resident. + * + *		Before actually copying a page, the version associated with + *		the destination address map wil be verified. + * + *	In/out conditions: + *		The caller must hold a reference, but not a lock, to + *		each of the source and destination objects and to the + *		destination map. + * + *	Results: + *		Returns KERN_SUCCESS if no errors were encountered in + *		reading or writing the data.  Returns KERN_INTERRUPTED if + *		the operation was interrupted (only possible if the + *		"interruptible" argument is asserted).  Other return values + *		indicate a permanent error in copying the data. + * + *		The actual amount of data copied will be returned in the + *		"copy_size" argument.  In the event that the destination map + *		verification failed, this amount may be less than the amount + *		requested. + */ +kern_return_t	vm_fault_copy( +	vm_object_t	src_object, +	vm_offset_t	src_offset, +	vm_size_t	*src_size,		/* INOUT */ +	vm_object_t	dst_object, +	vm_offset_t	dst_offset, +	vm_map_t	dst_map, +	vm_map_version_t *dst_version, +	boolean_t	interruptible) +{ +	vm_page_t		result_page; +	vm_prot_t		prot; + +	vm_page_t		src_page; +	vm_page_t		src_top_page; + +	vm_page_t		dst_page; +	vm_page_t		dst_top_page; + +	vm_size_t		amount_done; +	vm_object_t		old_copy_object; + +#define	RETURN(x)					\ +	MACRO_BEGIN					\ +	*src_size = amount_done;			\ +	MACRO_RETURN(x);				\ +	MACRO_END + +	amount_done = 0; +	do { /* while (amount_done != *src_size) */ + +	    RetrySourceFault: ; + +		if (src_object == VM_OBJECT_NULL) { +			/* +			 *	No source object.  We will just +			 *	zero-fill the page in dst_object. +			 */ + +			src_page = VM_PAGE_NULL; +		} else { +			prot = VM_PROT_READ; + +			vm_object_lock(src_object); +			vm_object_paging_begin(src_object); + +			switch (vm_fault_page(src_object, src_offset, +					VM_PROT_READ, FALSE, interruptible, +					&prot, &result_page, &src_top_page, +					FALSE, (void (*)()) 0)) { + +				case VM_FAULT_SUCCESS: +					break; +				case VM_FAULT_RETRY: +					goto RetrySourceFault; +				case VM_FAULT_INTERRUPTED: +					RETURN(MACH_SEND_INTERRUPTED); +				case VM_FAULT_MEMORY_SHORTAGE: +					VM_PAGE_WAIT((void (*)()) 0); +					goto RetrySourceFault; +				case VM_FAULT_FICTITIOUS_SHORTAGE: +					vm_page_more_fictitious(); +					goto RetrySourceFault; +				case VM_FAULT_MEMORY_ERROR: +					return(KERN_MEMORY_ERROR); +			} + +			src_page = result_page; + +			assert((src_top_page == VM_PAGE_NULL) == +					(src_page->object == src_object)); + +			assert ((prot & VM_PROT_READ) != VM_PROT_NONE); + +			vm_object_unlock(src_page->object); +		} + +	    RetryDestinationFault: ; + +		prot = VM_PROT_WRITE; + +		vm_object_lock(dst_object); +		vm_object_paging_begin(dst_object); + +		switch (vm_fault_page(dst_object, dst_offset, VM_PROT_WRITE, +				FALSE, FALSE /* interruptible */, +				&prot, &result_page, &dst_top_page, +				FALSE, (void (*)()) 0)) { + +			case VM_FAULT_SUCCESS: +				break; +			case VM_FAULT_RETRY: +				goto RetryDestinationFault; +			case VM_FAULT_INTERRUPTED: +				if (src_page != VM_PAGE_NULL) +					vm_fault_copy_cleanup(src_page, +							      src_top_page); +				RETURN(MACH_SEND_INTERRUPTED); +			case VM_FAULT_MEMORY_SHORTAGE: +				VM_PAGE_WAIT((void (*)()) 0); +				goto RetryDestinationFault; +			case VM_FAULT_FICTITIOUS_SHORTAGE: +				vm_page_more_fictitious(); +				goto RetryDestinationFault; +			case VM_FAULT_MEMORY_ERROR: +				if (src_page != VM_PAGE_NULL) +					vm_fault_copy_cleanup(src_page, +							      src_top_page); +				return(KERN_MEMORY_ERROR); +		} +		assert ((prot & VM_PROT_WRITE) != VM_PROT_NONE); + +		dst_page = result_page; + +		old_copy_object = dst_page->object->copy; + +		vm_object_unlock(dst_page->object); + +		if (!vm_map_verify(dst_map, dst_version)) { + +		 BailOut: ; + +			if (src_page != VM_PAGE_NULL) +				vm_fault_copy_cleanup(src_page, src_top_page); +			vm_fault_copy_cleanup(dst_page, dst_top_page); +			break; +		} + + +		vm_object_lock(dst_page->object); +		if (dst_page->object->copy != old_copy_object) { +			vm_object_unlock(dst_page->object); +			vm_map_verify_done(dst_map, dst_version); +			goto BailOut; +		} +		vm_object_unlock(dst_page->object); + +		/* +		 *	Copy the page, and note that it is dirty +		 *	immediately. +		 */ + +		if (src_page == VM_PAGE_NULL) +			vm_page_zero_fill(dst_page); +		else +			vm_page_copy(src_page, dst_page); +		dst_page->dirty = TRUE; + +		/* +		 *	Unlock everything, and return +		 */ + +		vm_map_verify_done(dst_map, dst_version); + +		if (src_page != VM_PAGE_NULL) +			vm_fault_copy_cleanup(src_page, src_top_page); +		vm_fault_copy_cleanup(dst_page, dst_top_page); + +		amount_done += PAGE_SIZE; +		src_offset += PAGE_SIZE; +		dst_offset += PAGE_SIZE; + +	} while (amount_done != *src_size); + +	RETURN(KERN_SUCCESS); +#undef	RETURN + +	/*NOTREACHED*/ +} + + + + + +#ifdef	notdef + +/* + *	Routine:	vm_fault_page_overwrite + * + *	Description: + *		A form of vm_fault_page that assumes that the + *		resulting page will be overwritten in its entirety, + *		making it unnecessary to obtain the correct *contents* + *		of the page. + * + *	Implementation: + *		XXX Untested.  Also unused.  Eventually, this technology + *		could be used in vm_fault_copy() to advantage. + */ +vm_fault_return_t vm_fault_page_overwrite( +	vm_object_t	dst_object, +	vm_offset_t	dst_offset, +	vm_page_t	*result_page)	/* OUT */ +{ +	vm_page_t	dst_page; + +#define	interruptible	FALSE	/* XXX */ + +	while (TRUE) { +		/* +		 *	Look for a page at this offset +		 */ + +		while ((dst_page = vm_page_lookup(dst_object, dst_offset)) +				 == VM_PAGE_NULL) { +			/* +			 *	No page, no problem... just allocate one. +			 */ + +			dst_page = vm_page_alloc(dst_object, dst_offset); +			if (dst_page == VM_PAGE_NULL) { +				vm_object_unlock(dst_object); +				VM_PAGE_WAIT((void (*)()) 0); +				vm_object_lock(dst_object); +				continue; +			} + +			/* +			 *	Pretend that the memory manager +			 *	write-protected the page. +			 * +			 *	Note that we will be asking for write +			 *	permission without asking for the data +			 *	first. +			 */ + +			dst_page->overwriting = TRUE; +			dst_page->page_lock = VM_PROT_WRITE; +			dst_page->absent = TRUE; +			dst_object->absent_count++; + +			break; + +			/* +			 *	When we bail out, we might have to throw +			 *	away the page created here. +			 */ + +#define	DISCARD_PAGE						\ +	MACRO_BEGIN						\ +	vm_object_lock(dst_object);				\ +	dst_page = vm_page_lookup(dst_object, dst_offset);	\ +	if ((dst_page != VM_PAGE_NULL) && dst_page->overwriting) \ +	   	VM_PAGE_FREE(dst_page);				\ +	vm_object_unlock(dst_object);				\ +	MACRO_END +		} + +		/* +		 *	If the page is write-protected... +		 */ + +		if (dst_page->page_lock & VM_PROT_WRITE) { +			/* +			 *	... and an unlock request hasn't been sent +			 */ + +			if ( ! (dst_page->unlock_request & VM_PROT_WRITE)) { +				vm_prot_t	u; +				kern_return_t	rc; + +				/* +				 *	... then send one now. +				 */ + +				if (!dst_object->pager_ready) { +					vm_object_assert_wait(dst_object, +						VM_OBJECT_EVENT_PAGER_READY, +						interruptible); +					vm_object_unlock(dst_object); +					thread_block((void (*)()) 0); +					if (current_thread()->wait_result != +					    THREAD_AWAKENED) { +						DISCARD_PAGE; +						return(VM_FAULT_INTERRUPTED); +					} +					continue; +				} + +				u = dst_page->unlock_request |= VM_PROT_WRITE; +				vm_object_unlock(dst_object); + +				if ((rc = memory_object_data_unlock( +						dst_object->pager, +						dst_object->pager_request, +						dst_offset + dst_object->paging_offset, +						PAGE_SIZE, +						u)) != KERN_SUCCESS) { +				     	printf("vm_object_overwrite: memory_object_data_unlock failed\n"); +					DISCARD_PAGE; +					return((rc == MACH_SEND_INTERRUPTED) ? +						VM_FAULT_INTERRUPTED : +						VM_FAULT_MEMORY_ERROR); +				} +				vm_object_lock(dst_object); +				continue; +			} + +			/* ... fall through to wait below */ +		} else { +			/* +			 *	If the page isn't being used for other +			 *	purposes, then we're done. +			 */ +			if ( ! (dst_page->busy || dst_page->absent || dst_page->error) ) +				break; +		} + +		PAGE_ASSERT_WAIT(dst_page, interruptible); +		vm_object_unlock(dst_object); +		thread_block((void (*)()) 0); +		if (current_thread()->wait_result != THREAD_AWAKENED) { +			DISCARD_PAGE; +			return(VM_FAULT_INTERRUPTED); +		} +	} + +	*result_page = dst_page; +	return(VM_FAULT_SUCCESS); + +#undef	interruptible +#undef	DISCARD_PAGE +} + +#endif	/* notdef */ diff --git a/vm/vm_fault.h b/vm/vm_fault.h new file mode 100644 index 0000000..ae692b1 --- /dev/null +++ b/vm/vm_fault.h @@ -0,0 +1,81 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU + *  School of Computer Science + *  Carnegie Mellon University + *  Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + *	File:	vm/vm_fault.h + * + *	Page fault handling module declarations. + */ + +#ifndef	_VM_VM_FAULT_H_ +#define _VM_VM_FAULT_H_ + +#include <mach/kern_return.h> +#include <mach/vm_prot.h> +#include <vm/vm_map.h> +#include <vm/vm_types.h> + +/* + *	Page fault handling based on vm_object only. + */ + +typedef	kern_return_t	vm_fault_return_t; +#define VM_FAULT_SUCCESS		0 +#define VM_FAULT_RETRY			1 +#define VM_FAULT_INTERRUPTED		2 +#define VM_FAULT_MEMORY_SHORTAGE 	3 +#define VM_FAULT_FICTITIOUS_SHORTAGE 	4 +#define VM_FAULT_MEMORY_ERROR		5 + +typedef void (*vm_fault_continuation_t)(kern_return_t); +#define vm_fault_no_continuation ((vm_fault_continuation_t)0) + +extern void vm_fault_init(void); +extern vm_fault_return_t vm_fault_page(vm_object_t, vm_offset_t, vm_prot_t, +				       boolean_t, boolean_t, vm_prot_t *, +				       vm_page_t *, vm_page_t *, boolean_t, +				       continuation_t); + +extern void		vm_fault_cleanup(vm_object_t, vm_page_t); +/* + *	Page fault handling based on vm_map (or entries therein) + */ + +extern kern_return_t	vm_fault(vm_map_t, vm_offset_t, vm_prot_t, boolean_t, +				 boolean_t, vm_fault_continuation_t); +extern void		vm_fault_wire(vm_map_t, vm_map_entry_t); +extern void		vm_fault_unwire(vm_map_t, vm_map_entry_t); + +/* Copy pages from one object to another.  */ +extern kern_return_t	vm_fault_copy(vm_object_t, vm_offset_t, vm_size_t *, +				      vm_object_t, vm_offset_t, vm_map_t, +				      vm_map_version_t *, boolean_t); + +kern_return_t vm_fault_wire_fast( +	vm_map_t	map, +	vm_offset_t	va, +	vm_map_entry_t	entry); + +#endif	/* _VM_VM_FAULT_H_ */ diff --git a/vm/vm_init.c b/vm/vm_init.c new file mode 100644 index 0000000..593af11 --- /dev/null +++ b/vm/vm_init.c @@ -0,0 +1,88 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University. + * Copyright (c) 1993,1994 The University of Utah and + * the Computer Systems Laboratory (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF + * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY + * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF + * THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU + *  School of Computer Science + *  Carnegie Mellon University + *  Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + *	File:	vm/vm_init.c + *	Author:	Avadis Tevanian, Jr., Michael Wayne Young + *	Date:	1985 + * + *	Initialize the Virtual Memory subsystem. + */ + +#include <mach/machine/vm_types.h> +#include <kern/slab.h> +#include <kern/kalloc.h> +#include <vm/vm_fault.h> +#include <vm/vm_init.h> +#include <vm/vm_object.h> +#include <vm/vm_map.h> +#include <vm/vm_page.h> +#include <vm/vm_kern.h> +#include <vm/memory_object.h> +#include <vm/memory_object_proxy.h> + + +/* + *	vm_mem_bootstrap initializes the virtual memory system. + *	This is done only by the first cpu up. + */ + +void vm_mem_bootstrap(void) +{ +	vm_offset_t	start, end; + +	/* +	 *	Initializes resident memory structures. +	 *	From here on, all physical memory is accounted for, +	 *	and we use only virtual addresses. +	 */ + +	vm_page_bootstrap(&start, &end); + +	/* +	 *	Initialize other VM packages +	 */ + +	slab_bootstrap(); +	vm_object_bootstrap(); +	vm_map_init(); +	kmem_init(start, end); +	pmap_init(); +	slab_init(); +	kalloc_init(); +	vm_fault_init(); +	vm_page_module_init(); +	memory_manager_default_init(); +} + +void vm_mem_init(void) +{ +	vm_object_init(); +	memory_object_proxy_init(); +	vm_page_info_all(); +} diff --git a/vm/vm_init.h b/vm/vm_init.h new file mode 100644 index 0000000..42ef48b --- /dev/null +++ b/vm/vm_init.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2013 Free Software Foundation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef _VM_VM_INIT_H_ +#define _VM_VM_INIT_H_ + +extern void vm_mem_init(void); +extern void vm_mem_bootstrap(void); + +#endif /* _VM_VM_INIT_H_ */ diff --git a/vm/vm_kern.c b/vm/vm_kern.c new file mode 100644 index 0000000..51223d9 --- /dev/null +++ b/vm/vm_kern.c @@ -0,0 +1,1099 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University. + * Copyright (c) 1993,1994 The University of Utah and + * the Computer Systems Laboratory (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF + * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY + * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF + * THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU + *  School of Computer Science + *  Carnegie Mellon University + *  Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + *	File:	vm/vm_kern.c + *	Author:	Avadis Tevanian, Jr., Michael Wayne Young + *	Date:	1985 + * + *	Kernel memory management. + */ + +#include <string.h> + +#include <mach/kern_return.h> +#include <machine/locore.h> +#include <machine/vm_param.h> +#include <kern/assert.h> +#include <kern/debug.h> +#include <kern/lock.h> +#include <kern/slab.h> +#include <kern/thread.h> +#include <kern/printf.h> +#include <vm/pmap.h> +#include <vm/vm_fault.h> +#include <vm/vm_kern.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pageout.h> + + + +/* + *	Variables exported by this module. + */ + +static struct vm_map	kernel_map_store; +vm_map_t		kernel_map = &kernel_map_store; +vm_map_t	kernel_pageable_map; + +/* + *	projected_buffer_allocate + * + *	Allocate a wired-down buffer shared between kernel and user task.   + *      Fresh, zero-filled memory is allocated. + *      If persistence is false, this buffer can only be deallocated from + *      user task using projected_buffer_deallocate, and deallocation  + *      from user task also deallocates the buffer from the kernel map. + *      projected_buffer_collect is called from vm_map_deallocate to + *      automatically deallocate projected buffers on task_deallocate. + *      Sharing with more than one user task is achieved by using  + *      projected_buffer_map for the second and subsequent tasks. + *      The user is precluded from manipulating the VM entry of this buffer + *      (i.e. changing protection, inheritance or machine attributes). + */ + +kern_return_t +projected_buffer_allocate( +	vm_map_t 	map, +	vm_size_t 	size, +       int 		persistence, +	vm_offset_t 	*kernel_p, +	vm_offset_t 	*user_p, +       vm_prot_t 	protection, +       vm_inherit_t 	inheritance)  /*Currently only VM_INHERIT_NONE supported*/ +{ +	vm_object_t object; +	vm_map_entry_t u_entry, k_entry; +	vm_offset_t addr; +	phys_addr_t physical_addr; +	vm_size_t r_size; +	kern_return_t kr; + +	if (map == VM_MAP_NULL || map == kernel_map) +	  return(KERN_INVALID_ARGUMENT); + +	/* +	 *	Allocate a new object.  +	 */ + +	size = round_page(size); +	object = vm_object_allocate(size); + +	vm_map_lock(kernel_map); +	kr = vm_map_find_entry(kernel_map, &addr, size, (vm_offset_t) 0, +			       VM_OBJECT_NULL, &k_entry); +	if (kr != KERN_SUCCESS) { +	  vm_map_unlock(kernel_map); +	  vm_object_deallocate(object); +	  return kr; +	} + +	k_entry->object.vm_object = object; +	if (!persistence) +	  k_entry->projected_on = (vm_map_entry_t) -1; +              /*Mark entry so as to automatically deallocate it when +                last corresponding user entry is deallocated*/ +	vm_map_unlock(kernel_map); +	*kernel_p = addr; + +	vm_map_lock(map); +	kr = vm_map_find_entry(map, &addr, size, (vm_offset_t) 0, +			       VM_OBJECT_NULL, &u_entry); +	if (kr != KERN_SUCCESS) { +	  vm_map_unlock(map); +	  vm_map_lock(kernel_map); +	  vm_map_entry_delete(kernel_map, k_entry); +	  vm_map_unlock(kernel_map); +	  vm_object_deallocate(object); +	  return kr; +	} + +	u_entry->object.vm_object = object; +	vm_object_reference(object); +	u_entry->projected_on = k_entry; +             /*Creates coupling with kernel mapping of the buffer, and +               also guarantees that user cannot directly manipulate +               buffer VM entry*/ +	u_entry->protection = protection; +	u_entry->max_protection = protection; +	u_entry->inheritance = inheritance; +	vm_map_unlock(map); +       	*user_p = addr; + +	/* +	 *	Allocate wired-down memory in the object, +	 *	and enter it in the kernel pmap. +	 */ +	kmem_alloc_pages(object, 0, +			 *kernel_p, *kernel_p + size, +			 VM_PROT_READ | VM_PROT_WRITE); +	memset((void*) *kernel_p, 0, size);         /*Zero fill*/ + +	/* Set up physical mappings for user pmap */ + +	pmap_pageable(map->pmap, *user_p, *user_p + size, FALSE); +	for (r_size = 0; r_size < size; r_size += PAGE_SIZE) { +	  physical_addr = pmap_extract(kernel_pmap, *kernel_p + r_size); +	  pmap_enter(map->pmap, *user_p + r_size, physical_addr, +		     protection, TRUE); +	} + +	return(KERN_SUCCESS); +} + + +/* + *	projected_buffer_map + * + *	Map an area of kernel memory onto a task's address space. + *      No new memory is allocated; the area must previously exist in the + *      kernel memory map. + */ + +kern_return_t +projected_buffer_map( +	vm_map_t 	map, +	vm_offset_t 	kernel_addr, +	vm_size_t 	size, +	vm_offset_t 	*user_p, +       vm_prot_t 	protection, +       vm_inherit_t 	inheritance)  /*Currently only VM_INHERIT_NONE supported*/ +{ +	vm_map_entry_t u_entry, k_entry; +	vm_offset_t user_addr; +	phys_addr_t physical_addr; +	vm_size_t r_size; +	kern_return_t kr; + +	/* +	 *	Find entry in kernel map  +	 */ + +	size = round_page(size); +	if (map == VM_MAP_NULL || map == kernel_map || +	    !vm_map_lookup_entry(kernel_map, kernel_addr, &k_entry) || +	    kernel_addr + size > k_entry->vme_end) +	  return(KERN_INVALID_ARGUMENT); + + +	/* +         *     Create entry in user task +         */ + +	vm_map_lock(map); +	kr = vm_map_find_entry(map, &user_addr, size, (vm_offset_t) 0, +			       VM_OBJECT_NULL, &u_entry); +	if (kr != KERN_SUCCESS) { +	  vm_map_unlock(map); +	  return kr; +	} + +	u_entry->object.vm_object = k_entry->object.vm_object; +	vm_object_reference(k_entry->object.vm_object); +	u_entry->offset = kernel_addr - k_entry->vme_start + k_entry->offset; +	u_entry->projected_on = k_entry; +             /*Creates coupling with kernel mapping of the buffer, and +               also guarantees that user cannot directly manipulate +               buffer VM entry*/ +	u_entry->protection = protection; +	u_entry->max_protection = protection; +	u_entry->inheritance = inheritance; +	u_entry->wired_count = k_entry->wired_count; +	vm_map_unlock(map); +       	*user_p = user_addr; + +	/* Set up physical mappings for user pmap */ + +	pmap_pageable(map->pmap, user_addr, user_addr + size, +		      !k_entry->wired_count); +	for (r_size = 0; r_size < size; r_size += PAGE_SIZE) { +	  physical_addr = pmap_extract(kernel_pmap, kernel_addr + r_size); +	  pmap_enter(map->pmap, user_addr + r_size, physical_addr, +		     protection, k_entry->wired_count); +	} + +	return(KERN_SUCCESS); +} + + +/* + *	projected_buffer_deallocate + * + *	Unmap projected buffer from task's address space. + *      May also unmap buffer from kernel map, if buffer is not + *      persistent and only the kernel reference remains. + */ + +kern_return_t +projected_buffer_deallocate( +     vm_map_t 		map, +     vm_offset_t 	start,  +     vm_offset_t	end) +{ +	vm_map_entry_t entry, k_entry; + +	if (map == VM_MAP_NULL || map == kernel_map) +		return KERN_INVALID_ARGUMENT; + +	vm_map_lock(map); +	if (!vm_map_lookup_entry(map, start, &entry) || +	    end > entry->vme_end || +            /*Check corresponding kernel entry*/ +	    (k_entry = entry->projected_on) == 0) { +	  vm_map_unlock(map); +	  return(KERN_INVALID_ARGUMENT); +	} + +	/*Prepare for deallocation*/ +	if (entry->vme_start < start) +	  _vm_map_clip_start(&map->hdr, entry, start, 1); +	if (entry->vme_end > end) +	  _vm_map_clip_end(&map->hdr, entry, end, 1); +      	if (map->first_free == entry)   /*Adjust first_free hint*/ +	  map->first_free = entry->vme_prev; +	entry->projected_on = 0;        /*Needed to allow deletion*/ +	entry->wired_count = 0;         /*Avoid unwire fault*/ +	vm_map_entry_delete(map, entry); +	vm_map_unlock(map); + +	/*Check if the buffer is not persistent and only the  +          kernel mapping remains, and if so delete it*/ +	vm_map_lock(kernel_map); +	if (k_entry->projected_on == (vm_map_entry_t) -1 && +	    k_entry->object.vm_object->ref_count == 1) { +	  if (kernel_map->first_free == k_entry) +	    kernel_map->first_free = k_entry->vme_prev; +	  k_entry->projected_on = 0;    /*Allow unwire fault*/ +	  vm_map_entry_delete(kernel_map, k_entry); +	} +	vm_map_unlock(kernel_map); +	return(KERN_SUCCESS); +} + + +/* + *	projected_buffer_collect + * + *	Unmap all projected buffers from task's address space. + */ + +kern_return_t +projected_buffer_collect(vm_map_t map) +{ +        vm_map_entry_t entry, next; + +        if (map == VM_MAP_NULL || map == kernel_map) +	  return(KERN_INVALID_ARGUMENT); + +	for (entry = vm_map_first_entry(map); +	     entry != vm_map_to_entry(map); +	     entry = next) { +	  next = entry->vme_next; +	  if (entry->projected_on != 0) +	    projected_buffer_deallocate(map, entry->vme_start, entry->vme_end); +	} +	return(KERN_SUCCESS); +} + + +/* + *	projected_buffer_in_range + * + *	Verifies whether a projected buffer exists in the address range  + *      given. + */ + +boolean_t +projected_buffer_in_range( +       vm_map_t 	map, +       vm_offset_t 	start,  +	vm_offset_t	end) +{ +        vm_map_entry_t entry; + +        if (map == VM_MAP_NULL || map == kernel_map) +	  return(FALSE); + +	/*Find first entry*/ +	if (!vm_map_lookup_entry(map, start, &entry)) +	  entry = entry->vme_next; + +	while (entry != vm_map_to_entry(map) && entry->projected_on == 0 && +	       entry->vme_start <= end) { +	  entry = entry->vme_next; +	} +	return(entry != vm_map_to_entry(map) && entry->vme_start <= end); +} + + +/* + *	kmem_alloc: + * + *	Allocate wired-down memory in the kernel's address map + *	or a submap.  The memory is not zero-filled. + */ + +kern_return_t +kmem_alloc( +	vm_map_t 	map, +	vm_offset_t 	*addrp, +	vm_size_t 	size) +{ +	vm_object_t object; +	vm_map_entry_t entry; +	vm_offset_t addr; +	unsigned int attempts; +	kern_return_t kr; + +	/* +	 *	Allocate a new object.  We must do this before locking +	 *	the map, lest we risk deadlock with the default pager: +	 *		device_read_alloc uses kmem_alloc, +	 *		which tries to allocate an object, +	 *		which uses kmem_alloc_wired to get memory, +	 *		which blocks for pages. +	 *		then the default pager needs to read a block +	 *		to process a memory_object_data_write, +	 *		and device_read_alloc calls kmem_alloc +	 *		and deadlocks on the map lock. +	 */ + +	size = round_page(size); +	object = vm_object_allocate(size); + +	attempts = 0; + +retry: +	vm_map_lock(map); +	kr = vm_map_find_entry(map, &addr, size, (vm_offset_t) 0, +			       VM_OBJECT_NULL, &entry); +	if (kr != KERN_SUCCESS) { +		vm_map_unlock(map); + +		if (attempts == 0) { +			attempts++; +			slab_collect(); +			goto retry; +		} + +		printf_once("no more room for kmem_alloc in %p (%s)\n", +			    map, map->name); +		vm_object_deallocate(object); +		return kr; +	} + +	entry->object.vm_object = object; +	entry->offset = 0; + +	/* +	 *	Since we have not given out this address yet, +	 *	it is safe to unlock the map. +	 */ +	vm_map_unlock(map); + +	/* +	 *	Allocate wired-down memory in the kernel_object, +	 *	for this entry, and enter it in the kernel pmap. +	 */ +	kmem_alloc_pages(object, 0, +			 addr, addr + size, +			 VM_PROT_DEFAULT); + +	/* +	 *	Return the memory, not zeroed. +	 */ +	*addrp = addr; +	return KERN_SUCCESS; +} + +/* + *	kmem_valloc: + * + *	Allocate addressing space in the kernel's address map + *	or a submap.  The adressing space does not map anything. + */ + +kern_return_t +kmem_valloc( +	vm_map_t 	map, +	vm_offset_t 	*addrp, +	vm_size_t 	size) +{ +	vm_map_entry_t entry; +	vm_offset_t offset; +	vm_offset_t addr; +	unsigned int attempts; +	kern_return_t kr; + +	/* +	 *	Use the kernel object for wired-down kernel pages. +	 *	Assume that no region of the kernel object is +	 *	referenced more than once.  We want vm_map_find_entry +	 *	to extend an existing entry if possible. +	 */ + +	size = round_page(size); +	attempts = 0; + +retry: +	vm_map_lock(map); +	kr = vm_map_find_entry(map, &addr, size, (vm_offset_t) 0, +			       kernel_object, &entry); +	if (kr != KERN_SUCCESS) { +		vm_map_unlock(map); + +		if (attempts == 0) { +			attempts++; +			slab_collect(); +			goto retry; +		} + +		printf_once("no more room for kmem_valloc in %p (%s)\n", +			    map, map->name); +		return kr; +	} + +	/* +	 *	Since we didn't know where the new region would +	 *	start, we couldn't supply the correct offset into +	 *	the kernel object.  We only initialize the entry +	 *	if we aren't extending an existing entry. +	 */ + +	offset = addr - VM_MIN_KERNEL_ADDRESS; + +	if (entry->object.vm_object == VM_OBJECT_NULL) { +		vm_object_reference(kernel_object); + +		entry->object.vm_object = kernel_object; +		entry->offset = offset; +	} + +	/* +	 *	Since we have not given out this address yet, +	 *	it is safe to unlock the map. +	 */ +	vm_map_unlock(map); + +	/* +	 *	Return the memory, not mapped. +	 */ +	*addrp = addr; +	return KERN_SUCCESS; +} + +/* + *	kmem_alloc_wired: + * + *	Allocate wired-down memory in the kernel's address map + *	or a submap.  The memory is not zero-filled. + * + *	The memory is allocated in the kernel_object. + *	It may not be copied with vm_map_copy. + */ + +kern_return_t +kmem_alloc_wired( +	vm_map_t 	map, +	vm_offset_t 	*addrp, +	vm_size_t 	size) +{ +	vm_offset_t offset; +	vm_offset_t addr; +	kern_return_t kr; + +	kr = kmem_valloc(map, &addr, size); +	if (kr != KERN_SUCCESS) +		return kr; + +	offset = addr - VM_MIN_KERNEL_ADDRESS; + +	/* +	 *	Allocate wired-down memory in the kernel_object, +	 *	for this entry, and enter it in the kernel pmap. +	 */ +	kmem_alloc_pages(kernel_object, offset, +			 addr, addr + size, +			 VM_PROT_DEFAULT); + +	/* +	 *	Return the memory, not zeroed. +	 */ +	*addrp = addr; +	return KERN_SUCCESS; +} + +/* + *	kmem_alloc_aligned: + * + *	Like kmem_alloc_wired, except that the memory is aligned. + *	The size should be a power-of-2. + */ + +kern_return_t +kmem_alloc_aligned( +	vm_map_t 	map, +	vm_offset_t 	*addrp, +	vm_size_t 	size) +{ +	vm_map_entry_t entry; +	vm_offset_t offset; +	vm_offset_t addr; +	unsigned int attempts; +	kern_return_t kr; + +	if ((size & (size - 1)) != 0) +		panic("kmem_alloc_aligned"); + +	/* +	 *	Use the kernel object for wired-down kernel pages. +	 *	Assume that no region of the kernel object is +	 *	referenced more than once.  We want vm_map_find_entry +	 *	to extend an existing entry if possible. +	 */ + +	size = round_page(size); +	attempts = 0; + +retry: +	vm_map_lock(map); +	kr = vm_map_find_entry(map, &addr, size, size - 1, +			       kernel_object, &entry); +	if (kr != KERN_SUCCESS) { +		vm_map_unlock(map); + +		if (attempts == 0) { +			attempts++; +			slab_collect(); +			goto retry; +		} + +		printf_once("no more room for kmem_alloc_aligned in %p (%s)\n", +			    map, map->name); +		return kr; +	} + +	/* +	 *	Since we didn't know where the new region would +	 *	start, we couldn't supply the correct offset into +	 *	the kernel object.  We only initialize the entry +	 *	if we aren't extending an existing entry. +	 */ + +	offset = addr - VM_MIN_KERNEL_ADDRESS; + +	if (entry->object.vm_object == VM_OBJECT_NULL) { +		vm_object_reference(kernel_object); + +		entry->object.vm_object = kernel_object; +		entry->offset = offset; +	} + +	/* +	 *	Since we have not given out this address yet, +	 *	it is safe to unlock the map. +	 */ +	vm_map_unlock(map); + +	/* +	 *	Allocate wired-down memory in the kernel_object, +	 *	for this entry, and enter it in the kernel pmap. +	 */ +	kmem_alloc_pages(kernel_object, offset, +			 addr, addr + size, +			 VM_PROT_DEFAULT); + +	/* +	 *	Return the memory, not zeroed. +	 */ +	*addrp = addr; +	return KERN_SUCCESS; +} + +/* + * kmem_map_aligned_table: map a table or structure in a virtual memory page + * Align the table initial address with the page initial address. + * + * Parameters: + * phys_address: physical address, the start address of the table. + * size: size of the table. + * mode: access mode. VM_PROT_READ for read, VM_PROT_WRITE for write. + * + * Returns a reference to the virtual address if success, NULL if failure. + */ + +void* +kmem_map_aligned_table( +	phys_addr_t	phys_address, +	vm_size_t	size, +	int		mode) +{ +	vm_offset_t virt_addr; +	kern_return_t ret; +	phys_addr_t into_page = phys_address % PAGE_SIZE; +	phys_addr_t nearest_page = phys_address - into_page; + +	size += into_page; + +	ret = kmem_alloc_wired(kernel_map, &virt_addr, +				round_page(size)); + +	if (ret != KERN_SUCCESS) +		return NULL; + +	(void) pmap_map_bd(virt_addr, nearest_page, +				nearest_page + round_page(size), mode); + +	/* XXX remember mapping somewhere so we can free it? */ + +	return (void *) (virt_addr + into_page); +} + +/* + *	kmem_alloc_pageable: + * + *	Allocate pageable memory in the kernel's address map. + */ + +kern_return_t +kmem_alloc_pageable( +	vm_map_t 	map, +	vm_offset_t 	*addrp, +	vm_size_t 	size) +{ +	vm_offset_t addr; +	kern_return_t kr; + +	addr = vm_map_min(map); +	kr = vm_map_enter(map, &addr, round_page(size), +			  (vm_offset_t) 0, TRUE, +			  VM_OBJECT_NULL, (vm_offset_t) 0, FALSE, +			  VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT); +	if (kr != KERN_SUCCESS) { +		printf_once("no more room for kmem_alloc_pageable in %p (%s)\n", +			    map, map->name); +		return kr; +	} + +	*addrp = addr; +	return KERN_SUCCESS; +} + +/* + *	kmem_free: + * + *	Release a region of kernel virtual memory allocated + *	with kmem_alloc, kmem_alloc_wired, or kmem_alloc_pageable, + *	and return the physical pages associated with that region. + */ + +void +kmem_free( +	vm_map_t 	map, +	vm_offset_t 	addr, +	vm_size_t 	size) +{ +	kern_return_t kr; + +	kr = vm_map_remove(map, trunc_page(addr), round_page(addr + size)); +	if (kr != KERN_SUCCESS) +		panic("kmem_free"); +} + +/* + *	Allocate new wired pages in an object. + *	The object is assumed to be mapped into the kernel map or + *	a submap. + */ +void +kmem_alloc_pages( +	vm_object_t	object, +	vm_offset_t	offset, +	vm_offset_t	start,  +	vm_offset_t	end, +	vm_prot_t	protection) +{ +	/* +	 *	Mark the pmap region as not pageable. +	 */ +	pmap_pageable(kernel_pmap, start, end, FALSE); + +	while (start < end) { +	    vm_page_t	mem; + +	    vm_object_lock(object); + +	    /* +	     *	Allocate a page +	     */ +	    while ((mem = vm_page_alloc(object, offset)) +			 == VM_PAGE_NULL) { +		vm_object_unlock(object); +		VM_PAGE_WAIT((void (*)()) 0); +		vm_object_lock(object); +	    } + +	    /* +	     *	Wire it down +	     */ +	    vm_page_lock_queues(); +	    vm_page_wire(mem); +	    vm_page_unlock_queues(); +	    vm_object_unlock(object); + +	    /* +	     *	Enter it in the kernel pmap +	     */ +	    PMAP_ENTER(kernel_pmap, start, mem, +		       protection, TRUE); + +	    vm_object_lock(object); +	    PAGE_WAKEUP_DONE(mem); +	    vm_object_unlock(object); + +	    start += PAGE_SIZE; +	    offset += PAGE_SIZE; +	} +} + +/* + *	Remap wired pages in an object into a new region. + *	The object is assumed to be mapped into the kernel map or + *	a submap. + */ +void +kmem_remap_pages( +	vm_object_t	object, +	vm_offset_t	offset, +	vm_offset_t	start,  +	vm_offset_t	end, +	vm_prot_t	protection) +{ +	/* +	 *	Mark the pmap region as not pageable. +	 */ +	pmap_pageable(kernel_pmap, start, end, FALSE); + +	while (start < end) { +	    vm_page_t	mem; + +	    vm_object_lock(object); + +	    /* +	     *	Find a page +	     */ +	    if ((mem = vm_page_lookup(object, offset)) == VM_PAGE_NULL) +		panic("kmem_remap_pages"); + +	    /* +	     *	Wire it down (again) +	     */ +	    vm_page_lock_queues(); +	    vm_page_wire(mem); +	    vm_page_unlock_queues(); +	    vm_object_unlock(object); + +	    /* +	     *	Enter it in the kernel pmap.  The page isn't busy, +	     *	but this shouldn't be a problem because it is wired. +	     */ +	    PMAP_ENTER(kernel_pmap, start, mem, +		       protection, TRUE); + +	    start += PAGE_SIZE; +	    offset += PAGE_SIZE; +	} +} + +/* + *	kmem_submap: + * + *	Initializes a map to manage a subrange + *	of the kernel virtual address space. + * + *	Arguments are as follows: + * + *	map		Map to initialize + *	parent		Map to take range from + *	size		Size of range to find + *	min, max	Returned endpoints of map + *	pageable	Can the region be paged + */ + +void +kmem_submap( +	vm_map_t 	map,  +	vm_map_t 	parent, +	vm_offset_t 	*min,  +	vm_offset_t 	*max, +	vm_size_t 	size) +{ +	vm_offset_t addr; +	kern_return_t kr; + +	size = round_page(size); + +	/* +	 *	Need reference on submap object because it is internal +	 *	to the vm_system.  vm_object_enter will never be called +	 *	on it (usual source of reference for vm_map_enter). +	 */ +	vm_object_reference(vm_submap_object); + +	addr = vm_map_min(parent); +	kr = vm_map_enter(parent, &addr, size, +			  (vm_offset_t) 0, TRUE, +			  vm_submap_object, (vm_offset_t) 0, FALSE, +			  VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT); +	if (kr != KERN_SUCCESS) +		panic("kmem_submap"); + +	pmap_reference(vm_map_pmap(parent)); +	vm_map_setup(map, vm_map_pmap(parent), addr, addr + size); +	kr = vm_map_submap(parent, addr, addr + size, map); +	if (kr != KERN_SUCCESS) +		panic("kmem_submap"); + +	*min = addr; +	*max = addr + size; +} + +/* + *	kmem_init: + * + *	Initialize the kernel's virtual memory map, taking + *	into account all memory allocated up to this time. + */ +void kmem_init( +	vm_offset_t	start, +	vm_offset_t	end) +{ +	vm_map_setup(kernel_map, pmap_kernel(), VM_MIN_KERNEL_ADDRESS, end); + +	/* +	 *	Reserve virtual memory allocated up to this time. +	 */ +	if (start != VM_MIN_KERNEL_ADDRESS) { +		kern_return_t rc; +		vm_offset_t addr = VM_MIN_KERNEL_ADDRESS; +		rc = vm_map_enter(kernel_map, +				  &addr, start - VM_MIN_KERNEL_ADDRESS, +				  (vm_offset_t) 0, TRUE, +				  VM_OBJECT_NULL, (vm_offset_t) 0, FALSE, +				  VM_PROT_DEFAULT, VM_PROT_ALL, +				  VM_INHERIT_DEFAULT); +		if (rc) +			panic("vm_map_enter failed (%d)\n", rc); +	} +} + +/* + *	New and improved IO wiring support. + */ + +/* + *	kmem_io_map_copyout: + * + *	Establish temporary mapping in designated map for the memory + *	passed in.  Memory format must be a page_list vm_map_copy. + *	Mapping is READ-ONLY. + */ + +kern_return_t +kmem_io_map_copyout( +     vm_map_t 		map, +     vm_offset_t	*addr,  	/* actual addr of data */ +     vm_offset_t	*alloc_addr,	/* page aligned addr */ +     vm_size_t		*alloc_size,	/* size allocated */ +     vm_map_copy_t	copy, +     vm_size_t		min_size)	/* Do at least this much */ +{ +	vm_offset_t	myaddr, offset; +	vm_size_t	mysize, copy_size; +	kern_return_t	ret; +	vm_page_t	*page_list; +	vm_map_copy_t	new_copy; +	int		i; + +	assert(copy->type == VM_MAP_COPY_PAGE_LIST); +	assert(min_size != 0); + +	/* +	 *	Figure out the size in vm pages. +	 */ +	min_size += copy->offset - trunc_page(copy->offset); +	min_size = round_page(min_size); +	mysize = round_page(copy->offset + copy->size) - +		trunc_page(copy->offset); + +	/* +	 *	If total size is larger than one page list and +	 *	we don't have to do more than one page list, then +	 *	only do one page list.   +	 * +	 * XXX	Could be much smarter about this ... like trimming length +	 * XXX	if we need more than one page list but not all of them. +	 */ + +	copy_size = ptoa(copy->cpy_npages); +	if (mysize > copy_size && copy_size > min_size) +		mysize = copy_size; + +	/* +	 *	Allocate some address space in the map (must be kernel +	 *	space). +	 */ +	myaddr = vm_map_min(map); +	ret = vm_map_enter(map, &myaddr, mysize, +			  (vm_offset_t) 0, TRUE, +			  VM_OBJECT_NULL, (vm_offset_t) 0, FALSE, +			  VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT); + +	if (ret != KERN_SUCCESS) +		return(ret); + +	/* +	 *	Tell the pmap module that this will be wired, and +	 *	enter the mappings. +	 */ +	pmap_pageable(vm_map_pmap(map), myaddr, myaddr + mysize, TRUE); + +	*addr = myaddr + (copy->offset - trunc_page(copy->offset)); +	*alloc_addr = myaddr; +	*alloc_size = mysize; + +	offset = myaddr; +	page_list = ©->cpy_page_list[0]; +	while (TRUE) { +		for ( i = 0; i < copy->cpy_npages; i++, offset += PAGE_SIZE) { +			PMAP_ENTER(vm_map_pmap(map), offset, *page_list, +				   VM_PROT_READ, TRUE); +			page_list++; +		} + +		if (offset == (myaddr + mysize)) +			break; + +		/* +		 *	Onward to the next page_list.  The extend_cont +		 *	leaves the current page list's pages alone;  +		 *	they'll be cleaned up at discard.  Reset this +		 *	copy's continuation to discard the next one. +		 */ +		vm_map_copy_invoke_extend_cont(copy, &new_copy, &ret); + +		if (ret != KERN_SUCCESS) { +			kmem_io_map_deallocate(map, myaddr, mysize); +			return(ret); +		} +		copy->cpy_cont = vm_map_copy_discard_cont; +		copy->cpy_cont_args = (vm_map_copyin_args_t)new_copy; +		copy = new_copy; +		page_list = ©->cpy_page_list[0]; +	} + +	return(ret); +} + +/* + *	kmem_io_map_deallocate: + * + *	Get rid of the mapping established by kmem_io_map_copyout. + *	Assumes that addr and size have been rounded to page boundaries. + *	(e.g., the alloc_addr and alloc_size returned by kmem_io_map_copyout) + */ + +void +kmem_io_map_deallocate( +	vm_map_t	map, +	vm_offset_t	addr, +	vm_size_t	size) +{ +	/* +	 *	Remove the mappings.  The pmap_remove is needed. +	 */ +	 +	pmap_remove(vm_map_pmap(map), addr, addr + size); +	vm_map_remove(map, addr, addr + size); +} + +/* + *	Routine:	copyinmap + *	Purpose: + *		Like copyin, except that fromaddr is an address + *		in the specified VM map.  This implementation + *		is incomplete; it handles the current user map + *		and the kernel map/submaps. + */ + +int copyinmap( +	vm_map_t 	map, +	char 		*fromaddr,  +	char		*toaddr, +	int 		length) +{ +	if (vm_map_pmap(map) == kernel_pmap) { +		/* assume a correct copy */ +		memcpy(toaddr, fromaddr, length); +		return 0; +	} + +	if (current_map() == map) +		return copyin( fromaddr, toaddr, length); + +	return 1; +} + +/* + *	Routine:	copyoutmap + *	Purpose: + *		Like copyout, except that toaddr is an address + *		in the specified VM map.  This implementation + *		is incomplete; it handles the current user map + *		and the kernel map/submaps. + */ + +int copyoutmap( +	vm_map_t map, +	char 	*fromaddr,  +	char	*toaddr, +	int 	length) +{ +	if (vm_map_pmap(map) == kernel_pmap) { +		/* assume a correct copy */ +		memcpy(toaddr, fromaddr, length); +		return 0; +	} + +	if (current_map() == map) +		return copyout(fromaddr, toaddr, length); + +	return 1; +} diff --git a/vm/vm_kern.h b/vm/vm_kern.h new file mode 100644 index 0000000..13115ff --- /dev/null +++ b/vm/vm_kern.h @@ -0,0 +1,100 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU + *  School of Computer Science + *  Carnegie Mellon University + *  Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + *	File:	vm/vm_kern.h + *	Author:	Avadis Tevanian, Jr., Michael Wayne Young + *	Date:	1985 + * + *	Kernel memory management definitions. + */ + +#ifndef	_VM_VM_KERN_H_ +#define _VM_VM_KERN_H_ + +#include <mach/kern_return.h> +#include <vm/vm_map.h> + +extern kern_return_t    projected_buffer_allocate(vm_map_t, vm_size_t, int, +						  vm_offset_t *, vm_offset_t *, +						  vm_prot_t, vm_inherit_t); +extern kern_return_t    projected_buffer_deallocate(vm_map_t, vm_offset_t, +						    vm_offset_t); +extern kern_return_t    projected_buffer_map(vm_map_t, vm_offset_t, vm_size_t, +					     vm_offset_t *, vm_prot_t, +					     vm_inherit_t); +extern kern_return_t    projected_buffer_collect(vm_map_t); + +extern void		kmem_init(vm_offset_t, vm_offset_t); + +extern kern_return_t	kmem_alloc(vm_map_t, vm_offset_t *, vm_size_t); +extern kern_return_t	kmem_alloc_pageable(vm_map_t, vm_offset_t *, +					    vm_size_t); +extern kern_return_t	kmem_valloc(vm_map_t, vm_offset_t *, vm_size_t); +extern kern_return_t	kmem_alloc_wired(vm_map_t, vm_offset_t *, vm_size_t); +extern kern_return_t	kmem_alloc_aligned(vm_map_t, vm_offset_t *, vm_size_t); +extern void*		kmem_map_aligned_table(phys_addr_t, vm_size_t, int); + +extern void		kmem_free(vm_map_t, vm_offset_t, vm_size_t); + +extern void		kmem_submap(vm_map_t, vm_map_t, vm_offset_t *, +				    vm_offset_t *, vm_size_t); + +extern kern_return_t	kmem_io_map_copyout(vm_map_t, vm_offset_t *, +					    vm_offset_t *, vm_size_t *, +					    vm_map_copy_t, vm_size_t); +extern void		kmem_io_map_deallocate(vm_map_t, vm_offset_t, +					       vm_size_t); + +extern int +copyinmap (vm_map_t map, char *fromaddr, char *toaddr, int length); + +extern int +copyoutmap (vm_map_t map, char *fromaddr, char *toaddr, int length); + +extern vm_map_t	kernel_map; +extern vm_map_t	kernel_pageable_map; +extern vm_map_t ipc_kernel_map; + +extern boolean_t projected_buffer_in_range( +        vm_map_t map, +        vm_offset_t start, +		vm_offset_t end); + +extern void kmem_alloc_pages( +	vm_object_t	object, +	vm_offset_t	offset, +	vm_offset_t	start, +	vm_offset_t	end, +	vm_prot_t	protection); + +extern void kmem_remap_pages( +	vm_object_t	object, +	vm_offset_t	offset, +	vm_offset_t	start, +	vm_offset_t	end, +	vm_prot_t	protection); + +#endif	/* _VM_VM_KERN_H_ */ diff --git a/vm/vm_map.c b/vm/vm_map.c new file mode 100644 index 0000000..e454bb2 --- /dev/null +++ b/vm/vm_map.c @@ -0,0 +1,5237 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University. + * Copyright (c) 1993,1994 The University of Utah and + * the Computer Systems Laboratory (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF + * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY + * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF + * THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU + *  School of Computer Science + *  Carnegie Mellon University + *  Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + *	File:	vm/vm_map.c + *	Author:	Avadis Tevanian, Jr., Michael Wayne Young + *	Date:	1985 + * + *	Virtual memory mapping module. + */ + +#include <kern/printf.h> +#include <mach/kern_return.h> +#include <mach/port.h> +#include <mach/vm_attributes.h> +#include <mach/vm_param.h> +#include <mach/vm_wire.h> +#include <kern/assert.h> +#include <kern/debug.h> +#include <kern/kalloc.h> +#include <kern/mach.server.h> +#include <kern/list.h> +#include <kern/rbtree.h> +#include <kern/slab.h> +#include <kern/mach4.server.h> +#include <vm/pmap.h> +#include <vm/vm_fault.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_resident.h> +#include <vm/vm_kern.h> +#include <vm/memory_object_proxy.h> +#include <ipc/ipc_port.h> +#include <string.h> + +#if	MACH_KDB +#include <ddb/db_output.h> +#include <vm/vm_print.h> +#endif	/* MACH_KDB */ + +/* + * Macros to copy a vm_map_entry. We must be careful to correctly + * manage the wired page count. vm_map_entry_copy() creates a new + * map entry to the same memory - the wired count in the new entry + * must be set to zero. vm_map_entry_copy_full() creates a new + * entry that is identical to the old entry.  This preserves the + * wire count; it's used for map splitting and cache changing in + * vm_map_copyout. + */ +#define vm_map_entry_copy(NEW,OLD)			\ +MACRO_BEGIN						\ +                *(NEW) = *(OLD);			\ +                (NEW)->is_shared = FALSE;		\ +                (NEW)->needs_wakeup = FALSE;		\ +                (NEW)->in_transition = FALSE;		\ +                (NEW)->wired_count = 0;			\ +                (NEW)->wired_access = VM_PROT_NONE;	\ +MACRO_END + +#define vm_map_entry_copy_full(NEW,OLD)        (*(NEW) = *(OLD)) + +/* + *	Virtual memory maps provide for the mapping, protection, + *	and sharing of virtual memory objects.  In addition, + *	this module provides for an efficient virtual copy of + *	memory from one map to another. + * + *	Synchronization is required prior to most operations. + * + *	Maps consist of an ordered doubly-linked list of simple + *	entries; a hint and a red-black tree are used to speed up lookups. + * + *	Sharing maps have been deleted from this version of Mach. + *	All shared objects are now mapped directly into the respective + *	maps.  This requires a change in the copy on write strategy; + *	the asymmetric (delayed) strategy is used for shared temporary + *	objects instead of the symmetric (shadow) strategy.  This is + *	selected by the (new) use_shared_copy bit in the object.  See + *	vm_object_copy_temporary in vm_object.c for details.  All maps + *	are now "top level" maps (either task map, kernel map or submap + *	of the kernel map). + * + *	Since portions of maps are specified by start/end addresses, + *	which may not align with existing map entries, all + *	routines merely "clip" entries to these start/end values. + *	[That is, an entry is split into two, bordering at a + *	start or end value.]  Note that these clippings may not + *	always be necessary (as the two resulting entries are then + *	not changed); however, the clipping is done for convenience. + *	The entries can later be "glued back together" (coalesced). + * + *	The symmetric (shadow) copy strategy implements virtual copy + *	by copying VM object references from one map to + *	another, and then marking both regions as copy-on-write. + *	It is important to note that only one writeable reference + *	to a VM object region exists in any map when this strategy + *	is used -- this means that shadow object creation can be + *	delayed until a write operation occurs.  The asymmetric (delayed) + *	strategy allows multiple maps to have writeable references to + *	the same region of a vm object, and hence cannot delay creating + *	its copy objects.  See vm_object_copy_temporary() in vm_object.c. + *	Copying of permanent objects is completely different; see + *	vm_object_copy_strategically() in vm_object.c. + */ + +struct kmem_cache    vm_map_cache;		/* cache for vm_map structures */ +struct kmem_cache    vm_map_entry_cache;	/* cache for vm_map_entry structures */ +struct kmem_cache    vm_map_copy_cache; 	/* cache for vm_map_copy structures */ + +/* + *	Placeholder object for submap operations.  This object is dropped + *	into the range by a call to vm_map_find, and removed when + *	vm_map_submap creates the submap. + */ + +static struct vm_object	vm_submap_object_store; +vm_object_t		vm_submap_object = &vm_submap_object_store; + +/* + *	vm_map_init: + * + *	Initialize the vm_map module.  Must be called before + *	any other vm_map routines. + * + *	Map and entry structures are allocated from caches -- we must + *	initialize those caches. + * + *	There are two caches of interest: + * + *	vm_map_cache:		used to allocate maps. + *	vm_map_entry_cache:	used to allocate map entries. + * + *	We make sure the map entry cache allocates memory directly from the + *	physical allocator to avoid recursion with this module. + */ + +void vm_map_init(void) +{ +	kmem_cache_init(&vm_map_cache, "vm_map", sizeof(struct vm_map), 0, +			NULL, 0); +	kmem_cache_init(&vm_map_entry_cache, "vm_map_entry", +			sizeof(struct vm_map_entry), 0, NULL, +			KMEM_CACHE_NOOFFSLAB | KMEM_CACHE_PHYSMEM); +	kmem_cache_init(&vm_map_copy_cache, "vm_map_copy", +			sizeof(struct vm_map_copy), 0, NULL, 0); + +	/* +	 *	Submap object is initialized by vm_object_init. +	 */ +} + +void vm_map_setup( +	vm_map_t	map, +	pmap_t		pmap, +	vm_offset_t	min,  +	vm_offset_t	max) +{ +	vm_map_first_entry(map) = vm_map_to_entry(map); +	vm_map_last_entry(map)  = vm_map_to_entry(map); +	map->hdr.nentries = 0; +	rbtree_init(&map->hdr.tree); +	rbtree_init(&map->hdr.gap_tree); + +	map->size = 0; +	map->size_wired = 0; +	map->ref_count = 1; +	map->pmap = pmap; +	map->min_offset = min; +	map->max_offset = max; +	map->wiring_required = FALSE; +	map->wait_for_space = FALSE; +	map->first_free = vm_map_to_entry(map); +	map->hint = vm_map_to_entry(map); +	map->name = NULL; +	vm_map_lock_init(map); +	simple_lock_init(&map->ref_lock); +	simple_lock_init(&map->hint_lock); +} + +/* + *	vm_map_create: + * + *	Creates and returns a new empty VM map with + *	the given physical map structure, and having + *	the given lower and upper address bounds. + */ +vm_map_t vm_map_create( +	pmap_t		pmap, +	vm_offset_t	min,  +	vm_offset_t	max) +{ +	vm_map_t	result; + +	result = (vm_map_t) kmem_cache_alloc(&vm_map_cache); +	if (result == VM_MAP_NULL) +		return VM_MAP_NULL; + +	vm_map_setup(result, pmap, min, max); + +	return(result); +} + +void vm_map_lock(struct vm_map *map) +{ +	lock_write(&map->lock); + +	/* +	 *	XXX Memory allocation may occur while a map is locked, +	 *	for example when clipping entries. If the system is running +	 *	low on memory, allocating may block until pages are +	 *	available. But if a map used by the default pager is +	 *	kept locked, a deadlock occurs. +	 * +	 *	This workaround temporarily elevates the current thread +	 *	VM privileges to avoid that particular deadlock, and does +	 *	so regardless of the map for convenience, and because it's +	 *	currently impossible to predict which map the default pager +	 *	may depend on. +	 * +	 *	This workaround isn't reliable, and only makes exhaustion +	 *	less likely. In particular pageout may cause lots of data +	 *	to be passed between the kernel and the pagers, often +	 *	in the form of large copy maps. Making the minimum +	 *	number of pages depend on the total number of pages +	 *	should make exhaustion even less likely. +	 */ + +	if (current_thread()) { +		current_thread()->vm_privilege++; +		assert(current_thread()->vm_privilege != 0); +	} + +	map->timestamp++; +} + +void vm_map_unlock(struct vm_map *map) +{ +	if (current_thread()) { +		current_thread()->vm_privilege--; +	} + +	lock_write_done(&map->lock); +} + +/* + *	vm_map_entry_create:	[ internal use only ] + * + *	Allocates a VM map entry for insertion in the + *	given map (or map copy).  No fields are filled. + */ +#define	vm_map_entry_create(map) \ +	    _vm_map_entry_create(&(map)->hdr) + +#define	vm_map_copy_entry_create(copy) \ +	    _vm_map_entry_create(&(copy)->cpy_hdr) + +static vm_map_entry_t +_vm_map_entry_create(const struct vm_map_header *map_header) +{ +	vm_map_entry_t	entry; + +	entry = (vm_map_entry_t) kmem_cache_alloc(&vm_map_entry_cache); +	if (entry == VM_MAP_ENTRY_NULL) +		panic("vm_map_entry_create"); + +	return(entry); +} + +/* + *	vm_map_entry_dispose:	[ internal use only ] + * + *	Inverse of vm_map_entry_create. + */ +#define	vm_map_entry_dispose(map, entry) \ +	_vm_map_entry_dispose(&(map)->hdr, (entry)) + +#define	vm_map_copy_entry_dispose(map, entry) \ +	_vm_map_entry_dispose(&(copy)->cpy_hdr, (entry)) + +static void +_vm_map_entry_dispose(const struct vm_map_header *map_header, +		vm_map_entry_t entry) +{ +	(void)map_header; + +	kmem_cache_free(&vm_map_entry_cache, (vm_offset_t) entry); +} + +/* + *	Red-black tree lookup/insert comparison functions + */ +static inline int vm_map_entry_cmp_lookup(vm_offset_t addr, +                                          const struct rbtree_node *node) +{ +	struct vm_map_entry *entry; + +	entry = rbtree_entry(node, struct vm_map_entry, tree_node); + +	if (addr < entry->vme_start) +		return -1; +	else if (addr < entry->vme_end) +		return 0; +	else +		return 1; +} + +static inline int vm_map_entry_cmp_insert(const struct rbtree_node *a, +                                          const struct rbtree_node *b) +{ +	struct vm_map_entry *entry; + +	entry = rbtree_entry(a, struct vm_map_entry, tree_node); +	return vm_map_entry_cmp_lookup(entry->vme_start, b); +} + +/* + *	Gap management functions + */ +static inline int vm_map_entry_gap_cmp_lookup(vm_size_t gap_size, +					      const struct rbtree_node *node) +{ +	struct vm_map_entry *entry; + +	entry = rbtree_entry(node, struct vm_map_entry, gap_node); + +	if (gap_size < entry->gap_size) +		return -1; +	else if (gap_size == entry->gap_size) +		return 0; +	else +		return 1; +} + +static inline int vm_map_entry_gap_cmp_insert(const struct rbtree_node *a, +					      const struct rbtree_node *b) +{ +	struct vm_map_entry *entry; + +	entry = rbtree_entry(a, struct vm_map_entry, gap_node); +	return vm_map_entry_gap_cmp_lookup(entry->gap_size, b); +} + +static int +vm_map_gap_valid(struct vm_map_header *hdr, struct vm_map_entry *entry) +{ +	return entry != (struct vm_map_entry *)&hdr->links; +} + +static void +vm_map_gap_compute(struct vm_map_header *hdr, struct vm_map_entry *entry) +{ +	struct vm_map_entry *next; + +	next = entry->vme_next; + +	if (vm_map_gap_valid(hdr, next)) { +		entry->gap_size = next->vme_start - entry->vme_end; +	} else { +		entry->gap_size = hdr->vme_end - entry->vme_end; +	} +} + +static void +vm_map_gap_insert_single(struct vm_map_header *hdr, struct vm_map_entry *entry) +{ +	struct vm_map_entry *tmp; +	struct rbtree_node *node; +	unsigned long slot; + +	if (!vm_map_gap_valid(hdr, entry)) { +		return; +	} + +	vm_map_gap_compute(hdr, entry); + +	if (entry->gap_size == 0) { +		return; +	} + +	node = rbtree_lookup_slot(&hdr->gap_tree, entry->gap_size, +				  vm_map_entry_gap_cmp_lookup, slot); + +	if (node == NULL) { +		rbtree_insert_slot(&hdr->gap_tree, slot, &entry->gap_node); +		list_init(&entry->gap_list); +		entry->in_gap_tree = 1; +	} else { +		tmp = rbtree_entry(node, struct vm_map_entry, gap_node); +		list_insert_tail(&tmp->gap_list, &entry->gap_list); +		entry->in_gap_tree = 0; +	} +} + +static void +vm_map_gap_remove_single(struct vm_map_header *hdr, struct vm_map_entry *entry) +{ +	struct vm_map_entry *tmp; + +	if (!vm_map_gap_valid(hdr, entry)) { +		return; +	} + +	if (entry->gap_size == 0) { +		return; +	} + +	if (!entry->in_gap_tree) { +		list_remove(&entry->gap_list); +		return; +	} + +	rbtree_remove(&hdr->gap_tree, &entry->gap_node); + +	if (list_empty(&entry->gap_list)) { +		return; +	} + +	tmp = list_first_entry(&entry->gap_list, struct vm_map_entry, gap_list); +	assert(tmp->gap_size == entry->gap_size); +	list_remove(&tmp->gap_list); +	list_set_head(&tmp->gap_list, &entry->gap_list); +	assert(!tmp->in_gap_tree); +	rbtree_insert(&hdr->gap_tree, &tmp->gap_node, +		      vm_map_entry_gap_cmp_insert); +	tmp->in_gap_tree = 1; +} + +static void +vm_map_gap_update(struct vm_map_header *hdr, struct vm_map_entry *entry) +{ +	vm_map_gap_remove_single(hdr, entry); +	vm_map_gap_insert_single(hdr, entry); +} + +static void +vm_map_gap_insert(struct vm_map_header *hdr, struct vm_map_entry *entry) +{ +	vm_map_gap_remove_single(hdr, entry->vme_prev); +	vm_map_gap_insert_single(hdr, entry->vme_prev); +	vm_map_gap_insert_single(hdr, entry); +} + +static void +vm_map_gap_remove(struct vm_map_header *hdr, struct vm_map_entry *entry) +{ +	vm_map_gap_remove_single(hdr, entry); +	vm_map_gap_remove_single(hdr, entry->vme_prev); +	vm_map_gap_insert_single(hdr, entry->vme_prev); +} + +/* + *	vm_map_entry_{un,}link: + * + *	Insert/remove entries from maps (or map copies). + * + *	The start and end addresses of the entries must be properly set + *	before using these macros. + */ +#define vm_map_entry_link(map, after_where, entry)	\ +	_vm_map_entry_link(&(map)->hdr, after_where, entry, 1) + +#define vm_map_copy_entry_link(copy, after_where, entry)	\ +	_vm_map_entry_link(&(copy)->cpy_hdr, after_where, entry, 0) + +#define _vm_map_entry_link(hdr, after_where, entry, link_gap)	\ +	MACRO_BEGIN					\ +	(hdr)->nentries++;				\ +	(entry)->vme_prev = (after_where);		\ +	(entry)->vme_next = (after_where)->vme_next;	\ +	(entry)->vme_prev->vme_next =			\ +	 (entry)->vme_next->vme_prev = (entry);		\ +	rbtree_insert(&(hdr)->tree, &(entry)->tree_node,	\ +		      vm_map_entry_cmp_insert);		\ +	if (link_gap)					\ +		vm_map_gap_insert((hdr), (entry));	\ +	MACRO_END + +#define vm_map_entry_unlink(map, entry)			\ +	_vm_map_entry_unlink(&(map)->hdr, entry, 1) + +#define vm_map_copy_entry_unlink(copy, entry)			\ +	_vm_map_entry_unlink(&(copy)->cpy_hdr, entry, 0) + +#define _vm_map_entry_unlink(hdr, entry, unlink_gap)	\ +	MACRO_BEGIN					\ +	(hdr)->nentries--;				\ +	(entry)->vme_next->vme_prev = (entry)->vme_prev; \ +	(entry)->vme_prev->vme_next = (entry)->vme_next; \ +	rbtree_remove(&(hdr)->tree, &(entry)->tree_node);	\ +	if (unlink_gap)					\ +		vm_map_gap_remove((hdr), (entry));	\ +	MACRO_END + +/* + *	vm_map_reference: + * + *	Creates another valid reference to the given map. + * + */ +void vm_map_reference(vm_map_t map) +{ +	if (map == VM_MAP_NULL) +		return; + +	simple_lock(&map->ref_lock); +	map->ref_count++; +	simple_unlock(&map->ref_lock); +} + +/* + *	vm_map_deallocate: + * + *	Removes a reference from the specified map, + *	destroying it if no references remain. + *	The map should not be locked. + */ +void vm_map_deallocate(vm_map_t map) +{ +	int		c; + +	if (map == VM_MAP_NULL) +		return; + +	simple_lock(&map->ref_lock); +	c = --map->ref_count; +	simple_unlock(&map->ref_lock); + +	if (c > 0) { +		return; +	} + +	projected_buffer_collect(map); +	(void) vm_map_delete(map, map->min_offset, map->max_offset); + +	pmap_destroy(map->pmap); + +	kmem_cache_free(&vm_map_cache, (vm_offset_t) map); +} + +/* + *	SAVE_HINT: + * + *	Saves the specified entry as the hint for + *	future lookups.  Performs necessary interlocks. + */ +#define	SAVE_HINT(map,value) \ +		simple_lock(&(map)->hint_lock); \ +		(map)->hint = (value); \ +		simple_unlock(&(map)->hint_lock); + +/* + *	vm_map_lookup_entry:	[ internal use only ] + * + *	Finds the map entry containing (or + *	immediately preceding) the specified address + *	in the given map; the entry is returned + *	in the "entry" parameter.  The boolean + *	result indicates whether the address is + *	actually contained in the map. + */ +boolean_t vm_map_lookup_entry( +	vm_map_t	map, +	vm_offset_t	address, +	vm_map_entry_t	*entry)		/* OUT */ +{ +	struct rbtree_node	*node; +	vm_map_entry_t		hint; + +	/* +	 *	First, make a quick check to see if we are already +	 *	looking at the entry we want (which is often the case). +	 */ + +	simple_lock(&map->hint_lock); +	hint = map->hint; +	simple_unlock(&map->hint_lock); + +	if ((hint != vm_map_to_entry(map)) && (address >= hint->vme_start)) { +		if (address < hint->vme_end) { +			*entry = hint; +			return(TRUE); +		} else { +			vm_map_entry_t next = hint->vme_next; + +			if ((next == vm_map_to_entry(map)) +			    || (address < next->vme_start)) { +				*entry = hint; +				return(FALSE); +			} +		} +	} + +	/* +	 *	If the hint didn't help, use the red-black tree. +	 */ + +	node = rbtree_lookup_nearest(&map->hdr.tree, address, +				     vm_map_entry_cmp_lookup, RBTREE_LEFT); + +	if (node == NULL) { +		*entry = vm_map_to_entry(map); +		SAVE_HINT(map, *entry); +		return(FALSE); +	} else { +		*entry = rbtree_entry(node, struct vm_map_entry, tree_node); +		SAVE_HINT(map, *entry); +		return((address < (*entry)->vme_end) ? TRUE : FALSE); +	} +} + +/* + * Find a range of available space from the specified map. + * + * If successful, this function returns the map entry immediately preceding + * the range, and writes the range address in startp. If the map contains + * no entry, the entry returned points to the map header. + * Otherwise, NULL is returned. + * + * If map_locked is true, this function will not wait for more space in case + * of failure. Otherwise, the map is locked. + */ +static struct vm_map_entry * +vm_map_find_entry_anywhere(struct vm_map *map, +			   vm_size_t size, +			   vm_offset_t mask, +			   boolean_t map_locked, +			   vm_offset_t *startp) +{ +	struct vm_map_entry *entry; +	struct rbtree_node *node; +	vm_size_t max_size; +	vm_offset_t start, end; +	vm_offset_t max; + +	assert(size != 0); + +	max = map->max_offset; +	if (((mask + 1) & mask) != 0) { +		/* We have high bits in addition to the low bits */ + +		int first0 = __builtin_ffs(~mask);		/* First zero after low bits */ +		vm_offset_t lowmask = (1UL << (first0-1)) - 1;		/* low bits */ +		vm_offset_t himask = mask - lowmask;			/* high bits */ +		int second1 = __builtin_ffs(himask);		/* First one after low bits */ + +		max = 1UL << (second1-1); + +		if (himask + max != 0) { +			/* high bits do not continue up to the end */ +			printf("invalid mask %zx\n", mask); +			return NULL; +		} + +		mask = lowmask; +	} + +	if (!map_locked) { +		vm_map_lock(map); +	} + +restart: +	if (map->hdr.nentries == 0) { +		entry = vm_map_to_entry(map); +		start = (map->min_offset + mask) & ~mask; +		end = start + size; + +		if ((start < map->min_offset) || (end <= start) || (end > max)) { +			goto error; +		} + +		*startp = start; +		return entry; +	} + +	entry = map->first_free; + +	if (entry != vm_map_to_entry(map)) { +		start = (entry->vme_end + mask) & ~mask; +		end = start + size; + +		if ((start >= entry->vme_end) +		    && (end > start) +		    && (end <= max) +		    && (end <= (entry->vme_end + entry->gap_size))) { +			*startp = start; +			return entry; +		} +	} + +	max_size = size + mask; + +	if (max_size < size) { +		printf("max_size %zd got smaller than size %zd with mask %zd\n", +		       max_size, size, mask); +		goto error; +	} + +	node = rbtree_lookup_nearest(&map->hdr.gap_tree, max_size, +				     vm_map_entry_gap_cmp_lookup, RBTREE_RIGHT); + +	if (node == NULL) { +		if (map_locked || !map->wait_for_space) { +			goto error; +		} + +		assert_wait((event_t)map, TRUE); +		vm_map_unlock(map); +		thread_block(NULL); +		vm_map_lock(map); +		goto restart; +	} + +	entry = rbtree_entry(node, struct vm_map_entry, gap_node); +	assert(entry->in_gap_tree); + +	if (!list_empty(&entry->gap_list)) { +		entry = list_last_entry(&entry->gap_list, +					struct vm_map_entry, gap_list); +	} + +	assert(entry->gap_size >= max_size); +	start = (entry->vme_end + mask) & ~mask; +	assert(start >= entry->vme_end); +	end = start + size; +	assert(end > start); +	assert(end <= (entry->vme_end + entry->gap_size)); +	if (end > max) { +		/* Does not respect the allowed maximum */ +		printf("%zx does not respect %zx\n", end, max); +		return NULL; +	} +	*startp = start; +	return entry; + +error: +	printf("no more room in %p (%s)\n", map, map->name); +	return NULL; +} + +/* + *	Routine:	vm_map_find_entry + *	Purpose: + *		Allocate a range in the specified virtual address map, + *		returning the entry allocated for that range. + *		Used by kmem_alloc, etc.  Returns wired entries. + * + *		The map must be locked. + * + *		If an entry is allocated, the object/offset fields + *		are initialized to zero.  If an object is supplied, + *		then an existing entry may be extended. + */ +kern_return_t vm_map_find_entry( +	vm_map_t		map, +	vm_offset_t		*address,	/* OUT */ +	vm_size_t		size, +	vm_offset_t		mask, +	vm_object_t		object, +	vm_map_entry_t		*o_entry)	/* OUT */ +{ +	vm_map_entry_t	entry, new_entry; +	vm_offset_t	start; +	vm_offset_t	end; + +	entry = vm_map_find_entry_anywhere(map, size, mask, TRUE, &start); + +	if (entry == NULL) { +		return KERN_NO_SPACE; +	} + +	end = start + size; + +	/* +	 *	At this point, +	 *		"start" and "end" should define the endpoints of the +	 *			available new range, and +	 *		"entry" should refer to the region before the new +	 *			range, and +	 * +	 *		the map should be locked. +	 */ + +	*address = start; + +	/* +	 *	See whether we can avoid creating a new entry by +	 *	extending one of our neighbors.  [So far, we only attempt to +	 *	extend from below.] +	 */ + +	if ((object != VM_OBJECT_NULL) && +	    (entry != vm_map_to_entry(map)) && +	    (entry->vme_end == start) && +	    (!entry->is_shared) && +	    (!entry->is_sub_map) && +	    (entry->object.vm_object == object) && +	    (entry->needs_copy == FALSE) && +	    (entry->inheritance == VM_INHERIT_DEFAULT) && +	    (entry->protection == VM_PROT_DEFAULT) && +	    (entry->max_protection == VM_PROT_ALL) && +	    (entry->wired_count != 0) && +	    (entry->projected_on == 0)) { +		/* +		 *	Because this is a special case, +		 *	we don't need to use vm_object_coalesce. +		 */ + +		entry->vme_end = end; +		vm_map_gap_update(&map->hdr, entry); +		new_entry = entry; +	} else { +		new_entry = vm_map_entry_create(map); + +		new_entry->vme_start = start; +		new_entry->vme_end = end; + +		new_entry->is_shared = FALSE; +		new_entry->is_sub_map = FALSE; +		new_entry->object.vm_object = VM_OBJECT_NULL; +		new_entry->offset = (vm_offset_t) 0; + +		new_entry->needs_copy = FALSE; + +		new_entry->inheritance = VM_INHERIT_DEFAULT; +		new_entry->protection = VM_PROT_DEFAULT; +		new_entry->max_protection = VM_PROT_ALL; +		new_entry->wired_count = 1; +		new_entry->wired_access = VM_PROT_DEFAULT; + +		new_entry->in_transition = FALSE; +		new_entry->needs_wakeup = FALSE; +		new_entry->projected_on = 0; + +		/* +		 *	Insert the new entry into the list +		 */ + +		vm_map_entry_link(map, entry, new_entry); +    	} + +	map->size += size; + +	/* +	 *	Update the free space hint and the lookup hint +	 */ + +	map->first_free = new_entry; +	SAVE_HINT(map, new_entry); + +	*o_entry = new_entry; +	return(KERN_SUCCESS); +} + +boolean_t vm_map_pmap_enter_print = FALSE; +boolean_t vm_map_pmap_enter_enable = FALSE; + +/* + *	Routine:	vm_map_pmap_enter + * + *	Description: + *		Force pages from the specified object to be entered into + *		the pmap at the specified address if they are present. + *		As soon as a page not found in the object the scan ends. + * + *	Returns: + *		Nothing. + * + *	In/out conditions: + *		The source map should not be locked on entry. + */ +static void +vm_map_pmap_enter( +	vm_map_t	map, +	vm_offset_t 	addr, +	vm_offset_t	end_addr, +	vm_object_t 	object, +	vm_offset_t	offset, +	vm_prot_t	protection) +{ +	while (addr < end_addr) { +		vm_page_t	m; + +		vm_object_lock(object); +		vm_object_paging_begin(object); + +		m = vm_page_lookup(object, offset); +		if (m == VM_PAGE_NULL || m->absent) { +			vm_object_paging_end(object); +			vm_object_unlock(object); +			return; +		} + +		if (vm_map_pmap_enter_print) { +			printf("vm_map_pmap_enter:"); +			printf("map: %p, addr: %zx, object: %p, offset: %zx\n", +				map, addr, object, offset); +		} + +		m->busy = TRUE; +		vm_object_unlock(object); + +		PMAP_ENTER(map->pmap, addr, m, +			   protection, FALSE); + +		vm_object_lock(object); +		PAGE_WAKEUP_DONE(m); +		vm_page_lock_queues(); +		if (!m->active && !m->inactive) +		    vm_page_activate(m); +		vm_page_unlock_queues(); +		vm_object_paging_end(object); +		vm_object_unlock(object); + +		offset += PAGE_SIZE; +		addr += PAGE_SIZE; +	} +} + +/* + *	Routine:	vm_map_enter + * + *	Description: + *		Allocate a range in the specified virtual address map. + *		The resulting range will refer to memory defined by + *		the given memory object and offset into that object. + * + *		Arguments are as defined in the vm_map call. + */ +kern_return_t vm_map_enter( +	vm_map_t	map, +	vm_offset_t	*address,	/* IN/OUT */ +	vm_size_t	size, +	vm_offset_t	mask, +	boolean_t	anywhere, +	vm_object_t	object, +	vm_offset_t	offset, +	boolean_t	needs_copy, +	vm_prot_t	cur_protection, +	vm_prot_t	max_protection, +	vm_inherit_t	inheritance) +{ +	vm_map_entry_t	entry; +	vm_map_entry_t	next_entry; +	vm_offset_t	start; +	vm_offset_t	end; +	kern_return_t	result = KERN_SUCCESS; + +#define	RETURN(value)	{ result = value; goto BailOut; } + +	if (size == 0) +		return KERN_INVALID_ARGUMENT; + +	start = *address; + +	if (anywhere) { +		entry = vm_map_find_entry_anywhere(map, size, mask, FALSE, &start); + +		if (entry == NULL) { +			RETURN(KERN_NO_SPACE); +		} + +		end = start + size; +		*address = start; +		next_entry = entry->vme_next; +	} else { +		vm_map_entry_t		temp_entry; + +		/* +		 *	Verify that: +		 *		the address doesn't itself violate +		 *		the mask requirement. +		 */ + +		if ((start & mask) != 0) +			return(KERN_NO_SPACE); + +		vm_map_lock(map); + +		/* +		 *	...	the address is within bounds +		 */ + +		end = start + size; + +		if ((start < map->min_offset) || +		    (end > map->max_offset) || +		    (start >= end)) { +			RETURN(KERN_INVALID_ADDRESS); +		} + +		/* +		 *	...	the starting address isn't allocated +		 */ + +		if (vm_map_lookup_entry(map, start, &temp_entry)) +			RETURN(KERN_NO_SPACE); + +		entry = temp_entry; +		next_entry = entry->vme_next; + +		/* +		 *	...	the next region doesn't overlap the +		 *		end point. +		 */ + +		if ((next_entry != vm_map_to_entry(map)) && +		    (next_entry->vme_start < end)) +			RETURN(KERN_NO_SPACE); +	} + +	/* +	 *	At this point, +	 *		"start" and "end" should define the endpoints of the +	 *			available new range, and +	 *		"entry" should refer to the region before the new +	 *			range, and +	 * +	 *		the map should be locked. +	 */ + +	/* +	 *	See whether we can avoid creating a new entry (and object) by +	 *	extending one of our neighbors. +	 */ + +	if ((entry != vm_map_to_entry(map)) && +	    (entry->vme_end == start) && +	    (!entry->is_shared) && +	    (!entry->is_sub_map) && +	    (entry->inheritance == inheritance) && +	    (entry->protection == cur_protection) && +	    (entry->max_protection == max_protection) && +	    (entry->wired_count == 0) && +	    (entry->projected_on == 0)) { +		if (vm_object_coalesce(entry->object.vm_object, +				object, +				entry->offset, +				offset, +				(vm_size_t)(entry->vme_end - entry->vme_start), +				size, +				&entry->object.vm_object, +				&entry->offset)) { + +			/* +			 *	Coalesced the two objects - can extend +			 *	the previous map entry to include the +			 *	new range. +			 */ +			map->size += size; +			entry->vme_end = end; +			vm_map_gap_update(&map->hdr, entry); +			/* +			 *	Now that we did, perhaps we could simplify +			 *	things even further by coalescing the next +			 *	entry into the one we just extended. +			 */ +			vm_map_coalesce_entry(map, next_entry); +			RETURN(KERN_SUCCESS); +		} +	} +	if ((next_entry != vm_map_to_entry(map)) && +	    (next_entry->vme_start == end) && +	    (!next_entry->is_shared) && +	    (!next_entry->is_sub_map) && +	    (next_entry->inheritance == inheritance) && +	    (next_entry->protection == cur_protection) && +	    (next_entry->max_protection == max_protection) && +	    (next_entry->wired_count == 0) && +	    (next_entry->projected_on == 0)) { +		if (vm_object_coalesce(object, +			next_entry->object.vm_object, +			offset, +			next_entry->offset, +			size, +			(vm_size_t)(next_entry->vme_end - next_entry->vme_start), +			&next_entry->object.vm_object, +			&next_entry->offset)) { + +			/* +			 *	Coalesced the two objects - can extend +			 *	the next map entry to include the +			 *	new range. +			 */ +			map->size += size; +			next_entry->vme_start = start; +			vm_map_gap_update(&map->hdr, entry); +			/* +			 *	Now that we did, perhaps we could simplify +			 *	things even further by coalescing the +			 *	entry into the previous one. +			 */ +			vm_map_coalesce_entry(map, next_entry); +			RETURN(KERN_SUCCESS); +		} +	} + +	/* +	 *	Create a new entry +	 */ + +	/**/ { +	vm_map_entry_t	new_entry; + +	new_entry = vm_map_entry_create(map); + +	new_entry->vme_start = start; +	new_entry->vme_end = end; + +	new_entry->is_shared = FALSE; +	new_entry->is_sub_map = FALSE; +	new_entry->object.vm_object = object; +	new_entry->offset = offset; + +	new_entry->needs_copy = needs_copy; + +	new_entry->inheritance = inheritance; +	new_entry->protection = cur_protection; +	new_entry->max_protection = max_protection; +	new_entry->wired_count = 0; +	new_entry->wired_access = VM_PROT_NONE; + +	new_entry->in_transition = FALSE; +	new_entry->needs_wakeup = FALSE; +	new_entry->projected_on = 0; + +	/* +	 *	Insert the new entry into the list +	 */ + +	vm_map_entry_link(map, entry, new_entry); +	map->size += size; + +	/* +	 *	Update the free space hint and the lookup hint +	 */ + +	if ((map->first_free == entry) && +	    ((entry == vm_map_to_entry(map) ? map->min_offset : entry->vme_end) +	     >= new_entry->vme_start)) +		map->first_free = new_entry; + +	SAVE_HINT(map, new_entry); + +	if (map->wiring_required) { +		/* Returns with the map read-locked if successful */ +		result = vm_map_pageable(map, start, end, cur_protection, FALSE, FALSE); + +		if (result != KERN_SUCCESS) { +			RETURN(KERN_SUCCESS); +		} +	} + +	vm_map_unlock(map); + +	if ((object != VM_OBJECT_NULL) && +	    (vm_map_pmap_enter_enable) && +	    (!anywhere)	 && +	    (!needs_copy) && +	    (size < (128*1024))) { +		vm_map_pmap_enter(map, start, end, +				  object, offset, cur_protection); +	} + +	return(result); +	/**/ } + + BailOut: ; + +	vm_map_unlock(map); +	return(result); + +#undef	RETURN +} + +/* + *	vm_map_clip_start:	[ internal use only ] + * + *	Asserts that the given entry begins at or after + *	the specified address; if necessary, + *	it splits the entry into two. + */ +#define vm_map_clip_start(map, entry, startaddr) \ +	MACRO_BEGIN \ +	if ((startaddr) > (entry)->vme_start) \ +		_vm_map_clip_start(&(map)->hdr,(entry),(startaddr),1); \ +	MACRO_END + +#define vm_map_copy_clip_start(copy, entry, startaddr) \ +	MACRO_BEGIN \ +	if ((startaddr) > (entry)->vme_start) \ +		_vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr),0); \ +	MACRO_END + +/* + *	This routine is called only when it is known that + *	the entry must be split. + */ +void _vm_map_clip_start( +	struct vm_map_header 	*map_header, +	vm_map_entry_t		entry, +	vm_offset_t		start, +	boolean_t		link_gap) +{ +	vm_map_entry_t	new_entry; + +	/* +	 *	Split off the front portion -- +	 *	note that we must insert the new +	 *	entry BEFORE this one, so that +	 *	this entry has the specified starting +	 *	address. +	 */ + +	new_entry = _vm_map_entry_create(map_header); +	vm_map_entry_copy_full(new_entry, entry); + +	new_entry->vme_end = start; +	entry->offset += (start - entry->vme_start); +	entry->vme_start = start; + +	_vm_map_entry_link(map_header, entry->vme_prev, new_entry, link_gap); + +	if (entry->is_sub_map) +	 	vm_map_reference(new_entry->object.sub_map); +	else +		vm_object_reference(new_entry->object.vm_object); +} + +/* + *	vm_map_clip_end:	[ internal use only ] + * + *	Asserts that the given entry ends at or before + *	the specified address; if necessary, + *	it splits the entry into two. + */ +#define vm_map_clip_end(map, entry, endaddr) \ +	MACRO_BEGIN \ +	if ((endaddr) < (entry)->vme_end) \ +		_vm_map_clip_end(&(map)->hdr,(entry),(endaddr),1); \ +	MACRO_END + +#define vm_map_copy_clip_end(copy, entry, endaddr) \ +	MACRO_BEGIN \ +	if ((endaddr) < (entry)->vme_end) \ +		_vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr),0); \ +	MACRO_END + +/* + *	This routine is called only when it is known that + *	the entry must be split. + */ +void _vm_map_clip_end( +	struct vm_map_header 	*map_header, +	vm_map_entry_t		entry, +	vm_offset_t		end, +	boolean_t		link_gap) +{ +	vm_map_entry_t	new_entry; + +	/* +	 *	Create a new entry and insert it +	 *	AFTER the specified entry +	 */ + +	new_entry = _vm_map_entry_create(map_header); +	vm_map_entry_copy_full(new_entry, entry); + +	new_entry->vme_start = entry->vme_end = end; +	new_entry->offset += (end - entry->vme_start); + +	_vm_map_entry_link(map_header, entry, new_entry, link_gap); + +	if (entry->is_sub_map) +	 	vm_map_reference(new_entry->object.sub_map); +	else +		vm_object_reference(new_entry->object.vm_object); +} + +/* + *	VM_MAP_RANGE_CHECK:	[ internal use only ] + * + *	Asserts that the starting and ending region + *	addresses fall within the valid range of the map. + */ +#define	VM_MAP_RANGE_CHECK(map, start, end)		\ +		{					\ +		if (start < vm_map_min(map))		\ +			start = vm_map_min(map);	\ +		if (end > vm_map_max(map))		\ +			end = vm_map_max(map);		\ +		if (start > end)			\ +			start = end;			\ +		} + +/* + *	vm_map_submap:		[ kernel use only ] + * + *	Mark the given range as handled by a subordinate map. + * + *	This range must have been created with vm_map_find using + *	the vm_submap_object, and no other operations may have been + *	performed on this range prior to calling vm_map_submap. + * + *	Only a limited number of operations can be performed + *	within this rage after calling vm_map_submap: + *		vm_fault + *	[Don't try vm_map_copyin!] + * + *	To remove a submapping, one must first remove the + *	range from the superior map, and then destroy the + *	submap (if desired).  [Better yet, don't try it.] + */ +kern_return_t vm_map_submap( +	vm_map_t	map, +	vm_offset_t	start, +	vm_offset_t	end, +	vm_map_t	submap) +{ +	vm_map_entry_t		entry; +	kern_return_t		result = KERN_INVALID_ARGUMENT; +	vm_object_t		object; + +	vm_map_lock(map); + +	VM_MAP_RANGE_CHECK(map, start, end); + +	if (vm_map_lookup_entry(map, start, &entry)) { +		vm_map_clip_start(map, entry, start); +	} +	 else +		entry = entry->vme_next; + +	vm_map_clip_end(map, entry, end); + +	if ((entry->vme_start == start) && (entry->vme_end == end) && +	    (!entry->is_sub_map) && +	    ((object = entry->object.vm_object) == vm_submap_object) && +	    (object->resident_page_count == 0) && +	    (object->copy == VM_OBJECT_NULL) && +	    (object->shadow == VM_OBJECT_NULL) && +	    (!object->pager_created)) { +		entry->object.vm_object = VM_OBJECT_NULL; +		vm_object_deallocate(object); +		entry->is_sub_map = TRUE; +		vm_map_reference(entry->object.sub_map = submap); +		result = KERN_SUCCESS; +	} +	vm_map_unlock(map); + +	return(result); +} + +static void +vm_map_entry_inc_wired(vm_map_t map, vm_map_entry_t entry) +{ +	/* +	 * This member is a counter to indicate whether an entry +	 * should be faulted in (first time it is wired, wired_count +	 * goes from 0 to 1) or not (other times, wired_count goes +	 * from 1 to 2 or remains 2). +	 */ +	if (entry->wired_count > 1) { +		return; +	} + +	if (entry->wired_count == 0) { +		map->size_wired += entry->vme_end - entry->vme_start; +	} + +	entry->wired_count++; +} + +static void +vm_map_entry_reset_wired(vm_map_t map, vm_map_entry_t entry) +{ +	if (entry->wired_count != 0) { +		map->size_wired -= entry->vme_end - entry->vme_start; +		entry->wired_count = 0; +	} +} + +/* + *	vm_map_pageable_scan: scan entries and update wiring as appropriate + * + *	This function is used by the VM system after either the wiring + *	access or protection of a mapping changes. It scans part or + *	all the entries of a map, and either wires, unwires, or skips + *	entries depending on their state. + * + *	The map must be locked. If wiring faults are performed, the lock + *	is downgraded to a read lock. The caller should always consider + *	the map read locked on return. + */ +static void +vm_map_pageable_scan(struct vm_map *map, +		     struct vm_map_entry *start, +		     struct vm_map_entry *end) +{ +	struct vm_map_entry *entry; +	boolean_t do_wire_faults; + +	/* +	 * Pass 1. Update counters and prepare wiring faults. +	 */ + +	do_wire_faults = FALSE; + +	for (entry = start; entry != end; entry = entry->vme_next) { + +		/* +		 * Unwiring. +		 * +		 * Note that unwiring faults can be performed while +		 * holding a write lock on the map. A wiring fault +		 * can only be done with a read lock. +		 */ + +		if (entry->wired_access == VM_PROT_NONE) { +			if (entry->wired_count != 0) { +				vm_map_entry_reset_wired(map, entry); +				vm_fault_unwire(map, entry); +			} + +			continue; +		} + +		/* +		 * Wiring. +		 */ + +		if (entry->protection == VM_PROT_NONE) { + +			/* +			 * Make sure entries that cannot be accessed +			 * because of their protection aren't wired. +			 */ + +			if (entry->wired_count == 0) { +				continue; +			} + +			/* +			 * This normally occurs after changing the protection of +			 * a wired region to VM_PROT_NONE. +			 */ +			vm_map_entry_reset_wired(map, entry); +			vm_fault_unwire(map, entry); +			continue; +		} + +		/* +		 *	We must do this in two passes: +		 * +		 *	1.  Holding the write lock, we create any shadow +		 *	    or zero-fill objects that need to be created. +		 *	    Then we increment the wiring count. +		 * +		 *	2.  We downgrade to a read lock, and call +		 *	    vm_fault_wire to fault in the pages for any +		 *	    newly wired area (wired_count is 1). +		 * +		 *	Downgrading to a read lock for vm_fault_wire avoids +		 *	a possible deadlock with another thread that may have +		 *	faulted on one of the pages to be wired (it would mark +		 *	the page busy, blocking us, then in turn block on the +		 *	map lock that we hold).  Because of problems in the +		 *	recursive lock package, we cannot upgrade to a write +		 *	lock in vm_map_lookup.  Thus, any actions that require +		 *	the write lock must be done beforehand.  Because we +		 *	keep the read lock on the map, the copy-on-write +		 *	status of the entries we modify here cannot change. +		 */ + +		if (entry->wired_count == 0) { +			/* +			 *	Perform actions of vm_map_lookup that need +			 *	the write lock on the map: create a shadow +			 *	object for a copy-on-write region, or an +			 *	object for a zero-fill region. +			 */ +			if (entry->needs_copy && +			    ((entry->protection & VM_PROT_WRITE) != 0)) { +				vm_object_shadow(&entry->object.vm_object, +						 &entry->offset, +						 (vm_size_t)(entry->vme_end +							     - entry->vme_start)); +				entry->needs_copy = FALSE; +			} + +			if (entry->object.vm_object == VM_OBJECT_NULL) { +				entry->object.vm_object = +					vm_object_allocate( +						(vm_size_t)(entry->vme_end +							    - entry->vme_start)); +				entry->offset = (vm_offset_t)0; +			} +		} + +		vm_map_entry_inc_wired(map, entry); + +		if (entry->wired_count == 1) { +			do_wire_faults = TRUE; +		} +	} + +	/* +	 * Pass 2. Trigger wiring faults. +	 */ + +	if (!do_wire_faults) { +		return; +	} + +	/* +	 * HACK HACK HACK HACK +	 * +	 * If we are wiring in the kernel map or a submap of it, +	 * unlock the map to avoid deadlocks.  We trust that the +	 * kernel threads are well-behaved, and therefore will +	 * not do anything destructive to this region of the map +	 * while we have it unlocked.  We cannot trust user threads +	 * to do the same. +	 * +	 * HACK HACK HACK HACK +	 */ +	if (vm_map_pmap(map) == kernel_pmap) { +		vm_map_unlock(map); /* trust me ... */ +	} else { +		vm_map_lock_set_recursive(map); +		vm_map_lock_write_to_read(map); +	} + +	for (entry = start; entry != end; entry = entry->vme_next) { +		/* +		 * The wiring count can only be 1 if it was +		 * incremented by this function right before +		 * downgrading the lock. +		 */ +		if (entry->wired_count == 1) { +			/* +			 * XXX This assumes that the faults always succeed. +			 */ +			vm_fault_wire(map, entry); +		} +	} + +	if (vm_map_pmap(map) == kernel_pmap) { +		vm_map_lock(map); +	} else { +		vm_map_lock_clear_recursive(map); +	} +} + +/* + *	vm_map_protect: + * + *	Sets the protection of the specified address + *	region in the target map.  If "set_max" is + *	specified, the maximum protection is to be set; + *	otherwise, only the current protection is affected. + */ +kern_return_t vm_map_protect( +	vm_map_t	map, +	vm_offset_t	start, +	vm_offset_t	end, +	vm_prot_t	new_prot, +	boolean_t	set_max) +{ +	vm_map_entry_t		current; +	vm_map_entry_t		entry; +	vm_map_entry_t		next; + +	vm_map_lock(map); + +	VM_MAP_RANGE_CHECK(map, start, end); + +	if (vm_map_lookup_entry(map, start, &entry)) { +		vm_map_clip_start(map, entry, start); +	} +	 else +		entry = entry->vme_next; + +	/* +	 *	Make a first pass to check for protection +	 *	violations. +	 */ + +	current = entry; +	while ((current != vm_map_to_entry(map)) && +	       (current->vme_start < end)) { + +		if (current->is_sub_map) { +			vm_map_unlock(map); +			return(KERN_INVALID_ARGUMENT); +		} +		if ((new_prot & (VM_PROT_NOTIFY | current->max_protection)) +		    != new_prot) { +		       vm_map_unlock(map); +		       return(KERN_PROTECTION_FAILURE); +		} + +		current = current->vme_next; +	} + +	/* +	 *	Go back and fix up protections. +	 *	[Note that clipping is not necessary the second time.] +	 */ + +	current = entry; + +	while ((current != vm_map_to_entry(map)) && +	       (current->vme_start < end)) { + +		vm_prot_t	old_prot; + +		vm_map_clip_end(map, current, end); + +		old_prot = current->protection; +		if (set_max) +			current->protection = +				(current->max_protection = new_prot) & +					old_prot; +		else +			current->protection = new_prot; + +		/* +		 *	Make sure the new protection doesn't conflict +		 *	with the desired wired access if any. +		 */ + +		if ((current->protection != VM_PROT_NONE) && +		    (current->wired_access != VM_PROT_NONE || +		     map->wiring_required)) { +			current->wired_access = current->protection; +		} + +		/* +		 *	Update physical map if necessary. +		 */ + +		if (current->protection != old_prot) { +			pmap_protect(map->pmap, current->vme_start, +					current->vme_end, +					current->protection); +		} + +		next = current->vme_next; +		vm_map_coalesce_entry(map, current); +		current = next; +	} + +	next = current->vme_next; +	if (vm_map_coalesce_entry(map, current)) +		current = next; + +	/* Returns with the map read-locked if successful */ +	vm_map_pageable_scan(map, entry, current); + +	vm_map_unlock(map); +	return(KERN_SUCCESS); +} + +/* + *	vm_map_inherit: + * + *	Sets the inheritance of the specified address + *	range in the target map.  Inheritance + *	affects how the map will be shared with + *	child maps at the time of vm_map_fork. + */ +kern_return_t vm_map_inherit( +	vm_map_t	map, +	vm_offset_t	start, +	vm_offset_t	end, +	vm_inherit_t	new_inheritance) +{ +	vm_map_entry_t	entry; +	vm_map_entry_t	temp_entry; +	vm_map_entry_t	next; + +	vm_map_lock(map); + +	VM_MAP_RANGE_CHECK(map, start, end); + +	if (vm_map_lookup_entry(map, start, &temp_entry)) { +		entry = temp_entry; +		vm_map_clip_start(map, entry, start); +	} +	else +		entry = temp_entry->vme_next; + +	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) { +		vm_map_clip_end(map, entry, end); + +		entry->inheritance = new_inheritance; + +		next = entry->vme_next; +		vm_map_coalesce_entry(map, entry); +		entry = next; +	} + +	vm_map_coalesce_entry(map, entry); + +	vm_map_unlock(map); +	return(KERN_SUCCESS); +} + +/* + *	vm_map_pageable: + * + *	Sets the pageability of the specified address + *	range in the target map.  Regions specified + *	as not pageable require locked-down physical + *	memory and physical page maps.  access_type indicates + *	types of accesses that must not generate page faults. + *	This is checked against protection of memory being locked-down. + *	access_type of VM_PROT_NONE makes memory pageable. + * + *	If lock_map is TRUE, the map is locked and unlocked + *	by this function. Otherwise, it is assumed the caller + *	already holds the lock, in which case the function + *	returns with the lock downgraded to a read lock if successful. + * + *	If check_range is TRUE, this function fails if it finds + *	holes or protection mismatches in the specified range. + * + *	A reference must remain to the map throughout the call. + */ + +kern_return_t vm_map_pageable( +	vm_map_t	map, +	vm_offset_t	start, +	vm_offset_t	end, +	vm_prot_t	access_type, +	boolean_t	lock_map, +	boolean_t	check_range) +{ +	vm_map_entry_t		entry; +	vm_map_entry_t		start_entry; +	vm_map_entry_t		end_entry; + +	if (lock_map) { +		vm_map_lock(map); +	} + +	VM_MAP_RANGE_CHECK(map, start, end); + +	if (!vm_map_lookup_entry(map, start, &start_entry)) { +		/* +		 *	Start address is not in map; this is fatal. +		 */ +		if (lock_map) { +			vm_map_unlock(map); +		} + +		return KERN_NO_SPACE; +	} + +	/* +	 * Pass 1. Clip entries, check for holes and protection mismatches +	 * if requested. +	 */ + +	vm_map_clip_start(map, start_entry, start); + +	for (entry = start_entry; +	     (entry != vm_map_to_entry(map)) && +	     (entry->vme_start < end); +	     entry = entry->vme_next) { +		vm_map_clip_end(map, entry, end); + +		if (check_range && +		    (((entry->vme_end < end) && +		      ((entry->vme_next == vm_map_to_entry(map)) || +		       (entry->vme_next->vme_start > entry->vme_end))) || +		     ((entry->protection & access_type) != access_type))) { +			if (lock_map) { +				vm_map_unlock(map); +			} + +			return KERN_NO_SPACE; +		} +	} + +	end_entry = entry; + +	/* +	 * Pass 2. Set the desired wired access. +	 */ + +	for (entry = start_entry; entry != end_entry; entry = entry->vme_next) { +		entry->wired_access = access_type; +	} + +	/* Returns with the map read-locked */ +	vm_map_pageable_scan(map, start_entry, end_entry); + +	if (lock_map) { +		vm_map_unlock(map); +	} + +	return(KERN_SUCCESS); +} + +/* Update pageability of all the memory currently in the map. + * The map must be locked, and protection mismatch will not be checked, see + * vm_map_pageable(). + */ +static kern_return_t +vm_map_pageable_current(vm_map_t map, vm_prot_t access_type) +{ +	struct rbtree_node *node; +	vm_offset_t min_address, max_address; + +	node = rbtree_first(&map->hdr.tree); +	min_address = rbtree_entry(node, struct vm_map_entry, +				   tree_node)->vme_start; + +	node = rbtree_last(&map->hdr.tree); +	max_address = rbtree_entry(node, struct vm_map_entry, +				   tree_node)->vme_end; + +	/* Returns with the map read-locked if successful */ +	return vm_map_pageable(map, min_address, max_address,access_type, +			       FALSE, FALSE); +} + + +/* + *	vm_map_pageable_all: + * + *	Sets the pageability of an entire map. If the VM_WIRE_CURRENT + *	flag is set, then all current mappings are locked down. If the + *	VM_WIRE_FUTURE flag is set, then all mappings created after the + *	call returns are locked down. If no flags are passed + *	(i.e. VM_WIRE_NONE), all mappings become pageable again, and + *	future mappings aren't automatically locked down any more. + * + *	The access type of the mappings match their current protection. + *	Null mappings (with protection PROT_NONE) are updated to track + *	that they should be wired in case they become accessible. + */ +kern_return_t +vm_map_pageable_all(struct vm_map *map, vm_wire_t flags) +{ +	boolean_t wiring_required; +	kern_return_t kr; + +	if ((flags & ~VM_WIRE_ALL) != 0) { +		return KERN_INVALID_ARGUMENT; +	} + +	vm_map_lock(map); + +	if (flags == VM_WIRE_NONE) { +		map->wiring_required = FALSE; + +		/* Returns with the map read-locked if successful */ +		kr = vm_map_pageable_current(map, VM_PROT_NONE); +		vm_map_unlock(map); +		return kr; +	} + +	wiring_required = map->wiring_required; + +	if (flags & VM_WIRE_FUTURE) { +		map->wiring_required = TRUE; +	} + +	if (flags & VM_WIRE_CURRENT) { +		/* Returns with the map read-locked if successful */ +		kr = vm_map_pageable_current(map, VM_PROT_READ | VM_PROT_WRITE); + +		if (kr != KERN_SUCCESS) { +			if (flags & VM_WIRE_FUTURE) { +				map->wiring_required = wiring_required; +			} + +			vm_map_unlock(map); +			return kr; +		} +	} + +	vm_map_unlock(map); + +	return KERN_SUCCESS; +} + +/* + *	vm_map_entry_delete:	[ internal use only ] + * + *	Deallocate the given entry from the target map. + */ +void vm_map_entry_delete( +	vm_map_t	map, +	vm_map_entry_t	entry) +{ +	vm_offset_t		s, e; +	vm_size_t		size; +	vm_object_t		object; +	extern vm_object_t	kernel_object; + +	s = entry->vme_start; +	e = entry->vme_end; +	size = e - s; + +	/*Check if projected buffer*/ +	if (map != kernel_map && entry->projected_on != 0) { +	  /*Check if projected kernel entry is persistent; +	    may only manipulate directly if it is*/ +	  if (entry->projected_on->projected_on == 0) +	    entry->wired_count = 0;    /*Avoid unwire fault*/ +	  else +	    return; +	} + +	/* +	 *	Get the object.    Null objects cannot have pmap entries. +	 */ + +	if ((object = entry->object.vm_object) != VM_OBJECT_NULL) { + +	    /* +	     *	Unwire before removing addresses from the pmap; +	     *	otherwise, unwiring will put the entries back in +	     *	the pmap. +	     */ + +	    if (entry->wired_count != 0) { +		vm_map_entry_reset_wired(map, entry); +		vm_fault_unwire(map, entry); +	    } + +	    /* +	     *	If the object is shared, we must remove +	     *	*all* references to this data, since we can't +	     *	find all of the physical maps which are sharing +	     *	it. +	     */ + +	    if (object == kernel_object) { +		vm_object_lock(object); +		vm_object_page_remove(object, entry->offset, +				entry->offset + size); +		vm_object_unlock(object); +	    } else if (entry->is_shared) { +		vm_object_pmap_remove(object, +				 entry->offset, +				 entry->offset + size); +	    } else { +		pmap_remove(map->pmap, s, e); +		/* +		 *	If this object has no pager and our +		 *	reference to the object is the only +		 *	one, we can release the deleted pages +		 *	now. +		 */ +		vm_object_lock(object); +		if ((!object->pager_created) && +		    (object->ref_count == 1) && +		    (object->paging_in_progress == 0)) { +			vm_object_page_remove(object, +				entry->offset, +				entry->offset + size); +		} +		vm_object_unlock(object); +	    } +        } + +	/* +	 *	Deallocate the object only after removing all +	 *	pmap entries pointing to its pages. +	 */ + +	if (entry->is_sub_map) +		vm_map_deallocate(entry->object.sub_map); +	else +	 	vm_object_deallocate(entry->object.vm_object); + +	vm_map_entry_unlink(map, entry); +	map->size -= size; + +	vm_map_entry_dispose(map, entry); +} + +/* + *	vm_map_delete:	[ internal use only ] + * + *	Deallocates the given address range from the target + *	map. + */ + +kern_return_t vm_map_delete( +	vm_map_t		map, +	vm_offset_t		start, +	vm_offset_t		end) +{ +	vm_map_entry_t		entry; +	vm_map_entry_t		first_entry; + +	if (map->pmap == kernel_pmap && (start < kernel_virtual_start || end > kernel_virtual_end)) +		panic("vm_map_delete(%lx-%lx) falls in physical memory area!\n", (unsigned long) start, (unsigned long) end); + +	/* +	 *	Find the start of the region, and clip it +	 */ + +	if (!vm_map_lookup_entry(map, start, &first_entry)) +		entry = first_entry->vme_next; +	else { +		entry = first_entry; +		vm_map_clip_start(map, entry, start); + +		/* +		 *	Fix the lookup hint now, rather than each +		 *	time though the loop. +		 */ + +		SAVE_HINT(map, entry->vme_prev); +	} + +	/* +	 *	Save the free space hint +	 */ + +	if (map->first_free->vme_start >= start) +		map->first_free = entry->vme_prev; + +	/* +	 *	Step through all entries in this region +	 */ + +	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) { +		vm_map_entry_t		next; + +		vm_map_clip_end(map, entry, end); + +		/* +		 *	If the entry is in transition, we must wait +		 *	for it to exit that state.  It could be clipped +		 *	while we leave the map unlocked. +		 */ +                if(entry->in_transition) { +                        /* +                         * Say that we are waiting, and wait for entry. +                         */ +                        entry->needs_wakeup = TRUE; +                        vm_map_entry_wait(map, FALSE); +                        vm_map_lock(map); + +                        /* +                         * The entry could have been clipped or it +                         * may not exist anymore.  look it up again. +                         */ +                        if(!vm_map_lookup_entry(map, start, &entry)) { +				entry = entry->vme_next; +			} +			continue; +		} + +		next = entry->vme_next; + +		vm_map_entry_delete(map, entry); +		entry = next; +	} + +	if (map->wait_for_space) +		thread_wakeup((event_t) map); + +	return(KERN_SUCCESS); +} + +/* + *	vm_map_remove: + * + *	Remove the given address range from the target map. + *	This is the exported form of vm_map_delete. + */ +kern_return_t vm_map_remove( +	vm_map_t	map, +	vm_offset_t	start, +	vm_offset_t	end) +{ +	kern_return_t	result; + +	vm_map_lock(map); +	VM_MAP_RANGE_CHECK(map, start, end); +	result = vm_map_delete(map, start, end); +	vm_map_unlock(map); + +	return(result); +} + + +/* + *	vm_map_copy_steal_pages: + * + *	Steal all the pages from a vm_map_copy page_list by copying ones + *	that have not already been stolen. + */ +static void +vm_map_copy_steal_pages(vm_map_copy_t copy) +{ +	vm_page_t	m, new_m; +	int		i; +	vm_object_t	object; + +	for (i = 0; i < copy->cpy_npages; i++) { + +		/* +		 *	If the page is not tabled, then it's already stolen. +		 */ +		m = copy->cpy_page_list[i]; +		if (!m->tabled) +			continue; + +		/* +		 *	Page was not stolen,  get a new +		 *	one and do the copy now. +		 */ +		while ((new_m = vm_page_grab(VM_PAGE_HIGHMEM)) == VM_PAGE_NULL) { +			VM_PAGE_WAIT((void(*)()) 0); +		} + +		vm_page_copy(m, new_m); + +		object = m->object; +		vm_object_lock(object); +		vm_page_lock_queues(); +		if (!m->active && !m->inactive) +			vm_page_activate(m); +		vm_page_unlock_queues(); +		PAGE_WAKEUP_DONE(m); +		vm_object_paging_end(object); +		vm_object_unlock(object); + +		copy->cpy_page_list[i] = new_m; +	} +} + +/* + *	vm_map_copy_page_discard: + * + *	Get rid of the pages in a page_list copy.  If the pages are + *	stolen, they are freed.  If the pages are not stolen, they + *	are unbusied, and associated state is cleaned up. + */ +void vm_map_copy_page_discard(vm_map_copy_t copy) +{ +	while (copy->cpy_npages > 0) { +		vm_page_t	m; + +		if((m = copy->cpy_page_list[--(copy->cpy_npages)]) != +		    VM_PAGE_NULL) { + +			/* +			 *	If it's not in the table, then it's +			 *	a stolen page that goes back +			 *	to the free list.  Else it belongs +			 *	to some object, and we hold a +			 *	paging reference on that object. +			 */ +			if (!m->tabled) { +				VM_PAGE_FREE(m); +			} +			else { +				vm_object_t	object; + +				object = m->object; + +				vm_object_lock(object); +				vm_page_lock_queues(); +				if (!m->active && !m->inactive) +					vm_page_activate(m); +				vm_page_unlock_queues(); + +				PAGE_WAKEUP_DONE(m); +				vm_object_paging_end(object); +				vm_object_unlock(object); +			} +		} +	} +} + +/* + *	Routine:	vm_map_copy_discard + * + *	Description: + *		Dispose of a map copy object (returned by + *		vm_map_copyin). + */ +void +vm_map_copy_discard(vm_map_copy_t copy) +{ +free_next_copy: +	if (copy == VM_MAP_COPY_NULL) +		return; + +	switch (copy->type) { +	case VM_MAP_COPY_ENTRY_LIST: +		while (vm_map_copy_first_entry(copy) != +					vm_map_copy_to_entry(copy)) { +			vm_map_entry_t	entry = vm_map_copy_first_entry(copy); + +			vm_map_copy_entry_unlink(copy, entry); +			vm_object_deallocate(entry->object.vm_object); +			vm_map_copy_entry_dispose(copy, entry); +		} +		break; +        case VM_MAP_COPY_OBJECT: +		vm_object_deallocate(copy->cpy_object); +		break; +	case VM_MAP_COPY_PAGE_LIST: + +		/* +		 *	To clean this up, we have to unbusy all the pages +		 *	and release the paging references in their objects. +		 */ +		if (copy->cpy_npages > 0) +			vm_map_copy_page_discard(copy); + +		/* +		 *	If there's a continuation, abort it.  The +		 *	abort routine releases any storage. +		 */ +		if (vm_map_copy_has_cont(copy)) { + +			/* +			 *	Special case: recognize +			 *	vm_map_copy_discard_cont and optimize +			 *	here to avoid tail recursion. +			 */ +			if (copy->cpy_cont == vm_map_copy_discard_cont) { +				vm_map_copy_t	new_copy; + +				new_copy = (vm_map_copy_t) copy->cpy_cont_args; +				kmem_cache_free(&vm_map_copy_cache, (vm_offset_t) copy); +				copy = new_copy; +				goto free_next_copy; +			} +			else { +				vm_map_copy_abort_cont(copy); +			} +		} + +		break; +	} +	kmem_cache_free(&vm_map_copy_cache, (vm_offset_t) copy); +} + +/* + *	Routine:	vm_map_copy_copy + * + *	Description: + *			Move the information in a map copy object to + *			a new map copy object, leaving the old one + *			empty. + * + *			This is used by kernel routines that need + *			to look at out-of-line data (in copyin form) + *			before deciding whether to return SUCCESS. + *			If the routine returns FAILURE, the original + *			copy object will be deallocated; therefore, + *			these routines must make a copy of the copy + *			object and leave the original empty so that + *			deallocation will not fail. + */ +vm_map_copy_t +vm_map_copy_copy(vm_map_copy_t copy) +{ +	vm_map_copy_t	new_copy; + +	if (copy == VM_MAP_COPY_NULL) +		return VM_MAP_COPY_NULL; + +	/* +	 * Allocate a new copy object, and copy the information +	 * from the old one into it. +	 */ + +	new_copy = (vm_map_copy_t) kmem_cache_alloc(&vm_map_copy_cache); +	*new_copy = *copy; + +	if (copy->type == VM_MAP_COPY_ENTRY_LIST) { +		/* +		 * The links in the entry chain must be +		 * changed to point to the new copy object. +		 */ +		vm_map_copy_first_entry(copy)->vme_prev +			= vm_map_copy_to_entry(new_copy); +		vm_map_copy_last_entry(copy)->vme_next +			= vm_map_copy_to_entry(new_copy); +	} + +	/* +	 * Change the old copy object into one that contains +	 * nothing to be deallocated. +	 */ +	copy->type = VM_MAP_COPY_OBJECT; +	copy->cpy_object = VM_OBJECT_NULL; + +	/* +	 * Return the new object. +	 */ +	return new_copy; +} + +/* + *	Routine:	vm_map_copy_discard_cont + * + *	Description: + *		A version of vm_map_copy_discard that can be called + *		as a continuation from a vm_map_copy page list. + */ +kern_return_t	vm_map_copy_discard_cont( +vm_map_copyin_args_t	cont_args, +vm_map_copy_t		*copy_result)	/* OUT */ +{ +	vm_map_copy_discard((vm_map_copy_t) cont_args); +	if (copy_result != (vm_map_copy_t *)0) +		*copy_result = VM_MAP_COPY_NULL; +	return(KERN_SUCCESS); +} + +/* + *	Routine:	vm_map_copy_overwrite + * + *	Description: + *		Copy the memory described by the map copy + *		object (copy; returned by vm_map_copyin) onto + *		the specified destination region (dst_map, dst_addr). + *		The destination must be writeable. + * + *		Unlike vm_map_copyout, this routine actually + *		writes over previously-mapped memory.  If the + *		previous mapping was to a permanent (user-supplied) + *		memory object, it is preserved. + * + *		The attributes (protection and inheritance) of the + *		destination region are preserved. + * + *		If successful, consumes the copy object. + *		Otherwise, the caller is responsible for it. + * + *	Implementation notes: + *		To overwrite temporary virtual memory, it is + *		sufficient to remove the previous mapping and insert + *		the new copy.  This replacement is done either on + *		the whole region (if no permanent virtual memory + *		objects are embedded in the destination region) or + *		in individual map entries. + * + *		To overwrite permanent virtual memory, it is + *		necessary to copy each page, as the external + *		memory management interface currently does not + *		provide any optimizations. + * + *		Once a page of permanent memory has been overwritten, + *		it is impossible to interrupt this function; otherwise, + *		the call would be neither atomic nor location-independent. + *		The kernel-state portion of a user thread must be + *		interruptible. + * + *		It may be expensive to forward all requests that might + *		overwrite permanent memory (vm_write, vm_copy) to + *		uninterruptible kernel threads.  This routine may be + *		called by interruptible threads; however, success is + *		not guaranteed -- if the request cannot be performed + *		atomically and interruptibly, an error indication is + *		returned. + */ +kern_return_t vm_map_copy_overwrite( +	vm_map_t	dst_map, +	vm_offset_t	dst_addr, +	vm_map_copy_t	copy, +	boolean_t	interruptible) +{ +	vm_size_t	size; +	vm_offset_t	start; +	vm_map_entry_t	tmp_entry; +	vm_map_entry_t	entry; + +	boolean_t	contains_permanent_objects = FALSE; + +	interruptible = FALSE;	/* XXX */ + +	/* +	 *	Check for null copy object. +	 */ + +	if (copy == VM_MAP_COPY_NULL) +		return(KERN_SUCCESS); + +	/* +	 *	Only works for entry lists at the moment.  Will +	 *      support page lists LATER. +	 */ + +	assert(copy->type == VM_MAP_COPY_ENTRY_LIST); + +	/* +	 *	Currently this routine only handles page-aligned +	 *	regions.  Eventually, it should handle misalignments +	 *	by actually copying pages. +	 */ + +	if (!page_aligned(copy->offset) || +	    !page_aligned(copy->size) || +	    !page_aligned(dst_addr)) +		return(KERN_INVALID_ARGUMENT); + +	size = copy->size; + +	if (size == 0) { +		vm_map_copy_discard(copy); +		return(KERN_SUCCESS); +	} + +	/* +	 *	Verify that the destination is all writeable +	 *	initially. +	 */ +start_pass_1: +	vm_map_lock(dst_map); +	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) { +		vm_map_unlock(dst_map); +		return(KERN_INVALID_ADDRESS); +	} +	vm_map_clip_start(dst_map, tmp_entry, dst_addr); +	for (entry = tmp_entry;;) { +		vm_size_t	sub_size = (entry->vme_end - entry->vme_start); +		vm_map_entry_t	next = entry->vme_next; + +		if ( ! (entry->protection & VM_PROT_WRITE)) { +			vm_map_unlock(dst_map); +			return(KERN_PROTECTION_FAILURE); +		} + +		/* +		 *	If the entry is in transition, we must wait +		 *	for it to exit that state.  Anything could happen +		 *	when we unlock the map, so start over. +		 */ +                if (entry->in_transition) { + +                        /* +                         * Say that we are waiting, and wait for entry. +                         */ +                        entry->needs_wakeup = TRUE; +                        vm_map_entry_wait(dst_map, FALSE); + +			goto start_pass_1; +		} + +		if (size <= sub_size) +			break; + +		if ((next == vm_map_to_entry(dst_map)) || +		    (next->vme_start != entry->vme_end)) { +			vm_map_unlock(dst_map); +			return(KERN_INVALID_ADDRESS); +		} + + +		/* +		 *	Check for permanent objects in the destination. +		 */ + +		if ((entry->object.vm_object != VM_OBJECT_NULL) && +			   !entry->object.vm_object->temporary) +			contains_permanent_objects = TRUE; + +		size -= sub_size; +		entry = next; +	} + +	/* +	 *	If there are permanent objects in the destination, then +	 *	the copy cannot be interrupted. +	 */ + +	if (interruptible && contains_permanent_objects) { +		vm_map_unlock(dst_map); +		return(KERN_FAILURE);	/* XXX */ +	} + +	/* +	 * XXXO	If there are no permanent objects in the destination, +	 * XXXO and the destination map entry is not shared, +	 * XXXO	then the map entries can be deleted and replaced +	 * XXXO	with those from the copy.  The following code is the +	 * XXXO	basic idea of what to do, but there are lots of annoying +	 * XXXO	little details about getting protection and inheritance +	 * XXXO	right.  Should add protection, inheritance, and sharing checks +	 * XXXO	to the above pass and make sure that no wiring is involved. +	 */ +/* + *	if (!contains_permanent_objects) { + * + *		 * + *		 *	Run over copy and adjust entries.  Steal code + *		 *	from vm_map_copyout() to do this. + *		 * + * + *		tmp_entry = tmp_entry->vme_prev; + *		vm_map_delete(dst_map, dst_addr, dst_addr + copy->size); + *		vm_map_copy_insert(dst_map, tmp_entry, copy); + * + *		vm_map_unlock(dst_map); + *		vm_map_copy_discard(copy); + *	} + */ +	/* +	 * +	 *	Make a second pass, overwriting the data +	 *	At the beginning of each loop iteration, +	 *	the next entry to be overwritten is "tmp_entry" +	 *	(initially, the value returned from the lookup above), +	 *	and the starting address expected in that entry +	 *	is "start". +	 */ + +	start = dst_addr; + +	while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) { +		vm_map_entry_t	copy_entry = vm_map_copy_first_entry(copy); +		vm_size_t	copy_size = (copy_entry->vme_end - copy_entry->vme_start); +		vm_object_t	object; + +		entry = tmp_entry; +		size = (entry->vme_end - entry->vme_start); +		/* +		 *	Make sure that no holes popped up in the +		 *	address map, and that the protection is +		 *	still valid, in case the map was unlocked +		 *	earlier. +		 */ + +		if (entry->vme_start != start) { +			vm_map_unlock(dst_map); +			return(KERN_INVALID_ADDRESS); +		} +		assert(entry != vm_map_to_entry(dst_map)); + +		/* +		 *	Check protection again +		 */ + +		if ( ! (entry->protection & VM_PROT_WRITE)) { +			vm_map_unlock(dst_map); +			return(KERN_PROTECTION_FAILURE); +		} + +		/* +		 *	Adjust to source size first +		 */ + +		if (copy_size < size) { +			vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size); +			size = copy_size; +		} + +		/* +		 *	Adjust to destination size +		 */ + +		if (size < copy_size) { +			vm_map_copy_clip_end(copy, copy_entry, +				copy_entry->vme_start + size); +			copy_size = size; +		} + +		assert((entry->vme_end - entry->vme_start) == size); +		assert((tmp_entry->vme_end - tmp_entry->vme_start) == size); +		assert((copy_entry->vme_end - copy_entry->vme_start) == size); + +		/* +		 *	If the destination contains temporary unshared memory, +		 *	we can perform the copy by throwing it away and +		 *	installing the source data. +		 */ + +		object = entry->object.vm_object; +		if (!entry->is_shared && +		    ((object == VM_OBJECT_NULL) || object->temporary)) { +			vm_object_t	old_object = entry->object.vm_object; +			vm_offset_t	old_offset = entry->offset; + +			entry->object = copy_entry->object; +			entry->offset = copy_entry->offset; +			entry->needs_copy = copy_entry->needs_copy; +			vm_map_entry_reset_wired(dst_map, entry); + +			vm_map_copy_entry_unlink(copy, copy_entry); +			vm_map_copy_entry_dispose(copy, copy_entry); + +			vm_object_pmap_protect( +				old_object, +				old_offset, +				size, +				dst_map->pmap, +				tmp_entry->vme_start, +				VM_PROT_NONE); + +			vm_object_deallocate(old_object); + +			/* +			 *	Set up for the next iteration.  The map +			 *	has not been unlocked, so the next +			 *	address should be at the end of this +			 *	entry, and the next map entry should be +			 *	the one following it. +			 */ + +			start = tmp_entry->vme_end; +			tmp_entry = tmp_entry->vme_next; +		} else { +			vm_map_version_t	version; +			vm_object_t		dst_object = entry->object.vm_object; +			vm_offset_t		dst_offset = entry->offset; +			kern_return_t		r; + +			/* +			 *	Take an object reference, and record +			 *	the map version information so that the +			 *	map can be safely unlocked. +			 */ + +			vm_object_reference(dst_object); + +			version.main_timestamp = dst_map->timestamp; + +			vm_map_unlock(dst_map); + +			/* +			 *	Copy as much as possible in one pass +			 */ + +			copy_size = size; +			r = vm_fault_copy( +					copy_entry->object.vm_object, +					copy_entry->offset, +					©_size, +					dst_object, +					dst_offset, +					dst_map, +					&version, +					FALSE /* XXX interruptible */ ); + +			/* +			 *	Release the object reference +			 */ + +			vm_object_deallocate(dst_object); + +			/* +			 *	If a hard error occurred, return it now +			 */ + +			if (r != KERN_SUCCESS) +				return(r); + +			if (copy_size != 0) { +				/* +				 *	Dispose of the copied region +				 */ + +				vm_map_copy_clip_end(copy, copy_entry, +					copy_entry->vme_start + copy_size); +				vm_map_copy_entry_unlink(copy, copy_entry); +				vm_object_deallocate(copy_entry->object.vm_object); +				vm_map_copy_entry_dispose(copy, copy_entry); +			} + +			/* +			 *	Pick up in the destination map where we left off. +			 * +			 *	Use the version information to avoid a lookup +			 *	in the normal case. +			 */ + +			start += copy_size; +			vm_map_lock(dst_map); +			if ((version.main_timestamp + 1) == dst_map->timestamp) { +				/* We can safely use saved tmp_entry value */ + +				vm_map_clip_end(dst_map, tmp_entry, start); +				tmp_entry = tmp_entry->vme_next; +			} else { +				/* Must do lookup of tmp_entry */ + +				if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) { +					vm_map_unlock(dst_map); +					return(KERN_INVALID_ADDRESS); +				} +				vm_map_clip_start(dst_map, tmp_entry, start); +			} +		} + +	} +	vm_map_unlock(dst_map); + +	/* +	 *	Throw away the vm_map_copy object +	 */ +	vm_map_copy_discard(copy); + +	return(KERN_SUCCESS); +} + +/* + *	Routine:	vm_map_copy_insert + * + *	Description: + *		Link a copy chain ("copy") into a map at the + *		specified location (after "where"). + *	Side effects: + *		The copy chain is destroyed. + */ +static void +vm_map_copy_insert(struct vm_map *map, struct vm_map_entry *where, +		   struct vm_map_copy *copy) +{ +	struct vm_map_entry *entry; + +	assert(copy->type == VM_MAP_COPY_ENTRY_LIST); + +	for (;;) { +		entry = vm_map_copy_first_entry(copy); + +		if (entry == vm_map_copy_to_entry(copy)) { +			break; +		} + +		/* +		 * TODO Turn copy maps into their own type so they don't +		 * use any of the tree operations. +		 */ +		vm_map_copy_entry_unlink(copy, entry); +		vm_map_entry_link(map, where, entry); +		where = entry; +	} + +	kmem_cache_free(&vm_map_copy_cache, (vm_offset_t)copy); +} + +/* + *	Routine:	vm_map_copyout + * + *	Description: + *		Copy out a copy chain ("copy") into newly-allocated + *		space in the destination map. + * + *		If successful, consumes the copy object. + *		Otherwise, the caller is responsible for it. + */ +kern_return_t vm_map_copyout( +	vm_map_t	dst_map, +	vm_offset_t	*dst_addr,	/* OUT */ +	vm_map_copy_t	copy) +{ +	vm_size_t	size; +	vm_size_t	adjustment; +	vm_offset_t	start; +	vm_offset_t	vm_copy_start; +	vm_map_entry_t	last; +	vm_map_entry_t	entry; +	kern_return_t	kr; + +	/* +	 *	Check for null copy object. +	 */ + +	if (copy == VM_MAP_COPY_NULL) { +		*dst_addr = 0; +		return(KERN_SUCCESS); +	} + +	/* +	 *	Check for special copy object, created +	 *	by vm_map_copyin_object. +	 */ + +	if (copy->type == VM_MAP_COPY_OBJECT) { +		vm_object_t object = copy->cpy_object; +		vm_size_t offset = copy->offset; +		vm_size_t tmp_size = copy->size; + +		*dst_addr = 0; +		kr = vm_map_enter(dst_map, dst_addr, tmp_size, +				  (vm_offset_t) 0, TRUE, +				  object, offset, FALSE, +				  VM_PROT_DEFAULT, VM_PROT_ALL, +				  VM_INHERIT_DEFAULT); +		if (kr != KERN_SUCCESS) +			return(kr); +		kmem_cache_free(&vm_map_copy_cache, (vm_offset_t) copy); +		return(KERN_SUCCESS); +	} + +	if (copy->type == VM_MAP_COPY_PAGE_LIST) +		return(vm_map_copyout_page_list(dst_map, dst_addr, copy)); + +	/* +	 *	Find space for the data +	 */ + +	vm_copy_start = trunc_page(copy->offset); +	size =	round_page(copy->offset + copy->size) - vm_copy_start; +	last = vm_map_find_entry_anywhere(dst_map, size, 0, FALSE, &start); + +	if (last == NULL) { +		vm_map_unlock(dst_map); +		return KERN_NO_SPACE; +	} + +	/* +	 *	Adjust the addresses in the copy chain, and +	 *	reset the region attributes. +	 */ + +	adjustment = start - vm_copy_start; +	for (entry = vm_map_copy_first_entry(copy); +	     entry != vm_map_copy_to_entry(copy); +	     entry = entry->vme_next) { +		entry->vme_start += adjustment; +		entry->vme_end += adjustment; + +		/* +		 * XXX There is no need to update the gap tree here. +		 * See vm_map_copy_insert. +		 */ + +		entry->inheritance = VM_INHERIT_DEFAULT; +		entry->protection = VM_PROT_DEFAULT; +		entry->max_protection = VM_PROT_ALL; +		entry->projected_on = 0; + +		/* +		 * If the entry is now wired, +		 * map the pages into the destination map. +		 */ +		if (entry->wired_count != 0) { +		    vm_offset_t 	va; +		    vm_offset_t		offset; +		    vm_object_t 	object; + +		    object = entry->object.vm_object; +		    offset = entry->offset; +		    va = entry->vme_start; + +		    pmap_pageable(dst_map->pmap, +				  entry->vme_start, +				  entry->vme_end, +				  TRUE); + +		    while (va < entry->vme_end) { +			vm_page_t	m; + +			/* +			 * Look up the page in the object. +			 * Assert that the page will be found in the +			 * top object: +			 * either +			 *	the object was newly created by +			 *	vm_object_copy_slowly, and has +			 *	copies of all of the pages from +			 *	the source object +			 * or +			 *	the object was moved from the old +			 *	map entry; because the old map +			 *	entry was wired, all of the pages +			 *	were in the top-level object. +			 *	(XXX not true if we wire pages for +			 *	 reading) +			 */ +			vm_object_lock(object); +			vm_object_paging_begin(object); + +			m = vm_page_lookup(object, offset); +			if (m == VM_PAGE_NULL || m->wire_count == 0 || +			    m->absent) +			    panic("vm_map_copyout: wiring %p", m); + +			m->busy = TRUE; +			vm_object_unlock(object); + +			PMAP_ENTER(dst_map->pmap, va, m, +				   entry->protection, TRUE); + +			vm_object_lock(object); +			PAGE_WAKEUP_DONE(m); +			/* the page is wired, so we don't have to activate */ +			vm_object_paging_end(object); +			vm_object_unlock(object); + +			offset += PAGE_SIZE; +			va += PAGE_SIZE; +		    } +		} + + +	} + +	/* +	 *	Correct the page alignment for the result +	 */ + +	*dst_addr = start + (copy->offset - vm_copy_start); + +	/* +	 *	Update the hints and the map size +	 */ + +	if (dst_map->first_free == last) +		dst_map->first_free = vm_map_copy_last_entry(copy); +	SAVE_HINT(dst_map, vm_map_copy_last_entry(copy)); + +	dst_map->size += size; + +	/* +	 *	Link in the copy +	 */ + +	vm_map_copy_insert(dst_map, last, copy); + +	if (dst_map->wiring_required) { +		/* Returns with the map read-locked if successful */ +		kr = vm_map_pageable(dst_map, start, start + size, +				     VM_PROT_READ | VM_PROT_WRITE, +				     FALSE, FALSE); + +		if (kr != KERN_SUCCESS) { +			vm_map_unlock(dst_map); +			return kr; +		} +	} + +	vm_map_unlock(dst_map); + +	return(KERN_SUCCESS); +} + +/* + * + *	vm_map_copyout_page_list: + * + *	Version of vm_map_copyout() for page list vm map copies. + * + */ +kern_return_t vm_map_copyout_page_list( +	vm_map_t	dst_map, +	vm_offset_t	*dst_addr,	/* OUT */ +	vm_map_copy_t	copy) +{ +	vm_size_t	size; +	vm_offset_t	start; +	vm_offset_t	end; +	vm_offset_t	offset; +	vm_map_entry_t	last; +	vm_object_t	object; +	vm_page_t	*page_list, m; +	vm_map_entry_t	entry; +	vm_offset_t	old_last_offset; +	boolean_t	cont_invoked, needs_wakeup = FALSE; +	kern_return_t	result = KERN_SUCCESS; +	vm_map_copy_t	orig_copy; +	vm_offset_t	dst_offset; +	boolean_t	must_wire; + +	/* +	 *	Make sure the pages are stolen, because we are +	 *	going to put them in a new object.  Assume that +	 *	all pages are identical to first in this regard. +	 */ + +	page_list = ©->cpy_page_list[0]; +	if ((*page_list)->tabled) +		vm_map_copy_steal_pages(copy); + +	/* +	 *	Find space for the data +	 */ + +	size =	round_page(copy->offset + copy->size) - +		trunc_page(copy->offset); + +	vm_map_lock(dst_map); + +	last = vm_map_find_entry_anywhere(dst_map, size, 0, TRUE, &start); + +	if (last == NULL) { +		vm_map_unlock(dst_map); +		return KERN_NO_SPACE; +	} + +	end = start + size; + +	must_wire = dst_map->wiring_required; + +	/* +	 *	See whether we can avoid creating a new entry (and object) by +	 *	extending one of our neighbors.  [So far, we only attempt to +	 *	extend from below.] +	 * +	 *	The code path below here is a bit twisted.  If any of the +	 *	extension checks fails, we branch to create_object.  If +	 *	it all works, we fall out the bottom and goto insert_pages. +	 */ +	if (last == vm_map_to_entry(dst_map) || +	    last->vme_end != start || +	    last->is_shared != FALSE || +	    last->is_sub_map != FALSE || +	    last->inheritance != VM_INHERIT_DEFAULT || +	    last->protection != VM_PROT_DEFAULT || +	    last->max_protection != VM_PROT_ALL || +	    (must_wire ? (last->wired_count == 0) +		       : (last->wired_count != 0))) { +		    goto create_object; +	} + +	/* +	 * If this entry needs an object, make one. +	 */ +	if (last->object.vm_object == VM_OBJECT_NULL) { +		object = vm_object_allocate( +			(vm_size_t)(last->vme_end - last->vme_start + size)); +		last->object.vm_object = object; +		last->offset = 0; +		vm_object_lock(object); +	} +	else { +	    vm_offset_t	prev_offset = last->offset; +	    vm_size_t	prev_size = start - last->vme_start; +	    vm_size_t	new_size; + +	    /* +	     *	This is basically vm_object_coalesce. +	     */ + +	    object = last->object.vm_object; +	    vm_object_lock(object); + +	    /* +	     *	Try to collapse the object first +	     */ +	    vm_object_collapse(object); + +	    /* +	     *	Can't coalesce if pages not mapped to +	     *	last may be in use anyway: +	     *	. more than one reference +	     *	. paged out +	     *	. shadows another object +	     *	. has a copy elsewhere +	     *	. paging references (pages might be in page-list) +	     */ + +	    if ((object->ref_count > 1) || +		object->pager_created || +		(object->shadow != VM_OBJECT_NULL) || +		(object->copy != VM_OBJECT_NULL) || +		(object->paging_in_progress != 0)) { +		    vm_object_unlock(object); +		    goto create_object; +	    } + +	    /* +	     *	Extend the object if necessary.  Don't have to call +	     *  vm_object_page_remove because the pages aren't mapped, +	     *	and vm_page_replace will free up any old ones it encounters. +	     */ +	    new_size = prev_offset + prev_size + size; +	    if (new_size > object->size) +		object->size = new_size; +        } + +	/* +	 *	Coalesced the two objects - can extend +	 *	the previous map entry to include the +	 *	new range. +	 */ +	dst_map->size += size; +	last->vme_end = end; +	vm_map_gap_update(&dst_map->hdr, last); + +	SAVE_HINT(dst_map, last); + +	goto insert_pages; + +create_object: + +	/* +	 *	Create object +	 */ +	object = vm_object_allocate(size); + +	/* +	 *	Create entry +	 */ + +	entry = vm_map_entry_create(dst_map); + +	entry->object.vm_object = object; +	entry->offset = 0; + +	entry->is_shared = FALSE; +	entry->is_sub_map = FALSE; +	entry->needs_copy = FALSE; +	entry->wired_count = 0; + +	if (must_wire) { +		vm_map_entry_inc_wired(dst_map, entry); +		entry->wired_access = VM_PROT_DEFAULT; +	} else { +		entry->wired_access = VM_PROT_NONE; +	} + +	entry->in_transition = TRUE; +	entry->needs_wakeup = FALSE; + +	entry->vme_start = start; +	entry->vme_end = start + size; + +	entry->inheritance = VM_INHERIT_DEFAULT; +	entry->protection = VM_PROT_DEFAULT; +	entry->max_protection = VM_PROT_ALL; +	entry->projected_on = 0; + +	vm_object_lock(object); + +	/* +	 *	Update the hints and the map size +	 */ +	if (dst_map->first_free == last) { +		dst_map->first_free = entry; +	} +	SAVE_HINT(dst_map, entry); +	dst_map->size += size; + +	/* +	 *	Link in the entry +	 */ +	vm_map_entry_link(dst_map, last, entry); +	last = entry; + +	/* +	 *	Transfer pages into new object. +	 *	Scan page list in vm_map_copy. +	 */ +insert_pages: +	dst_offset = copy->offset & PAGE_MASK; +	cont_invoked = FALSE; +	orig_copy = copy; +	last->in_transition = TRUE; +	old_last_offset = last->offset +	    + (start - last->vme_start); + +	vm_page_lock_queues(); + +	for (offset = 0; offset < size; offset += PAGE_SIZE) { +		m = *page_list; +		assert(m && !m->tabled); + +		/* +		 *	Must clear busy bit in page before inserting it. +		 *	Ok to skip wakeup logic because nobody else +		 *	can possibly know about this page. +		 *	The page is dirty in its new object. +		 */ + +		assert(!m->wanted); + +		m->busy = FALSE; +		m->dirty = TRUE; +		vm_page_replace(m, object, old_last_offset + offset); +		if (must_wire) { +			vm_page_wire(m); +			PMAP_ENTER(dst_map->pmap, +				   last->vme_start + m->offset - last->offset, +				   m, last->protection, TRUE); +		} else { +			vm_page_activate(m); +		} + +		*page_list++ = VM_PAGE_NULL; +		if (--(copy->cpy_npages) == 0 && +		    vm_map_copy_has_cont(copy)) { +			vm_map_copy_t	new_copy; + +			/* +			 *	Ok to unlock map because entry is +			 *	marked in_transition. +			 */ +			cont_invoked = TRUE; +			vm_page_unlock_queues(); +			vm_object_unlock(object); +			vm_map_unlock(dst_map); +			vm_map_copy_invoke_cont(copy, &new_copy, &result); + +			if (result == KERN_SUCCESS) { + +				/* +				 *	If we got back a copy with real pages, +				 *	steal them now.  Either all of the +				 *	pages in the list are tabled or none +				 *	of them are; mixtures are not possible. +				 * +				 *	Save original copy for consume on +				 *	success logic at end of routine. +				 */ +				if (copy != orig_copy) +					vm_map_copy_discard(copy); + +				if ((copy = new_copy) != VM_MAP_COPY_NULL) { +					page_list = ©->cpy_page_list[0]; +					if ((*page_list)->tabled) +				    		vm_map_copy_steal_pages(copy); +				} +			} +			else { +				/* +				 *	Continuation failed. +				 */ +				vm_map_lock(dst_map); +				goto error; +			} + +			vm_map_lock(dst_map); +			vm_object_lock(object); +			vm_page_lock_queues(); +		} +	} + +	vm_page_unlock_queues(); +	vm_object_unlock(object); + +	*dst_addr = start + dst_offset; + +	/* +	 *	Clear the in transition bits.  This is easy if we +	 *	didn't have a continuation. +	 */ +error: +	if (!cont_invoked) { +		/* +		 *	We didn't unlock the map, so nobody could +		 *	be waiting. +		 */ +		last->in_transition = FALSE; +		assert(!last->needs_wakeup); +		needs_wakeup = FALSE; +	} +	else { +		if (!vm_map_lookup_entry(dst_map, start, &entry)) +			panic("vm_map_copyout_page_list: missing entry"); + +                /* +                 * Clear transition bit for all constituent entries that +                 * were in the original entry.  Also check for waiters. +                 */ +                while((entry != vm_map_to_entry(dst_map)) && +                      (entry->vme_start < end)) { +                        assert(entry->in_transition); +                        entry->in_transition = FALSE; +                        if(entry->needs_wakeup) { +                                entry->needs_wakeup = FALSE; +                                needs_wakeup = TRUE; +                        } +                        entry = entry->vme_next; +                } +	} + +	if (result != KERN_SUCCESS) +		vm_map_delete(dst_map, start, end); + +	vm_map_unlock(dst_map); + +	if (needs_wakeup) +		vm_map_entry_wakeup(dst_map); + +	/* +	 *	Consume on success logic. +	 */ +	if (copy != orig_copy) { +		kmem_cache_free(&vm_map_copy_cache, (vm_offset_t) copy); +	} +	if (result == KERN_SUCCESS) { +		kmem_cache_free(&vm_map_copy_cache, (vm_offset_t) orig_copy); +	} + +	return(result); +} + +/* + *	Routine:	vm_map_copyin + * + *	Description: + *		Copy the specified region (src_addr, len) from the + *		source address space (src_map), possibly removing + *		the region from the source address space (src_destroy). + * + *	Returns: + *		A vm_map_copy_t object (copy_result), suitable for + *		insertion into another address space (using vm_map_copyout), + *		copying over another address space region (using + *		vm_map_copy_overwrite).  If the copy is unused, it + *		should be destroyed (using vm_map_copy_discard). + * + *	In/out conditions: + *		The source map should not be locked on entry. + */ +kern_return_t vm_map_copyin( +	vm_map_t	src_map, +	vm_offset_t	src_addr, +	vm_size_t	len, +	boolean_t	src_destroy, +	vm_map_copy_t	*copy_result)	/* OUT */ +{ +	vm_map_entry_t	tmp_entry;	/* Result of last map lookup -- +					 * in multi-level lookup, this +					 * entry contains the actual +					 * vm_object/offset. +					 */ + +	vm_offset_t	src_start;	/* Start of current entry -- +					 * where copy is taking place now +					 */ +	vm_offset_t	src_end;	/* End of entire region to be +					 * copied */ + +	vm_map_copy_t	copy;		/* Resulting copy */ + +	/* +	 *	Check for copies of zero bytes. +	 */ + +	if (len == 0) { +		*copy_result = VM_MAP_COPY_NULL; +		return(KERN_SUCCESS); +	} + +	/* +	 *	Check that the end address doesn't overflow +	 */ + +	if ((src_addr + len) <= src_addr) { +		return KERN_INVALID_ADDRESS; +	} + +	/* +	 *	Compute start and end of region +	 */ + +	src_start = trunc_page(src_addr); +	src_end = round_page(src_addr + len); + +	/* +	 *	XXX VM maps shouldn't end at maximum address +	 */ + +	if (src_end == 0) { +		return KERN_INVALID_ADDRESS; +	} + +	/* +	 *	Allocate a header element for the list. +	 * +	 *	Use the start and end in the header to +	 *	remember the endpoints prior to rounding. +	 */ + +	copy = (vm_map_copy_t) kmem_cache_alloc(&vm_map_copy_cache); +	vm_map_copy_first_entry(copy) = +	 vm_map_copy_last_entry(copy) = vm_map_copy_to_entry(copy); +	copy->type = VM_MAP_COPY_ENTRY_LIST; +	copy->cpy_hdr.nentries = 0; +	rbtree_init(©->cpy_hdr.tree); +	rbtree_init(©->cpy_hdr.gap_tree); + +	copy->offset = src_addr; +	copy->size = len; + +#define	RETURN(x)						\ +	MACRO_BEGIN						\ +	vm_map_unlock(src_map);					\ +	vm_map_copy_discard(copy);				\ +	MACRO_RETURN(x);					\ +	MACRO_END + +	/* +	 *	Find the beginning of the region. +	 */ + + 	vm_map_lock(src_map); + +	if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) +		RETURN(KERN_INVALID_ADDRESS); +	vm_map_clip_start(src_map, tmp_entry, src_start); + +	/* +	 *	Go through entries until we get to the end. +	 */ + +	while (TRUE) { +		vm_map_entry_t	src_entry = tmp_entry;	/* Top-level entry */ +		vm_size_t	src_size;		/* Size of source +							 * map entry (in both +							 * maps) +							 */ + +		vm_object_t	src_object;		/* Object to copy */ +		vm_offset_t	src_offset; + +		boolean_t	src_needs_copy;		/* Should source map +							 * be made read-only +							 * for copy-on-write? +							 */ + +		vm_map_entry_t	new_entry;		/* Map entry for copy */ +		boolean_t	new_entry_needs_copy;	/* Will new entry be COW? */ + +		boolean_t	was_wired;		/* Was source wired? */ +		vm_map_version_t version;		/* Version before locks +							 * dropped to make copy +							 */ + +		/* +		 *	Verify that the region can be read. +		 */ + +		if (! (src_entry->protection & VM_PROT_READ)) +			RETURN(KERN_PROTECTION_FAILURE); + +		/* +		 *	Clip against the endpoints of the entire region. +		 */ + +		vm_map_clip_end(src_map, src_entry, src_end); + +		src_size = src_entry->vme_end - src_start; +		src_object = src_entry->object.vm_object; +		src_offset = src_entry->offset; +		was_wired = (src_entry->wired_count != 0); + +		/* +		 *	Create a new address map entry to +		 *	hold the result.  Fill in the fields from +		 *	the appropriate source entries. +		 */ + +		new_entry = vm_map_copy_entry_create(copy); +		vm_map_entry_copy(new_entry, src_entry); + +		/* +		 *	Attempt non-blocking copy-on-write optimizations. +		 */ + +		if (src_destroy && +		    (src_object == VM_OBJECT_NULL || +		     (src_object->temporary && !src_object->use_shared_copy))) +		{ +		    /* +		     * If we are destroying the source, and the object +		     * is temporary, and not shared writable, +		     * we can move the object reference +		     * from the source to the copy.  The copy is +		     * copy-on-write only if the source is. +		     * We make another reference to the object, because +		     * destroying the source entry will deallocate it. +		     */ +		    vm_object_reference(src_object); + +		    /* +		     * Copy is always unwired.  vm_map_copy_entry +		     * set its wired count to zero. +		     */ + +		    goto CopySuccessful; +		} + +		if (!was_wired && +		    vm_object_copy_temporary( +				&new_entry->object.vm_object, +				&new_entry->offset, +				&src_needs_copy, +				&new_entry_needs_copy)) { + +			new_entry->needs_copy = new_entry_needs_copy; + +			/* +			 *	Handle copy-on-write obligations +			 */ + +			if (src_needs_copy && !tmp_entry->needs_copy) { +				vm_object_pmap_protect( +					src_object, +					src_offset, +					src_size, +			      		(src_entry->is_shared ? PMAP_NULL +						: src_map->pmap), +					src_entry->vme_start, +					src_entry->protection & +						~VM_PROT_WRITE); + +				tmp_entry->needs_copy = TRUE; +			} + +			/* +			 *	The map has never been unlocked, so it's safe to +			 *	move to the next entry rather than doing another +			 *	lookup. +			 */ + +			goto CopySuccessful; +		} + +		new_entry->needs_copy = FALSE; + +		/* +		 *	Take an object reference, so that we may +		 *	release the map lock(s). +		 */ + +		assert(src_object != VM_OBJECT_NULL); +		vm_object_reference(src_object); + +		/* +		 *	Record the timestamp for later verification. +		 *	Unlock the map. +		 */ + +		version.main_timestamp = src_map->timestamp; +		vm_map_unlock(src_map); + +		/* +		 *	Perform the copy +		 */ + +		if (was_wired) { +			vm_object_lock(src_object); +			(void) vm_object_copy_slowly( +					src_object, +					src_offset, +					src_size, +					FALSE, +					&new_entry->object.vm_object); +			new_entry->offset = 0; +			new_entry->needs_copy = FALSE; +		} else { +			kern_return_t	result; + +			result = vm_object_copy_strategically(src_object, +				src_offset, +				src_size, +				&new_entry->object.vm_object, +				&new_entry->offset, +				&new_entry_needs_copy); + +			new_entry->needs_copy = new_entry_needs_copy; + + +			if (result != KERN_SUCCESS) { +				vm_map_copy_entry_dispose(copy, new_entry); + +				vm_map_lock(src_map); +				RETURN(result); +			} + +		} + +		/* +		 *	Throw away the extra reference +		 */ + +		vm_object_deallocate(src_object); + +		/* +		 *	Verify that the map has not substantially +		 *	changed while the copy was being made. +		 */ + +		vm_map_lock(src_map);	/* Increments timestamp once! */ + +		if ((version.main_timestamp + 1) == src_map->timestamp) +			goto CopySuccessful; + +		/* +		 *	Simple version comparison failed. +		 * +		 *	Retry the lookup and verify that the +		 *	same object/offset are still present. +		 * +		 *	[Note: a memory manager that colludes with +		 *	the calling task can detect that we have +		 *	cheated.  While the map was unlocked, the +		 *	mapping could have been changed and restored.] +		 */ + +		if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) { +			vm_map_copy_entry_dispose(copy, new_entry); +			RETURN(KERN_INVALID_ADDRESS); +		} + +		src_entry = tmp_entry; +		vm_map_clip_start(src_map, src_entry, src_start); + +		if ((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) +			goto VerificationFailed; + +		if (src_entry->vme_end < new_entry->vme_end) +			src_size = (new_entry->vme_end = src_entry->vme_end) - src_start; + +		if ((src_entry->object.vm_object != src_object) || +		    (src_entry->offset != src_offset) ) { + +			/* +			 *	Verification failed. +			 * +			 *	Start over with this top-level entry. +			 */ + +		 VerificationFailed: ; + +			vm_object_deallocate(new_entry->object.vm_object); +			vm_map_copy_entry_dispose(copy, new_entry); +			tmp_entry = src_entry; +			continue; +		} + +		/* +		 *	Verification succeeded. +		 */ + +	 CopySuccessful: ; + +		/* +		 *	Link in the new copy entry. +		 */ + +		vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), +				       new_entry); + +		/* +		 *	Determine whether the entire region +		 *	has been copied. +		 */ +		src_start = new_entry->vme_end; +		if ((src_start >= src_end) && (src_end != 0)) +			break; + +		/* +		 *	Verify that there are no gaps in the region +		 */ + +		tmp_entry = src_entry->vme_next; +		if (tmp_entry->vme_start != src_start) +			RETURN(KERN_INVALID_ADDRESS); +	} + +	/* +	 * If the source should be destroyed, do it now, since the +	 * copy was successful. +	 */ +	if (src_destroy) +	    (void) vm_map_delete(src_map, trunc_page(src_addr), src_end); + +	vm_map_unlock(src_map); + +	*copy_result = copy; +	return(KERN_SUCCESS); + +#undef	RETURN +} + +/* + *	vm_map_copyin_object: + * + *	Create a copy object from an object. + *	Our caller donates an object reference. + */ + +kern_return_t vm_map_copyin_object( +	vm_object_t	object, +	vm_offset_t	offset,		/* offset of region in object */ +	vm_size_t	size,		/* size of region in object */ +	vm_map_copy_t	*copy_result)	/* OUT */ +{ +	vm_map_copy_t	copy;		/* Resulting copy */ + +	/* +	 *	We drop the object into a special copy object +	 *	that contains the object directly.  These copy objects +	 *	are distinguished by links. +	 */ + +	copy = (vm_map_copy_t) kmem_cache_alloc(&vm_map_copy_cache); +	vm_map_copy_first_entry(copy) = +	 vm_map_copy_last_entry(copy) = VM_MAP_ENTRY_NULL; +	copy->type = VM_MAP_COPY_OBJECT; +	copy->cpy_object = object; +	copy->offset = offset; +	copy->size = size; + +	*copy_result = copy; +	return(KERN_SUCCESS); +} + +/* + *	vm_map_copyin_page_list_cont: + * + *	Continuation routine for vm_map_copyin_page_list. + * + *	If vm_map_copyin_page_list can't fit the entire vm range + *	into a single page list object, it creates a continuation. + *	When the target of the operation has used the pages in the + *	initial page list, it invokes the continuation, which calls + *	this routine.  If an error happens, the continuation is aborted + *	(abort arg to this routine is TRUE).  To avoid deadlocks, the + *	pages are discarded from the initial page list before invoking + *	the continuation. + * + *	NOTE: This is not the same sort of continuation used by + *	the scheduler. + */ + +static kern_return_t	vm_map_copyin_page_list_cont( +	vm_map_copyin_args_t	cont_args, +	vm_map_copy_t		*copy_result)	/* OUT */ +{ +	kern_return_t	result = 0; /* '=0' to quiet gcc warnings */ +	boolean_t	do_abort, src_destroy, src_destroy_only; + +	/* +	 *	Check for cases that only require memory destruction. +	 */ +	do_abort = (copy_result == (vm_map_copy_t *) 0); +	src_destroy = (cont_args->destroy_len != (vm_size_t) 0); +	src_destroy_only = (cont_args->src_len == (vm_size_t) 0); + +	if (do_abort || src_destroy_only) { +		if (src_destroy) +			result = vm_map_remove(cont_args->map, +			    cont_args->destroy_addr, +			    cont_args->destroy_addr + cont_args->destroy_len); +		if (!do_abort) +			*copy_result = VM_MAP_COPY_NULL; +	} +	else { +		result = vm_map_copyin_page_list(cont_args->map, +			cont_args->src_addr, cont_args->src_len, src_destroy, +			cont_args->steal_pages, copy_result, TRUE); + +		if (src_destroy && !cont_args->steal_pages && +			vm_map_copy_has_cont(*copy_result)) { +			    vm_map_copyin_args_t	new_args; +		    	    /* +			     *	Transfer old destroy info. +			     */ +			    new_args = (vm_map_copyin_args_t) +			    		(*copy_result)->cpy_cont_args; +		            new_args->destroy_addr = cont_args->destroy_addr; +		            new_args->destroy_len = cont_args->destroy_len; +		} +	} + +	vm_map_deallocate(cont_args->map); +	kfree((vm_offset_t)cont_args, sizeof(vm_map_copyin_args_data_t)); + +	return(result); +} + +/* + *	vm_map_copyin_page_list: + * + *	This is a variant of vm_map_copyin that copies in a list of pages. + *	If steal_pages is TRUE, the pages are only in the returned list. + *	If steal_pages is FALSE, the pages are busy and still in their + *	objects.  A continuation may be returned if not all the pages fit: + *	the recipient of this copy_result must be prepared to deal with it. + */ + +kern_return_t vm_map_copyin_page_list( +	vm_map_t	src_map, +	vm_offset_t	src_addr, +	vm_size_t	len, +	boolean_t	src_destroy, +	boolean_t	steal_pages, +	vm_map_copy_t	*copy_result,	/* OUT */ +	boolean_t	is_cont) +{ +	vm_map_entry_t	src_entry; +	vm_page_t 	m; +	vm_offset_t	src_start; +	vm_offset_t	src_end; +	vm_size_t	src_size; +	vm_object_t	src_object; +	vm_offset_t	src_offset; +	vm_offset_t	src_last_offset; +	vm_map_copy_t	copy;		/* Resulting copy */ +	kern_return_t	result = KERN_SUCCESS; +	boolean_t	need_map_lookup; +        vm_map_copyin_args_t	cont_args; + +	/* +	 * 	If steal_pages is FALSE, this leaves busy pages in +	 *	the object.  A continuation must be used if src_destroy +	 *	is true in this case (!steal_pages && src_destroy). +	 * +	 * XXX	Still have a more general problem of what happens +	 * XXX	if the same page occurs twice in a list.  Deadlock +	 * XXX	can happen if vm_fault_page was called.  A +	 * XXX	possible solution is to use a continuation if vm_fault_page +	 * XXX	is called and we cross a map entry boundary. +	 */ + +	/* +	 *	Check for copies of zero bytes. +	 */ + +	if (len == 0) { +		*copy_result = VM_MAP_COPY_NULL; +		return(KERN_SUCCESS); +	} + +	/* +	 *	Check that the end address doesn't overflow +	 */ + +	if ((src_addr + len) <= src_addr) { +		return KERN_INVALID_ADDRESS; +	} + +	/* +	 *	Compute start and end of region +	 */ + +	src_start = trunc_page(src_addr); +	src_end = round_page(src_addr + len); + +	/* +	 *	XXX VM maps shouldn't end at maximum address +	 */ + +	if (src_end == 0) { +		return KERN_INVALID_ADDRESS; +	} + +	/* +	 *	Allocate a header element for the page list. +	 * +	 *	Record original offset and size, as caller may not +	 *      be page-aligned. +	 */ + +	copy = (vm_map_copy_t) kmem_cache_alloc(&vm_map_copy_cache); +	copy->type = VM_MAP_COPY_PAGE_LIST; +	copy->cpy_npages = 0; +	copy->offset = src_addr; +	copy->size = len; +	copy->cpy_cont = ((kern_return_t (*)()) 0); +	copy->cpy_cont_args = VM_MAP_COPYIN_ARGS_NULL; + +	/* +	 *	Find the beginning of the region. +	 */ + +do_map_lookup: + + 	vm_map_lock(src_map); + +	if (!vm_map_lookup_entry(src_map, src_start, &src_entry)) { +		result = KERN_INVALID_ADDRESS; +		goto error; +	} +	need_map_lookup = FALSE; + +	/* +	 *	Go through entries until we get to the end. +	 */ + +	while (TRUE) { + +		if (! (src_entry->protection & VM_PROT_READ)) { +			result = KERN_PROTECTION_FAILURE; +			goto error; +		} + +		if (src_end > src_entry->vme_end) +			src_size = src_entry->vme_end - src_start; +		else +			src_size = src_end - src_start; + +		src_object = src_entry->object.vm_object; +		src_offset = src_entry->offset + +				(src_start - src_entry->vme_start); + +		/* +		 *	If src_object is NULL, allocate it now; +		 *	we're going to fault on it shortly. +		 */ +		if (src_object == VM_OBJECT_NULL) { +			src_object = vm_object_allocate((vm_size_t) +				src_entry->vme_end - +				src_entry->vme_start); +			src_entry->object.vm_object = src_object; +		} + +		/* +		 * Iterate over pages.  Fault in ones that aren't present. +		 */ +		src_last_offset = src_offset + src_size; +		for (; (src_offset < src_last_offset && !need_map_lookup); +		       src_offset += PAGE_SIZE, src_start += PAGE_SIZE) { + +			if (copy->cpy_npages == VM_MAP_COPY_PAGE_LIST_MAX) { +make_continuation: +			    /* +			     *	At this point we have the max number of +			     *  pages busy for this thread that we're +			     *  willing to allow.  Stop here and record +			     *  arguments for the remainder.  Note: +			     *  this means that this routine isn't atomic, +			     *  but that's the breaks.  Note that only +			     *  the first vm_map_copy_t that comes back +			     *  from this routine has the right offset +			     *  and size; those from continuations are +			     *  page rounded, and short by the amount +			     *	already done. +			     * +			     *	Reset src_end so the src_destroy +			     *	code at the bottom doesn't do +			     *	something stupid. +			     */ + +			    cont_args = (vm_map_copyin_args_t) +			    	    kalloc(sizeof(vm_map_copyin_args_data_t)); +			    cont_args->map = src_map; +			    vm_map_reference(src_map); +			    cont_args->src_addr = src_start; +			    cont_args->src_len = len - (src_start - src_addr); +			    if (src_destroy) { +			    	cont_args->destroy_addr = cont_args->src_addr; +				cont_args->destroy_len = cont_args->src_len; +			    } +			    else { +			    	cont_args->destroy_addr = (vm_offset_t) 0; +				cont_args->destroy_len = (vm_offset_t) 0; +			    } +			    cont_args->steal_pages = steal_pages; + +			    copy->cpy_cont_args = cont_args; +			    copy->cpy_cont = vm_map_copyin_page_list_cont; + +			    src_end = src_start; +			    vm_map_clip_end(src_map, src_entry, src_end); +			    break; +			} + +			/* +			 *	Try to find the page of data. +			 */ +			vm_object_lock(src_object); +			vm_object_paging_begin(src_object); +			if (((m = vm_page_lookup(src_object, src_offset)) != +			    VM_PAGE_NULL) && !m->busy && !m->fictitious && +			    !m->absent && !m->error) { + +				/* +				 *	This is the page.  Mark it busy +				 *	and keep the paging reference on +				 *	the object whilst we do our thing. +				 */ +				m->busy = TRUE; + +				/* +				 *	Also write-protect the page, so +				 *	that the map`s owner cannot change +				 *	the data.  The busy bit will prevent +				 *	faults on the page from succeeding +				 *	until the copy is released; after +				 *	that, the page can be re-entered +				 *	as writable, since we didn`t alter +				 *	the map entry.  This scheme is a +				 *	cheap copy-on-write. +				 * +				 *	Don`t forget the protection and +				 *	the page_lock value! +				 * +				 *	If the source is being destroyed +				 *	AND not shared writable, we don`t +				 *	have to protect the page, since +				 *	we will destroy the (only) +				 *	writable mapping later. +				 */ +				if (!src_destroy || +				    src_object->use_shared_copy) +				{ +				    pmap_page_protect(m->phys_addr, +						  src_entry->protection +						& ~m->page_lock +						& ~VM_PROT_WRITE); +				} + +			} +			else { +				vm_prot_t result_prot; +				vm_page_t top_page; +				kern_return_t kr; + +				/* +				 *	Have to fault the page in; must +				 *	unlock the map to do so.  While +				 *	the map is unlocked, anything +				 *	can happen, we must lookup the +				 *	map entry before continuing. +				 */ +				vm_map_unlock(src_map); +				need_map_lookup = TRUE; +retry: +				result_prot = VM_PROT_READ; + +				kr = vm_fault_page(src_object, src_offset, +						   VM_PROT_READ, FALSE, FALSE, +						   &result_prot, &m, &top_page, +						   FALSE, (void (*)()) 0); +				/* +				 *	Cope with what happened. +				 */ +				switch (kr) { +				case VM_FAULT_SUCCESS: +					break; +				case VM_FAULT_INTERRUPTED: /* ??? */ +			        case VM_FAULT_RETRY: +					vm_object_lock(src_object); +					vm_object_paging_begin(src_object); +					goto retry; +				case VM_FAULT_MEMORY_SHORTAGE: +					VM_PAGE_WAIT((void (*)()) 0); +					vm_object_lock(src_object); +					vm_object_paging_begin(src_object); +					goto retry; +				case VM_FAULT_FICTITIOUS_SHORTAGE: +					vm_page_more_fictitious(); +					vm_object_lock(src_object); +					vm_object_paging_begin(src_object); +					goto retry; +				case VM_FAULT_MEMORY_ERROR: +					/* +					 *	Something broke.  If this +					 *	is a continuation, return +					 *	a partial result if possible, +					 *	else fail the whole thing. +					 *	In the continuation case, the +					 *	next continuation call will +					 *	get this error if it persists. +					 */ +					vm_map_lock(src_map); +					if (is_cont && +					    copy->cpy_npages != 0) +						goto make_continuation; + +					result = KERN_MEMORY_ERROR; +					goto error; +				} + +				if (top_page != VM_PAGE_NULL) { +					vm_object_lock(src_object); +					VM_PAGE_FREE(top_page); +					vm_object_paging_end(src_object); +					vm_object_unlock(src_object); +				 } + +				 /* +				  *	We do not need to write-protect +				  *	the page, since it cannot have +				  *	been in the pmap (and we did not +				  *	enter it above).  The busy bit +				  *	will protect the page from being +				  *	entered as writable until it is +				  *	unlocked. +				  */ + +			} + +			/* +			 *	The page is busy, its object is locked, and +			 *	we have a paging reference on it.  Either +			 *	the map is locked, or need_map_lookup is +			 *	TRUE. +			 * +			 *	Put the page in the page list. +			 */ +			copy->cpy_page_list[copy->cpy_npages++] = m; +			vm_object_unlock(m->object); +		} + +		/* +		 *	DETERMINE whether the entire region +		 *	has been copied. +		 */ +		if (src_start >= src_end && src_end != 0) { +			if (need_map_lookup) +				vm_map_lock(src_map); +			break; +		} + +		/* +		 *	If need_map_lookup is TRUE, have to start over with +		 *	another map lookup.  Note that we dropped the map +		 *	lock (to call vm_fault_page) above only in this case. +		 */ +		if (need_map_lookup) +			goto do_map_lookup; + +		/* +		 *	Verify that there are no gaps in the region +		 */ + +		src_start = src_entry->vme_end; +		src_entry = src_entry->vme_next; +		if (src_entry->vme_start != src_start) { +			result = KERN_INVALID_ADDRESS; +			goto error; +		} +	} + +	/* +	 *	If steal_pages is true, make sure all +	 *	pages in the copy are not in any object +	 *	We try to remove them from the original +	 *	object, but we may have to copy them. +	 * +	 *	At this point every page in the list is busy +	 *	and holds a paging reference to its object. +	 *	When we're done stealing, every page is busy, +	 *	and in no object (m->tabled == FALSE). +	 */ +	src_start = trunc_page(src_addr); +	if (steal_pages) { +		int 		i; +		vm_offset_t	unwire_end; + +		unwire_end = src_start; +		for (i = 0; i < copy->cpy_npages; i++) { + +			/* +			 *	Remove the page from its object if it +			 *	can be stolen.  It can be stolen if: + 			 * +			 *	(1) The source is being destroyed, +			 *	      the object is temporary, and +			 *	      not shared. +			 *	(2) The page is not precious. +			 * +			 *	The not shared check consists of two +			 *	parts:  (a) there are no objects that +			 *	shadow this object.  (b) it is not the +			 *	object in any shared map entries (i.e., +			 *	use_shared_copy is not set). +			 * +			 *	The first check (a) means that we can't +			 *	steal pages from objects that are not +			 *	at the top of their shadow chains.  This +			 *	should not be a frequent occurrence. +			 * +			 *	Stealing wired pages requires telling the +			 *	pmap module to let go of them. +			 * +			 *	NOTE: stealing clean pages from objects +			 *  	whose mappings survive requires a call to +			 *	the pmap module.  Maybe later. + 			 */ +			m = copy->cpy_page_list[i]; +			src_object = m->object; +			vm_object_lock(src_object); + +			if (src_destroy && +			    src_object->temporary && +			    (!src_object->shadowed) && +			    (!src_object->use_shared_copy) && +			    !m->precious) { +				vm_offset_t	page_vaddr; + +				page_vaddr = src_start + (i * PAGE_SIZE); +				if (m->wire_count > 0) { + +				    assert(m->wire_count == 1); +				    /* +				     *	In order to steal a wired +				     *	page, we have to unwire it +				     *	first.  We do this inline +				     *	here because we have the page. +				     * +				     *	Step 1: Unwire the map entry. +				     *		Also tell the pmap module +				     *		that this piece of the +				     *		pmap is pageable. +				     */ +				    vm_object_unlock(src_object); +				    if (page_vaddr >= unwire_end) { +				        if (!vm_map_lookup_entry(src_map, +				            page_vaddr, &src_entry)) +		    panic("vm_map_copyin_page_list: missing wired map entry"); + +				        vm_map_clip_start(src_map, src_entry, +						page_vaddr); +				    	vm_map_clip_end(src_map, src_entry, +						src_start + src_size); + +					assert(src_entry->wired_count > 0); +					vm_map_entry_reset_wired(src_map, src_entry); +					unwire_end = src_entry->vme_end; +				        pmap_pageable(vm_map_pmap(src_map), +					    page_vaddr, unwire_end, TRUE); +				    } + +				    /* +				     *	Step 2: Unwire the page. +				     *	pmap_remove handles this for us. +				     */ +				    vm_object_lock(src_object); +				} + +				/* +				 *	Don't need to remove the mapping; +				 *	vm_map_delete will handle it. +				 * +				 *	Steal the page.  Setting the wire count +				 *	to zero is vm_page_unwire without +				 *	activating the page. +  				 */ +				vm_page_lock_queues(); +	 			vm_page_remove(m); +				if (m->wire_count > 0) { +				    m->wire_count = 0; +				    vm_page_wire_count--; +				} else { +				    VM_PAGE_QUEUES_REMOVE(m); +				} +				vm_page_unlock_queues(); +			} +			else { +			        /* +				 *	Have to copy this page.  Have to +				 *	unlock the map while copying, +				 *	hence no further page stealing. +				 *	Hence just copy all the pages. +				 *	Unlock the map while copying; +				 *	This means no further page stealing. +				 */ +				vm_object_unlock(src_object); +				vm_map_unlock(src_map); + +				vm_map_copy_steal_pages(copy); + +				vm_map_lock(src_map); +				break; +		        } + +			vm_object_paging_end(src_object); +			vm_object_unlock(src_object); +	        } + +		/* +		 * If the source should be destroyed, do it now, since the +		 * copy was successful. +		 */ + +		if (src_destroy) { +		    (void) vm_map_delete(src_map, src_start, src_end); +		} +	} +	else { +		/* +		 *	!steal_pages leaves busy pages in the map. +		 *	This will cause src_destroy to hang.  Use +		 *	a continuation to prevent this. +		 */ +	        if (src_destroy && !vm_map_copy_has_cont(copy)) { +			cont_args = (vm_map_copyin_args_t) +				kalloc(sizeof(vm_map_copyin_args_data_t)); +			vm_map_reference(src_map); +			cont_args->map = src_map; +			cont_args->src_addr = (vm_offset_t) 0; +			cont_args->src_len = (vm_size_t) 0; +			cont_args->destroy_addr = src_start; +			cont_args->destroy_len = src_end - src_start; +			cont_args->steal_pages = FALSE; + +			copy->cpy_cont_args = cont_args; +			copy->cpy_cont = vm_map_copyin_page_list_cont; +		} + +	} + +	vm_map_unlock(src_map); + +	*copy_result = copy; +	return(result); + +error: +	vm_map_unlock(src_map); +	vm_map_copy_discard(copy); +	return(result); +} + +/* + *	vm_map_fork: + * + *	Create and return a new map based on the old + *	map, according to the inheritance values on the + *	regions in that map. + * + *	The source map must not be locked. + */ +vm_map_t vm_map_fork(vm_map_t old_map) +{ +	vm_map_t	new_map; +	vm_map_entry_t	old_entry; +	vm_map_entry_t	new_entry; +	pmap_t		new_pmap = pmap_create((vm_size_t) 0); +	vm_size_t	new_size = 0; +	vm_size_t	entry_size; +	vm_object_t	object; + +	if (new_pmap == PMAP_NULL) +		return VM_MAP_NULL; + +	vm_map_lock(old_map); + +	new_map = vm_map_create(new_pmap, +			old_map->min_offset, +			old_map->max_offset); +	if (new_map == VM_MAP_NULL) { +		pmap_destroy(new_pmap); +		return VM_MAP_NULL; +	} + +	for ( +	    old_entry = vm_map_first_entry(old_map); +	    old_entry != vm_map_to_entry(old_map); +	    ) { +		if (old_entry->is_sub_map) +			panic("vm_map_fork: encountered a submap"); + +		entry_size = (old_entry->vme_end - old_entry->vme_start); + +		switch (old_entry->inheritance) { +		case VM_INHERIT_NONE: +			break; + +		case VM_INHERIT_SHARE: +		        /* +			 *	New sharing code.  New map entry +			 *	references original object.  Temporary +			 *	objects use asynchronous copy algorithm for +			 *	future copies.  First make sure we have +			 *	the right object.  If we need a shadow, +			 *	or someone else already has one, then +			 *	make a new shadow and share it. +			 */ + +			object = old_entry->object.vm_object; +			if (object == VM_OBJECT_NULL) { +				object = vm_object_allocate( +					    (vm_size_t)(old_entry->vme_end - +							old_entry->vme_start)); +				old_entry->offset = 0; +				old_entry->object.vm_object = object; +				assert(!old_entry->needs_copy); +			} +			else if (old_entry->needs_copy || object->shadowed || +			    (object->temporary && !old_entry->is_shared && +			     object->size > (vm_size_t)(old_entry->vme_end - +						old_entry->vme_start))) { + +			    assert(object->temporary); +			    assert(!(object->shadowed && old_entry->is_shared)); +			    vm_object_shadow( +			        &old_entry->object.vm_object, +			        &old_entry->offset, +			        (vm_size_t) (old_entry->vme_end - +					     old_entry->vme_start)); + +			    /* +			     *	If we're making a shadow for other than +			     *	copy on write reasons, then we have +			     *	to remove write permission. +			     */ + +			    if (!old_entry->needs_copy && +				(old_entry->protection & VM_PROT_WRITE)) { +			    	pmap_protect(vm_map_pmap(old_map), +					     old_entry->vme_start, +					     old_entry->vme_end, +					     old_entry->protection & +					     	~VM_PROT_WRITE); +			    } +			    old_entry->needs_copy = FALSE; +			    object = old_entry->object.vm_object; +			} + +			/* +			 *	Set use_shared_copy to indicate that +			 *	object must use shared (delayed) copy-on +			 *	write.  This is ignored for permanent objects. +			 *	Bump the reference count for the new entry +			 */ + +			vm_object_lock(object); +			object->use_shared_copy = TRUE; +			object->ref_count++; +			vm_object_unlock(object); + +			new_entry = vm_map_entry_create(new_map); + +			if (old_entry->projected_on != 0) { +			  /* +			   *   If entry is projected buffer, clone the +                           *   entry exactly. +                           */ + +			  vm_map_entry_copy_full(new_entry, old_entry); + +			} else { +			  /* +			   *	Clone the entry, using object ref from above. +			   *	Mark both entries as shared. +			   */ + +			  vm_map_entry_copy(new_entry, old_entry); +			  old_entry->is_shared = TRUE; +			  new_entry->is_shared = TRUE; +			} + +			/* +			 *	Insert the entry into the new map -- we +			 *	know we're inserting at the end of the new +			 *	map. +			 */ + +			vm_map_entry_link( +				new_map, +				vm_map_last_entry(new_map), +				new_entry); + +			/* +			 *	Update the physical map +			 */ + +			pmap_copy(new_map->pmap, old_map->pmap, +				new_entry->vme_start, +				entry_size, +				old_entry->vme_start); + +			new_size += entry_size; +			break; + +		case VM_INHERIT_COPY: +			if (old_entry->wired_count == 0) { +				boolean_t	src_needs_copy; +				boolean_t	new_entry_needs_copy; + +				new_entry = vm_map_entry_create(new_map); +				vm_map_entry_copy(new_entry, old_entry); + +				if (vm_object_copy_temporary( +					&new_entry->object.vm_object, +					&new_entry->offset, +					&src_needs_copy, +					&new_entry_needs_copy)) { + +					/* +					 *	Handle copy-on-write obligations +					 */ + +					if (src_needs_copy && !old_entry->needs_copy) { +						vm_object_pmap_protect( +							old_entry->object.vm_object, +							old_entry->offset, +							entry_size, +							(old_entry->is_shared ? +								PMAP_NULL : +								old_map->pmap), +							old_entry->vme_start, +							old_entry->protection & +							    ~VM_PROT_WRITE); + +						old_entry->needs_copy = TRUE; +					} + +					new_entry->needs_copy = new_entry_needs_copy; + +					/* +					 *	Insert the entry at the end +					 *	of the map. +					 */ + +					vm_map_entry_link(new_map, +						vm_map_last_entry(new_map), +						new_entry); + + +					new_size += entry_size; +					break; +				} + +				vm_map_entry_dispose(new_map, new_entry); +			} + +			/* INNER BLOCK (copy cannot be optimized) */ { + +			vm_offset_t	start = old_entry->vme_start; +			vm_map_copy_t	copy; +			vm_map_entry_t	last = vm_map_last_entry(new_map); + +			vm_map_unlock(old_map); +			if (vm_map_copyin(old_map, +					start, +					entry_size, +					FALSE, +					©) +			    != KERN_SUCCESS) { +			    	vm_map_lock(old_map); +				if (!vm_map_lookup_entry(old_map, start, &last)) +					last = last->vme_next; +				old_entry = last; +				/* +				 *	For some error returns, want to +				 *	skip to the next element. +				 */ + +				continue; +			} + +			/* +			 *	Insert the copy into the new map +			 */ + +			vm_map_copy_insert(new_map, last, copy); +			new_size += entry_size; + +			/* +			 *	Pick up the traversal at the end of +			 *	the copied region. +			 */ + +			vm_map_lock(old_map); +			start += entry_size; +			if (!vm_map_lookup_entry(old_map, start, &last)) +				last = last->vme_next; +			 else +				vm_map_clip_start(old_map, last, start); +			old_entry = last; + +			continue; +			/* INNER BLOCK (copy cannot be optimized) */ } +		} +		old_entry = old_entry->vme_next; +	} + +	new_map->size = new_size; +	vm_map_unlock(old_map); + +	return(new_map); +} + +/* + *	vm_map_lookup: + * + *	Finds the VM object, offset, and + *	protection for a given virtual address in the + *	specified map, assuming a page fault of the + *	type specified. + * + *	Returns the (object, offset, protection) for + *	this address, whether it is wired down, and whether + *	this map has the only reference to the data in question. + *	In order to later verify this lookup, a "version" + *	is returned. + * + *	The map should not be locked; it will not be + *	locked on exit.  In order to guarantee the + *	existence of the returned object, it is returned + *	locked. + * + *	If a lookup is requested with "write protection" + *	specified, the map may be changed to perform virtual + *	copying operations, although the data referenced will + *	remain the same. + */ +kern_return_t vm_map_lookup( +	vm_map_t		*var_map,	/* IN/OUT */ +	vm_offset_t		vaddr, +	vm_prot_t		fault_type, + +	vm_map_version_t	*out_version,	/* OUT */ +	vm_object_t		*object,	/* OUT */ +	vm_offset_t		*offset,	/* OUT */ +	vm_prot_t		*out_prot,	/* OUT */ +	boolean_t		*wired)		/* OUT */ +{ +	vm_map_entry_t		entry; +	vm_map_t		map = *var_map; +	vm_prot_t		prot; + +	RetryLookup: ; + +	/* +	 *	Lookup the faulting address. +	 */ + +	vm_map_lock_read(map); + +#define	RETURN(why) \ +		{ \ +		vm_map_unlock_read(map); \ +		return(why); \ +		} + +	/* +	 *	If the map has an interesting hint, try it before calling +	 *	full blown lookup routine. +	 */ + +	simple_lock(&map->hint_lock); +	entry = map->hint; +	simple_unlock(&map->hint_lock); + +	if ((entry == vm_map_to_entry(map)) || +	    (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) { +		vm_map_entry_t	tmp_entry; + +		/* +		 *	Entry was either not a valid hint, or the vaddr +		 *	was not contained in the entry, so do a full lookup. +		 */ +		if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) +			RETURN(KERN_INVALID_ADDRESS); + +		entry = tmp_entry; +	} + +	/* +	 *	Handle submaps. +	 */ + +	if (entry->is_sub_map) { +		vm_map_t	old_map = map; + +		*var_map = map = entry->object.sub_map; +		vm_map_unlock_read(old_map); +		goto RetryLookup; +	} + +	/* +	 *	Check whether this task is allowed to have +	 *	this page. +	 */ + +	prot = entry->protection; + +	if ((fault_type & (prot)) != fault_type) { +		if ((prot & VM_PROT_NOTIFY) && (fault_type & VM_PROT_WRITE)) { +			RETURN(KERN_WRITE_PROTECTION_FAILURE); +		} else { +			RETURN(KERN_PROTECTION_FAILURE); +		} +	} + +	/* +	 *	If this page is not pageable, we have to get +	 *	it for all possible accesses. +	 */ + +	if ((*wired = (entry->wired_count != 0))) +		prot = fault_type = entry->protection; + +	/* +	 *	If the entry was copy-on-write, we either ... +	 */ + +	if (entry->needs_copy) { +	    	/* +		 *	If we want to write the page, we may as well +		 *	handle that now since we've got the map locked. +		 * +		 *	If we don't need to write the page, we just +		 *	demote the permissions allowed. +		 */ + +		if (fault_type & VM_PROT_WRITE) { +			/* +			 *	Make a new object, and place it in the +			 *	object chain.  Note that no new references +			 *	have appeared -- one just moved from the +			 *	map to the new object. +			 */ + +			if (vm_map_lock_read_to_write(map)) { +				goto RetryLookup; +			} +			map->timestamp++; + +			vm_object_shadow( +			    &entry->object.vm_object, +			    &entry->offset, +			    (vm_size_t) (entry->vme_end - entry->vme_start)); + +			entry->needs_copy = FALSE; + +			vm_map_lock_write_to_read(map); +		} +		else { +			/* +			 *	We're attempting to read a copy-on-write +			 *	page -- don't allow writes. +			 */ + +			prot &= (~VM_PROT_WRITE); +		} +	} + +	/* +	 *	Create an object if necessary. +	 */ +	if (entry->object.vm_object == VM_OBJECT_NULL) { + +		if (vm_map_lock_read_to_write(map)) { +			goto RetryLookup; +		} + +		entry->object.vm_object = vm_object_allocate( +				(vm_size_t)(entry->vme_end - entry->vme_start)); +		entry->offset = 0; +		vm_map_lock_write_to_read(map); +	} + +	/* +	 *	Return the object/offset from this entry.  If the entry +	 *	was copy-on-write or empty, it has been fixed up.  Also +	 *	return the protection. +	 */ + +        *offset = (vaddr - entry->vme_start) + entry->offset; +        *object = entry->object.vm_object; +	*out_prot = prot; + +	/* +	 *	Lock the object to prevent it from disappearing +	 */ + +	vm_object_lock(*object); + +	/* +	 *	Save the version number and unlock the map. +	 */ + +	out_version->main_timestamp = map->timestamp; + +	RETURN(KERN_SUCCESS); + +#undef	RETURN +} + +/* + *	vm_map_verify: + * + *	Verifies that the map in question has not changed + *	since the given version.  If successful, the map + *	will not change until vm_map_verify_done() is called. + */ +boolean_t	vm_map_verify( +	vm_map_t		map, +	vm_map_version_t 	*version)	/* REF */ +{ +	boolean_t	result; + +	vm_map_lock_read(map); +	result = (map->timestamp == version->main_timestamp); + +	if (!result) +		vm_map_unlock_read(map); + +	return(result); +} + +/* + *	vm_map_verify_done: + * + *	Releases locks acquired by a vm_map_verify. + * + *	This is now a macro in vm/vm_map.h.  It does a + *	vm_map_unlock_read on the map. + */ + +/* + *	vm_region: + * + *	User call to obtain information about a region in + *	a task's address map. + */ + +kern_return_t	vm_region( +	vm_map_t	map, +	vm_offset_t	*address,		/* IN/OUT */ +	vm_size_t	*size,			/* OUT */ +	vm_prot_t	*protection,		/* OUT */ +	vm_prot_t	*max_protection,	/* OUT */ +	vm_inherit_t	*inheritance,		/* OUT */ +	boolean_t	*is_shared,		/* OUT */ +	ipc_port_t	*object_name,		/* OUT */ +	vm_offset_t	*offset_in_object)	/* OUT */ +{ +	vm_map_entry_t	tmp_entry; +	vm_map_entry_t	entry; +	vm_offset_t	tmp_offset; +	vm_offset_t	start; + +	if (map == VM_MAP_NULL) +		return(KERN_INVALID_ARGUMENT); + +	start = *address; + +	vm_map_lock_read(map); +	if (!vm_map_lookup_entry(map, start, &tmp_entry)) { +		if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) { +			vm_map_unlock_read(map); +		   	return(KERN_NO_SPACE); +		} +	} else { +		entry = tmp_entry; +	} + +	start = entry->vme_start; +	*protection = entry->protection; +	*max_protection = entry->max_protection; +	*inheritance = entry->inheritance; +	*address = start; +	*size = (entry->vme_end - start); + +	tmp_offset = entry->offset; + + +	if (entry->is_sub_map) { +		*is_shared = FALSE; +		*object_name = IP_NULL; +		*offset_in_object = tmp_offset; +	} else { +		*is_shared = entry->is_shared; +		*object_name = vm_object_name(entry->object.vm_object); +		*offset_in_object = tmp_offset; +	} + +	vm_map_unlock_read(map); + +	return(KERN_SUCCESS); +} + +/* + *	vm_region_create_proxy: + * + *	Gets a proxy to the region that ADDRESS belongs to, starting at the + *	region start, with MAX_PROTECTION and LEN limited by the region ones, + *	and returns it in *PORT. + */ +kern_return_t +vm_region_create_proxy (task_t task, vm_address_t address, +			vm_prot_t max_protection, vm_size_t len, +			ipc_port_t *port) +{ +  kern_return_t ret; +  vm_map_entry_t entry, tmp_entry; +  vm_object_t object; +  rpc_vm_offset_t rpc_offset, rpc_start; +  rpc_vm_size_t rpc_len = (rpc_vm_size_t) len; +  ipc_port_t pager; + +  if (task == TASK_NULL) +    return(KERN_INVALID_ARGUMENT); + +  vm_map_lock_read(task->map); +  if (!vm_map_lookup_entry(task->map, address, &tmp_entry)) { +    if ((entry = tmp_entry->vme_next) == vm_map_to_entry(task->map)) { +      vm_map_unlock_read(task->map); +      return(KERN_NO_SPACE); +    } +  } else { +    entry = tmp_entry; +  } + +  if (entry->is_sub_map) { +    vm_map_unlock_read(task->map); +    return(KERN_INVALID_ARGUMENT); +  } + +  /* Limit the allowed protection and range to the entry ones */ +  if (len > entry->vme_end - entry->vme_start) { +    vm_map_unlock_read(task->map); +    return(KERN_INVALID_ARGUMENT); +  } +  max_protection &= entry->max_protection; + +  object = entry->object.vm_object; +  vm_object_lock(object); +  /* Create a pager in case this is an internal object that does +     not yet have one. */ +  vm_object_pager_create(object); +  pager = ipc_port_copy_send(object->pager); +  vm_object_unlock(object); + +  rpc_start = (address - entry->vme_start) + entry->offset; +  rpc_offset = 0; + +  vm_map_unlock_read(task->map); + +  ret = memory_object_create_proxy(task->itk_space, max_protection, +				    &pager, 1, +				    &rpc_offset, 1, +				    &rpc_start, 1, +				    &rpc_len, 1, port); +  if (ret) +    ipc_port_release_send(pager); + +  return ret; +} + +/* + *	Routine:	vm_map_coalesce_entry + *	Purpose: + *		Try to coalesce an entry with the preceeding entry in the map. + *	Conditions: + *		The map is locked.  If coalesced, the entry is destroyed + *		by the call. + *	Returns: + *		Whether the entry was coalesced. + */ +boolean_t +vm_map_coalesce_entry( +	vm_map_t	map, +	vm_map_entry_t	entry) +{ +	vm_map_entry_t	prev = entry->vme_prev; +	vm_size_t	prev_size; +	vm_size_t	entry_size; + +	/* +	 *	Check the basic conditions for coalescing the two entries. +	 */ +	if ((entry == vm_map_to_entry(map)) || +	    (prev == vm_map_to_entry(map)) || +	    (prev->vme_end != entry->vme_start) || +	    (prev->is_shared || entry->is_shared) || +	    (prev->is_sub_map || entry->is_sub_map) || +	    (prev->inheritance != entry->inheritance) || +	    (prev->protection != entry->protection) || +	    (prev->max_protection != entry->max_protection) || +	    (prev->needs_copy != entry->needs_copy) || +	    (prev->in_transition || entry->in_transition) || +	    (prev->wired_count != entry->wired_count) || +	    (prev->projected_on != 0) || +	    (entry->projected_on != 0)) +		return FALSE; + +	prev_size = prev->vme_end - prev->vme_start; +	entry_size = entry->vme_end - entry->vme_start; +	assert(prev->gap_size == 0); + +	/* +	 *	See if we can coalesce the two objects. +	 */ +	if (!vm_object_coalesce(prev->object.vm_object, +		entry->object.vm_object, +		prev->offset, +		entry->offset, +		prev_size, +		entry_size, +		&prev->object.vm_object, +		&prev->offset)) +		return FALSE; + +	/* +	 *	Update the hints. +	 */ +	if (map->hint == entry) +		SAVE_HINT(map, prev); +	if (map->first_free == entry) +		map->first_free = prev; + +	/* +	 *	Get rid of the entry without changing any wirings or the pmap, +	*	and without altering map->size. +	 */ +	prev->vme_end = entry->vme_end; +	vm_map_entry_unlink(map, entry); +	vm_map_entry_dispose(map, entry); + +	return TRUE; +} + + + +/* + *	Routine:	vm_map_machine_attribute + *	Purpose: + *		Provide machine-specific attributes to mappings, + *		such as cachability etc. for machines that provide + *		them.  NUMA architectures and machines with big/strange + *		caches will use this. + *	Note: + *		Responsibilities for locking and checking are handled here, + *		everything else in the pmap module. If any non-volatile + *		information must be kept, the pmap module should handle + *		it itself. [This assumes that attributes do not + *		need to be inherited, which seems ok to me] + */ +kern_return_t vm_map_machine_attribute( +	vm_map_t	map, +	vm_offset_t	address, +	vm_size_t	size, +	vm_machine_attribute_t	attribute, +	vm_machine_attribute_val_t* value)		/* IN/OUT */ +{ +	kern_return_t	ret; + +	if (address < vm_map_min(map) || +	    (address + size) > vm_map_max(map)) +		return KERN_INVALID_ARGUMENT; + +	vm_map_lock(map); + +	ret = pmap_attribute(map->pmap, address, size, attribute, value); + +	vm_map_unlock(map); + +	return ret; +} + +/* + *	Routine:	vm_map_msync + *	Purpose: + *		Synchronize out pages of the given map out to their memory + *		manager, if any. + */ +kern_return_t vm_map_msync( +	vm_map_t	map, +	vm_offset_t	address, +	vm_size_t	size, +	vm_sync_t	sync_flags) +{ +	if (map == VM_MAP_NULL) +		return KERN_INVALID_ARGUMENT; + +	if ((sync_flags & (VM_SYNC_ASYNCHRONOUS | VM_SYNC_SYNCHRONOUS)) == +			 (VM_SYNC_ASYNCHRONOUS | VM_SYNC_SYNCHRONOUS)) +		return KERN_INVALID_ARGUMENT; + +	size =	round_page(address + size) - trunc_page(address); +	address = trunc_page(address); + +	if (size == 0) +		return KERN_SUCCESS; + +	/* TODO */ + +	return KERN_INVALID_ARGUMENT; +} + + + +#if	MACH_KDB + +#define	printf	kdbprintf + +/* + *	vm_map_print:	[ debug ] + */ +void vm_map_print(db_expr_t addr, boolean_t have_addr, db_expr_t count, const char *modif) +{ +	vm_map_t	map; +	vm_map_entry_t	entry; + +	if (!have_addr) +		map = current_thread()->task->map; +	else +		map = (vm_map_t)addr; + +	iprintf("Map 0x%X: name=\"%s\", pmap=0x%X,", +		(vm_offset_t) map, map->name, (vm_offset_t) (map->pmap)); +	 printf("ref=%d,nentries=%d\n", map->ref_count, map->hdr.nentries); +	 printf("size=%lu,resident:%lu,wired=%lu\n", map->size, +	        pmap_resident_count(map->pmap) * PAGE_SIZE, map->size_wired); +	 printf("version=%d\n",	map->timestamp); +	indent += 1; +	for (entry = vm_map_first_entry(map); +	     entry != vm_map_to_entry(map); +	     entry = entry->vme_next) { +		static char *inheritance_name[3] = { "share", "copy", "none"}; + +		iprintf("map entry 0x%X: ", (vm_offset_t) entry); +		 printf("start=0x%X, end=0x%X\n", +			(vm_offset_t) entry->vme_start, (vm_offset_t) entry->vme_end); +		iprintf("prot=%X/%X/%s, ", +			entry->protection, +			entry->max_protection, +			inheritance_name[entry->inheritance]); +		if (entry->wired_count != 0) { +			printf("wired, "); +		} +		if (entry->in_transition) { +			printf("in transition"); +			if (entry->needs_wakeup) +				printf("(wake request)"); +			printf(", "); +		} +		if (entry->is_sub_map) { +		 	printf("submap=0x%X, offset=0x%X\n", +				(vm_offset_t) entry->object.sub_map, +				(vm_offset_t) entry->offset); +		} else { +			printf("object=0x%X, offset=0x%X", +				(vm_offset_t) entry->object.vm_object, +				(vm_offset_t) entry->offset); +			if (entry->is_shared) +				printf(", shared"); +			if (entry->needs_copy) +				printf(", copy needed"); +			printf("\n"); + +			if ((entry->vme_prev == vm_map_to_entry(map)) || +			    (entry->vme_prev->object.vm_object != entry->object.vm_object)) { +				indent += 1; +				vm_object_print(entry->object.vm_object); +				indent -= 1; +			} +		} +	} +	indent -= 1; +} + +/* + *	Routine:	vm_map_copy_print + *	Purpose: + *		Pretty-print a copy object for ddb. + */ + +void vm_map_copy_print(const vm_map_copy_t copy) +{ +	int i, npages; + +	printf("copy object 0x%x\n", copy); + +	indent += 1; + +	iprintf("type=%d", copy->type); +	switch (copy->type) { +		case VM_MAP_COPY_ENTRY_LIST: +		printf("[entry_list]"); +		break; + +		case VM_MAP_COPY_OBJECT: +		printf("[object]"); +		break; + +		case VM_MAP_COPY_PAGE_LIST: +		printf("[page_list]"); +		break; + +		default: +		printf("[bad type]"); +		break; +	} +	printf(", offset=0x%x", copy->offset); +	printf(", size=0x%x\n", copy->size); + +	switch (copy->type) { +		case VM_MAP_COPY_ENTRY_LIST: +		/* XXX add stuff here */ +		break; + +		case VM_MAP_COPY_OBJECT: +		iprintf("object=0x%x\n", copy->cpy_object); +		break; + +		case VM_MAP_COPY_PAGE_LIST: +		iprintf("npages=%d", copy->cpy_npages); +		printf(", cont=%x", copy->cpy_cont); +		printf(", cont_args=%x\n", copy->cpy_cont_args); +		if (copy->cpy_npages < 0) { +			npages = 0; +		} else if (copy->cpy_npages > VM_MAP_COPY_PAGE_LIST_MAX) { +			npages = VM_MAP_COPY_PAGE_LIST_MAX; +		} else { +			npages = copy->cpy_npages; +		} +		iprintf("copy->cpy_page_list[0..%d] = {", npages); +		for (i = 0; i < npages - 1; i++) { +			printf("0x%x, ", copy->cpy_page_list[i]); +		} +		if (npages > 0) { +			printf("0x%x", copy->cpy_page_list[npages - 1]); +		} +		printf("}\n"); +		break; +	} + +	indent -= 1; +} +#endif	/* MACH_KDB */ diff --git a/vm/vm_map.h b/vm/vm_map.h new file mode 100644 index 0000000..a4949e4 --- /dev/null +++ b/vm/vm_map.h @@ -0,0 +1,585 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University. + * Copyright (c) 1993,1994 The University of Utah and + * the Computer Systems Laboratory (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF + * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY + * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF + * THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU + *  School of Computer Science + *  Carnegie Mellon University + *  Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + *	File:	vm/vm_map.h + *	Author:	Avadis Tevanian, Jr., Michael Wayne Young + *	Date:	1985 + * + *	Virtual memory map module definitions. + * + * Contributors: + *	avie, dlb, mwyoung + */ + +#ifndef	_VM_VM_MAP_H_ +#define _VM_VM_MAP_H_ + +#include <mach/kern_return.h> +#include <mach/boolean.h> +#include <mach/machine/vm_types.h> +#include <mach/vm_attributes.h> +#include <mach/vm_prot.h> +#include <mach/vm_inherit.h> +#include <mach/vm_wire.h> +#include <mach/vm_sync.h> +#include <vm/pmap.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_types.h> +#include <kern/list.h> +#include <kern/lock.h> +#include <kern/rbtree.h> +#include <kern/macros.h> + +/* TODO: make it dynamic */ +#define KENTRY_DATA_SIZE (256*PAGE_SIZE) + +/* + *	Types defined: + * + *	vm_map_entry_t		an entry in an address map. + *	vm_map_version_t	a timestamp of a map, for use with vm_map_lookup + *	vm_map_copy_t		represents memory copied from an address map, + *				 used for inter-map copy operations + */ + +/* + *	Type:		vm_map_object_t [internal use only] + * + *	Description: + *		The target of an address mapping, either a virtual + *		memory object or a sub map (of the kernel map). + */ +typedef union vm_map_object { +	struct vm_object	*vm_object;	/* object object */ +	struct vm_map		*sub_map;	/* belongs to another map */ +} vm_map_object_t; + +/* + *	Type:		vm_map_entry_t [internal use only] + * + *	Description: + *		A single mapping within an address map. + * + *	Implementation: + *		Address map entries consist of start and end addresses, + *		a VM object (or sub map) and offset into that object, + *		and user-exported inheritance and protection information. + *		Control information for virtual copy operations is also + *		stored in the address map entry. + */ +struct vm_map_links { +	struct vm_map_entry	*prev;		/* previous entry */ +	struct vm_map_entry	*next;		/* next entry */ +	vm_offset_t		start;		/* start address */ +	vm_offset_t		end;		/* end address */ +}; + +struct vm_map_entry { +	struct vm_map_links	links;		/* links to other entries */ +#define vme_prev		links.prev +#define vme_next		links.next +#define vme_start		links.start +#define vme_end			links.end +	struct rbtree_node	tree_node;	/* links to other entries in tree */ +	struct rbtree_node	gap_node;	/* links to other entries in gap tree */ +	struct list		gap_list;	/* links to other entries with +						   the same gap size */ +	vm_size_t		gap_size;	/* size of available memory +						   following this entry */ +	union vm_map_object	object;		/* object I point to */ +	vm_offset_t		offset;		/* offset into object */ +	unsigned int +	/* boolean_t */		in_gap_tree:1,	/* entry is in the gap tree if true, +						   or linked to other entries with +						   the same gap size if false */ +	/* boolean_t */		is_shared:1,	/* region is shared */ +	/* boolean_t */		is_sub_map:1,	/* Is "object" a submap? */ +	/* boolean_t */		in_transition:1, /* Entry being changed */ +	/* boolean_t */		needs_wakeup:1,  /* Waiters on in_transition */ +		/* Only used when object is a vm_object: */ +	/* boolean_t */		needs_copy:1;    /* does object need to be copied */ + +		/* Only in task maps: */ +	vm_prot_t		protection;	/* protection code */ +	vm_prot_t		max_protection;	/* maximum protection */ +	vm_inherit_t		inheritance;	/* inheritance */ +	unsigned short		wired_count;	/* can be paged if = 0 */ +	vm_prot_t		wired_access;	/* wiring access types, as accepted +						   by vm_map_pageable; used on wiring +						   scans when protection != VM_PROT_NONE */ +	struct vm_map_entry     *projected_on;  /* 0 for normal map entry +           or persistent kernel map projected buffer entry; +           -1 for non-persistent kernel map projected buffer entry; +           pointer to corresponding kernel map entry for user map +           projected buffer entry */ +}; + +typedef struct vm_map_entry	*vm_map_entry_t; + +#define VM_MAP_ENTRY_NULL	((vm_map_entry_t) 0) + +/* + *	Type:		struct vm_map_header + * + *	Description: + *		Header for a vm_map and a vm_map_copy. + */ +struct vm_map_header { +	struct vm_map_links	links;		/* first, last, min, max */ +	struct rbtree		tree;		/* Sorted tree of entries */ +	struct rbtree		gap_tree;	/* Sorted tree of gap lists +						   for allocations */ +	int			nentries;	/* Number of entries */ +}; + +/* + *	Type:		vm_map_t [exported; contents invisible] + * + *	Description: + *		An address map -- a directory relating valid + *		regions of a task's address space to the corresponding + *		virtual memory objects. + * + *	Implementation: + *		Maps are doubly-linked lists of map entries, sorted + *		by address.  They're also contained in a red-black tree. + *		One hint is used to start searches again at the last + *		successful search, insertion, or removal.  If the hint + *		lookup failed (i.e. the hint didn't refer to the requested + *		entry), a BST lookup is performed.  Another hint is used to + *		quickly find free space. + */ +struct vm_map { +	lock_data_t		lock;		/* Lock for map data */ +	struct vm_map_header	hdr;		/* Map entry header */ +#define min_offset		hdr.links.start	/* start of range */ +#define max_offset		hdr.links.end	/* end of range */ +	pmap_t			pmap;		/* Physical map */ +	vm_size_t		size;		/* virtual size */ +	vm_size_t		size_wired;	/* wired size */ +	int			ref_count;	/* Reference count */ +	decl_simple_lock_data(,	ref_lock)	/* Lock for ref_count field */ +	vm_map_entry_t		hint;		/* hint for quick lookups */ +	decl_simple_lock_data(,	hint_lock)	/* lock for hint storage */ +	vm_map_entry_t		first_free;	/* First free space hint */ + +	/* Flags */ +	unsigned int	wait_for_space:1,	/* Should callers wait +						   for space? */ +	/* boolean_t */ wiring_required:1;	/* New mappings are wired? */ + +	unsigned int		timestamp;	/* Version number */ + +	const char		*name;		/* Associated name */ +}; + +#define vm_map_to_entry(map)	((struct vm_map_entry *) &(map)->hdr.links) +#define vm_map_first_entry(map)	((map)->hdr.links.next) +#define vm_map_last_entry(map)	((map)->hdr.links.prev) + +/* + *	Type:		vm_map_version_t [exported; contents invisible] + * + *	Description: + *		Map versions may be used to quickly validate a previous + *		lookup operation. + * + *	Usage note: + *		Because they are bulky objects, map versions are usually + *		passed by reference. + * + *	Implementation: + *		Just a timestamp for the main map. + */ +typedef struct vm_map_version { +	unsigned int	main_timestamp; +} vm_map_version_t; + +/* + *	Type:		vm_map_copy_t [exported; contents invisible] + * + *	Description: + *		A map copy object represents a region of virtual memory + *		that has been copied from an address map but is still + *		in transit. + * + *		A map copy object may only be used by a single thread + *		at a time. + * + *	Implementation: + * 		There are three formats for map copy objects. + *		The first is very similar to the main + *		address map in structure, and as a result, some + *		of the internal maintenance functions/macros can + *		be used with either address maps or map copy objects. + * + *		The map copy object contains a header links + *		entry onto which the other entries that represent + *		the region are chained. + * + *		The second format is a single vm object.  This is used + *		primarily in the pageout path.  The third format is a + *		list of vm pages.  An optional continuation provides + *		a hook to be called to obtain more of the memory, + *		or perform other operations.  The continuation takes 3 + *		arguments, a saved arg buffer, a pointer to a new vm_map_copy + *		(returned) and an abort flag (abort if TRUE). + */ + +#define VM_MAP_COPY_PAGE_LIST_MAX	64 + +struct vm_map_copy; +struct vm_map_copyin_args_data; +typedef kern_return_t (*vm_map_copy_cont_fn)(struct vm_map_copyin_args_data*, struct vm_map_copy**); + +typedef struct vm_map_copy { +	int			type; +#define VM_MAP_COPY_ENTRY_LIST	1 +#define VM_MAP_COPY_OBJECT	2 +#define VM_MAP_COPY_PAGE_LIST	3 +	vm_offset_t		offset; +	vm_size_t		size; +	union { +	    struct vm_map_header	hdr;	/* ENTRY_LIST */ +	    struct {				/* OBJECT */ +	    	vm_object_t		object; +	    } c_o; +	    struct {				/* PAGE_LIST */ +		vm_page_t		page_list[VM_MAP_COPY_PAGE_LIST_MAX]; +		int			npages; +		vm_map_copy_cont_fn cont; +		struct vm_map_copyin_args_data* cont_args; +	    } c_p; +	} c_u; +} *vm_map_copy_t; + +#define cpy_hdr			c_u.hdr + +#define cpy_object		c_u.c_o.object + +#define cpy_page_list		c_u.c_p.page_list +#define cpy_npages		c_u.c_p.npages +#define cpy_cont		c_u.c_p.cont +#define cpy_cont_args		c_u.c_p.cont_args + +#define	VM_MAP_COPY_NULL	((vm_map_copy_t) 0) + +/* + *	Useful macros for entry list copy objects + */ + +#define vm_map_copy_to_entry(copy)		\ +		((struct vm_map_entry *) &(copy)->cpy_hdr.links) +#define vm_map_copy_first_entry(copy)		\ +		((copy)->cpy_hdr.links.next) +#define vm_map_copy_last_entry(copy)		\ +		((copy)->cpy_hdr.links.prev) + +/* + *	Continuation macros for page list copy objects + */ + +#define	vm_map_copy_invoke_cont(old_copy, new_copy, result)		\ +MACRO_BEGIN								\ +	vm_map_copy_page_discard(old_copy);				\ +	*result = (*((old_copy)->cpy_cont))((old_copy)->cpy_cont_args,	\ +					    new_copy);			\ +	(old_copy)->cpy_cont = (kern_return_t (*)()) 0;			\ +MACRO_END + +#define	vm_map_copy_invoke_extend_cont(old_copy, new_copy, result)	\ +MACRO_BEGIN								\ +	*result = (*((old_copy)->cpy_cont))((old_copy)->cpy_cont_args,	\ +					    new_copy);			\ +	(old_copy)->cpy_cont = (kern_return_t (*)()) 0;			\ +MACRO_END + +#define vm_map_copy_abort_cont(old_copy)				\ +MACRO_BEGIN								\ +	vm_map_copy_page_discard(old_copy);				\ +	(*((old_copy)->cpy_cont))((old_copy)->cpy_cont_args,		\ +				  (vm_map_copy_t *) 0);			\ +	(old_copy)->cpy_cont = (kern_return_t (*)()) 0;			\ +	(old_copy)->cpy_cont_args = VM_MAP_COPYIN_ARGS_NULL;		\ +MACRO_END + +#define vm_map_copy_has_cont(copy)					\ +    (((copy)->cpy_cont) != (kern_return_t (*)()) 0) + +/* + *	Continuation structures for vm_map_copyin_page_list. + */ + +typedef	struct vm_map_copyin_args_data { +	vm_map_t	map; +	vm_offset_t	src_addr; +	vm_size_t	src_len; +	vm_offset_t	destroy_addr; +	vm_size_t	destroy_len; +	boolean_t	steal_pages; +} vm_map_copyin_args_data_t, *vm_map_copyin_args_t; + +#define	VM_MAP_COPYIN_ARGS_NULL	((vm_map_copyin_args_t) 0) + +/* + *	Macros:		vm_map_lock, etc. [internal use only] + *	Description: + *		Perform locking on the data portion of a map. + */ + +#define vm_map_lock_init(map)			\ +MACRO_BEGIN					\ +	lock_init(&(map)->lock, TRUE);		\ +	(map)->timestamp = 0;			\ +MACRO_END + +void vm_map_lock(struct vm_map *map); +void vm_map_unlock(struct vm_map *map); + +#define vm_map_lock_read(map)	lock_read(&(map)->lock) +#define vm_map_unlock_read(map)	lock_read_done(&(map)->lock) +#define vm_map_lock_write_to_read(map) \ +		lock_write_to_read(&(map)->lock) +#define vm_map_lock_read_to_write(map) \ +		(lock_read_to_write(&(map)->lock) || (((map)->timestamp++), 0)) +#define vm_map_lock_set_recursive(map) \ +		lock_set_recursive(&(map)->lock) +#define vm_map_lock_clear_recursive(map) \ +		lock_clear_recursive(&(map)->lock) + +/* + *	Exported procedures that operate on vm_map_t. + */ + +/* Initialize the module */ +extern void		vm_map_init(void); + +/* Initialize an empty map */ +extern void		vm_map_setup(vm_map_t, pmap_t, vm_offset_t, vm_offset_t); +/* Create an empty map */ +extern vm_map_t		vm_map_create(pmap_t, vm_offset_t, vm_offset_t); +/* Create a map in the image of an existing map */ +extern vm_map_t		vm_map_fork(vm_map_t); + +/* Gain a reference to an existing map */ +extern void		vm_map_reference(vm_map_t); +/* Lose a reference */ +extern void		vm_map_deallocate(vm_map_t); + +/* Enter a mapping */ +extern kern_return_t	vm_map_enter(vm_map_t, vm_offset_t *, vm_size_t, +				     vm_offset_t, boolean_t, vm_object_t, +				     vm_offset_t, boolean_t, vm_prot_t, +				     vm_prot_t, vm_inherit_t); +/* Enter a mapping primitive */ +extern kern_return_t	vm_map_find_entry(vm_map_t, vm_offset_t *, vm_size_t, +					  vm_offset_t, vm_object_t, +					  vm_map_entry_t *); +/* Deallocate a region */ +extern kern_return_t	vm_map_remove(vm_map_t, vm_offset_t, vm_offset_t); +/* Change protection */ +extern kern_return_t	vm_map_protect(vm_map_t, vm_offset_t, vm_offset_t, +				       vm_prot_t, boolean_t); +/* Change inheritance */ +extern kern_return_t	vm_map_inherit(vm_map_t, vm_offset_t, vm_offset_t, +				       vm_inherit_t); + +/* Look up an address */ +extern kern_return_t	vm_map_lookup(vm_map_t *, vm_offset_t, vm_prot_t, +				      vm_map_version_t *, vm_object_t *, +				      vm_offset_t *, vm_prot_t *, boolean_t *); +/* Find a map entry */ +extern boolean_t	vm_map_lookup_entry(vm_map_t, vm_offset_t, +					    vm_map_entry_t *); +/* Verify that a previous lookup is still valid */ +extern boolean_t	vm_map_verify(vm_map_t, vm_map_version_t *); +/* vm_map_verify_done is now a macro -- see below */ +/* Make a copy of a region */ +extern kern_return_t	vm_map_copyin(vm_map_t, vm_offset_t, vm_size_t, +				      boolean_t, vm_map_copy_t *); +/* Make a copy of a region using a page list copy */ +extern kern_return_t	vm_map_copyin_page_list(vm_map_t, vm_offset_t, +						vm_size_t, boolean_t, +						boolean_t, vm_map_copy_t *, +						boolean_t); +/* Place a copy into a map */ +extern kern_return_t	vm_map_copyout(vm_map_t, vm_offset_t *, vm_map_copy_t); +/* Overwrite existing memory with a copy */ +extern kern_return_t	vm_map_copy_overwrite(vm_map_t, vm_offset_t, +					      vm_map_copy_t, boolean_t); +/* Discard a copy without using it */ +extern void		vm_map_copy_discard(vm_map_copy_t); +extern void		vm_map_copy_page_discard(vm_map_copy_t); +extern vm_map_copy_t	vm_map_copy_copy(vm_map_copy_t); +/* Page list continuation version of previous */ +extern kern_return_t	vm_map_copy_discard_cont(vm_map_copyin_args_t, +						 vm_map_copy_t *); + +extern boolean_t	vm_map_coalesce_entry(vm_map_t, vm_map_entry_t); + +/* Add or remove machine- dependent attributes from map regions */ +extern kern_return_t	vm_map_machine_attribute(vm_map_t, vm_offset_t, +						 vm_size_t, +						 vm_machine_attribute_t, +						 vm_machine_attribute_val_t *); + +extern kern_return_t	vm_map_msync(vm_map_t, +				     vm_offset_t, vm_size_t, vm_sync_t); + +/* Delete entry from map */ +extern void		vm_map_entry_delete(vm_map_t, vm_map_entry_t); + +kern_return_t vm_map_delete( +    vm_map_t   	map, +    vm_offset_t    	start, +    vm_offset_t    	end); + +kern_return_t vm_map_copyout_page_list( +    vm_map_t    	dst_map, +    vm_offset_t 	*dst_addr,  /* OUT */ +    vm_map_copy_t   	copy); + +void vm_map_copy_page_discard (vm_map_copy_t copy); + +boolean_t vm_map_lookup_entry( +	vm_map_t	map, +	vm_offset_t	address, +	vm_map_entry_t	*entry); /* OUT */ + +static inline void vm_map_set_name(vm_map_t map, const char *name) +{ +	map->name = name; +} + + +/* + *	Functions implemented as macros + */ +#define		vm_map_min(map)		((map)->min_offset) +						/* Lowest valid address in +						 * a map */ + +#define		vm_map_max(map)		((map)->max_offset) +						/* Highest valid address */ + +#define		vm_map_pmap(map)	((map)->pmap) +						/* Physical map associated +						 * with this address map */ + +#define		vm_map_verify_done(map, version)    (vm_map_unlock_read(map)) +						/* Operation that required +						 * a verified lookup is +						 * now complete */ +/* + *	Pageability functions. + */ +extern kern_return_t	vm_map_pageable(vm_map_t, vm_offset_t, vm_offset_t, +					vm_prot_t, boolean_t, boolean_t); + +extern kern_return_t	vm_map_pageable_all(vm_map_t, vm_wire_t); + +/* + *	Submap object.  Must be used to create memory to be put + *	in a submap by vm_map_submap. + */ +extern vm_object_t	vm_submap_object; + +/* + *  vm_map_copyin_object: + * + *  Create a copy object from an object. + *  Our caller donates an object reference. + */ +extern kern_return_t vm_map_copyin_object( +    vm_object_t object, +    vm_offset_t offset,     /* offset of region in object */ +    vm_size_t   size,       /* size of region in object */ +    vm_map_copy_t   *copy_result);   /* OUT */ + +/* + *  vm_map_submap:      [ kernel use only ] + * + *  Mark the given range as handled by a subordinate map. + * + *  This range must have been created with vm_map_find using + *  the vm_submap_object, and no other operations may have been + *  performed on this range prior to calling vm_map_submap. + * + *  Only a limited number of operations can be performed + *  within this rage after calling vm_map_submap: + *      vm_fault + *  [Don't try vm_map_copyin!] + * + *  To remove a submapping, one must first remove the + *  range from the superior map, and then destroy the + *  submap (if desired).  [Better yet, don't try it.] + */ +extern kern_return_t vm_map_submap( +    vm_map_t   map, +    vm_offset_t    start, +    vm_offset_t    end, +    vm_map_t        submap); + +/* + *	Wait and wakeup macros for in_transition map entries. + */ +#define vm_map_entry_wait(map, interruptible)    	\ +        MACRO_BEGIN                                     \ +        assert_wait((event_t)&(map)->hdr, interruptible);	\ +        vm_map_unlock(map);                             \ +	thread_block((void (*)()) 0);			\ +        MACRO_END + +#define vm_map_entry_wakeup(map)        thread_wakeup((event_t)&(map)->hdr) + +/* + *      This routine is called only when it is known that + *      the entry must be split. + */ +extern void _vm_map_clip_start( +        struct vm_map_header *map_header, +        vm_map_entry_t entry, +        vm_offset_t	start, +        boolean_t	link_gap); + +/* + *      vm_map_clip_end:        [ internal use only ] + * + *      Asserts that the given entry ends at or before + *      the specified address; if necessary, + *      it splits the entry into two. + */ +void _vm_map_clip_end( +	struct vm_map_header 	*map_header, +	vm_map_entry_t		entry, +	vm_offset_t		end, +	boolean_t		link_gap); + +#endif	/* _VM_VM_MAP_H_ */ diff --git a/vm/vm_object.c b/vm/vm_object.c new file mode 100644 index 0000000..c238cce --- /dev/null +++ b/vm/vm_object.c @@ -0,0 +1,2994 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University. + * Copyright (c) 1993,1994 The University of Utah and + * the Computer Systems Laboratory (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF + * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY + * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF + * THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU + *  School of Computer Science + *  Carnegie Mellon University + *  Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + *	File:	vm/vm_object.c + *	Author:	Avadis Tevanian, Jr., Michael Wayne Young + * + *	Virtual memory object module. + */ + +#include <kern/printf.h> +#include <string.h> + +#include <mach/memory_object.h> +#include <vm/memory_object_default.user.h> +#include <vm/memory_object_user.user.h> +#include <machine/vm_param.h> +#include <ipc/ipc_port.h> +#include <ipc/ipc_space.h> +#include <kern/assert.h> +#include <kern/debug.h> +#include <kern/mach.server.h> +#include <kern/lock.h> +#include <kern/queue.h> +#include <kern/xpr.h> +#include <kern/slab.h> +#include <vm/memory_object.h> +#include <vm/vm_fault.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pageout.h> + +#if	MACH_KDB +#include <ddb/db_output.h> +#endif	/* MACH_KDB */ + +void memory_object_release( +	ipc_port_t	pager, +	pager_request_t	pager_request, +	ipc_port_t	pager_name); /* forward */ + +/* + *	Virtual memory objects maintain the actual data + *	associated with allocated virtual memory.  A given + *	page of memory exists within exactly one object. + * + *	An object is only deallocated when all "references" + *	are given up.  Only one "reference" to a given + *	region of an object should be writeable. + * + *	Associated with each object is a list of all resident + *	memory pages belonging to that object; this list is + *	maintained by the "vm_page" module, but locked by the object's + *	lock. + * + *	Each object also records the memory object port + *	that is used by the kernel to request and write + *	back data (the memory object port, field "pager"), + *	and the ports provided to the memory manager, the server that + *	manages that data, to return data and control its + *	use (the memory object control port, field "pager_request") + *	and for naming (the memory object name port, field "pager_name"). + * + *	Virtual memory objects are allocated to provide + *	zero-filled memory (vm_allocate) or map a user-defined + *	memory object into a virtual address space (vm_map). + * + *	Virtual memory objects that refer to a user-defined + *	memory object are called "permanent", because all changes + *	made in virtual memory are reflected back to the + *	memory manager, which may then store it permanently. + *	Other virtual memory objects are called "temporary", + *	meaning that changes need be written back only when + *	necessary to reclaim pages, and that storage associated + *	with the object can be discarded once it is no longer + *	mapped. + * + *	A permanent memory object may be mapped into more + *	than one virtual address space.  Moreover, two threads + *	may attempt to make the first mapping of a memory + *	object concurrently.  Only one thread is allowed to + *	complete this mapping; all others wait for the + *	"pager_initialized" field is asserted, indicating + *	that the first thread has initialized all of the + *	necessary fields in the virtual memory object structure. + * + *	The kernel relies on a *default memory manager* to + *	provide backing storage for the zero-filled virtual + *	memory objects.  The memory object ports associated + *	with these temporary virtual memory objects are only + *	generated and passed to the default memory manager + *	when it becomes necessary.  Virtual memory objects + *	that depend on the default memory manager are called + *	"internal".  The "pager_created" field is provided to + *	indicate whether these ports have ever been allocated. + *	 + *	The kernel may also create virtual memory objects to + *	hold changed pages after a copy-on-write operation. + *	In this case, the virtual memory object (and its + *	backing storage -- its memory object) only contain + *	those pages that have been changed.  The "shadow" + *	field refers to the virtual memory object that contains + *	the remainder of the contents.  The "shadow_offset" + *	field indicates where in the "shadow" these contents begin. + *	The "copy" field refers to a virtual memory object + *	to which changed pages must be copied before changing + *	this object, in order to implement another form + *	of copy-on-write optimization. + * + *	The virtual memory object structure also records + *	the attributes associated with its memory object. + *	The "pager_ready", "can_persist" and "copy_strategy" + *	fields represent those attributes.  The "cached_list" + *	field is used in the implementation of the persistence + *	attribute. + * + * ZZZ Continue this comment. + */ + +struct kmem_cache	vm_object_cache; /* vm backing store cache */ + +/* + *	All wired-down kernel memory belongs to a single virtual + *	memory object (kernel_object) to avoid wasting data structures. + */ +static struct vm_object	kernel_object_store; +vm_object_t		kernel_object = &kernel_object_store; + +/* + *	Virtual memory objects that are not referenced by + *	any address maps, but that are allowed to persist + *	(an attribute specified by the associated memory manager), + *	are kept in a queue (vm_object_cached_list). + * + *	When an object from this queue is referenced again, + *	for example to make another address space mapping, + *	it must be removed from the queue.  That is, the + *	queue contains *only* objects with zero references. + * + *	The kernel may choose to terminate objects from this + *	queue in order to reclaim storage.  The current policy + *	is to let memory pressure dynamically adjust the number + *	of unreferenced objects. The pageout daemon attempts to + *	collect objects after removing pages from them. + * + *	A simple lock (accessed by routines + *	vm_object_cache_{lock,lock_try,unlock}) governs the + *	object cache.  It must be held when objects are + *	added to or removed from the cache (in vm_object_terminate). + *	The routines that acquire a reference to a virtual + *	memory object based on one of the memory object ports + *	must also lock the cache. + * + *	Ideally, the object cache should be more isolated + *	from the reference mechanism, so that the lock need + *	not be held to make simple references. + */ +queue_head_t	vm_object_cached_list; + +def_simple_lock_data(static,vm_object_cached_lock_data) + +#define vm_object_cache_lock()		\ +		simple_lock(&vm_object_cached_lock_data) +#define vm_object_cache_lock_try()	\ +		simple_lock_try(&vm_object_cached_lock_data) +#define vm_object_cache_unlock()	\ +		simple_unlock(&vm_object_cached_lock_data) + +/* + *	Number of physical pages referenced by cached objects. + *	This counter is protected by its own lock to work around + *	lock ordering issues. + */ +int		vm_object_cached_pages; + +def_simple_lock_data(static,vm_object_cached_pages_lock_data) + +/* + *	Virtual memory objects are initialized from + *	a template (see vm_object_allocate). + * + *	When adding a new field to the virtual memory + *	object structure, be sure to add initialization + *	(see vm_object_init). + */ +struct vm_object	vm_object_template; + +/* + *	vm_object_allocate: + * + *	Returns a new object with the given size. + */ + +static void _vm_object_setup( +	vm_object_t	object, +	vm_size_t	size) +{ +	*object = vm_object_template; +	queue_init(&object->memq); +	vm_object_lock_init(object); +	object->size = size; +} + +static vm_object_t _vm_object_allocate( +	vm_size_t		size) +{ +	vm_object_t object; + +	object = (vm_object_t) kmem_cache_alloc(&vm_object_cache); +	if (!object) +		return 0; + +	_vm_object_setup(object, size); + +	return object; +} + +vm_object_t vm_object_allocate( +	vm_size_t	size) +{ +	vm_object_t object; +	ipc_port_t port; + +	object = _vm_object_allocate(size); +	if (object == 0) +		panic("vm_object_allocate"); +	port = ipc_port_alloc_kernel(); +	if (port == IP_NULL) +		panic("vm_object_allocate"); +	object->pager_name = port; +	ipc_kobject_set(port, (ipc_kobject_t) object, IKOT_PAGING_NAME); + +	return object; +} + +/* + *	vm_object_bootstrap: + * + *	Initialize the VM objects module. + */ +void vm_object_bootstrap(void) +{ +	kmem_cache_init(&vm_object_cache, "vm_object", +			sizeof(struct vm_object), 0, NULL, 0); + +	queue_init(&vm_object_cached_list); +	simple_lock_init(&vm_object_cached_lock_data); + +	/* +	 *	Fill in a template object, for quick initialization +	 */ + +	vm_object_template.ref_count = 1; +	vm_object_template.size = 0; +	vm_object_template.resident_page_count = 0; +	vm_object_template.copy = VM_OBJECT_NULL; +	vm_object_template.shadow = VM_OBJECT_NULL; +	vm_object_template.shadow_offset = (vm_offset_t) 0; + +	vm_object_template.pager = IP_NULL; +	vm_object_template.paging_offset = 0; +	vm_object_template.pager_request = PAGER_REQUEST_NULL; +	vm_object_template.pager_name = IP_NULL; + +	vm_object_template.pager_created = FALSE; +	vm_object_template.pager_initialized = FALSE; +	vm_object_template.pager_ready = FALSE; + +	vm_object_template.copy_strategy = MEMORY_OBJECT_COPY_NONE; +		/* ignored if temporary, will be reset before +		 * permanent object becomes ready */ +	vm_object_template.use_shared_copy = FALSE; +	vm_object_template.shadowed = FALSE; + +	vm_object_template.absent_count = 0; +	vm_object_template.all_wanted = 0; /* all bits FALSE */ + +	vm_object_template.paging_in_progress = 0; +	vm_object_template.used_for_pageout = FALSE; +	vm_object_template.can_persist = FALSE; +	vm_object_template.cached = FALSE; +	vm_object_template.internal = TRUE; +	vm_object_template.temporary = TRUE; +	vm_object_template.alive = TRUE; +	vm_object_template.lock_in_progress = FALSE; +	vm_object_template.lock_restart = FALSE; +	vm_object_template.last_alloc = (vm_offset_t) 0; + +#if	MACH_PAGEMAP +	vm_object_template.existence_info = VM_EXTERNAL_NULL; +#endif	/* MACH_PAGEMAP */ + +		/* +	 *	Initialize the "kernel object" +	 */ + +	_vm_object_setup(kernel_object, +		VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS); + +	/* +	 *	Initialize the "submap object".  Make it as large as the +	 *	kernel object so that no limit is imposed on submap sizes. +	 */ + +	_vm_object_setup(vm_submap_object, +		VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS); + +#if	MACH_PAGEMAP +	vm_external_module_initialize(); +#endif	/* MACH_PAGEMAP */ +} + +void vm_object_init(void) +{ +	/* +	 *	Finish initializing the kernel object. +	 *	The submap object doesn't need a name port. +	 */ + +	kernel_object->pager_name = ipc_port_alloc_kernel(); +	ipc_kobject_set(kernel_object->pager_name, +			(ipc_kobject_t) kernel_object, +			IKOT_PAGING_NAME); +} + +/* + *	Object cache management functions. + * + *	Both the cache and the object must be locked + *	before calling these functions. + */ + +static void vm_object_cache_add( +	vm_object_t	object) +{ +	assert(!object->cached); +	queue_enter(&vm_object_cached_list, object, vm_object_t, cached_list); +	object->cached = TRUE; +} + +static void vm_object_cache_remove( +	vm_object_t	object) +{ +	assert(object->cached); +	queue_remove(&vm_object_cached_list, object, vm_object_t, cached_list); +	object->cached = FALSE; +} + +void vm_object_collect( +	vm_object_t	object) +{ +	vm_object_unlock(object); + +	/* +	 *	The cache lock must be acquired in the proper order. +	 */ + +	vm_object_cache_lock(); +	vm_object_lock(object); + +	/* +	 *	If the object was referenced while the lock was +	 *	dropped, cancel the termination. +	 */ + +	if (!vm_object_collectable(object)) { +		vm_object_unlock(object); +		vm_object_cache_unlock(); +		return; +	} + +	vm_object_cache_remove(object); +	vm_object_terminate(object); +} + +/* + *	vm_object_reference: + * + *	Gets another reference to the given object. + */ +void vm_object_reference( +	vm_object_t	object) +{ +	if (object == VM_OBJECT_NULL) +		return; + +	vm_object_lock(object); +	assert(object->ref_count > 0); +	object->ref_count++; +	vm_object_unlock(object); +} + +/* + *	vm_object_deallocate: + * + *	Release a reference to the specified object, + *	gained either through a vm_object_allocate + *	or a vm_object_reference call.  When all references + *	are gone, storage associated with this object + *	may be relinquished. + * + *	No object may be locked. + */ +void vm_object_deallocate( +	vm_object_t	object) +{ +	vm_object_t	temp; + +	while (object != VM_OBJECT_NULL) { + +		/* +		 *	The cache holds a reference (uncounted) to +		 *	the object; we must lock it before removing +		 *	the object. +		 */ + +		vm_object_cache_lock(); + +		/* +		 *	Lose the reference +		 */ +		vm_object_lock(object); +		if (--(object->ref_count) > 0) { + +			/* +			 *	If there are still references, then +			 *	we are done. +			 */ +			vm_object_unlock(object); +			vm_object_cache_unlock(); +			return; +		} + +		/* +		 *	See whether this object can persist.  If so, enter +		 *	it in the cache. +		 */ +		if (object->can_persist && (object->resident_page_count > 0)) { +			vm_object_cache_add(object); +			vm_object_cache_unlock(); +			vm_object_unlock(object); +			return; +		} + +		if (object->pager_created && +		    !object->pager_initialized) { + +			/* +			 *	Have to wait for initialization. +			 *	Put reference back and retry +			 *	when it's initialized. +			 */ + +			object->ref_count++; +			vm_object_assert_wait(object, +				VM_OBJECT_EVENT_INITIALIZED, FALSE); +			vm_object_unlock(object); +			vm_object_cache_unlock(); +			thread_block((void (*)()) 0); +			continue; +		} + +		/* +		 *	Take the reference to the shadow object +		 *	out of the object to be destroyed. +		 */ + +		temp = object->shadow; + +		/* +		 *	Destroy the object; the cache lock will +		 *	be released in the process. +		 */ + +		vm_object_terminate(object); + +		/* +		 *	Deallocate the reference to the shadow +		 *	by continuing the loop with that object +		 *	in place of the original. +		 */ + +		object = temp; +	} +} + +/* + *	Routine:	vm_object_terminate + *	Purpose: + *		Free all resources associated with a vm_object. + *	In/out conditions: + *		Upon entry, the object and the cache must be locked, + *		and the object must have no references. + * + *		The shadow object reference is left alone. + * + *		Upon exit, the cache will be unlocked, and the + *		object will cease to exist. + */ +void vm_object_terminate( +	vm_object_t	object) +{ +	vm_page_t	p; +	vm_object_t	shadow_object; + +	/* +	 *	Make sure the object isn't already being terminated +	 */ + +	assert(object->alive); +	object->alive = FALSE; + +	/* +	 *	Make sure no one can look us up now. +	 */ + +	vm_object_remove(object); +	vm_object_cache_unlock(); + +	/* +	 *	Detach the object from its shadow if we are the shadow's +	 *	copy. +	 */ +	if ((shadow_object = object->shadow) != VM_OBJECT_NULL) { +		vm_object_lock(shadow_object); +		assert((shadow_object->copy == object) || +		       (shadow_object->copy == VM_OBJECT_NULL)); +		shadow_object->copy = VM_OBJECT_NULL; +		vm_object_unlock(shadow_object); +	} + +	/* +	 *	The pageout daemon might be playing with our pages. +	 *	Now that the object is dead, it won't touch any more +	 *	pages, but some pages might already be on their way out. +	 *	Hence, we wait until the active paging activities have ceased. +	 */ + +	vm_object_paging_wait(object, FALSE); + +	/* +	 *	Clean or free the pages, as appropriate. +	 *	It is possible for us to find busy/absent pages, +	 *	if some faults on this object were aborted. +	 */ + +	if ((object->temporary) || (object->pager == IP_NULL)) { +		while (!queue_empty(&object->memq)) { +			p = (vm_page_t) queue_first(&object->memq); + +			VM_PAGE_CHECK(p); + +			VM_PAGE_FREE(p); +		} +	} else while (!queue_empty(&object->memq)) { +		p = (vm_page_t) queue_first(&object->memq); + +		VM_PAGE_CHECK(p); + +		vm_page_lock_queues(); +		VM_PAGE_QUEUES_REMOVE(p); +		vm_page_unlock_queues(); + +		if (p->absent || p->private) { + +			/* +			 *	For private pages, VM_PAGE_FREE just +			 *	leaves the page structure around for +			 *	its owner to clean up.  For absent +			 *	pages, the structure is returned to +			 *	the appropriate pool. +			 */ + +			goto free_page; +		} + +		if (!p->dirty) +			p->dirty = pmap_is_modified(p->phys_addr); + +		if (p->dirty || p->precious) { +			p->busy = TRUE; +			vm_pageout_page(p, FALSE, TRUE); /* flush page */ +		} else { +		    free_page: +		    	VM_PAGE_FREE(p); +		} +	} + +	assert(object->ref_count == 0); +	assert(object->paging_in_progress == 0); +	assert(!object->cached); + +	if (!object->internal) { +		assert(object->resident_page_count == 0); + +		vm_page_lock_queues(); +		vm_object_external_count--; +		vm_page_unlock_queues(); +	} + +	/* +	 *	Throw away port rights... note that they may +	 *	already have been thrown away (by vm_object_destroy +	 *	or memory_object_destroy). +	 * +	 *	Instead of destroying the control and name ports, +	 *	we send all rights off to the memory manager instead, +	 *	using memory_object_terminate. +	 */ + +	vm_object_unlock(object); + +	if (object->pager != IP_NULL) { +		/* consumes our rights for pager, pager_request, pager_name */ +		memory_object_release(object->pager, +					     object->pager_request, +					     object->pager_name); +	} else if (object->pager_name != IP_NULL) { +		/* consumes our right for pager_name */ +		ipc_port_dealloc_kernel(object->pager_name); +	} + +#if	MACH_PAGEMAP +	vm_external_destroy(object->existence_info); +#endif	/* MACH_PAGEMAP */ + +	/* +	 *	Free the space for the object. +	 */ + +	kmem_cache_free(&vm_object_cache, (vm_offset_t) object); +} + +/* + *	Routine:	vm_object_pager_wakeup + *	Purpose:	Wake up anyone waiting for IKOT_PAGER_TERMINATING + */ + +void +vm_object_pager_wakeup( +	ipc_port_t	pager) +{ +	boolean_t someone_waiting; + +	/* +	 *	If anyone was waiting for the memory_object_terminate +	 *	to be queued, wake them up now. +	 */ +	vm_object_cache_lock(); +	assert(ip_kotype(pager) == IKOT_PAGER_TERMINATING); +	someone_waiting = (pager->ip_kobject != IKO_NULL); +	if (ip_active(pager)) +		ipc_kobject_set(pager, IKO_NULL, IKOT_NONE); +	vm_object_cache_unlock(); +	if (someone_waiting) { +		thread_wakeup((event_t) pager); +	} +} + +/* + *	Routine:	memory_object_release + *	Purpose:	Terminate the pager and release port rights, + *			just like memory_object_terminate, except + *			that we wake up anyone blocked in vm_object_enter + *			waiting for termination message to be queued + *			before calling memory_object_init. + */ +void memory_object_release( +	ipc_port_t	pager, +	pager_request_t	pager_request, +	ipc_port_t	pager_name) +{ + +	/* +	 *	Keep a reference to pager port; +	 *	the terminate might otherwise release all references. +	 */ +	ip_reference(pager); + +	/* +	 *	Terminate the pager. +	 */ +	(void) memory_object_terminate(pager, pager_request, pager_name); + +	/* +	 *	Wakeup anyone waiting for this terminate +	 */ +	vm_object_pager_wakeup(pager); + +	/* +	 *	Release reference to pager port. +	 */ +	ip_release(pager); +} + +/* + *	Routine:	vm_object_abort_activity [internal use only] + *	Purpose: + *		Abort paging requests pending on this object. + *	In/out conditions: + *		The object is locked on entry and exit. + */ +static void vm_object_abort_activity( +	vm_object_t	object) +{ +	vm_page_t	p; +	vm_page_t	next; + +	/* +	 *	Abort all activity that would be waiting +	 *	for a result on this memory object. +	 * +	 *	We could also choose to destroy all pages +	 *	that we have in memory for this object, but +	 *	we don't. +	 */ + +	p = (vm_page_t) queue_first(&object->memq); +	while (!queue_end(&object->memq, (queue_entry_t) p)) { +		next = (vm_page_t) queue_next(&p->listq); + +		/* +		 *	If it's being paged in, destroy it. +		 *	If an unlock has been requested, start it again. +		 */ + +		if (p->busy && p->absent) { +			VM_PAGE_FREE(p); +		} +		 else { +		 	if (p->unlock_request != VM_PROT_NONE) +			 	p->unlock_request = VM_PROT_NONE; +			PAGE_WAKEUP(p); +		} +		 +		p = next; +	} + +	/* +	 *	Wake up threads waiting for the memory object to +	 *	become ready. +	 */ + +	object->pager_ready = TRUE; +	vm_object_wakeup(object, VM_OBJECT_EVENT_PAGER_READY); +} + +/* + *	Routine:	memory_object_destroy [user interface] + *	Purpose: + *		Shut down a memory object, despite the + *		presence of address map (or other) references + *		to the vm_object. + *	Note: + *		This routine may be called either from the user interface, + *		or from port destruction handling (via vm_object_destroy). + */ +kern_return_t memory_object_destroy( +	vm_object_t	object, +	kern_return_t	reason) +{ +	ipc_port_t	old_object,  old_name; +	pager_request_t	old_control; + +	if (object == VM_OBJECT_NULL) +		return KERN_SUCCESS; + +	/* +	 *	Remove the port associations immediately. +	 * +	 *	This will prevent the memory manager from further +	 *	meddling.  [If it wanted to flush data or make +	 *	other changes, it should have done so before performing +	 *	the destroy call.] +	 */ + +	vm_object_cache_lock(); +	vm_object_lock(object); +	vm_object_remove(object); +	object->can_persist = FALSE; +	vm_object_cache_unlock(); + +	/* +	 *	Rip out the ports from the vm_object now... this +	 *	will prevent new memory_object calls from succeeding. +	 */ + +	old_object = object->pager; +	object->pager = IP_NULL; +	 +	old_control = object->pager_request; +	object->pager_request = PAGER_REQUEST_NULL; + +	old_name = object->pager_name; +	object->pager_name = IP_NULL; + + +	/* +	 *	Wait for existing paging activity (that might +	 *	have the old ports) to subside. +	 */ + +	vm_object_paging_wait(object, FALSE); +	vm_object_unlock(object); + +	/* +	 *	Shut down the ports now. +	 * +	 *	[Paging operations may be proceeding concurrently -- +	 *	they'll get the null values established above.] +	 */ + +	if (old_object != IP_NULL) { +		/* consumes our rights for object, control, name */ +		memory_object_release(old_object, old_control, +					     old_name); +	} else if (old_name != IP_NULL) { +		/* consumes our right for name */ +		ipc_port_dealloc_kernel(object->pager_name); +	} + +	/* +	 *	Lose the reference that was donated for this routine +	 */ + +	vm_object_deallocate(object); + +	return KERN_SUCCESS; +} + +/* + *	Routine:	vm_object_pmap_protect + * + *	Purpose: + *		Reduces the permission for all physical + *		pages in the specified object range. + * + *		If removing write permission only, it is + *		sufficient to protect only the pages in + *		the top-level object; only those pages may + *		have write permission. + * + *		If removing all access, we must follow the + *		shadow chain from the top-level object to + *		remove access to all pages in shadowed objects. + * + *		The object must *not* be locked.  The object must + *		be temporary/internal.   + * + *              If pmap is not NULL, this routine assumes that + *              the only mappings for the pages are in that + *              pmap. + */ +boolean_t vm_object_pmap_protect_by_page = FALSE; + +void vm_object_pmap_protect( +	vm_object_t		object, +	vm_offset_t		offset, +	vm_size_t		size, +	pmap_t			pmap, +	vm_offset_t		pmap_start, +	vm_prot_t		prot) +{ +	if (object == VM_OBJECT_NULL) +	    return; + +	vm_object_lock(object); + +	assert(object->temporary && object->internal); + +	while (TRUE) { +	    if (object->resident_page_count > atop(size) / 2 && +		    pmap != PMAP_NULL) { +		vm_object_unlock(object); +		pmap_protect(pmap, pmap_start, pmap_start + size, prot); +		return; +	    } + +	    { +		vm_page_t	p; +		vm_offset_t	end; + +		end = offset + size; + +		queue_iterate(&object->memq, p, vm_page_t, listq) { +		    if (!p->fictitious && +			(offset <= p->offset) && +			(p->offset < end)) { +			if ((pmap == PMAP_NULL) || +			    vm_object_pmap_protect_by_page) { +			    pmap_page_protect(p->phys_addr, +					      prot & ~p->page_lock); +			} else { +			    vm_offset_t	start = +					pmap_start + +					(p->offset - offset); + +			    pmap_protect(pmap, +					 start, +					 start + PAGE_SIZE, +					 prot); +			} +		    } +		} +	    } + +	    if (prot == VM_PROT_NONE) { +		/* +		 * Must follow shadow chain to remove access +		 * to pages in shadowed objects. +		 */ +		vm_object_t	next_object; + +		next_object = object->shadow; +		if (next_object != VM_OBJECT_NULL) { +		    offset += object->shadow_offset; +		    vm_object_lock(next_object); +		    vm_object_unlock(object); +		    object = next_object; +		} +		else { +		    /* +		     * End of chain - we are done. +		     */ +		    break; +		} +	    } +	    else { +		/* +		 * Pages in shadowed objects may never have +		 * write permission - we may stop here. +		 */ +		break; +	    } +	} + +	vm_object_unlock(object); +} + +/* + *	vm_object_pmap_remove: + * + *	Removes all physical pages in the specified + *	object range from all physical maps. + * + *	The object must *not* be locked. + */ +void vm_object_pmap_remove( +	vm_object_t	object, +	vm_offset_t	start, +	vm_offset_t	end) +{ +	vm_page_t	p; + +	if (object == VM_OBJECT_NULL) +		return; + +	vm_object_lock(object); +	queue_iterate(&object->memq, p, vm_page_t, listq) { +		if (!p->fictitious && +		    (start <= p->offset) && +		    (p->offset < end)) +			pmap_page_protect(p->phys_addr, VM_PROT_NONE); +	} +	vm_object_unlock(object); +} + +/* + *	Routine:	vm_object_copy_slowly + * + *	Description: + *		Copy the specified range of the source + *		virtual memory object without using + *		protection-based optimizations (such + *		as copy-on-write).  The pages in the + *		region are actually copied. + * + *	In/out conditions: + *		The caller must hold a reference and a lock + *		for the source virtual memory object.  The source + *		object will be returned *unlocked*. + * + *	Results: + *		If the copy is completed successfully, KERN_SUCCESS is + *		returned.  If the caller asserted the interruptible + *		argument, and an interruption occurred while waiting + *		for a user-generated event, MACH_SEND_INTERRUPTED is + *		returned.  Other values may be returned to indicate + *		hard errors during the copy operation. + * + *		A new virtual memory object is returned in a + *		parameter (_result_object).  The contents of this + *		new object, starting at a zero offset, are a copy + *		of the source memory region.  In the event of + *		an error, this parameter will contain the value + *		VM_OBJECT_NULL. + */ +kern_return_t vm_object_copy_slowly( +	vm_object_t	src_object, +	vm_offset_t	src_offset, +	vm_size_t	size, +	boolean_t	interruptible, +	vm_object_t	*_result_object)	/* OUT */ +{ +	vm_object_t	new_object; +	vm_offset_t	new_offset; + +	if (size == 0) { +		vm_object_unlock(src_object); +		*_result_object = VM_OBJECT_NULL; +		return KERN_INVALID_ARGUMENT; +	} + +	/* +	 *	Prevent destruction of the source object while we copy. +	 */ + +	assert(src_object->ref_count > 0); +	src_object->ref_count++; +	vm_object_unlock(src_object); + +	/* +	 *	Create a new object to hold the copied pages. +	 *	A few notes: +	 *		We fill the new object starting at offset 0, +	 *		 regardless of the input offset. +	 *		We don't bother to lock the new object within +	 *		 this routine, since we have the only reference. +	 */ + +	new_object = vm_object_allocate(size); +	new_offset = 0; + +	assert(size == trunc_page(size));	/* Will the loop terminate? */ + +	for ( ; +	    size != 0 ; +	    src_offset += PAGE_SIZE, new_offset += PAGE_SIZE, size -= PAGE_SIZE +	    ) { +		vm_page_t	new_page; +		vm_fault_return_t result; + +		while ((new_page = vm_page_alloc(new_object, new_offset)) +				== VM_PAGE_NULL) { +			VM_PAGE_WAIT((void (*)()) 0); +		} + +		do { +			vm_prot_t	prot = VM_PROT_READ; +			vm_page_t	_result_page; +			vm_page_t	top_page; +			vm_page_t	result_page; + +			vm_object_lock(src_object); +			src_object->paging_in_progress++; + +			result = vm_fault_page(src_object, src_offset, +				VM_PROT_READ, FALSE, interruptible, +				&prot, &_result_page, &top_page, +				FALSE, (void (*)()) 0); + +			switch(result) { +				case VM_FAULT_SUCCESS: +					result_page = _result_page; + +					/* +					 *	We don't need to hold the object +					 *	lock -- the busy page will be enough. +					 *	[We don't care about picking up any +					 *	new modifications.] +					 * +					 *	Copy the page to the new object. +					 * +					 *	POLICY DECISION: +					 *		If result_page is clean, +					 *		we could steal it instead +					 *		of copying. +					 */ + +					vm_object_unlock(result_page->object); +					vm_page_copy(result_page, new_page); + +					/* +					 *	Let go of both pages (make them +					 *	not busy, perform wakeup, activate). +					 */ + +					new_page->busy = FALSE; +					new_page->dirty = TRUE; +					vm_object_lock(result_page->object); +					PAGE_WAKEUP_DONE(result_page); + +					vm_page_lock_queues(); +					if (!result_page->active && +					    !result_page->inactive) +						vm_page_activate(result_page); +					vm_page_activate(new_page); +					vm_page_unlock_queues(); + +					/* +					 *	Release paging references and +					 *	top-level placeholder page, if any. +					 */ + +					vm_fault_cleanup(result_page->object, +							top_page); + +					break; +				 +				case VM_FAULT_RETRY: +					break; + +				case VM_FAULT_MEMORY_SHORTAGE: +					VM_PAGE_WAIT((void (*)()) 0); +					break; + +				case VM_FAULT_FICTITIOUS_SHORTAGE: +					vm_page_more_fictitious(); +					break; + +				case VM_FAULT_INTERRUPTED: +					vm_page_free(new_page); +					vm_object_deallocate(new_object); +					vm_object_deallocate(src_object); +					*_result_object = VM_OBJECT_NULL; +					return MACH_SEND_INTERRUPTED; + +				case VM_FAULT_MEMORY_ERROR: +					/* +					 * A policy choice: +					 *	(a) ignore pages that we can't +					 *	    copy +					 *	(b) return the null object if +					 *	    any page fails [chosen] +					 */ + +					vm_page_free(new_page); +					vm_object_deallocate(new_object); +					vm_object_deallocate(src_object); +					*_result_object = VM_OBJECT_NULL; +					return KERN_MEMORY_ERROR; +			} +		} while (result != VM_FAULT_SUCCESS); +	} + +	/* +	 *	Lose the extra reference, and return our object. +	 */ + +	vm_object_deallocate(src_object); +	*_result_object = new_object; +	return KERN_SUCCESS; +} + +/* + *	Routine:	vm_object_copy_temporary + * + *	Purpose: + *		Copy the specified range of the source virtual + *		memory object, if it can be done without blocking. + * + *	Results: + *		If the copy is successful, the copy is returned in + *		the arguments; otherwise, the arguments are not + *		affected. + * + *	In/out conditions: + *		The object should be unlocked on entry and exit. + */ + +boolean_t vm_object_copy_temporary( +	vm_object_t	*_object,		/* INOUT */ +	vm_offset_t	*_offset,		/* INOUT */ +	boolean_t	*_src_needs_copy,	/* OUT */ +	boolean_t	*_dst_needs_copy)	/* OUT */ +{ +	vm_object_t	object = *_object; + +	if (object == VM_OBJECT_NULL) { +		*_src_needs_copy = FALSE; +		*_dst_needs_copy = FALSE; +		return TRUE; +	} + +	/* +	 *	If the object is temporary, we can perform +	 *	a symmetric copy-on-write without asking. +	 */ + +	vm_object_lock(object); +	if (object->temporary) { + +		/* +		 *	Shared objects use delayed copy +		 */ +		if (object->use_shared_copy) { + +			/* +			 *	Asymmetric copy strategy.  Destination +			 *	must be copied (to allow copy object reuse). +			 *	Source is unaffected. +			 */ +			vm_object_unlock(object); +			object = vm_object_copy_delayed(object); +			*_object = object; +			*_src_needs_copy = FALSE; +			*_dst_needs_copy = TRUE; +			return TRUE; +		} + +		/* +		 *	Make another reference to the object. +		 * +		 *	Leave object/offset unchanged. +		 */ + +		assert(object->ref_count > 0); +		object->ref_count++; +		object->shadowed = TRUE; +		vm_object_unlock(object); + +		/* +		 *	Both source and destination must make +		 *	shadows, and the source must be made +		 *	read-only if not already. +		 */ + +		*_src_needs_copy = TRUE; +		*_dst_needs_copy = TRUE; +		return TRUE; +	} + +	if (object->pager_ready && +	    (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY)) { +	    	/* XXX Do something intelligent (see temporary code above) */ +	} +	vm_object_unlock(object); + +	return FALSE; +} + +/* + *	Routine:	vm_object_copy_call [internal] + * + *	Description: + *		Copy the specified (src_offset, size) portion + *		of the source object (src_object), using the + *		user-managed copy algorithm. + * + *	In/out conditions: + *		The source object must be locked on entry.  It + *		will be *unlocked* on exit. + * + *	Results: + *		If the copy is successful, KERN_SUCCESS is returned. + *		This routine is interruptible; if a wait for + *		a user-generated event is interrupted, MACH_SEND_INTERRUPTED + *		is returned.  Other return values indicate hard errors + *		in creating the user-managed memory object for the copy. + * + *		A new object that represents the copied virtual + *		memory is returned in a parameter (*_result_object). + *		If the return value indicates an error, this parameter + *		is not valid. + */ +static kern_return_t vm_object_copy_call( +	vm_object_t	src_object, +	vm_offset_t	src_offset, +	vm_size_t	size, +	vm_object_t	*_result_object)	/* OUT */ +{ +	vm_offset_t	src_end = src_offset + size; +	ipc_port_t	new_memory_object; +	vm_object_t	new_object; +	vm_page_t	p; + +	/* +	 *	Create a memory object port to be associated +	 *	with this new vm_object. +	 * +	 *	Since the kernel has the only rights to this +	 *	port, we need not hold the cache lock. +	 * +	 *	Since we have the only object reference, we +	 *	need not be worried about collapse operations. +	 * +	 */ + +	new_memory_object = ipc_port_alloc_kernel(); +	if (new_memory_object == IP_NULL) +		return KERN_RESOURCE_SHORTAGE; + +	/* +	 *	Set the backing object for the new +	 *	temporary object. +	 */ + +	assert(src_object->ref_count > 0); +	src_object->ref_count++; +	vm_object_paging_begin(src_object); +	vm_object_unlock(src_object); + +	/* we hold a naked receive right for new_memory_object */ +	(void) ipc_port_make_send(new_memory_object); +	/* now we also hold a naked send right for new_memory_object */ + +	/* +	 *	Let the memory manager know that a copy operation +	 *	is in progress.  Note that we're using the old +	 *	memory object's ports (for which we're holding +	 *	a paging reference)... the memory manager cannot +	 *	yet affect the new memory object. +	 */ + +	(void) memory_object_copy(src_object->pager, +				src_object->pager_request, +				src_offset, size, +				new_memory_object); +	/* no longer hold the naked receive right for new_memory_object */ + +	vm_object_lock(src_object); +	vm_object_paging_end(src_object); + +	/* +	 *	Remove write access from all of the pages of +	 *	the old memory object that we can. +	 */ + +	queue_iterate(&src_object->memq, p, vm_page_t, listq) { +	    if (!p->fictitious && +		(src_offset <= p->offset) && +		(p->offset < src_end) && +		!(p->page_lock & VM_PROT_WRITE)) { +		p->page_lock |= VM_PROT_WRITE; +		pmap_page_protect(p->phys_addr, VM_PROT_ALL & ~p->page_lock); +	    } +	} + +	vm_object_unlock(src_object); +		 +	/* +	 *	Initialize the rest of the paging stuff +	 */ + +	new_object = vm_object_enter(new_memory_object, size, FALSE); +	assert(new_object); +	new_object->shadow = src_object; +	new_object->shadow_offset = src_offset; + +	/* +	 *	Drop the reference for new_memory_object taken above. +	 */ + +	ipc_port_release_send(new_memory_object); +	/* no longer hold the naked send right for new_memory_object */ + +	*_result_object = new_object; +	return KERN_SUCCESS; +} + +/* + *	Routine:	vm_object_copy_delayed [internal] + * + *	Description: + *		Copy the specified virtual memory object, using + *		the asymmetric copy-on-write algorithm. + * + *	In/out conditions: + *		The object must be unlocked on entry. + * + *		This routine will not block waiting for user-generated + *		events.  It is not interruptible. + */ +vm_object_t vm_object_copy_delayed( +	vm_object_t	src_object) +{ +	vm_object_t	new_copy; +	vm_object_t	old_copy; +	vm_page_t	p; + +	/* +	 *	The user-level memory manager wants to see +	 *	all of the changes to this object, but it +	 *	has promised not to make any changes on its own. +	 * +	 *	Perform an asymmetric copy-on-write, as follows: +	 *		Create a new object, called a "copy object" +	 *		 to hold pages modified by the new mapping +	 *		 (i.e., the copy, not the original mapping). +	 *		Record the original object as the backing +	 *		 object for the copy object.  If the +	 *		 original mapping does not change a page, +	 *		 it may be used read-only by the copy. +	 *		Record the copy object in the original +	 *		 object.  When the original mapping causes +	 *		 a page to be modified, it must be copied +	 *		 to a new page that is "pushed" to the +	 *		 copy object. +	 *		Mark the new mapping (the copy object) +	 *		 copy-on-write.  This makes the copy +	 *		 object itself read-only, allowing it +	 *		 to be reused if the original mapping +	 *		 makes no changes, and simplifying the +	 *		 synchronization required in the "push" +	 *		 operation described above. +	 * +	 *	The copy-on-write is said to be asymmetric because +	 *	the original object is *not* marked copy-on-write. +	 *	A copied page is pushed to the copy object, regardless +	 *	which party attempted to modify the page. +	 * +	 *	Repeated asymmetric copy operations may be done. +	 *	If the original object has not been changed since +	 *	the last copy, its copy object can be reused. +	 *	Otherwise, a new copy object can be inserted +	 *	between the original object and its previous +	 *	copy object.  Since any copy object is read-only, +	 *	this cannot affect the contents of the previous copy +	 *	object. +	 * +	 *	Note that a copy object is higher in the object +	 *	tree than the original object; therefore, use of +	 *	the copy object recorded in the original object +	 *	must be done carefully, to avoid deadlock. +	 */ + +	/* +	 *	Allocate a new copy object before locking, even +	 *	though we may not need it later. +	 */ + +	new_copy = vm_object_allocate(src_object->size); + +	vm_object_lock(src_object); + +	/* +	 *	See whether we can reuse the result of a previous +	 *	copy operation. +	 */ + Retry: +	old_copy = src_object->copy; +	if (old_copy != VM_OBJECT_NULL) { +		/* +		 *	Try to get the locks (out of order) +		 */ +		if (!vm_object_lock_try(old_copy)) { +			vm_object_unlock(src_object); + +			simple_lock_pause();	/* wait a bit */ + +			vm_object_lock(src_object); +			goto Retry; +		} + +		/* +		 *	Determine whether the old copy object has +		 *	been modified. +		 */ + +		if (old_copy->resident_page_count == 0 && +		    !old_copy->pager_created) { +			/* +			 *	It has not been modified. +			 * +			 *	Return another reference to +			 *	the existing copy-object. +			 */ +			assert(old_copy->ref_count > 0); +			old_copy->ref_count++; +			vm_object_unlock(old_copy); +			vm_object_unlock(src_object); + +			vm_object_deallocate(new_copy); + +			return old_copy; +		} + +		/* +		 *	The copy-object is always made large enough to +		 *	completely shadow the original object, since +		 *	it may have several users who want to shadow +		 *	the original object at different points. +		 */ + +		assert((old_copy->shadow == src_object) && +		    (old_copy->shadow_offset == (vm_offset_t) 0)); + +		/* +		 *	Make the old copy-object shadow the new one. +		 *	It will receive no more pages from the original +		 *	object. +		 */ + +		src_object->ref_count--;	/* remove ref. from old_copy */ +		assert(src_object->ref_count > 0); +		old_copy->shadow = new_copy; +		assert(new_copy->ref_count > 0); +		new_copy->ref_count++; +		vm_object_unlock(old_copy);	/* done with old_copy */ +	} + +	/* +	 *	Point the new copy at the existing object. +	 */ + +	new_copy->shadow = src_object; +	new_copy->shadow_offset = 0; +	new_copy->shadowed = TRUE;	/* caller must set needs_copy */ +	assert(src_object->ref_count > 0); +	src_object->ref_count++; +	src_object->copy = new_copy; + +	/* +	 *	Mark all pages of the existing object copy-on-write. +	 *	This object may have a shadow chain below it, but +	 *	those pages will already be marked copy-on-write. +	 */ + +	queue_iterate(&src_object->memq, p, vm_page_t, listq) { +	    if (!p->fictitious) +		pmap_page_protect(p->phys_addr,  +				  (VM_PROT_ALL & ~VM_PROT_WRITE & +				   ~p->page_lock)); +	} + +	vm_object_unlock(src_object); +	 +	return new_copy; +} + +/* + *	Routine:	vm_object_copy_strategically + * + *	Purpose: + *		Perform a copy according to the source object's + *		declared strategy.  This operation may block, + *		and may be interrupted. + */ +kern_return_t	vm_object_copy_strategically( +	vm_object_t	src_object, +	vm_offset_t	src_offset, +	vm_size_t	size, +	vm_object_t	*dst_object,	/* OUT */ +	vm_offset_t	*dst_offset,	/* OUT */ +	boolean_t	*dst_needs_copy) /* OUT */ +{ +	kern_return_t	result = KERN_SUCCESS;	/* to quiet gcc warnings */ +	boolean_t	interruptible = TRUE; /* XXX */ + +	assert(src_object != VM_OBJECT_NULL); + +	vm_object_lock(src_object); + +	/* XXX assert(!src_object->temporary);  JSB FIXME */ + +	/* +	 *	The copy strategy is only valid if the memory manager +	 *	is "ready". +	 */ + +	while (!src_object->pager_ready) { +		vm_object_wait(	src_object, +				VM_OBJECT_EVENT_PAGER_READY, +				interruptible); +		if (interruptible && +		    (current_thread()->wait_result != THREAD_AWAKENED)) { +			*dst_object = VM_OBJECT_NULL; +			*dst_offset = 0; +			*dst_needs_copy = FALSE; +			return MACH_SEND_INTERRUPTED; +		} +		vm_object_lock(src_object); +	} + +	/* +	 *	The object may be temporary (even though it is external). +	 *	If so, do a symmetric copy. +	 */ + +	if (src_object->temporary) { +		/* +		 *	XXX +		 *	This does not count as intelligent! +		 *	This buys us the object->temporary optimizations, +		 *	but we aren't using a symmetric copy, +		 *	which may confuse the vm code. The correct thing +		 *	to do here is to figure out what to call to get +		 *	a temporary shadowing set up. +		 */ +		src_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY; +	} + +	/* +	 *	The object is permanent. Use the appropriate copy strategy. +	 */ + +	switch (src_object->copy_strategy) { +	    case MEMORY_OBJECT_COPY_NONE: +		if ((result = vm_object_copy_slowly( +					src_object, +					src_offset, +					size, +					interruptible, +					dst_object)) +		    == KERN_SUCCESS) { +			*dst_offset = 0; +			*dst_needs_copy = FALSE; +		} +		break; + +	    case MEMORY_OBJECT_COPY_CALL: +		if ((result = vm_object_copy_call(	 +				src_object, +				src_offset, +				size, +				dst_object)) +		    == KERN_SUCCESS) { +			*dst_offset = 0; +			*dst_needs_copy = FALSE; +		} +		break; + +	    case MEMORY_OBJECT_COPY_DELAY: +		vm_object_unlock(src_object); +		*dst_object = vm_object_copy_delayed(src_object); +		*dst_offset = src_offset; +		*dst_needs_copy = TRUE; + +		result = KERN_SUCCESS; +		break; +	} + +	return result; +} + +/* + *	vm_object_shadow: + * + *	Create a new object which is backed by the + *	specified existing object range.  The source + *	object reference is deallocated. + * + *	The new object and offset into that object + *	are returned in the source parameters. + */ + +void vm_object_shadow( +	vm_object_t	*object,	/* IN/OUT */ +	vm_offset_t	*offset,	/* IN/OUT */ +	vm_size_t	length) +{ +	vm_object_t	source; +	vm_object_t	result; + +	source = *object; + +	/* +	 *	Allocate a new object with the given length +	 */ + +	if ((result = vm_object_allocate(length)) == VM_OBJECT_NULL) +		panic("vm_object_shadow: no object for shadowing"); + +	/* +	 *	The new object shadows the source object, adding +	 *	a reference to it.  Our caller changes his reference +	 *	to point to the new object, removing a reference to +	 *	the source object.  Net result: no change of reference +	 *	count. +	 */ +	result->shadow = source; +	 +	/* +	 *	Store the offset into the source object, +	 *	and fix up the offset into the new object. +	 */ + +	result->shadow_offset = *offset; + +	/* +	 *	Return the new things +	 */ + +	*offset = 0; +	*object = result; +} + +/* + *	The relationship between vm_object structures and + *	the memory_object ports requires careful synchronization. + * + *	All associations are created by vm_object_enter.  All three + *	port fields are filled in, as follows: + *		pager:	the memory_object port itself, supplied by + *			the user requesting a mapping (or the kernel, + *			when initializing internal objects); the + *			kernel simulates holding send rights by keeping + *			a port reference; + *		pager_request: + *		pager_name: + *			the memory object control and name ports, + *			created by the kernel; the kernel holds + *			receive (and ownership) rights to these + *			ports, but no other references. + *	All of the ports are referenced by their global names. + * + *	When initialization is complete, the "initialized" field + *	is asserted.  Other mappings using a particular memory object, + *	and any references to the vm_object gained through the + *	port association must wait for this initialization to occur. + * + *	In order to allow the memory manager to set attributes before + *	requests (notably virtual copy operations, but also data or + *	unlock requests) are made, a "ready" attribute is made available. + *	Only the memory manager may affect the value of this attribute. + *	Its value does not affect critical kernel functions, such as + *	internal object initialization or destruction.  [Furthermore, + *	memory objects created by the kernel are assumed to be ready + *	immediately; the default memory manager need not explicitly + *	set the "ready" attribute.] + * + *	[Both the "initialized" and "ready" attribute wait conditions + *	use the "pager" field as the wait event.] + * + *	The port associations can be broken down by any of the + *	following routines: + *		vm_object_terminate: + *			No references to the vm_object remain, and + *			the object cannot (or will not) be cached. + *			This is the normal case, and is done even + *			though one of the other cases has already been + *			done. + *		vm_object_destroy: + *			The memory_object port has been destroyed, + *			meaning that the kernel cannot flush dirty + *			pages or request new data or unlock existing + *			data. + *		memory_object_destroy: + *			The memory manager has requested that the + *			kernel relinquish rights to the memory object + *			port.  [The memory manager may not want to + *			destroy the port, but may wish to refuse or + *			tear down existing memory mappings.] + *	Each routine that breaks an association must break all of + *	them at once.  At some later time, that routine must clear + *	the vm_object port fields and release the port rights. + *	[Furthermore, each routine must cope with the simultaneous + *	or previous operations of the others.] + * + *	In addition to the lock on the object, the vm_object_cache_lock + *	governs the port associations.  References gained through the + *	port association require use of the cache lock. + * + *	Because the port fields may be cleared spontaneously, they + *	cannot be used to determine whether a memory object has + *	ever been associated with a particular vm_object.  [This + *	knowledge is important to the shadow object mechanism.] + *	For this reason, an additional "created" attribute is + *	provided. + * + *	During various paging operations, the port values found in the + *	vm_object must be valid.  To prevent these port rights from being + *	released, and to prevent the port associations from changing + *	(other than being removed, i.e., made null), routines may use + *	the vm_object_paging_begin/end routines [actually, macros]. + *	The implementation uses the "paging_in_progress" and "wanted" fields. + *	[Operations that alter the validity of the port values include the + *	termination routines and vm_object_collapse.] + */ + +vm_object_t vm_object_lookup( +	ipc_port_t	port) +{ +	vm_object_t	object = VM_OBJECT_NULL; + +	if (IP_VALID(port)) { +		ip_lock(port); +		if (ip_active(port) && +		    (ip_kotype(port) == IKOT_PAGING_REQUEST)) { +			vm_object_cache_lock(); +			object = (vm_object_t) port->ip_kobject; +			vm_object_lock(object); + +			assert(object->alive); + +			if (object->ref_count == 0) +				vm_object_cache_remove(object); + +			object->ref_count++; +			vm_object_unlock(object); +			vm_object_cache_unlock(); +		} +		ip_unlock(port); +	} + +	return object; +} + +vm_object_t vm_object_lookup_name( +	ipc_port_t	port) +{ +	vm_object_t	object = VM_OBJECT_NULL; + +	if (IP_VALID(port)) { +		ip_lock(port); +		if (ip_active(port) && +		    (ip_kotype(port) == IKOT_PAGING_NAME)) { +			vm_object_cache_lock(); +			object = (vm_object_t) port->ip_kobject; +			vm_object_lock(object); + +			assert(object->alive); + +			if (object->ref_count == 0) +				vm_object_cache_remove(object); + +			object->ref_count++; +			vm_object_unlock(object); +			vm_object_cache_unlock(); +		} +		ip_unlock(port); +	} + +	return object; +} + +void vm_object_destroy( +	ipc_port_t	pager) +{ +	vm_object_t	object; +	pager_request_t	old_request; +	ipc_port_t	old_name; + +	/* +	 *	Perform essentially the same operations as in vm_object_lookup, +	 *	except that this time we look up based on the memory_object +	 *	port, not the control port. +	 */ +	vm_object_cache_lock(); +	if (ip_kotype(pager) != IKOT_PAGER) { +		vm_object_cache_unlock(); +		return; +	} + +	object = (vm_object_t) pager->ip_kobject; +	vm_object_lock(object); +	if (object->ref_count == 0) +		vm_object_cache_remove(object); +	object->ref_count++; + +	object->can_persist = FALSE; + +	assert(object->pager == pager); + +	/* +	 *	Remove the port associations. +	 * +	 *	Note that the memory_object itself is dead, so +	 *	we don't bother with it. +	 */ + +	object->pager = IP_NULL; +	vm_object_remove(object); + +	old_request = object->pager_request; +	object->pager_request = PAGER_REQUEST_NULL; + +	old_name = object->pager_name; +	object->pager_name = IP_NULL; + +	vm_object_unlock(object); +	vm_object_cache_unlock(); + +	/* +	 *	Clean up the port references.  Note that there's no +	 *	point in trying the memory_object_terminate call +	 *	because the memory_object itself is dead. +	 */ + +	ipc_port_release_send(pager); +	if (old_request != IP_NULL) +		ipc_port_dealloc_kernel(old_request); +	if (old_name != IP_NULL) +		ipc_port_dealloc_kernel(old_name); + +	/* +	 *	Restart pending page requests +	 */ + +	vm_object_abort_activity(object); + +	/* +	 *	Lose the object reference. +	 */ + +	vm_object_deallocate(object); +} + +/* + *	Routine:	vm_object_enter + *	Purpose: + *		Find a VM object corresponding to the given + *		pager; if no such object exists, create one, + *		and initialize the pager. + */ +vm_object_t vm_object_enter( +	ipc_port_t	pager, +	vm_size_t	size, +	boolean_t	internal) +{ +	vm_object_t	object; +	vm_object_t	new_object; +	boolean_t	must_init; +	ipc_kobject_type_t po; + +restart: +	if (!IP_VALID(pager)) +		return vm_object_allocate(size); + +	new_object = VM_OBJECT_NULL; +	must_init = FALSE; + +	/* +	 *	Look for an object associated with this port. +	 */ + +	vm_object_cache_lock(); +	for (;;) { +		po = ip_kotype(pager); + +		/* +		 *	If a previous object is being terminated, +		 *	we must wait for the termination message +		 *	to be queued. +		 * +		 *	We set kobject to a non-null value to let the +		 *	terminator know that someone is waiting. +		 *	Among the possibilities is that the port +		 *	could die while we're waiting.  Must restart +		 *	instead of continuing the loop. +		 */ + +		if (po == IKOT_PAGER_TERMINATING) { +			pager->ip_kobject = (ipc_kobject_t) pager; +			assert_wait((event_t) pager, FALSE); +			vm_object_cache_unlock(); +			thread_block((void (*)()) 0); +			goto restart; +		} + +		/* +		 *	Bail if there is already a kobject associated +		 *	with the pager port. +		 */ +		if (po != IKOT_NONE) { +			break; +		} + +		/* +		 *	We must unlock to create a new object; +		 *	if we do so, we must try the lookup again. +		 */ + +		if (new_object == VM_OBJECT_NULL) { +			vm_object_cache_unlock(); +			new_object = vm_object_allocate(size); +			vm_object_cache_lock(); +		} else { +			/* +			 *	Lookup failed twice, and we have something +			 *	to insert; set the object. +			 */ + +			ipc_kobject_set(pager, +					(ipc_kobject_t) new_object, +					IKOT_PAGER); +			new_object = VM_OBJECT_NULL; +			must_init = TRUE; +		} +	} + +	if (internal) +		must_init = TRUE; + +	/* +	 *	It's only good if it's a VM object! +	 */ + +	object = (po == IKOT_PAGER) ? (vm_object_t) pager->ip_kobject +				    : VM_OBJECT_NULL; + +	if ((object != VM_OBJECT_NULL) && !must_init) { +		vm_object_lock(object); +		if (object->ref_count == 0) +			vm_object_cache_remove(object); +		object->ref_count++; +		vm_object_unlock(object); + +		vm_stat.hits++; +	} +	assert((object == VM_OBJECT_NULL) || (object->ref_count > 0) || +		((object->paging_in_progress != 0) && internal)); + +	vm_stat.lookups++; + +	vm_object_cache_unlock(); + +	/* +	 *	If we raced to create a vm_object but lost, let's +	 *	throw away ours. +	 */ + +	if (new_object != VM_OBJECT_NULL) +		vm_object_deallocate(new_object); + +	if (object == VM_OBJECT_NULL) +		return(object); + +	if (must_init) { +		/* +		 *	Copy the naked send right we were given. +		 */ + +		pager = ipc_port_copy_send(pager); +		if (!IP_VALID(pager)) +			panic("vm_object_enter: port died"); /* XXX */ + +		object->pager_created = TRUE; +		object->pager = pager; + +		/* +		 *	Allocate request port. +		 */ + +		object->pager_request = ipc_port_alloc_kernel(); +		if (object->pager_request == IP_NULL) +			panic("vm_object_enter: pager request alloc"); + +		ipc_kobject_set(object->pager_request, +				(ipc_kobject_t) object, +				IKOT_PAGING_REQUEST); + +		/* +		 *	Let the pager know we're using it. +		 */ + +		if (internal) { +			/* acquire a naked send right for the DMM */ +			ipc_port_t DMM = memory_manager_default_reference(); + +			/* mark the object internal */ +			object->internal = TRUE; +			assert(object->temporary); + +			/* default-pager objects are ready immediately */ +			object->pager_ready = TRUE; + +			/* consumes the naked send right for DMM */ +			(void) memory_object_create(DMM, +				pager, +				object->size, +				object->pager_request, +				object->pager_name, +				PAGE_SIZE); +		} else { +			/* the object is external and not temporary */ +			object->internal = FALSE; +			object->temporary = FALSE; + +			assert(object->resident_page_count == 0); +			vm_object_external_count++; + +			/* user pager objects are not ready until marked so */ +			object->pager_ready = FALSE; + +			(void) memory_object_init(pager, +				object->pager_request, +				object->pager_name, +				PAGE_SIZE); + +		} + +		vm_object_lock(object); +		object->pager_initialized = TRUE; + +		vm_object_wakeup(object, VM_OBJECT_EVENT_INITIALIZED); +	} else { +		vm_object_lock(object); +	} +	/* +	 *	[At this point, the object must be locked] +	 */ + +	/* +	 *	Wait for the work above to be done by the first +	 *	thread to map this object. +	 */ + +	while (!object->pager_initialized) { +		vm_object_wait(	object, +				VM_OBJECT_EVENT_INITIALIZED, +				FALSE); +		vm_object_lock(object); +	} +	vm_object_unlock(object); + +	return object; +} + +/* + *	Routine:	vm_object_pager_create + *	Purpose: + *		Create a memory object for an internal object. + *	In/out conditions: + *		The object is locked on entry and exit; + *		it may be unlocked within this call. + *	Limitations: + *		Only one thread may be performing a + *		vm_object_pager_create on an object at + *		a time.  Presumably, only the pageout + *		daemon will be using this routine. + */ +void vm_object_pager_create( +	vm_object_t	object) +{ +	ipc_port_t	pager; + +	if (object->pager_created) { +		/* +		 *	Someone else got to it first... +		 *	wait for them to finish initializing +		 */ + +		while (!object->pager_initialized) { +			vm_object_wait(	object, +					VM_OBJECT_EVENT_PAGER_READY, +					FALSE); +			vm_object_lock(object); +		} +		return; +	} + +	/* +	 *	Indicate that a memory object has been assigned +	 *	before dropping the lock, to prevent a race. +	 */ + +	object->pager_created = TRUE; +		 +	/* +	 *	Prevent collapse or termination by +	 *	holding a paging reference +	 */ + +	vm_object_paging_begin(object); +	vm_object_unlock(object); + +#if	MACH_PAGEMAP +	object->existence_info = vm_external_create( +					object->size + +					object->paging_offset); +	assert((object->size + object->paging_offset) >= +		object->size); +#endif	/* MACH_PAGEMAP */ + +	/* +	 *	Create the pager, and associate with it +	 *	this object. +	 * +	 *	Note that we only make the port association +	 *	so that vm_object_enter can properly look up +	 *	the object to complete the initialization... +	 *	we do not expect any user to ever map this +	 *	object. +	 * +	 *	Since the kernel has the only rights to the +	 *	port, it's safe to install the association +	 *	without holding the cache lock. +	 */ + +	pager = ipc_port_alloc_kernel(); +	if (pager == IP_NULL) +		panic("vm_object_pager_create: allocate pager port"); + +	(void) ipc_port_make_send(pager); +	ipc_kobject_set(pager, (ipc_kobject_t) object, IKOT_PAGER); + +	/* +	 *	Initialize the rest of the paging stuff +	 */ + +	if (vm_object_enter(pager, object->size, TRUE) != object) +		panic("vm_object_pager_create: mismatch"); + +	/* +	 *	Drop the naked send right taken above. +	 */ + +	ipc_port_release_send(pager); + +	/* +	 *	Release the paging reference +	 */ + +	vm_object_lock(object); +	vm_object_paging_end(object); +} + +/* + *	Routine:	vm_object_remove + *	Purpose: + *		Eliminate the pager/object association + *		for this pager. + *	Conditions: + *		The object cache must be locked. + */ +void vm_object_remove( +	vm_object_t	object) +{ +	ipc_port_t port; + +	if ((port = object->pager) != IP_NULL) { +		if (ip_kotype(port) == IKOT_PAGER) +			ipc_kobject_set(port, IKO_NULL, +					IKOT_PAGER_TERMINATING); +		 else if (ip_kotype(port) != IKOT_NONE) +			panic("vm_object_remove: bad object port"); +	} +	if ((port = object->pager_request) != IP_NULL) { +		if (ip_kotype(port) == IKOT_PAGING_REQUEST) +			ipc_kobject_set(port, IKO_NULL, IKOT_NONE); +		 else if (ip_kotype(port) != IKOT_NONE) +			panic("vm_object_remove: bad request port"); +	} +	if ((port = object->pager_name) != IP_NULL) { +		if (ip_kotype(port) == IKOT_PAGING_NAME) +			ipc_kobject_set(port, IKO_NULL, IKOT_NONE); +		 else if (ip_kotype(port) != IKOT_NONE) +			panic("vm_object_remove: bad name port"); +	} +} + +/* + *	Global variables for vm_object_collapse(): + * + *		Counts for normal collapses and bypasses. + *		Debugging variables, to watch or disable collapse. + */ +long	object_collapses = 0; +long	object_bypasses  = 0; + +int		vm_object_collapse_debug = 0; +boolean_t	vm_object_collapse_allowed = TRUE; +boolean_t	vm_object_collapse_bypass_allowed = TRUE; + +/* + *	vm_object_collapse: + * + *	Collapse an object with the object backing it. + *	Pages in the backing object are moved into the + *	parent, and the backing object is deallocated. + * + *	Requires that the object be locked and the page + *	queues be unlocked.  May unlock/relock the object, + *	so the caller should hold a reference for the object. + */ +void vm_object_collapse( +	vm_object_t	object) +{ +	vm_object_t	backing_object; +	vm_offset_t	backing_offset; +	vm_size_t	size; +	vm_offset_t	new_offset; +	vm_page_t	p, pp; +	ipc_port_t 	old_name_port; + +	if (!vm_object_collapse_allowed) +		return; + +	while (TRUE) { +		/* +		 *	Verify that the conditions are right for collapse: +		 * +		 *	The object exists and no pages in it are currently +		 *	being paged out (or have ever been paged out). +		 * +		 *	This check is probably overkill -- if a memory +		 *	object has not been created, the fault handler +		 *	shouldn't release the object lock while paging +		 *	is in progress or absent pages exist. +		 */ +		if (object == VM_OBJECT_NULL || +		    object->pager_created || +		    object->paging_in_progress != 0 || +		    object->absent_count != 0) +			return; + +		/* +		 *		There is a backing object, and +		 */ +	 +		if ((backing_object = object->shadow) == VM_OBJECT_NULL) +			return; +	 +		vm_object_lock(backing_object); +		/* +		 *	... +		 *		The backing object is not read_only, +		 *		and no pages in the backing object are +		 *		currently being paged out. +		 *		The backing object is internal. +		 * +		 *	XXX It may be sufficient for the backing +		 *	XXX object to be temporary. +		 */ +	 +		if (!backing_object->internal || +		    backing_object->paging_in_progress != 0) { +			vm_object_unlock(backing_object); +			return; +		} +	 +		/* +		 *	The backing object can't be a copy-object: +		 *	the shadow_offset for the copy-object must stay +		 *	as 0.  Furthermore (for the 'we have all the +		 *	pages' case), if we bypass backing_object and +		 *	just shadow the next object in the chain, old +		 *	pages from that object would then have to be copied +		 *	BOTH into the (former) backing_object and into the +		 *	parent object. +		 */ +		if (backing_object->shadow != VM_OBJECT_NULL && +		    backing_object->shadow->copy != VM_OBJECT_NULL) { +			vm_object_unlock(backing_object); +			return; +		} + +		/* +		 *	We know that we can either collapse the backing +		 *	object (if the parent is the only reference to +		 *	it) or (perhaps) remove the parent's reference +		 *	to it. +		 */ + +		backing_offset = object->shadow_offset; +		size = object->size; + +		/* +		 *	If there is exactly one reference to the backing +		 *	object, we can collapse it into the parent. +		 */ +	 +		if (backing_object->ref_count == 1) { +			if (!vm_object_cache_lock_try()) { +				vm_object_unlock(backing_object); +				return; +			} + +			/* +			 *	We can collapse the backing object. +			 * +			 *	Move all in-memory pages from backing_object +			 *	to the parent.  Pages that have been paged out +			 *	will be overwritten by any of the parent's +			 *	pages that shadow them. +			 */ + +			while (!queue_empty(&backing_object->memq)) { + +				p = (vm_page_t) +					queue_first(&backing_object->memq); + +				new_offset = (p->offset - backing_offset); + +				assert(!p->busy || p->absent); + +				/* +				 *	If the parent has a page here, or if +				 *	this page falls outside the parent, +				 *	dispose of it. +				 * +				 *	Otherwise, move it as planned. +				 */ + +				if (p->offset < backing_offset || +				    new_offset >= size) { +					VM_PAGE_FREE(p); +				} else { +				    pp = vm_page_lookup(object, new_offset); +				    if (pp != VM_PAGE_NULL && !pp->absent) { +					/* +					 *	Parent object has a real page. +					 *	Throw away the backing object's +					 *	page. +					 */ +					VM_PAGE_FREE(p); +				    } +				    else { +					assert(pp == VM_PAGE_NULL || ! +					       "vm_object_collapse: bad case"); + +					/* +					 *	Parent now has no page. +					 *	Move the backing object's page up. +					 */ +					vm_page_rename(p, object, new_offset); +				    } +				} +			} + +			/* +			 *	Move the pager from backing_object to object. +			 * +			 *	XXX We're only using part of the paging space +			 *	for keeps now... we ought to discard the +			 *	unused portion. +			 */ + +			switch (vm_object_collapse_debug) { +			    case 0: +			    	break; +			    case 1: +				if ((backing_object->pager == IP_NULL) && +				    (backing_object->pager_request == +				     PAGER_REQUEST_NULL)) +				    break; +				/* Fall through to... */ + +			    default: +				printf("vm_object_collapse: %p (pager %p, request %p) up to %p\n", +					backing_object, backing_object->pager, backing_object->pager_request, +					object); +				if (vm_object_collapse_debug > 2) +				    SoftDebugger("vm_object_collapse"); +			} + +			object->pager = backing_object->pager; +			if (object->pager != IP_NULL) +				ipc_kobject_set(object->pager, +						(ipc_kobject_t) object, +						IKOT_PAGER); +			object->pager_initialized = backing_object->pager_initialized; +			object->pager_ready = backing_object->pager_ready; +			object->pager_created = backing_object->pager_created; + +			object->pager_request = backing_object->pager_request; +			if (object->pager_request != IP_NULL) +				ipc_kobject_set(object->pager_request, +						(ipc_kobject_t) object, +						IKOT_PAGING_REQUEST); +			old_name_port = object->pager_name; +			if (old_name_port != IP_NULL) +				ipc_kobject_set(old_name_port, +						IKO_NULL, IKOT_NONE); +			object->pager_name = backing_object->pager_name; +			if (object->pager_name != IP_NULL) +				ipc_kobject_set(object->pager_name, +						(ipc_kobject_t) object, +						IKOT_PAGING_NAME); + +			vm_object_cache_unlock(); + +			/* +			 * If there is no pager, leave paging-offset alone. +			 */ +			if (object->pager != IP_NULL) +				object->paging_offset = +					backing_object->paging_offset + +						backing_offset; + +#if	MACH_PAGEMAP +			assert(object->existence_info == VM_EXTERNAL_NULL); +			object->existence_info = backing_object->existence_info; +#endif	/* MACH_PAGEMAP */ + +			/* +			 *	Object now shadows whatever backing_object did. +			 *	Note that the reference to backing_object->shadow +			 *	moves from within backing_object to within object. +			 */ + +			object->shadow = backing_object->shadow; +			object->shadow_offset += backing_object->shadow_offset; +			if (object->shadow != VM_OBJECT_NULL && +			    object->shadow->copy != VM_OBJECT_NULL) { +				panic("vm_object_collapse: we collapsed a copy-object!"); +			} +			/* +			 *	Discard backing_object. +			 * +			 *	Since the backing object has no pages, no +			 *	pager left, and no object references within it, +			 *	all that is necessary is to dispose of it. +			 */ + +			assert( +				(backing_object->ref_count == 1) && +				(backing_object->resident_page_count == 0) && +				(backing_object->paging_in_progress == 0) +			); + +			assert(backing_object->alive); +			assert(!backing_object->cached); +			backing_object->alive = FALSE; +			vm_object_unlock(backing_object); + +			vm_object_unlock(object); +			if (old_name_port != IP_NULL) +				ipc_port_dealloc_kernel(old_name_port); +			kmem_cache_free(&vm_object_cache, (vm_offset_t) backing_object); +			vm_object_lock(object); + +			object_collapses++; +		} +		else { +			if (!vm_object_collapse_bypass_allowed) { +				vm_object_unlock(backing_object); +				return; +			} + +			/* +			 *	If all of the pages in the backing object are +			 *	shadowed by the parent object, the parent +			 *	object no longer has to shadow the backing +			 *	object; it can shadow the next one in the +			 *	chain. +			 * +			 *	The backing object must not be paged out - we'd +			 *	have to check all of the paged-out pages, as +			 *	well. +			 */ + +			if (backing_object->pager_created) { +				vm_object_unlock(backing_object); +				return; +			} + +			/* +			 *	Should have a check for a 'small' number +			 *	of pages here. +			 */ + +			queue_iterate(&backing_object->memq, p, +				      vm_page_t, listq) +			{ +				new_offset = (p->offset - backing_offset); + +				/* +				 *	If the parent has a page here, or if +				 *	this page falls outside the parent, +				 *	keep going. +				 * +				 *	Otherwise, the backing_object must be +				 *	left in the chain. +				 */ + +				if (p->offset >= backing_offset && +				    new_offset <= size && +				    (pp = vm_page_lookup(object, new_offset)) +				      == VM_PAGE_NULL) { +					/* +					 *	Page still needed. +					 *	Can't go any further. +					 */ +					vm_object_unlock(backing_object); +					return; +				} +			} + +			/* +			 *	Make the parent shadow the next object +			 *	in the chain.  Deallocating backing_object +			 *	will not remove it, since its reference +			 *	count is at least 2. +			 */ + +			vm_object_reference(object->shadow = backing_object->shadow); +			object->shadow_offset += backing_object->shadow_offset; + +			/* +			 *	Backing object might have had a copy pointer +			 *	to us.  If it did, clear it.  +			 */ +			if (backing_object->copy == object) +				backing_object->copy = VM_OBJECT_NULL; + +			/* +			 *	Drop the reference count on backing_object. +			 *	Since its ref_count was at least 2, it +			 *	will not vanish; so we don't need to call +			 *	vm_object_deallocate. +			 */ +			backing_object->ref_count--; +			assert(backing_object->ref_count > 0); +			vm_object_unlock(backing_object); + +			object_bypasses ++; + +		} + +		/* +		 *	Try again with this object's new backing object. +		 */ +	} +} + +/* + *	Routine:	vm_object_page_remove: [internal] + *	Purpose: + *		Removes all physical pages in the specified + *		object range from the object's list of pages. + * + *	In/out conditions: + *		The object must be locked. + */ +unsigned int vm_object_page_remove_lookup = 0; +unsigned int vm_object_page_remove_iterate = 0; + +void vm_object_page_remove( +	vm_object_t	object, +	vm_offset_t	start, +	vm_offset_t	end) +{ +	vm_page_t	p, next; + +	/* +	 *	One and two page removals are most popular. +	 *	The factor of 16 here is somewhat arbitrary. +	 *	It balances vm_object_lookup vs iteration. +	 */ + +	if (atop(end - start) < object->resident_page_count/16) { +		vm_object_page_remove_lookup++; + +		for (; start < end; start += PAGE_SIZE) { +			p = vm_page_lookup(object, start); +			if (p != VM_PAGE_NULL) { +				if (!p->fictitious) +					pmap_page_protect(p->phys_addr, +							  VM_PROT_NONE); +				VM_PAGE_FREE(p); +			} +		} +	} else { +		vm_object_page_remove_iterate++; + +		p = (vm_page_t) queue_first(&object->memq); +		while (!queue_end(&object->memq, (queue_entry_t) p)) { +			next = (vm_page_t) queue_next(&p->listq); +			if ((start <= p->offset) && (p->offset < end)) { +				if (!p->fictitious) +				    pmap_page_protect(p->phys_addr, +						      VM_PROT_NONE); +				VM_PAGE_FREE(p); +			} +			p = next; +		} +	} +} + +/* + *	Routine:	vm_object_coalesce + *	Purpose: + *		Tries to coalesce two objects backing up adjoining + *		regions	of memory into a single object. + * + *		NOTE: Only works at the moment if one of the objects + *		is NULL	or if the objects are the same - otherwise, + *		which object do we lock first? + *	Returns: + *		TRUE	if objects have been coalesced. + *		FALSE	the objects could not be coalesced. + *	Parameters: + *		prev_object	First object to coalesce + *		prev_offset	Offset into prev_object + *		next_object	Second object into coalesce + *		next_offset	Offset into next_object + * + *		prev_size	Size of reference to prev_object + *		next_size	Size of reference to next_object + * + *		new_object	Resulting colesced object + *		new_offset	Offset into the resulting object + *	Conditions: + *		The objects must *not* be locked. + * + *		If the objects are coalesced successfully, the caller's + *		references for both objects are consumed, and the caller + *		gains a reference for the new object. + */ + +boolean_t vm_object_coalesce( +	vm_object_t 	prev_object, +	vm_object_t	next_object, +	vm_offset_t	prev_offset, +	vm_offset_t	next_offset, +	vm_size_t	prev_size, +	vm_size_t	next_size, +	vm_object_t	*new_object,	/* OUT */ +	vm_offset_t	*new_offset)	/* OUT */ +{ +	vm_object_t	object; +	vm_size_t	newsize; + +	if (prev_object == next_object) { +		/* +		 *	If neither object actually exists, +		 *	the offsets don't matter. +		 */ +		if (prev_object == VM_OBJECT_NULL) { +			*new_object = VM_OBJECT_NULL; +			*new_offset = 0; +			return TRUE; +		} + +		if (prev_offset + prev_size == next_offset) { +			*new_object = prev_object; +			*new_offset = prev_offset; +			/* +			 *	Deallocate one of the two references. +			 */ +			vm_object_deallocate(prev_object); +			return TRUE; +		} + +		return FALSE; +	} + +	if (next_object != VM_OBJECT_NULL) { +		/* +		 *	Don't know how to merge two different +		 *	objects yet. +		 */ +		if (prev_object != VM_OBJECT_NULL) +			return FALSE; + +		object = next_object; +	} else { +		object = prev_object; +	} + +	vm_object_lock(object); + +	/* +	 *	Try to collapse the object first +	 */ +	vm_object_collapse(object); + +	/* +	 *	Can't coalesce if pages not mapped to +	 *	the object may be in use anyway: +	 *	. more than one reference +	 *	. paged out +	 *	. shadows another object +	 *	. has a copy elsewhere +	 *	. paging references (pages might be in page-list) +	 */ + +	if ((object->ref_count > 1) || +	    object->pager_created || +	    object->used_for_pageout || +	    (object->shadow != VM_OBJECT_NULL) || +	    (object->copy != VM_OBJECT_NULL) || +	    (object->paging_in_progress != 0)) { +		vm_object_unlock(object); +		return FALSE; +	} + +	if (object == prev_object) { +		/* +		 *	Remove any pages that may still be in +		 *	the object from a previous deallocation. +		 */ +		vm_object_page_remove(object, +			prev_offset + prev_size, +			prev_offset + prev_size + next_size); +		/* +		 *	Extend the object if necessary. +		 */ +		newsize = prev_offset + prev_size + next_size; +		if (newsize > object->size) +			object->size = newsize; + +		*new_offset = prev_offset; +	} else { +		/* +		 *	Check if we have enough space in the object +		 *	offset space to insert the new mapping before +		 *	the existing one. +		 */ +		if (next_offset < prev_size) { +			vm_object_unlock(object); +			return FALSE; +		} +		/* +		 *	Remove any pages that may still be in +		 *	the object from a previous deallocation. +		 */ +		vm_object_page_remove(object, +			next_offset - prev_size, +			next_offset); + +		*new_offset = next_offset - prev_size; +	} + +	vm_object_unlock(object); +	*new_object = object; +	return TRUE; +} + +vm_object_t	vm_object_request_object( +	ipc_port_t	p) +{ +	return vm_object_lookup(p); +} + +/* + *	Routine:	vm_object_name + *	Purpose: + *		Returns a naked send right to the "name" port associated + *		with this object. + */ +ipc_port_t	vm_object_name( +	vm_object_t	object) +{ +	ipc_port_t	p; + +	if (object == VM_OBJECT_NULL) +		return IP_NULL; + +	vm_object_lock(object); + +	while (object->shadow != VM_OBJECT_NULL) { +		vm_object_t	new_object = object->shadow; +		vm_object_lock(new_object); +		vm_object_unlock(object); +		object = new_object; +	} + +	p = object->pager_name; +	if (p != IP_NULL) +		p = ipc_port_make_send(p); +	vm_object_unlock(object); + +	return p; +} + +/* + *	Attach a set of physical pages to an object, so that they can + *	be mapped by mapping the object.  Typically used to map IO memory. + * + *	The mapping function and its private data are used to obtain the + *	physical addresses for each page to be mapped. + */ +kern_return_t +vm_object_page_map( +	vm_object_t	object, +	vm_offset_t	offset, +	vm_size_t	size, +	phys_addr_t	(*map_fn)(void *, vm_offset_t), +	void *		map_fn_data)	/* private to map_fn */ +{ +	int	num_pages; +	int	i; +	vm_page_t	m; +	vm_page_t	old_page; +	phys_addr_t	addr; + +	num_pages = atop(size); + +	for (i = 0; i < num_pages; i++, offset += PAGE_SIZE) { + +	    addr = (*map_fn)(map_fn_data, offset); +	    if (addr == vm_page_fictitious_addr) +		return KERN_NO_ACCESS; + +	    while ((m = vm_page_grab_fictitious()) == VM_PAGE_NULL) +		vm_page_more_fictitious(); + +	    vm_object_lock(object); +	    if ((old_page = vm_page_lookup(object, offset)) +			!= VM_PAGE_NULL) +	    { +		VM_PAGE_FREE(old_page); +	    } + +	    vm_page_init(m); +	    m->phys_addr = addr; +	    m->private = TRUE;		/* don`t free page */ +	    m->wire_count = 1; +	    vm_page_lock_queues(); +	    vm_page_insert(m, object, offset); +	    vm_page_unlock_queues(); + +	    PAGE_WAKEUP_DONE(m); +	    vm_object_unlock(object); +	} +	return KERN_SUCCESS; +} + + +#if	MACH_KDB +#include <vm/vm_print.h> +#define printf	kdbprintf + +boolean_t	vm_object_print_pages = FALSE; + +/* + *	vm_object_print:	[ debug ] + */ +void vm_object_print( +	vm_object_t	object) +{ +	vm_page_t	p; + +	int 		count; + +	if (object == VM_OBJECT_NULL) +		return; + +	iprintf("Object 0x%X: size=0x%X, %d references", +		(vm_offset_t) object, (vm_offset_t) object->size, +		object->ref_count); +	printf("\n"); +	iprintf("%lu resident pages,", object->resident_page_count); +	 printf(" %d absent pages,", object->absent_count); +	 printf(" %d paging ops\n", object->paging_in_progress); +	indent += 1; +	iprintf("memory object=0x%X (offset=0x%X),", +		 (vm_offset_t) object->pager, (vm_offset_t) object->paging_offset); +	 printf("control=0x%X, name=0x%X\n", +	 	(vm_offset_t) object->pager_request, (vm_offset_t) object->pager_name); +	iprintf("%s%s", +	 	object->pager_ready ? " ready" : "", +	 	object->pager_created ? " created" : ""); +	 printf("%s,%s ", +	 	object->pager_initialized ? "" : "uninitialized", +		object->temporary ? "temporary" : "permanent"); +	 printf("%s%s,", +		object->internal ? "internal" : "external", +	 	object->can_persist ? " cacheable" : ""); +	 printf("copy_strategy=%d\n", (vm_offset_t)object->copy_strategy); +	iprintf("shadow=0x%X (offset=0x%X),", +		(vm_offset_t) object->shadow, (vm_offset_t) object->shadow_offset); +	 printf("copy=0x%X\n", (vm_offset_t) object->copy); + +	indent += 1; + +	if (vm_object_print_pages) { +		count = 0; +		p = (vm_page_t) queue_first(&object->memq); +		while (!queue_end(&object->memq, (queue_entry_t) p)) { +			if (count == 0) iprintf("memory:="); +			else if (count == 4) {printf("\n"); iprintf(" ..."); count = 0;} +			else printf(","); +			count++; + +			printf("(off=0x%X,page=0x%X)", p->offset, (vm_offset_t) p); +			p = (vm_page_t) queue_next(&p->listq); +		} +		if (count != 0) +			printf("\n"); +	} +	indent -= 2; +} + +#endif	/* MACH_KDB */ diff --git a/vm/vm_object.h b/vm/vm_object.h new file mode 100644 index 0000000..9c17541 --- /dev/null +++ b/vm/vm_object.h @@ -0,0 +1,415 @@ +/* + * Mach Operating System + * Copyright (c) 1993-1987 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU + *  School of Computer Science + *  Carnegie Mellon University + *  Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + *	File:	vm_object.h + *	Author:	Avadis Tevanian, Jr., Michael Wayne Young + *	Date:	1985 + * + *	Virtual memory object module definitions. + */ + +#ifndef	_VM_VM_OBJECT_H_ +#define _VM_VM_OBJECT_H_ + +#include <sys/types.h> +#include <mach/kern_return.h> +#include <mach/boolean.h> +#include <mach/memory_object.h> +#include <mach/port.h> +#include <mach/vm_prot.h> +#include <mach/machine/vm_types.h> +#include <kern/queue.h> +#include <kern/lock.h> +#include <kern/assert.h> +#include <kern/debug.h> +#include <kern/macros.h> +#include <vm/pmap.h> +#include <ipc/ipc_types.h> + +#if	MACH_PAGEMAP +#include <vm/vm_external.h> +#endif	/* MACH_PAGEMAP */ + +typedef struct ipc_port *	pager_request_t; +#define	PAGER_REQUEST_NULL	((pager_request_t) 0) + +/* + *	We use "struct ipc_port *" instead of "ipc_port_t" + *	to avoid include file circularities. + */ + +struct vm_object { +	queue_head_t		memq;		/* Resident memory */ +	decl_simple_lock_data(,	Lock)		/* Synchronization */ +#if	VM_OBJECT_DEBUG +	thread_t		LockHolder;	/* Thread holding Lock */ +#endif	/* VM_OBJECT_DEBUG */ +	vm_size_t		size;		/* Object size (only valid +						 * if internal) +						 */ + +	int			ref_count;	/* Number of references */ +	unsigned long		resident_page_count; +						/* number of resident pages */ + +	struct vm_object	*copy;		/* Object that should receive +						 * a copy of my changed pages +						 */ +	struct vm_object	*shadow;	/* My shadow */ +	vm_offset_t		shadow_offset;	/* Offset into shadow */ + +	struct ipc_port		*pager;		/* Where to get data */ +	vm_offset_t		paging_offset;	/* Offset into memory object */ +	pager_request_t		pager_request;	/* Where data comes back */ +	struct ipc_port		*pager_name;	/* How to identify region */ + +	memory_object_copy_strategy_t +				copy_strategy;	/* How to handle data copy */ + +	unsigned int +				absent_count;	/* The number of pages that +						 * have been requested but +						 * not filled.  That is, the +						 * number of pages for which +						 * the "absent" attribute is +						 * asserted. +						 */ + +	unsigned int /* boolean_t array */ +				all_wanted;	/* Bit array of "want to be +						 * awakened" notations.  See +						 * VM_OBJECT_EVENT_* items +						 * below +						 */ + +	unsigned int +				paging_in_progress:16, +						/* The memory object ports are +						 * being used (e.g., for pagein +						 * or pageout) -- don't change any +						 * of these fields (i.e., don't +						 * collapse, destroy or terminate) +						 */ +	/* boolean_t */		used_for_pageout:1,/* The object carries data sent to +						 * a memory manager, which signals +						 * it's done by releasing memory. +						 * This flag prevents coalescing so +						 * that unmapping memory immediately +						 * results in object termination. +						 */ +	/* boolean_t */		pager_created:1,/* Has pager ever been created? */ +	/* boolean_t */		pager_initialized:1,/* Are fields ready to use? */ +	/* boolean_t */		pager_ready:1,	/* Will manager take requests? */ + +	/* boolean_t */		can_persist:1,	/* The kernel may keep the data +						 * for this object (and rights to +						 * the memory object) after all +						 * address map references are +						 * deallocated? +						 */ +	/* boolean_t */		internal:1,	/* Created by the kernel (and +						 * therefore, managed by the +						 * default memory manger) +						 */ +	/* boolean_t */		temporary:1,	/* Permanent objects may be changed +						 * externally by the memory manager, +						 * and changes made in memory must +						 * be reflected back to the memory +						 * manager.  Temporary objects lack +						 * both of these characteristics. +						 */ +	/* boolean_t */		alive:1,	/* Not yet terminated (debug) */ +	/* boolean_t */		lock_in_progress : 1, +						/* Is a multi-page lock +						 * request in progress? +						 */ +	/* boolean_t */		lock_restart : 1, +						/* Should lock request in +						 * progress restart search? +						 */ +	/* boolean_t */		use_shared_copy : 1,/* Use shared (i.e., +						 * delayed) copy on write */ +	/* boolean_t */		shadowed: 1,	/* Shadow may exist */ + +	/* boolean_t */		cached: 1;	/* Object is cached */ +	queue_chain_t		cached_list;	/* Attachment point for the list +						 * of objects cached as a result +						 * of their can_persist value +						 */ +	vm_offset_t		last_alloc;	/* last allocation offset */ +#if	MACH_PAGEMAP +	vm_external_t		existence_info; +#endif	/* MACH_PAGEMAP */ +}; + +extern +vm_object_t	kernel_object;		/* the single kernel object */ + +/* + *	Declare procedures that operate on VM objects. + */ + +extern void		vm_object_bootstrap(void); +extern void		vm_object_init(void); +extern void		vm_object_collect(vm_object_t); +extern void		vm_object_terminate(vm_object_t); +extern vm_object_t	vm_object_allocate(vm_size_t); +extern void		vm_object_reference(vm_object_t); +extern void		vm_object_deallocate(vm_object_t); +extern void		vm_object_pmap_protect( +	vm_object_t	object, +	vm_offset_t	offset, +	vm_size_t	size, +	pmap_t		pmap, +	vm_offset_t	pmap_start, +	vm_prot_t	prot); +extern void		vm_object_pmap_remove( +	vm_object_t	object, +	vm_offset_t	start, +	vm_offset_t	end); +extern void		vm_object_page_remove( +	vm_object_t	object, +	vm_offset_t	start, +	vm_offset_t	end); +extern void		vm_object_shadow( +	vm_object_t	*object,	/* in/out */ +	vm_offset_t	*offset,	/* in/out */ +	vm_size_t	length); +extern void		vm_object_collapse(vm_object_t); +extern vm_object_t	vm_object_lookup(struct ipc_port *); +extern vm_object_t	vm_object_lookup_name(struct ipc_port *); +extern struct ipc_port	*vm_object_name(vm_object_t); +extern void		vm_object_remove(vm_object_t); + +extern boolean_t	vm_object_copy_temporary( +	vm_object_t	*_object,		/* in/out */ +	vm_offset_t	*_offset,		/* in/out */ +	boolean_t	*_src_needs_copy,	/* out */ +	boolean_t	*_dst_needs_copy);	/* out */ +extern kern_return_t	vm_object_copy_strategically( +	vm_object_t	src_object, +	vm_offset_t	src_offset, +	vm_size_t	size, +	vm_object_t	*dst_object,		/* out */ +	vm_offset_t	*dst_offset,		/* out */ +	boolean_t	*dst_needs_copy);	/* out */ +extern kern_return_t	vm_object_copy_slowly( +	vm_object_t	src_object, +	vm_offset_t	src_offset, +	vm_size_t	size, +	boolean_t	interruptible, +	vm_object_t	*_result_object);	/* out */ + +extern vm_object_t	vm_object_enter( +	struct ipc_port	*pager, +	vm_size_t	size, +	boolean_t	internal); +extern void		vm_object_pager_create( +	vm_object_t	object); +extern void		vm_object_destroy( +	struct ipc_port	*pager); + +extern kern_return_t vm_object_page_map( +	vm_object_t, +	vm_offset_t, +	vm_size_t, +	phys_addr_t	(*)(void *, vm_offset_t), +	void *); + +extern vm_object_t	vm_object_request_object(struct ipc_port *); + +extern boolean_t vm_object_coalesce( +   vm_object_t prev_object, +   vm_object_t next_object, +   vm_offset_t prev_offset, +   vm_offset_t next_offset, +   vm_size_t   prev_size, +   vm_size_t   next_size, +   vm_object_t *new_object,	/* OUT */ +   vm_offset_t *new_offset);	/* OUT */ + +extern void vm_object_pager_wakeup(ipc_port_t  pager); + +void memory_object_release( +	ipc_port_t	pager, +	pager_request_t	pager_request, +	ipc_port_t	pager_name); + +void vm_object_deactivate_pages(vm_object_t); + +vm_object_t vm_object_copy_delayed( +	vm_object_t	src_object); + +/* + *	Event waiting handling + */ + +#define	VM_OBJECT_EVENT_INITIALIZED		0 +#define	VM_OBJECT_EVENT_PAGER_READY		1 +#define	VM_OBJECT_EVENT_PAGING_IN_PROGRESS	2 +#define	VM_OBJECT_EVENT_ABSENT_COUNT		3 +#define	VM_OBJECT_EVENT_LOCK_IN_PROGRESS	4 + +#define	vm_object_wait(object, event, interruptible)			\ +	MACRO_BEGIN							\ +	(object)->all_wanted |= 1 << (event);				\ +	vm_object_sleep(((vm_offset_t) object) + (event),		\ +			(object),					\ +			(interruptible));				\ +	MACRO_END + +#define	vm_object_assert_wait(object, event, interruptible)		\ +	MACRO_BEGIN							\ +	(object)->all_wanted |= 1 << (event);				\ +	assert_wait((event_t)(((vm_offset_t) object) + (event)), (interruptible));	\ +	MACRO_END + +#define	vm_object_wakeup(object, event)					\ +	MACRO_BEGIN							\ +	if ((object)->all_wanted & (1 << (event)))			\ +		thread_wakeup((event_t)(((vm_offset_t) object) + (event)));	\ +	(object)->all_wanted &= ~(1 << (event));			\ +	MACRO_END + +/* + *	Routines implemented as macros + */ + +#define vm_object_collectable(object)					\ +	(((object)->ref_count == 0)					\ +	&& ((object)->resident_page_count == 0)) + +#define	vm_object_paging_begin(object) 					\ +	((object)->paging_in_progress++) + +#define	vm_object_paging_end(object) 					\ +	MACRO_BEGIN							\ +	assert((object)->paging_in_progress != 0);			\ +	if (--(object)->paging_in_progress == 0) {			\ +		vm_object_wakeup(object,				\ +			VM_OBJECT_EVENT_PAGING_IN_PROGRESS);		\ +	}								\ +	MACRO_END + +#define	vm_object_paging_wait(object, interruptible)			\ +	MACRO_BEGIN							\ +	while ((object)->paging_in_progress != 0) {			\ +		vm_object_wait(	(object),				\ +				VM_OBJECT_EVENT_PAGING_IN_PROGRESS,	\ +				(interruptible));			\ +		vm_object_lock(object);					\ +									\ +	  /*XXX if ((interruptible) &&	*/				\ +	    /*XXX (current_thread()->wait_result != THREAD_AWAKENED))*/ \ +		  /*XXX break; */					\ +	}								\ +	MACRO_END + +#define	vm_object_absent_assert_wait(object, interruptible)		\ +	MACRO_BEGIN							\ +	vm_object_assert_wait(	(object),				\ +			VM_OBJECT_EVENT_ABSENT_COUNT,			\ +			(interruptible));				\ +	MACRO_END + + +#define	vm_object_absent_release(object)				\ +	MACRO_BEGIN							\ +	(object)->absent_count--;					\ +	vm_object_wakeup((object),					\ +			 VM_OBJECT_EVENT_ABSENT_COUNT);			\ +	MACRO_END + +/* + *	Object locking macros (with and without debugging) + */ + +#if	VM_OBJECT_DEBUG +#define vm_object_lock_init(object) \ +MACRO_BEGIN \ +	simple_lock_init(&(object)->Lock); \ +	(object)->LockHolder = 0; \ +MACRO_END +#define vm_object_lock(object) \ +MACRO_BEGIN \ +	simple_lock(&(object)->Lock); \ +	(object)->LockHolder = current_thread(); \ +MACRO_END +#define vm_object_unlock(object) \ +MACRO_BEGIN \ +	if ((object)->LockHolder != current_thread()) \ +	    panic("vm_object_unlock 0x%x", (object)); \ +	(object)->LockHolder = 0; \ +	simple_unlock(&(object)->Lock); \ +MACRO_END +#define vm_object_lock_try(object) \ +	(simple_lock_try(&(object)->Lock) \ +	    ? ( ((object)->LockHolder = current_thread()) , TRUE) \ +	    : FALSE) +#define vm_object_sleep(event, object, interruptible) \ +MACRO_BEGIN \ +	if ((object)->LockHolder != current_thread()) \ +	    panic("vm_object_sleep %#x", (object)); \ +	(object)->LockHolder = 0; \ +	thread_sleep((event_t)(event), simple_lock_addr((object)->Lock), \ +		(interruptible)); \ +MACRO_END +#define	vm_object_lock_taken(object)	\ +		((object)->LockHolder == current_thread()) +#else	/* VM_OBJECT_DEBUG */ +#define vm_object_lock_init(object)	simple_lock_init(&(object)->Lock) +#define vm_object_lock(object)		simple_lock(&(object)->Lock) +#define vm_object_unlock(object)	simple_unlock(&(object)->Lock) +#define vm_object_lock_try(object)	simple_lock_try(&(object)->Lock) +#define vm_object_sleep(event, object, interruptible)			\ +		thread_sleep((event_t)(event), simple_lock_addr((object)->Lock), \ +			     (interruptible)) +#define	vm_object_lock_taken(object)	simple_lock_taken(&(object)->Lock) +#endif	/* VM_OBJECT_DEBUG */ + +/* + *	Page cache accounting. + * + *	The page queues must be locked when changing these counters. + */ +extern int	vm_object_external_count; +extern int	vm_object_external_pages; + +/* Add a reference to a locked VM object. */ +static inline int +vm_object_reference_locked (vm_object_t obj) +{ +  return (++obj->ref_count); +} + +/* Remove a reference from a locked VM object. */ +static inline int +vm_object_unreference_locked (vm_object_t obj) +{ +  return (--obj->ref_count); +} + +#endif	/* _VM_VM_OBJECT_H_ */ diff --git a/vm/vm_page.c b/vm/vm_page.c new file mode 100644 index 0000000..04decbb --- /dev/null +++ b/vm/vm_page.c @@ -0,0 +1,2164 @@ +/* + * Copyright (c) 2010-2014 Richard Braun. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <http://www.gnu.org/licenses/>. + * + * + * This implementation uses the binary buddy system to manage its heap. + * Descriptions of the buddy system can be found in the following works : + * - "UNIX Internals: The New Frontiers", by Uresh Vahalia. + * - "Dynamic Storage Allocation: A Survey and Critical Review", + *    by Paul R. Wilson, Mark S. Johnstone, Michael Neely, and David Boles. + * + * In addition, this allocator uses per-CPU pools of pages for order 0 + * (i.e. single page) allocations. These pools act as caches (but are named + * differently to avoid confusion with CPU caches) that reduce contention on + * multiprocessor systems. When a pool is empty and cannot provide a page, + * it is filled by transferring multiple pages from the backend buddy system. + * The symmetric case is handled likewise. + * + * TODO Limit number of dirty pages, block allocations above a top limit. + */ + +#include <string.h> +#include <kern/assert.h> +#include <kern/counters.h> +#include <kern/cpu_number.h> +#include <kern/debug.h> +#include <kern/list.h> +#include <kern/lock.h> +#include <kern/macros.h> +#include <kern/printf.h> +#include <kern/thread.h> +#include <mach/vm_param.h> +#include <machine/pmap.h> +#include <sys/types.h> +#include <vm/memory_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pageout.h> + +#define DEBUG 0 + +#define __init +#define __initdata +#define __read_mostly + +#define thread_pin() +#define thread_unpin() + +/* + * Number of free block lists per segment. + */ +#define VM_PAGE_NR_FREE_LISTS 11 + +/* + * The size of a CPU pool is computed by dividing the number of pages in its + * containing segment by this value. + */ +#define VM_PAGE_CPU_POOL_RATIO 1024 + +/* + * Maximum number of pages in a CPU pool. + */ +#define VM_PAGE_CPU_POOL_MAX_SIZE 128 + +/* + * The transfer size of a CPU pool is computed by dividing the pool size by + * this value. + */ +#define VM_PAGE_CPU_POOL_TRANSFER_RATIO 2 + +/* + * Per-processor cache of pages. + */ +struct vm_page_cpu_pool { +    simple_lock_data_t lock; +    int size; +    int transfer_size; +    int nr_pages; +    struct list pages; +} __aligned(CPU_L1_SIZE); + +/* + * Special order value for pages that aren't in a free list. Such pages are + * either allocated, or part of a free block of pages but not the head page. + */ +#define VM_PAGE_ORDER_UNLISTED (VM_PAGE_NR_FREE_LISTS + 1) + +/* + * Doubly-linked list of free blocks. + */ +struct vm_page_free_list { +    unsigned long size; +    struct list blocks; +}; + +/* + * XXX Because of a potential deadlock involving the default pager (see + * vm_map_lock()), it's currently impossible to reliably determine the + * minimum number of free pages required for successful pageout. Since + * that process is dependent on the amount of physical memory, we scale + * the minimum number of free pages from it, in the hope that memory + * exhaustion happens as rarely as possible... + */ + +/* + * Ratio used to compute the minimum number of pages in a segment. + */ +#define VM_PAGE_SEG_THRESHOLD_MIN_NUM   5 +#define VM_PAGE_SEG_THRESHOLD_MIN_DENOM 100 + +/* + * Number of pages reserved for privileged allocations in a segment. + */ +#define VM_PAGE_SEG_THRESHOLD_MIN 500 + +/* + * Ratio used to compute the threshold below which pageout is started. + */ +#define VM_PAGE_SEG_THRESHOLD_LOW_NUM   6 +#define VM_PAGE_SEG_THRESHOLD_LOW_DENOM 100 + +/* + * Minimum value the low threshold can have for a segment. + */ +#define VM_PAGE_SEG_THRESHOLD_LOW 600 + +#if VM_PAGE_SEG_THRESHOLD_LOW <= VM_PAGE_SEG_THRESHOLD_MIN +#error VM_PAGE_SEG_THRESHOLD_LOW invalid +#endif /* VM_PAGE_SEG_THRESHOLD_LOW >= VM_PAGE_SEG_THRESHOLD_MIN */ + +/* + * Ratio used to compute the threshold above which pageout is stopped. + */ +#define VM_PAGE_SEG_THRESHOLD_HIGH_NUM      10 +#define VM_PAGE_SEG_THRESHOLD_HIGH_DENOM    100 + +/* + * Minimum value the high threshold can have for a segment. + */ +#define VM_PAGE_SEG_THRESHOLD_HIGH 1000 + +#if VM_PAGE_SEG_THRESHOLD_HIGH <= VM_PAGE_SEG_THRESHOLD_LOW +#error VM_PAGE_SEG_THRESHOLD_HIGH invalid +#endif /* VM_PAGE_SEG_THRESHOLD_HIGH <= VM_PAGE_SEG_THRESHOLD_LOW */ + +/* + * Minimum number of pages allowed for a segment. + */ +#define VM_PAGE_SEG_MIN_PAGES 2000 + +#if VM_PAGE_SEG_MIN_PAGES <= VM_PAGE_SEG_THRESHOLD_HIGH +#error VM_PAGE_SEG_MIN_PAGES invalid +#endif /* VM_PAGE_SEG_MIN_PAGES <= VM_PAGE_SEG_THRESHOLD_HIGH */ + +/* + * Ratio used to compute the threshold of active pages beyond which + * to refill the inactive queue. + */ +#define VM_PAGE_HIGH_ACTIVE_PAGE_NUM    1 +#define VM_PAGE_HIGH_ACTIVE_PAGE_DENOM  3 + +/* + * Page cache queue. + * + * XXX The current implementation hardcodes a preference to evict external + * pages first and keep internal ones as much as possible. This is because + * the Hurd default pager implementation suffers from bugs that can easily + * cause the system to freeze. + */ +struct vm_page_queue { +    struct list internal_pages; +    struct list external_pages; +}; + +/* + * Segment name buffer size. + */ +#define VM_PAGE_NAME_SIZE 16 + +/* + * Segment of contiguous memory. + * + * XXX Per-segment locking is probably useless, since one or both of the + * page queues lock and the free page queue lock is held on any access. + * However it should first be made clear which lock protects access to + * which members of a segment. + */ +struct vm_page_seg { +    struct vm_page_cpu_pool cpu_pools[NCPUS]; + +    phys_addr_t start; +    phys_addr_t end; +    struct vm_page *pages; +    struct vm_page *pages_end; +    simple_lock_data_t lock; +    struct vm_page_free_list free_lists[VM_PAGE_NR_FREE_LISTS]; +    unsigned long nr_free_pages; + +    /* Free memory thresholds */ +    unsigned long min_free_pages; /* Privileged allocations only */ +    unsigned long low_free_pages; /* Pageout daemon starts scanning */ +    unsigned long high_free_pages; /* Pageout daemon stops scanning, +                                      unprivileged allocations resume */ + +    /* Page cache related data */ +    struct vm_page_queue active_pages; +    unsigned long nr_active_pages; +    unsigned long high_active_pages; +    struct vm_page_queue inactive_pages; +    unsigned long nr_inactive_pages; +}; + +/* + * Bootstrap information about a segment. + */ +struct vm_page_boot_seg { +    phys_addr_t start; +    phys_addr_t end; +    boolean_t heap_present; +    phys_addr_t avail_start; +    phys_addr_t avail_end; +}; + +static int vm_page_is_ready __read_mostly; + +/* + * Segment table. + * + * The system supports a maximum of 4 segments : + *  - DMA: suitable for DMA + *  - DMA32: suitable for DMA when devices support 32-bits addressing + *  - DIRECTMAP: direct physical mapping, allows direct access from + *    the kernel with a simple offset translation + *  - HIGHMEM: must be mapped before it can be accessed + * + * Segments are ordered by priority, 0 being the lowest priority. Their + * relative priorities are DMA < DMA32 < DIRECTMAP < HIGHMEM or + * DMA < DIRECTMAP < DMA32 < HIGHMEM. + * Some segments may actually be aliases for others, e.g. if DMA is always + * possible from the direct physical mapping, DMA and DMA32 are aliases for + * DIRECTMAP, in which case the segment table contains DIRECTMAP and HIGHMEM + * only. + */ +static struct vm_page_seg vm_page_segs[VM_PAGE_MAX_SEGS]; + +/* + * Bootstrap segment table. + */ +static struct vm_page_boot_seg vm_page_boot_segs[VM_PAGE_MAX_SEGS] __initdata; + +/* + * Number of loaded segments. + */ +static unsigned int vm_page_segs_size __read_mostly; + +/* + * If true, unprivileged allocations are blocked, disregarding any other + * condition. + * + * This variable is also used to resume clients once pages are available. + * + * The free page queue lock must be held when accessing this variable. + */ +static boolean_t vm_page_alloc_paused; + +static void __init +vm_page_init_pa(struct vm_page *page, unsigned short seg_index, phys_addr_t pa) +{ +    memset(page, 0, sizeof(*page)); +    vm_page_init(page); /* vm_resident members */ +    page->type = VM_PT_RESERVED; +    page->seg_index = seg_index; +    page->order = VM_PAGE_ORDER_UNLISTED; +    page->priv = NULL; +    page->phys_addr = pa; +} + +void +vm_page_set_type(struct vm_page *page, unsigned int order, unsigned short type) +{ +    unsigned int i, nr_pages; + +    nr_pages = 1 << order; + +    for (i = 0; i < nr_pages; i++) +        page[i].type = type; +} + +static boolean_t +vm_page_pageable(const struct vm_page *page) +{ +    return (page->object != NULL) +           && (page->wire_count == 0) +           && (page->active || page->inactive); +} + +static boolean_t +vm_page_can_move(const struct vm_page *page) +{ +    /* +     * This function is called on pages pulled from the page queues, +     * implying they're pageable, which is why the wire count isn't +     * checked here. +     */ + +    return !page->busy +           && !page->wanted +           && !page->absent +           && page->object->alive; +} + +static void +vm_page_remove_mappings(struct vm_page *page) +{ +    page->busy = TRUE; +    pmap_page_protect(page->phys_addr, VM_PROT_NONE); + +    if (!page->dirty) { +        page->dirty = pmap_is_modified(page->phys_addr); +    } +} + +static void __init +vm_page_free_list_init(struct vm_page_free_list *free_list) +{ +    free_list->size = 0; +    list_init(&free_list->blocks); +} + +static inline void +vm_page_free_list_insert(struct vm_page_free_list *free_list, +                         struct vm_page *page) +{ +    assert(page->order == VM_PAGE_ORDER_UNLISTED); + +    free_list->size++; +    list_insert_head(&free_list->blocks, &page->node); +} + +static inline void +vm_page_free_list_remove(struct vm_page_free_list *free_list, +                         struct vm_page *page) +{ +    assert(page->order != VM_PAGE_ORDER_UNLISTED); + +    free_list->size--; +    list_remove(&page->node); +} + +static struct vm_page * +vm_page_seg_alloc_from_buddy(struct vm_page_seg *seg, unsigned int order) +{ +    struct vm_page_free_list *free_list = free_list; +    struct vm_page *page, *buddy; +    unsigned int i; + +    assert(order < VM_PAGE_NR_FREE_LISTS); + +    if (vm_page_alloc_paused && current_thread() +        && !current_thread()->vm_privilege) { +        return NULL; +    } else if (seg->nr_free_pages <= seg->low_free_pages) { +        vm_pageout_start(); + +        if ((seg->nr_free_pages <= seg->min_free_pages) +            && current_thread() && !current_thread()->vm_privilege) { +            vm_page_alloc_paused = TRUE; +            return NULL; +        } +    } + +    for (i = order; i < VM_PAGE_NR_FREE_LISTS; i++) { +        free_list = &seg->free_lists[i]; + +        if (free_list->size != 0) +            break; +    } + +    if (i == VM_PAGE_NR_FREE_LISTS) +        return NULL; + +    page = list_first_entry(&free_list->blocks, struct vm_page, node); +    vm_page_free_list_remove(free_list, page); +    page->order = VM_PAGE_ORDER_UNLISTED; + +    while (i > order) { +        i--; +        buddy = &page[1 << i]; +        vm_page_free_list_insert(&seg->free_lists[i], buddy); +        buddy->order = i; +    } + +    seg->nr_free_pages -= (1 << order); + +    if (seg->nr_free_pages < seg->min_free_pages) { +        vm_page_alloc_paused = TRUE; +    } + +    return page; +} + +static void +vm_page_seg_free_to_buddy(struct vm_page_seg *seg, struct vm_page *page, +                          unsigned int order) +{ +    struct vm_page *buddy; +    phys_addr_t pa, buddy_pa; +    unsigned int nr_pages; + +    assert(page >= seg->pages); +    assert(page < seg->pages_end); +    assert(page->order == VM_PAGE_ORDER_UNLISTED); +    assert(order < VM_PAGE_NR_FREE_LISTS); + +    nr_pages = (1 << order); +    pa = page->phys_addr; + +    while (order < (VM_PAGE_NR_FREE_LISTS - 1)) { +        buddy_pa = pa ^ vm_page_ptoa(1ULL << order); + +        if ((buddy_pa < seg->start) || (buddy_pa >= seg->end)) +            break; + +        buddy = &seg->pages[vm_page_atop(buddy_pa - seg->start)]; + +        if (buddy->order != order) +            break; + +        vm_page_free_list_remove(&seg->free_lists[order], buddy); +        buddy->order = VM_PAGE_ORDER_UNLISTED; +        order++; +        pa &= -vm_page_ptoa(1ULL << order); +        page = &seg->pages[vm_page_atop(pa - seg->start)]; +    } + +    vm_page_free_list_insert(&seg->free_lists[order], page); +    page->order = order; +    seg->nr_free_pages += nr_pages; +} + +static void __init +vm_page_cpu_pool_init(struct vm_page_cpu_pool *cpu_pool, int size) +{ +    simple_lock_init(&cpu_pool->lock); +    cpu_pool->size = size; +    cpu_pool->transfer_size = (size + VM_PAGE_CPU_POOL_TRANSFER_RATIO - 1) +                              / VM_PAGE_CPU_POOL_TRANSFER_RATIO; +    cpu_pool->nr_pages = 0; +    list_init(&cpu_pool->pages); +} + +static inline struct vm_page_cpu_pool * +vm_page_cpu_pool_get(struct vm_page_seg *seg) +{ +    return &seg->cpu_pools[cpu_number()]; +} + +static inline struct vm_page * +vm_page_cpu_pool_pop(struct vm_page_cpu_pool *cpu_pool) +{ +    struct vm_page *page; + +    assert(cpu_pool->nr_pages != 0); +    cpu_pool->nr_pages--; +    page = list_first_entry(&cpu_pool->pages, struct vm_page, node); +    list_remove(&page->node); +    return page; +} + +static inline void +vm_page_cpu_pool_push(struct vm_page_cpu_pool *cpu_pool, struct vm_page *page) +{ +    assert(cpu_pool->nr_pages < cpu_pool->size); +    cpu_pool->nr_pages++; +    list_insert_head(&cpu_pool->pages, &page->node); +} + +static int +vm_page_cpu_pool_fill(struct vm_page_cpu_pool *cpu_pool, +                      struct vm_page_seg *seg) +{ +    struct vm_page *page; +    int i; + +    assert(cpu_pool->nr_pages == 0); + +    simple_lock(&seg->lock); + +    for (i = 0; i < cpu_pool->transfer_size; i++) { +        page = vm_page_seg_alloc_from_buddy(seg, 0); + +        if (page == NULL) +            break; + +        vm_page_cpu_pool_push(cpu_pool, page); +    } + +    simple_unlock(&seg->lock); + +    return i; +} + +static void +vm_page_cpu_pool_drain(struct vm_page_cpu_pool *cpu_pool, +                       struct vm_page_seg *seg) +{ +    struct vm_page *page; +    int i; + +    assert(cpu_pool->nr_pages == cpu_pool->size); + +    simple_lock(&seg->lock); + +    for (i = cpu_pool->transfer_size; i > 0; i--) { +        page = vm_page_cpu_pool_pop(cpu_pool); +        vm_page_seg_free_to_buddy(seg, page, 0); +    } + +    simple_unlock(&seg->lock); +} + +static void +vm_page_queue_init(struct vm_page_queue *queue) +{ +    list_init(&queue->internal_pages); +    list_init(&queue->external_pages); +} + +static void +vm_page_queue_push(struct vm_page_queue *queue, struct vm_page *page) +{ +    if (page->external) { +        list_insert_tail(&queue->external_pages, &page->node); +    } else { +        list_insert_tail(&queue->internal_pages, &page->node); +    } +} + +static void +vm_page_queue_remove(struct vm_page_queue *queue, struct vm_page *page) +{ +    (void)queue; +    list_remove(&page->node); +} + +static struct vm_page * +vm_page_queue_first(struct vm_page_queue *queue, boolean_t external_only) +{ +    struct vm_page *page; + +    if (!list_empty(&queue->external_pages)) { +        page = list_first_entry(&queue->external_pages, struct vm_page, node); +        return page; +    } + +    if (!external_only && !list_empty(&queue->internal_pages)) { +        page = list_first_entry(&queue->internal_pages, struct vm_page, node); +        return page; +    } + +    return NULL; +} + +static struct vm_page_seg * +vm_page_seg_get(unsigned short index) +{ +    assert(index < vm_page_segs_size); +    return &vm_page_segs[index]; +} + +static unsigned int +vm_page_seg_index(const struct vm_page_seg *seg) +{ +    unsigned int index; + +    index = seg - vm_page_segs; +    assert(index < vm_page_segs_size); +    return index; +} + +static phys_addr_t __init +vm_page_seg_size(struct vm_page_seg *seg) +{ +    return seg->end - seg->start; +} + +static int __init +vm_page_seg_compute_pool_size(struct vm_page_seg *seg) +{ +    phys_addr_t size; + +    size = vm_page_atop(vm_page_seg_size(seg)) / VM_PAGE_CPU_POOL_RATIO; + +    if (size == 0) +        size = 1; +    else if (size > VM_PAGE_CPU_POOL_MAX_SIZE) +        size = VM_PAGE_CPU_POOL_MAX_SIZE; + +    return size; +} + +static void __init +vm_page_seg_compute_pageout_thresholds(struct vm_page_seg *seg) +{ +    unsigned long nr_pages; + +    nr_pages = vm_page_atop(vm_page_seg_size(seg)); + +    if (nr_pages < VM_PAGE_SEG_MIN_PAGES) { +        panic("vm_page: segment too small"); +    } + +    seg->min_free_pages = nr_pages * VM_PAGE_SEG_THRESHOLD_MIN_NUM +                          / VM_PAGE_SEG_THRESHOLD_MIN_DENOM; + +    if (seg->min_free_pages < VM_PAGE_SEG_THRESHOLD_MIN) { +        seg->min_free_pages = VM_PAGE_SEG_THRESHOLD_MIN; +    } + +    seg->low_free_pages = nr_pages * VM_PAGE_SEG_THRESHOLD_LOW_NUM +                          / VM_PAGE_SEG_THRESHOLD_LOW_DENOM; + +    if (seg->low_free_pages < VM_PAGE_SEG_THRESHOLD_LOW) { +        seg->low_free_pages = VM_PAGE_SEG_THRESHOLD_LOW; +    } + +    seg->high_free_pages = nr_pages * VM_PAGE_SEG_THRESHOLD_HIGH_NUM +                           / VM_PAGE_SEG_THRESHOLD_HIGH_DENOM; + +    if (seg->high_free_pages < VM_PAGE_SEG_THRESHOLD_HIGH) { +        seg->high_free_pages = VM_PAGE_SEG_THRESHOLD_HIGH; +    } +} + +static void __init +vm_page_seg_init(struct vm_page_seg *seg, phys_addr_t start, phys_addr_t end, +                 struct vm_page *pages) +{ +    phys_addr_t pa; +    int pool_size; +    unsigned int i; + +    seg->start = start; +    seg->end = end; +    pool_size = vm_page_seg_compute_pool_size(seg); + +    for (i = 0; i < ARRAY_SIZE(seg->cpu_pools); i++) +        vm_page_cpu_pool_init(&seg->cpu_pools[i], pool_size); + +    seg->pages = pages; +    seg->pages_end = pages + vm_page_atop(vm_page_seg_size(seg)); +    simple_lock_init(&seg->lock); + +    for (i = 0; i < ARRAY_SIZE(seg->free_lists); i++) +        vm_page_free_list_init(&seg->free_lists[i]); + +    seg->nr_free_pages = 0; + +    vm_page_seg_compute_pageout_thresholds(seg); + +    vm_page_queue_init(&seg->active_pages); +    seg->nr_active_pages = 0; +    vm_page_queue_init(&seg->inactive_pages); +    seg->nr_inactive_pages = 0; + +    i = vm_page_seg_index(seg); + +    for (pa = seg->start; pa < seg->end; pa += PAGE_SIZE) +        vm_page_init_pa(&pages[vm_page_atop(pa - seg->start)], i, pa); +} + +static struct vm_page * +vm_page_seg_alloc(struct vm_page_seg *seg, unsigned int order, +                  unsigned short type) +{ +    struct vm_page_cpu_pool *cpu_pool; +    struct vm_page *page; +    int filled; + +    assert(order < VM_PAGE_NR_FREE_LISTS); + +    if (order == 0) { +        thread_pin(); +        cpu_pool = vm_page_cpu_pool_get(seg); +        simple_lock(&cpu_pool->lock); + +        if (cpu_pool->nr_pages == 0) { +            filled = vm_page_cpu_pool_fill(cpu_pool, seg); + +            if (!filled) { +                simple_unlock(&cpu_pool->lock); +                thread_unpin(); +                return NULL; +            } +        } + +        page = vm_page_cpu_pool_pop(cpu_pool); +        simple_unlock(&cpu_pool->lock); +        thread_unpin(); +    } else { +        simple_lock(&seg->lock); +        page = vm_page_seg_alloc_from_buddy(seg, order); +        simple_unlock(&seg->lock); + +        if (page == NULL) +            return NULL; +    } + +    assert(page->type == VM_PT_FREE); +    vm_page_set_type(page, order, type); +    return page; +} + +static void +vm_page_seg_free(struct vm_page_seg *seg, struct vm_page *page, +                 unsigned int order) +{ +    struct vm_page_cpu_pool *cpu_pool; + +    assert(page->type != VM_PT_FREE); +    assert(order < VM_PAGE_NR_FREE_LISTS); + +    vm_page_set_type(page, order, VM_PT_FREE); + +    if (order == 0) { +        thread_pin(); +        cpu_pool = vm_page_cpu_pool_get(seg); +        simple_lock(&cpu_pool->lock); + +        if (cpu_pool->nr_pages == cpu_pool->size) +            vm_page_cpu_pool_drain(cpu_pool, seg); + +        vm_page_cpu_pool_push(cpu_pool, page); +        simple_unlock(&cpu_pool->lock); +        thread_unpin(); +    } else { +        simple_lock(&seg->lock); +        vm_page_seg_free_to_buddy(seg, page, order); +        simple_unlock(&seg->lock); +    } +} + +static void +vm_page_seg_add_active_page(struct vm_page_seg *seg, struct vm_page *page) +{ +    assert(page->object != NULL); +    assert(page->seg_index == vm_page_seg_index(seg)); +    assert(page->type != VM_PT_FREE); +    assert(page->order == VM_PAGE_ORDER_UNLISTED); +    assert(!page->free && !page->active && !page->inactive); +    page->active = TRUE; +    page->reference = TRUE; +    vm_page_queue_push(&seg->active_pages, page); +    seg->nr_active_pages++; +    vm_page_active_count++; +} + +static void +vm_page_seg_remove_active_page(struct vm_page_seg *seg, struct vm_page *page) +{ +    assert(page->object != NULL); +    assert(page->seg_index == vm_page_seg_index(seg)); +    assert(page->type != VM_PT_FREE); +    assert(page->order == VM_PAGE_ORDER_UNLISTED); +    assert(!page->free && page->active && !page->inactive); +    page->active = FALSE; +    vm_page_queue_remove(&seg->active_pages, page); +    seg->nr_active_pages--; +    vm_page_active_count--; +} + +static void +vm_page_seg_add_inactive_page(struct vm_page_seg *seg, struct vm_page *page) +{ +    assert(page->object != NULL); +    assert(page->seg_index == vm_page_seg_index(seg)); +    assert(page->type != VM_PT_FREE); +    assert(page->order == VM_PAGE_ORDER_UNLISTED); +    assert(!page->free && !page->active && !page->inactive); +    page->inactive = TRUE; +    vm_page_queue_push(&seg->inactive_pages, page); +    seg->nr_inactive_pages++; +    vm_page_inactive_count++; +} + +static void +vm_page_seg_remove_inactive_page(struct vm_page_seg *seg, struct vm_page *page) +{ +    assert(page->object != NULL); +    assert(page->seg_index == vm_page_seg_index(seg)); +    assert(page->type != VM_PT_FREE); +    assert(page->order == VM_PAGE_ORDER_UNLISTED); +    assert(!page->free && !page->active && page->inactive); +    page->inactive = FALSE; +    vm_page_queue_remove(&seg->inactive_pages, page); +    seg->nr_inactive_pages--; +    vm_page_inactive_count--; +} + +/* + * Attempt to pull an active page. + * + * If successful, the object containing the page is locked. + */ +static struct vm_page * +vm_page_seg_pull_active_page(struct vm_page_seg *seg, boolean_t external_only) +{ +    struct vm_page *page, *first; +    boolean_t locked; + +    first = NULL; + +    for (;;) { +        page = vm_page_queue_first(&seg->active_pages, external_only); + +        if (page == NULL) { +            break; +        } else if (first == NULL) { +            first = page; +        } else if (first == page) { +            break; +        } + +        vm_page_seg_remove_active_page(seg, page); +        locked = vm_object_lock_try(page->object); + +        if (!locked) { +            vm_page_seg_add_active_page(seg, page); +            continue; +        } + +        if (!vm_page_can_move(page)) { +            vm_page_seg_add_active_page(seg, page); +            vm_object_unlock(page->object); +            continue; +        } + +        return page; +    } + +    return NULL; +} + +/* + * Attempt to pull an inactive page. + * + * If successful, the object containing the page is locked. + * + * XXX See vm_page_seg_pull_active_page (duplicated code). + */ +static struct vm_page * +vm_page_seg_pull_inactive_page(struct vm_page_seg *seg, boolean_t external_only) +{ +    struct vm_page *page, *first; +    boolean_t locked; + +    first = NULL; + +    for (;;) { +        page = vm_page_queue_first(&seg->inactive_pages, external_only); + +        if (page == NULL) { +            break; +        } else if (first == NULL) { +            first = page; +        } else if (first == page) { +            break; +        } + +        vm_page_seg_remove_inactive_page(seg, page); +        locked = vm_object_lock_try(page->object); + +        if (!locked) { +            vm_page_seg_add_inactive_page(seg, page); +            continue; +        } + +        if (!vm_page_can_move(page)) { +            vm_page_seg_add_inactive_page(seg, page); +            vm_object_unlock(page->object); +            continue; +        } + +        return page; +    } + +    return NULL; +} + +/* + * Attempt to pull a page cache page. + * + * If successful, the object containing the page is locked. + */ +static struct vm_page * +vm_page_seg_pull_cache_page(struct vm_page_seg *seg, +                            boolean_t external_only, +                            boolean_t *was_active) +{ +    struct vm_page *page; + +    page = vm_page_seg_pull_inactive_page(seg, external_only); + +    if (page != NULL) { +        *was_active = FALSE; +        return page; +    } + +    page = vm_page_seg_pull_active_page(seg, external_only); + +    if (page != NULL) { +        *was_active = TRUE; +        return page; +    } + +    return NULL; +} + +static boolean_t +vm_page_seg_page_available(const struct vm_page_seg *seg) +{ +    return (seg->nr_free_pages > seg->high_free_pages); +} + +static boolean_t +vm_page_seg_usable(const struct vm_page_seg *seg) +{ +    if ((seg->nr_active_pages + seg->nr_inactive_pages) == 0) { +        /* Nothing to page out, assume segment is usable */ +        return TRUE; +    } + +    return (seg->nr_free_pages >= seg->high_free_pages); +} + +static void +vm_page_seg_double_lock(struct vm_page_seg *seg1, struct vm_page_seg *seg2) +{ +    assert(seg1 != seg2); + +    if (seg1 < seg2) { +        simple_lock(&seg1->lock); +        simple_lock(&seg2->lock); +    } else { +        simple_lock(&seg2->lock); +        simple_lock(&seg1->lock); +    } +} + +static void +vm_page_seg_double_unlock(struct vm_page_seg *seg1, struct vm_page_seg *seg2) +{ +    simple_unlock(&seg1->lock); +    simple_unlock(&seg2->lock); +} + +/* + * Attempt to balance a segment by moving one page to another segment. + * + * Return TRUE if a page was actually moved. + */ +static boolean_t +vm_page_seg_balance_page(struct vm_page_seg *seg, +                         struct vm_page_seg *remote_seg) +{ +    struct vm_page *src, *dest; +    vm_object_t object; +    vm_offset_t offset; +    boolean_t was_active; + +    vm_page_lock_queues(); +    simple_lock(&vm_page_queue_free_lock); +    vm_page_seg_double_lock(seg, remote_seg); + +    if (vm_page_seg_usable(seg) +        || !vm_page_seg_page_available(remote_seg)) { +        goto error; +    } + +    src = vm_page_seg_pull_cache_page(seg, FALSE, &was_active); + +    if (src == NULL) { +        goto error; +    } + +    assert(src->object != NULL); +    assert(!src->fictitious && !src->private); +    assert(src->wire_count == 0); +    assert(src->type != VM_PT_FREE); +    assert(src->order == VM_PAGE_ORDER_UNLISTED); + +    dest = vm_page_seg_alloc_from_buddy(remote_seg, 0); +    assert(dest != NULL); + +    vm_page_seg_double_unlock(seg, remote_seg); +    simple_unlock(&vm_page_queue_free_lock); + +    if (!was_active && !src->reference && pmap_is_referenced(src->phys_addr)) { +        src->reference = TRUE; +    } + +    object = src->object; +    offset = src->offset; +    vm_page_remove(src); + +    vm_page_remove_mappings(src); + +    vm_page_set_type(dest, 0, src->type); +    memcpy(&dest->vm_page_header, &src->vm_page_header, +           VM_PAGE_BODY_SIZE); +    vm_page_copy(src, dest); + +    if (!src->dirty) { +        pmap_clear_modify(dest->phys_addr); +    } + +    dest->busy = FALSE; + +    simple_lock(&vm_page_queue_free_lock); +    vm_page_init(src); +    src->free = TRUE; +    simple_lock(&seg->lock); +    vm_page_set_type(src, 0, VM_PT_FREE); +    vm_page_seg_free_to_buddy(seg, src, 0); +    simple_unlock(&seg->lock); +    simple_unlock(&vm_page_queue_free_lock); + +    vm_object_lock(object); +    vm_page_insert(dest, object, offset); +    vm_object_unlock(object); + +    if (was_active) { +        vm_page_activate(dest); +    } else { +        vm_page_deactivate(dest); +    } + +    vm_page_unlock_queues(); + +    return TRUE; + +error: +    vm_page_seg_double_unlock(seg, remote_seg); +    simple_unlock(&vm_page_queue_free_lock); +    vm_page_unlock_queues(); +    return FALSE; +} + +static boolean_t +vm_page_seg_balance(struct vm_page_seg *seg) +{ +    struct vm_page_seg *remote_seg; +    unsigned int i; +    boolean_t balanced; + +    /* +     * It's important here that pages are moved to lower priority +     * segments first. +     */ + +    for (i = vm_page_segs_size - 1; i < vm_page_segs_size; i--) { +        remote_seg = vm_page_seg_get(i); + +        if (remote_seg == seg) { +            continue; +        } + +        balanced = vm_page_seg_balance_page(seg, remote_seg); + +        if (balanced) { +            return TRUE; +        } +    } + +    return FALSE; +} + +static boolean_t +vm_page_seg_evict(struct vm_page_seg *seg, boolean_t external_only, +                  boolean_t alloc_paused) +{ +    struct vm_page *page; +    boolean_t reclaim, double_paging; +    vm_object_t object; +    boolean_t was_active; + +    page = NULL; +    object = NULL; +    double_paging = FALSE; + +restart: +    vm_page_lock_queues(); +    simple_lock(&seg->lock); + +    if (page != NULL) { +        vm_object_lock(page->object); +    } else { +        page = vm_page_seg_pull_cache_page(seg, external_only, &was_active); + +        if (page == NULL) { +            goto out; +        } +    } + +    assert(page->object != NULL); +    assert(!page->fictitious && !page->private); +    assert(page->wire_count == 0); +    assert(page->type != VM_PT_FREE); +    assert(page->order == VM_PAGE_ORDER_UNLISTED); + +    object = page->object; + +    if (!was_active +        && (page->reference || pmap_is_referenced(page->phys_addr))) { +        vm_page_seg_add_active_page(seg, page); +        simple_unlock(&seg->lock); +        vm_object_unlock(object); +        vm_stat.reactivations++; +        current_task()->reactivations++; +        vm_page_unlock_queues(); +        page = NULL; +        goto restart; +    } + +    vm_page_remove_mappings(page); + +    if (!page->dirty && !page->precious) { +        reclaim = TRUE; +        goto out; +    } + +    reclaim = FALSE; + +    /* +     * If we are very low on memory, then we can't rely on an external +     * pager to clean a dirty page, because external pagers are not +     * vm-privileged. +     * +     * The laundry bit tells vm_pageout_setup not to do any special +     * processing of this page since it's immediately going to be +     * double paged out to the default pager. The laundry bit is +     * reset and the page is inserted into an internal object by +     * vm_pageout_setup before the second double paging pass. +     * +     * There is one important special case: the default pager can +     * back external memory objects. When receiving the first +     * pageout request, where the page is no longer present, a +     * fault could occur, during which the map would be locked. +     * This fault would cause a new paging request to the default +     * pager. Receiving that request would deadlock when trying to +     * lock the map again. Instead, the page isn't double paged +     * and vm_pageout_setup wires the page down, trusting the +     * default pager as for internal pages. +     */ + +    assert(!page->laundry); +    assert(!(double_paging && page->external)); + +    if (object->internal || !alloc_paused || +        memory_manager_default_port(object->pager)) { +        double_paging = FALSE; +    } else { +        double_paging = page->laundry = TRUE; +    } + +out: +    simple_unlock(&seg->lock); + +    if (object == NULL) { +        vm_page_unlock_queues(); +        return FALSE; +    } + +    if (reclaim) { +        vm_page_free(page); +        vm_page_unlock_queues(); + +        if (vm_object_collectable(object)) { +            vm_object_collect(object); +        } else { +            vm_object_unlock(object); +        } + +        return TRUE; +    } + +    vm_page_unlock_queues(); + +    /* +     * If there is no memory object for the page, create one and hand it +     * to the default pager. First try to collapse, so we don't create +     * one unnecessarily. +     */ + +    if (!object->pager_initialized) { +        vm_object_collapse(object); +    } + +    if (!object->pager_initialized) { +        vm_object_pager_create(object); +    } + +    if (!object->pager_initialized) { +        panic("vm_page_seg_evict"); +    } + +    vm_pageout_page(page, FALSE, TRUE); /* flush it */ +    vm_object_unlock(object); + +    if (double_paging) { +        goto restart; +    } + +    return TRUE; +} + +static void +vm_page_seg_compute_high_active_page(struct vm_page_seg *seg) +{ +    unsigned long nr_pages; + +    nr_pages = seg->nr_active_pages + seg->nr_inactive_pages; +    seg->high_active_pages = nr_pages * VM_PAGE_HIGH_ACTIVE_PAGE_NUM +                             / VM_PAGE_HIGH_ACTIVE_PAGE_DENOM; +} + +static void +vm_page_seg_refill_inactive(struct vm_page_seg *seg) +{ +    struct vm_page *page; + +    simple_lock(&seg->lock); + +    vm_page_seg_compute_high_active_page(seg); + +    while (seg->nr_active_pages > seg->high_active_pages) { +        page = vm_page_seg_pull_active_page(seg, FALSE); + +        if (page == NULL) { +            break; +        } + +        page->reference = FALSE; +        pmap_clear_reference(page->phys_addr); +        vm_page_seg_add_inactive_page(seg, page); +        vm_object_unlock(page->object); +    } + +    simple_unlock(&seg->lock); +} + +void __init +vm_page_load(unsigned int seg_index, phys_addr_t start, phys_addr_t end) +{ +    struct vm_page_boot_seg *seg; + +    assert(seg_index < ARRAY_SIZE(vm_page_boot_segs)); +    assert(vm_page_aligned(start)); +    assert(vm_page_aligned(end)); +    assert(start < end); +    assert(vm_page_segs_size < ARRAY_SIZE(vm_page_boot_segs)); + +    seg = &vm_page_boot_segs[seg_index]; +    seg->start = start; +    seg->end = end; +    seg->heap_present = FALSE; + +#if DEBUG +    printf("vm_page: load: %s: %llx:%llx\n", +           vm_page_seg_name(seg_index), +           (unsigned long long)start, (unsigned long long)end); +#endif + +    vm_page_segs_size++; +} + +void +vm_page_load_heap(unsigned int seg_index, phys_addr_t start, phys_addr_t end) +{ +    struct vm_page_boot_seg *seg; + +    assert(seg_index < ARRAY_SIZE(vm_page_boot_segs)); +    assert(vm_page_aligned(start)); +    assert(vm_page_aligned(end)); + +    seg = &vm_page_boot_segs[seg_index]; + +    assert(seg->start <= start); +    assert(end <= seg-> end); + +    seg->avail_start = start; +    seg->avail_end = end; +    seg->heap_present = TRUE; + +#if DEBUG +    printf("vm_page: heap: %s: %llx:%llx\n", +           vm_page_seg_name(seg_index), +           (unsigned long long)start, (unsigned long long)end); +#endif +} + +int +vm_page_ready(void) +{ +    return vm_page_is_ready; +} + +static unsigned int +vm_page_select_alloc_seg(unsigned int selector) +{ +    unsigned int seg_index; + +    switch (selector) { +    case VM_PAGE_SEL_DMA: +        seg_index = VM_PAGE_SEG_DMA; +        break; +    case VM_PAGE_SEL_DMA32: +        seg_index = VM_PAGE_SEG_DMA32; +        break; +    case VM_PAGE_SEL_DIRECTMAP: +        seg_index = VM_PAGE_SEG_DIRECTMAP; +        break; +    case VM_PAGE_SEL_HIGHMEM: +        seg_index = VM_PAGE_SEG_HIGHMEM; +        break; +    default: +        panic("vm_page: invalid selector"); +    } + +    return MIN(vm_page_segs_size - 1, seg_index); +} + +static int __init +vm_page_boot_seg_loaded(const struct vm_page_boot_seg *seg) +{ +    return (seg->end != 0); +} + +static void __init +vm_page_check_boot_segs(void) +{ +    unsigned int i; +    int expect_loaded; + +    if (vm_page_segs_size == 0) +        panic("vm_page: no physical memory loaded"); + +    for (i = 0; i < ARRAY_SIZE(vm_page_boot_segs); i++) { +        expect_loaded = (i < vm_page_segs_size); + +        if (vm_page_boot_seg_loaded(&vm_page_boot_segs[i]) == expect_loaded) +            continue; + +        panic("vm_page: invalid boot segment table"); +    } +} + +static phys_addr_t __init +vm_page_boot_seg_size(struct vm_page_boot_seg *seg) +{ +    return seg->end - seg->start; +} + +static phys_addr_t __init +vm_page_boot_seg_avail_size(struct vm_page_boot_seg *seg) +{ +    return seg->avail_end - seg->avail_start; +} + +phys_addr_t __init +vm_page_bootalloc(size_t size) +{ +    struct vm_page_boot_seg *seg; +    phys_addr_t pa; +    unsigned int i; + +    for (i = vm_page_select_alloc_seg(VM_PAGE_SEL_DIRECTMAP); +         i < vm_page_segs_size; +         i--) { +        seg = &vm_page_boot_segs[i]; + +        if (size <= vm_page_boot_seg_avail_size(seg)) { +            pa = seg->avail_start; +            seg->avail_start += vm_page_round(size); +            return pa; +        } +    } + +    panic("vm_page: no physical memory available"); +} + +void __init +vm_page_setup(void) +{ +    struct vm_page_boot_seg *boot_seg; +    struct vm_page_seg *seg; +    struct vm_page *table, *page, *end; +    size_t nr_pages, table_size; +    unsigned long va; +    unsigned int i; +    phys_addr_t pa; + +    vm_page_check_boot_segs(); + +    /* +     * Compute the page table size. +     */ +    nr_pages = 0; + +    for (i = 0; i < vm_page_segs_size; i++) +        nr_pages += vm_page_atop(vm_page_boot_seg_size(&vm_page_boot_segs[i])); + +    table_size = vm_page_round(nr_pages * sizeof(struct vm_page)); +    printf("vm_page: page table size: %lu entries (%luk)\n", nr_pages, +           table_size >> 10); +    table = (struct vm_page *)pmap_steal_memory(table_size); +    va = (unsigned long)table; + +    /* +     * Initialize the segments, associating them to the page table. When +     * the segments are initialized, all their pages are set allocated. +     * Pages are then released, which populates the free lists. +     */ +    for (i = 0; i < vm_page_segs_size; i++) { +        seg = &vm_page_segs[i]; +        boot_seg = &vm_page_boot_segs[i]; +        vm_page_seg_init(seg, boot_seg->start, boot_seg->end, table); +        page = seg->pages + vm_page_atop(boot_seg->avail_start +                                         - boot_seg->start); +        end = seg->pages + vm_page_atop(boot_seg->avail_end +                                        - boot_seg->start); + +        while (page < end) { +            page->type = VM_PT_FREE; +            vm_page_seg_free_to_buddy(seg, page, 0); +            page++; +        } + +        table += vm_page_atop(vm_page_seg_size(seg)); +    } + +    while (va < (unsigned long)table) { +        pa = pmap_extract(kernel_pmap, va); +        page = vm_page_lookup_pa(pa); +        assert((page != NULL) && (page->type == VM_PT_RESERVED)); +        page->type = VM_PT_TABLE; +        va += PAGE_SIZE; +    } + +    vm_page_is_ready = 1; +} + +void __init +vm_page_manage(struct vm_page *page) +{ +    assert(page->seg_index < ARRAY_SIZE(vm_page_segs)); +    assert(page->type == VM_PT_RESERVED); + +    vm_page_set_type(page, 0, VM_PT_FREE); +    vm_page_seg_free_to_buddy(&vm_page_segs[page->seg_index], page, 0); +} + +struct vm_page * +vm_page_lookup_pa(phys_addr_t pa) +{ +    struct vm_page_seg *seg; +    unsigned int i; + +    for (i = 0; i < vm_page_segs_size; i++) { +        seg = &vm_page_segs[i]; + +        if ((pa >= seg->start) && (pa < seg->end)) +            return &seg->pages[vm_page_atop(pa - seg->start)]; +    } + +    return NULL; +} + +static struct vm_page_seg * +vm_page_lookup_seg(const struct vm_page *page) +{ +    struct vm_page_seg *seg; +    unsigned int i; + +    for (i = 0; i < vm_page_segs_size; i++) { +        seg = &vm_page_segs[i]; + +        if ((page->phys_addr >= seg->start) && (page->phys_addr < seg->end)) { +            return seg; +        } +    } + +    return NULL; +} + +void vm_page_check(const struct vm_page *page) +{ +    if (page->fictitious) { +        if (page->private) { +            panic("vm_page: page both fictitious and private"); +        } + +        if (page->phys_addr != vm_page_fictitious_addr) { +            panic("vm_page: invalid fictitious page"); +        } +    } else { +        struct vm_page_seg *seg; + +        if (page->phys_addr == vm_page_fictitious_addr) { +            panic("vm_page: real page has fictitious address"); +        } + +        seg = vm_page_lookup_seg(page); + +        if (seg == NULL) { +            if (!page->private) { +                panic("vm_page: page claims it's managed but not in any segment"); +            } +        } else { +            if (page->private) { +                struct vm_page *real_page; + +                if (vm_page_pageable(page)) { +                    panic("vm_page: private page is pageable"); +                } + +                real_page = vm_page_lookup_pa(page->phys_addr); + +                if (vm_page_pageable(real_page)) { +                    panic("vm_page: page underlying private page is pageable"); +                } + +                if ((real_page->type == VM_PT_FREE) +                    || (real_page->order != VM_PAGE_ORDER_UNLISTED)) { +                    panic("vm_page: page underlying private pagei is free"); +                } +            } else { +                unsigned int index; + +                index = vm_page_seg_index(seg); + +                if (index != page->seg_index) { +                    panic("vm_page: page segment mismatch"); +                } +            } +        } +    } +} + +struct vm_page * +vm_page_alloc_pa(unsigned int order, unsigned int selector, unsigned short type) +{ +    struct vm_page *page; +    unsigned int i; + +    for (i = vm_page_select_alloc_seg(selector); i < vm_page_segs_size; i--) { +        page = vm_page_seg_alloc(&vm_page_segs[i], order, type); + +        if (page != NULL) +            return page; +    } + +    if (!current_thread() || current_thread()->vm_privilege) +        panic("vm_page: privileged thread unable to allocate page"); + +    return NULL; +} + +void +vm_page_free_pa(struct vm_page *page, unsigned int order) +{ +    assert(page != NULL); +    assert(page->seg_index < ARRAY_SIZE(vm_page_segs)); + +    vm_page_seg_free(&vm_page_segs[page->seg_index], page, order); +} + +const char * +vm_page_seg_name(unsigned int seg_index) +{ +    /* Don't use a switch statement since segments can be aliased */ +    if (seg_index == VM_PAGE_SEG_HIGHMEM) +        return "HIGHMEM"; +    else if (seg_index == VM_PAGE_SEG_DIRECTMAP) +        return "DIRECTMAP"; +    else if (seg_index == VM_PAGE_SEG_DMA32) +        return "DMA32"; +    else if (seg_index == VM_PAGE_SEG_DMA) +        return "DMA"; +    else +        panic("vm_page: invalid segment index"); +} + +void +vm_page_info_all(void) +{ +    struct vm_page_seg *seg; +    unsigned long pages; +    unsigned int i; + +    for (i = 0; i < vm_page_segs_size; i++) { +        seg = &vm_page_segs[i]; +        pages = (unsigned long)(seg->pages_end - seg->pages); +        printf("vm_page: %s: pages: %lu (%luM), free: %lu (%luM)\n", +               vm_page_seg_name(i), pages, pages >> (20 - PAGE_SHIFT), +               seg->nr_free_pages, seg->nr_free_pages >> (20 - PAGE_SHIFT)); +        printf("vm_page: %s: min:%lu low:%lu high:%lu\n", +               vm_page_seg_name(vm_page_seg_index(seg)), +               seg->min_free_pages, seg->low_free_pages, seg->high_free_pages); +    } +} + +phys_addr_t +vm_page_seg_end(unsigned int selector) +{ +    return vm_page_segs[vm_page_select_alloc_seg(selector)].end; +} + +static unsigned long +vm_page_boot_table_size(void) +{ +    unsigned long nr_pages; +    unsigned int i; + +    nr_pages = 0; + +    for (i = 0; i < vm_page_segs_size; i++) { +        nr_pages += vm_page_atop(vm_page_boot_seg_size(&vm_page_boot_segs[i])); +    } + +    return nr_pages; +} + +unsigned long +vm_page_table_size(void) +{ +    unsigned long nr_pages; +    unsigned int i; + +    if (!vm_page_is_ready) { +        return vm_page_boot_table_size(); +    } + +    nr_pages = 0; + +    for (i = 0; i < vm_page_segs_size; i++) { +        nr_pages += vm_page_atop(vm_page_seg_size(&vm_page_segs[i])); +    } + +    return nr_pages; +} + +unsigned long +vm_page_table_index(phys_addr_t pa) +{ +    struct vm_page_seg *seg; +    unsigned long index; +    unsigned int i; + +    index = 0; + +    for (i = 0; i < vm_page_segs_size; i++) { +        seg = &vm_page_segs[i]; + +        if ((pa >= seg->start) && (pa < seg->end)) { +            return index + vm_page_atop(pa - seg->start); +        } + +        index += vm_page_atop(vm_page_seg_size(seg)); +    } + +    panic("vm_page: invalid physical address"); +} + +phys_addr_t +vm_page_mem_size(void) +{ +    phys_addr_t total; +    unsigned int i; + +    total = 0; + +    for (i = 0; i < vm_page_segs_size; i++) { +        total += vm_page_seg_size(&vm_page_segs[i]); +    } + +    return total; +} + +unsigned long +vm_page_mem_free(void) +{ +    unsigned long total; +    unsigned int i; + +    total = 0; + +    for (i = 0; i < vm_page_segs_size; i++) { +        total += vm_page_segs[i].nr_free_pages; +    } + +    return total; +} + +/* + * Mark this page as wired down by yet another map, removing it + * from paging queues as necessary. + * + * The page's object and the page queues must be locked. + */ +void +vm_page_wire(struct vm_page *page) +{ +    VM_PAGE_CHECK(page); + +    if (page->wire_count == 0) { +        vm_page_queues_remove(page); + +        if (!page->private && !page->fictitious) { +            vm_page_wire_count++; +        } +    } + +    page->wire_count++; +} + +/* + * Release one wiring of this page, potentially enabling it to be paged again. + * + * The page's object and the page queues must be locked. + */ +void +vm_page_unwire(struct vm_page *page) +{ +    struct vm_page_seg *seg; + +    VM_PAGE_CHECK(page); + +    assert(page->wire_count != 0); +    page->wire_count--; + +    if ((page->wire_count != 0) +        || page->fictitious +        || page->private) { +        return; +    } + +    seg = vm_page_seg_get(page->seg_index); + +    simple_lock(&seg->lock); +    vm_page_seg_add_active_page(seg, page); +    simple_unlock(&seg->lock); + +    vm_page_wire_count--; +} + +/* + * Returns the given page to the inactive list, indicating that + * no physical maps have access to this page. + * [Used by the physical mapping system.] + * + * The page queues must be locked. + */ +void +vm_page_deactivate(struct vm_page *page) +{ +    struct vm_page_seg *seg; + +    VM_PAGE_CHECK(page); + +    /* +     * This page is no longer very interesting.  If it was +     * interesting (active or inactive/referenced), then we +     * clear the reference bit and (re)enter it in the +     * inactive queue.  Note wired pages should not have +     * their reference bit cleared. +     */ + +    if (page->active || (page->inactive && page->reference)) { +        if (!page->fictitious && !page->private && !page->absent) { +            pmap_clear_reference(page->phys_addr); +        } + +        page->reference = FALSE; +        vm_page_queues_remove(page); +    } + +    if ((page->wire_count == 0) && !page->fictitious +        && !page->private && !page->inactive) { +        seg = vm_page_seg_get(page->seg_index); + +        simple_lock(&seg->lock); +        vm_page_seg_add_inactive_page(seg, page); +        simple_unlock(&seg->lock); +    } +} + +/* + * Put the specified page on the active list (if appropriate). + * + * The page queues must be locked. + */ +void +vm_page_activate(struct vm_page *page) +{ +    struct vm_page_seg *seg; + +    VM_PAGE_CHECK(page); + +    /* +     * Unconditionally remove so that, even if the page was already +     * active, it gets back to the end of the active queue. +     */ +    vm_page_queues_remove(page); + +    if ((page->wire_count == 0) && !page->fictitious && !page->private) { +        seg = vm_page_seg_get(page->seg_index); + +        if (page->active) +            panic("vm_page_activate: already active"); + +        simple_lock(&seg->lock); +        vm_page_seg_add_active_page(seg, page); +        simple_unlock(&seg->lock); +    } +} + +void +vm_page_queues_remove(struct vm_page *page) +{ +    struct vm_page_seg *seg; + +    assert(!page->active || !page->inactive); + +    if (!page->active && !page->inactive) { +        return; +    } + +    seg = vm_page_seg_get(page->seg_index); + +    simple_lock(&seg->lock); + +    if (page->active) { +        vm_page_seg_remove_active_page(seg, page); +    } else { +        vm_page_seg_remove_inactive_page(seg, page); +    } + +    simple_unlock(&seg->lock); +} + +/* + * Check whether segments are all usable for unprivileged allocations. + * + * If all segments are usable, resume pending unprivileged allocations + * and return TRUE. + * + * This function acquires vm_page_queue_free_lock, which is held on return. + */ +static boolean_t +vm_page_check_usable(void) +{ +    struct vm_page_seg *seg; +    boolean_t usable; +    unsigned int i; + +    simple_lock(&vm_page_queue_free_lock); + +    for (i = 0; i < vm_page_segs_size; i++) { +        seg = vm_page_seg_get(i); + +        simple_lock(&seg->lock); +        usable = vm_page_seg_usable(seg); +        simple_unlock(&seg->lock); + +        if (!usable) { +            return FALSE; +        } +    } + +    vm_page_external_laundry_count = -1; +    vm_page_alloc_paused = FALSE; +    thread_wakeup(&vm_page_alloc_paused); +    return TRUE; +} + +static boolean_t +vm_page_may_balance(void) +{ +    struct vm_page_seg *seg; +    boolean_t page_available; +    unsigned int i; + +    for (i = 0; i < vm_page_segs_size; i++) { +        seg = vm_page_seg_get(i); + +        simple_lock(&seg->lock); +        page_available = vm_page_seg_page_available(seg); +        simple_unlock(&seg->lock); + +        if (page_available) { +            return TRUE; +        } +    } + +    return FALSE; +} + +static boolean_t +vm_page_balance_once(void) +{ +    boolean_t balanced; +    unsigned int i; + +    /* +     * It's important here that pages are moved from higher priority +     * segments first. +     */ + +    for (i = 0; i < vm_page_segs_size; i++) { +        balanced = vm_page_seg_balance(vm_page_seg_get(i)); + +        if (balanced) { +            return TRUE; +        } +    } + +    return FALSE; +} + +boolean_t +vm_page_balance(void) +{ +    boolean_t balanced; + +    while (vm_page_may_balance()) { +        balanced = vm_page_balance_once(); + +        if (!balanced) { +            break; +        } +    } + +    return vm_page_check_usable(); +} + +static boolean_t +vm_page_evict_once(boolean_t external_only, boolean_t alloc_paused) +{ +    boolean_t evicted; +    unsigned int i; + +    /* +     * It's important here that pages are evicted from lower priority +     * segments first. +     */ + +    for (i = vm_page_segs_size - 1; i < vm_page_segs_size; i--) { +        evicted = vm_page_seg_evict(vm_page_seg_get(i), +                                    external_only, alloc_paused); + +        if (evicted) { +            return TRUE; +        } +    } + +    return FALSE; +} + +#define VM_PAGE_MAX_LAUNDRY   5 +#define VM_PAGE_MAX_EVICTIONS 5 + +boolean_t +vm_page_evict(boolean_t *should_wait) +{ +    boolean_t pause, evicted, external_only, alloc_paused; +    unsigned int i; + +    *should_wait = TRUE; +    external_only = TRUE; + +    simple_lock(&vm_page_queue_free_lock); +    vm_page_external_laundry_count = 0; +    alloc_paused = vm_page_alloc_paused; +    simple_unlock(&vm_page_queue_free_lock); + +again: +    vm_page_lock_queues(); +    pause = (vm_page_laundry_count >= VM_PAGE_MAX_LAUNDRY); +    vm_page_unlock_queues(); + +    if (pause) { +        simple_lock(&vm_page_queue_free_lock); +        return FALSE; +    } + +    for (i = 0; i < VM_PAGE_MAX_EVICTIONS; i++) { +        evicted = vm_page_evict_once(external_only, alloc_paused); + +        if (!evicted) { +            break; +        } +    } + +    simple_lock(&vm_page_queue_free_lock); + +    /* +     * Keep in mind eviction may not cause pageouts, since non-precious +     * clean pages are simply released. +     */ +    if ((vm_page_laundry_count == 0) && (vm_page_external_laundry_count == 0)) { +        /* +         * No pageout, but some clean pages were freed. Start a complete +         * scan again without waiting. +         */ +        if (evicted) { +            *should_wait = FALSE; +            return FALSE; +        } + +        /* +         * Eviction failed, consider pages from internal objects on the +         * next attempt. +         */ +        if (external_only) { +            simple_unlock(&vm_page_queue_free_lock); +            external_only = FALSE; +            goto again; +        } + +        /* +         * TODO Find out what could cause this and how to deal with it. +         * This will likely require an out-of-memory killer. +         */ + +        { +            static boolean_t warned = FALSE; + +            if (!warned) { +                printf("vm_page warning: unable to recycle any page\n"); +                warned = 1; +            } +        } +    } + +    simple_unlock(&vm_page_queue_free_lock); + +    return vm_page_check_usable(); +} + +void +vm_page_refill_inactive(void) +{ +    unsigned int i; + +    vm_page_lock_queues(); + +    for (i = 0; i < vm_page_segs_size; i++) { +        vm_page_seg_refill_inactive(vm_page_seg_get(i)); +    } + +    vm_page_unlock_queues(); +} + +void +vm_page_wait(void (*continuation)(void)) +{ +    assert(!current_thread()->vm_privilege); + +    simple_lock(&vm_page_queue_free_lock); + +    if (!vm_page_alloc_paused) { +        simple_unlock(&vm_page_queue_free_lock); +        return; +    } + +    assert_wait(&vm_page_alloc_paused, FALSE); + +    simple_unlock(&vm_page_queue_free_lock); + +    if (continuation != 0) { +        counter(c_vm_page_wait_block_user++); +        thread_block(continuation); +    } else { +        counter(c_vm_page_wait_block_kernel++); +        thread_block((void (*)(void)) 0); +    } +} + +#if MACH_KDB +#include <ddb/db_output.h> +#define PAGES_PER_MB ((1<<20) / PAGE_SIZE) +void db_show_vmstat(void) +{ +	integer_t free_count = vm_page_mem_free(); +	unsigned i; + +	db_printf("%-20s %10uM\n", "size:", +		(free_count + vm_page_active_count + +		  vm_page_inactive_count + vm_page_wire_count) +		 / PAGES_PER_MB); + +	db_printf("%-20s %10uM\n", "free:", +		free_count / PAGES_PER_MB); +	db_printf("%-20s %10uM\n", "active:", +		vm_page_active_count / PAGES_PER_MB); +	db_printf("%-20s %10uM\n", "inactive:", +		vm_page_inactive_count / PAGES_PER_MB); +	db_printf("%-20s %10uM\n", "wired:", +		vm_page_wire_count / PAGES_PER_MB); + +	db_printf("%-20s %10uM\n", "zero filled:", +		vm_stat.zero_fill_count / PAGES_PER_MB); +	db_printf("%-20s %10uM\n", "reactivated:", +		vm_stat.reactivations / PAGES_PER_MB); +	db_printf("%-20s %10uM\n", "pageins:", +		vm_stat.pageins / PAGES_PER_MB); +	db_printf("%-20s %10uM\n", "pageouts:", +		vm_stat.pageouts / PAGES_PER_MB); +	db_printf("%-20s %10uM\n", "page faults:", +		vm_stat.faults / PAGES_PER_MB); +	db_printf("%-20s %10uM\n", "cow faults:", +		vm_stat.cow_faults / PAGES_PER_MB); +	db_printf("%-20s %10u%\n", "memobj hit ratio:", +		(vm_stat.hits * 100) / vm_stat.lookups); + +	db_printf("%-20s %10u%\n", "cached_memobjs", +		vm_object_external_count); +	db_printf("%-20s %10uM\n", "cache", +		vm_object_external_pages / PAGES_PER_MB); + +	for (i = 0; i < vm_page_segs_size; i++) +	{ +		db_printf("\nSegment %s:\n", vm_page_seg_name(i)); +		db_printf("%-20s %10uM\n", "size:", +			vm_page_seg_size(&vm_page_segs[i]) >> 20); +		db_printf("%-20s %10uM\n", "free:", +			vm_page_segs[i].nr_free_pages / PAGES_PER_MB); +		db_printf("%-20s %10uM\n", "min_free:", +			vm_page_segs[i].min_free_pages / PAGES_PER_MB); +		db_printf("%-20s %10uM\n", "low_free:", +			vm_page_segs[i].low_free_pages / PAGES_PER_MB); +		db_printf("%-20s %10uM\n", "high_free:", +			vm_page_segs[i].high_free_pages / PAGES_PER_MB); +		db_printf("%-20s %10uM\n", "active:", +			vm_page_segs[i].nr_active_pages / PAGES_PER_MB); +		db_printf("%-20s %10uM\n", "high active:", +			vm_page_segs[i].high_active_pages / PAGES_PER_MB); +		db_printf("%-20s %10uM\n", "inactive:", +			vm_page_segs[i].nr_inactive_pages / PAGES_PER_MB); +	} +} +#endif /* MACH_KDB */ diff --git a/vm/vm_page.h b/vm/vm_page.h new file mode 100644 index 0000000..3be75f1 --- /dev/null +++ b/vm/vm_page.h @@ -0,0 +1,567 @@ +/*  + * Mach Operating System + * Copyright (c) 1993-1988 Carnegie Mellon University + * All Rights Reserved. + *  + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + *  + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + *  + * Carnegie Mellon requests users of this software to return to + *  + *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU + *  School of Computer Science + *  Carnegie Mellon University + *  Pittsburgh PA 15213-3890 + *  + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + *	File:	vm/vm_page.h + *	Author:	Avadis Tevanian, Jr., Michael Wayne Young + *	Date:	1985 + * + *	Resident memory system definitions. + */ + +#ifndef	_VM_VM_PAGE_H_ +#define _VM_VM_PAGE_H_ + +#include <mach/boolean.h> +#include <mach/vm_prot.h> +#include <machine/vm_param.h> +#include <vm/vm_object.h> +#include <vm/vm_types.h> +#include <kern/queue.h> +#include <kern/list.h> +#include <kern/lock.h> +#include <kern/log2.h> + +#include <kern/macros.h> +#include <kern/sched_prim.h>	/* definitions of wait/wakeup */ + +#if	MACH_VM_DEBUG +#include <mach_debug/hash_info.h> +#endif + +/* + *	Management of resident (logical) pages. + * + *	A small structure is kept for each resident + *	page, indexed by page number.  Each structure + *	is an element of several lists: + * + *		A hash table bucket used to quickly + *		perform object/offset lookups + * + *		A list of all pages for a given object, + *		so they can be quickly deactivated at + *		time of deallocation. + * + *		An ordered list of pages due for pageout. + * + *	In addition, the structure contains the object + *	and offset to which this page belongs (for pageout), + *	and sundry status bits. + * + *	Fields in this structure are locked either by the lock on the + *	object that the page belongs to (O) or by the lock on the page + *	queues (P).  [Some fields require that both locks be held to + *	change that field; holding either lock is sufficient to read.] + */ + +struct vm_page { +	struct list node;		/* page queues or free list (P) */ +	void *priv; + +	/* +	 * This member is used throughout the code and may only change for +	 * fictitious pages. +	 */ +	phys_addr_t phys_addr; + +	queue_chain_t	listq;		/* all pages in same object (O) */ +	struct vm_page	*next;		/* VP bucket link (O) */ + +	/* We use an empty struct as the delimiter.  */ +	struct {} vm_page_header; + +	vm_object_t	object;		/* which object am I in (O,P) */ +	vm_offset_t	offset;		/* offset into that object (O,P) */ + +	unsigned int	wire_count:15,	/* how many wired down maps use me? +					   (O&P) */ +	/* boolean_t */	inactive:1,	/* page is in inactive list (P) */ +			active:1,	/* page is in active list (P) */ +			laundry:1,	/* page is being cleaned now (P)*/ +			external_laundry:1,	/* same as laundry for external pagers (P)*/ +			free:1,		/* page is on free list (P) */ +			reference:1,	/* page has been used (P) */ +			external:1,	/* page in external object (P) */ +			busy:1,		/* page is in transit (O) */ +			wanted:1,	/* someone is waiting for page (O) */ +			tabled:1,	/* page is in VP table (O) */ +			fictitious:1,	/* Physical page doesn't exist (O) */ +			private:1,	/* Page should not be returned to +					 *  the free list (O) */ +			absent:1,	/* Data has been requested, but is +					 *  not yet available (O) */ +			error:1,	/* Data manager was unable to provide +					 *  data due to error (O) */ +			dirty:1,	/* Page must be cleaned (O) */ +			precious:1,	/* Page is precious; data must be +					 *  returned even if clean (O) */ +			overwriting:1;	/* Request to unlock has been made +					 * without having data. (O) +					 * [See vm_object_overwrite] */ + +	vm_prot_t	page_lock:3;	/* Uses prohibited by data manager (O) */ +	vm_prot_t	unlock_request:3;	/* Outstanding unlock request (O) */ + +	struct {} vm_page_footer; + +	unsigned short type:2; +	unsigned short seg_index:2; +	unsigned short order:4; +}; + +#define VM_PAGE_BODY_SIZE					\ +		(offsetof(struct vm_page, vm_page_footer)	\ +		- offsetof(struct vm_page, vm_page_header)) + +/* + *	For debugging, this macro can be defined to perform + *	some useful check on a page structure. + */ + +#define VM_PAGE_CHECK(mem) vm_page_check(mem) + +void vm_page_check(const struct vm_page *page); + +/* + *	Each pageable resident page falls into one of three lists: + * + *	free	 + *		Available for allocation now. + *	inactive + *		Not referenced in any map, but still has an + *		object/offset-page mapping, and may be dirty. + *		This is the list of pages that should be + *		paged out next. + *	active + *		A list of pages which have been placed in + *		at least one physical map.  This list is + *		ordered, in LRU-like fashion. + */ + +#define VM_PAGE_DMA		0x01 +#if defined(VM_PAGE_DMA32_LIMIT) && VM_PAGE_DMA32_LIMIT > VM_PAGE_DIRECTMAP_LIMIT +#define VM_PAGE_DIRECTMAP      0x02 +#define VM_PAGE_DMA32          0x04 +#else +#define VM_PAGE_DMA32		0x02 +#define VM_PAGE_DIRECTMAP	0x04 +#endif +#define VM_PAGE_HIGHMEM		0x08 + +extern +int	vm_page_fictitious_count;/* How many fictitious pages are free? */ +extern +int	vm_page_active_count;	/* How many pages are active? */ +extern +int	vm_page_inactive_count;	/* How many pages are inactive? */ +extern +int	vm_page_wire_count;	/* How many pages are wired? */ +extern +int	vm_page_laundry_count;	/* How many pages being laundered? */ +extern +int	vm_page_external_laundry_count;	/* How many external pages being paged out? */ + +decl_simple_lock_data(extern,vm_page_queue_lock)/* lock on active and inactive +						   page queues */ +decl_simple_lock_data(extern,vm_page_queue_free_lock) +						/* lock on free page queue */ + +extern phys_addr_t	vm_page_fictitious_addr; +				/* (fake) phys_addr of fictitious pages */ + +extern void		vm_page_bootstrap( +	vm_offset_t	*startp, +	vm_offset_t	*endp); +extern void		vm_page_module_init(void); + +extern vm_page_t	vm_page_lookup( +	vm_object_t	object, +	vm_offset_t	offset); +extern vm_page_t	vm_page_grab_fictitious(void); +extern boolean_t	vm_page_convert(vm_page_t *); +extern void		vm_page_more_fictitious(void); +extern vm_page_t	vm_page_grab(unsigned flags); +extern void		vm_page_release(vm_page_t, boolean_t, boolean_t); +extern phys_addr_t	vm_page_grab_phys_addr(void); +extern vm_page_t	vm_page_grab_contig(vm_size_t, unsigned int); +extern void		vm_page_free_contig(vm_page_t, vm_size_t); +extern void		vm_page_wait(void (*)(void)); +extern vm_page_t	vm_page_alloc( +	vm_object_t	object, +	vm_offset_t	offset); +extern void		vm_page_init( +	vm_page_t	mem); +extern void		vm_page_free(vm_page_t); +extern void		vm_page_activate(vm_page_t); +extern void		vm_page_deactivate(vm_page_t); +extern void		vm_page_rename( +	vm_page_t	mem, +	vm_object_t	new_object, +	vm_offset_t	new_offset); +extern void		vm_page_insert( +	vm_page_t	mem, +	vm_object_t	object, +	vm_offset_t	offset); +extern void		vm_page_remove( +	vm_page_t	mem); + +extern void		vm_page_zero_fill(vm_page_t); +extern void		vm_page_copy(vm_page_t src_m, vm_page_t dest_m); + +extern void		vm_page_wire(vm_page_t); +extern void		vm_page_unwire(vm_page_t); + +#if	MACH_VM_DEBUG +extern unsigned int	vm_page_info( +	hash_info_bucket_t	*info, +	unsigned int		count); +#endif + +/* + *	Functions implemented as macros + */ + +#define PAGE_ASSERT_WAIT(m, interruptible)			\ +		MACRO_BEGIN					\ +		(m)->wanted = TRUE;				\ +		assert_wait((event_t) (m), (interruptible));	\ +		MACRO_END + +#define PAGE_WAKEUP_DONE(m)					\ +		MACRO_BEGIN					\ +		(m)->busy = FALSE;				\ +		if ((m)->wanted) {				\ +			(m)->wanted = FALSE;			\ +			thread_wakeup(((event_t) m));		\ +		}						\ +		MACRO_END + +#define PAGE_WAKEUP(m)						\ +		MACRO_BEGIN					\ +		if ((m)->wanted) {				\ +			(m)->wanted = FALSE;			\ +			thread_wakeup((event_t) (m));		\ +		}						\ +		MACRO_END + +#define VM_PAGE_FREE(p) 			\ +		MACRO_BEGIN			\ +		vm_page_lock_queues();		\ +		vm_page_free(p);		\ +		vm_page_unlock_queues();	\ +		MACRO_END + +/* + *	Macro to be used in place of pmap_enter() + */ + +#define PMAP_ENTER(pmap, virtual_address, page, protection, wired) \ +		MACRO_BEGIN					\ +		pmap_enter(					\ +			(pmap),					\ +			(virtual_address),			\ +			(page)->phys_addr,			\ +			(protection) & ~(page)->page_lock,	\ +			(wired)					\ +		 );						\ +		MACRO_END + +#define	VM_PAGE_WAIT(continuation)	vm_page_wait(continuation) + +#define vm_page_lock_queues()	simple_lock(&vm_page_queue_lock) +#define vm_page_unlock_queues()	simple_unlock(&vm_page_queue_lock) + +#define VM_PAGE_QUEUES_REMOVE(mem) vm_page_queues_remove(mem) + +/* + * Copyright (c) 2010-2014 Richard Braun. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <http://www.gnu.org/licenses/>. + * + * + * Physical page management. + */ + +/* + * Address/page conversion and rounding macros (not inline functions to + * be easily usable on both virtual and physical addresses, which may not + * have the same type size). + */ +#define vm_page_atop(addr)      ((addr) >> PAGE_SHIFT) +#define vm_page_ptoa(page)      ((page) << PAGE_SHIFT) +#define vm_page_trunc(addr)     P2ALIGN(addr, PAGE_SIZE) +#define vm_page_round(addr)     P2ROUND(addr, PAGE_SIZE) +#define vm_page_aligned(addr)   P2ALIGNED(addr, PAGE_SIZE) + +/* + * Segment selectors. + * + * Selector-to-segment-list translation table : + * DMA          DMA + * if 32bit PAE + * DIRECTMAP    DMA32 DMA + * DMA32        DMA32 DIRECTMAP DMA + * HIGHMEM      HIGHMEM DMA32 DIRECTMAP DMA + * else + * DMA32        DMA32 DMA + * DIRECTMAP    DIRECTMAP DMA32 DMA + * HIGHMEM      HIGHMEM DIRECTMAP DMA32 DMA + * endif + */ +#define VM_PAGE_SEL_DMA         0 +#if defined(VM_PAGE_DMA32_LIMIT) && VM_PAGE_DMA32_LIMIT > VM_PAGE_DIRECTMAP_LIMIT +#define VM_PAGE_SEL_DIRECTMAP   1 +#define VM_PAGE_SEL_DMA32       2 +#else +#define VM_PAGE_SEL_DMA32       1 +#define VM_PAGE_SEL_DIRECTMAP   2 +#endif +#define VM_PAGE_SEL_HIGHMEM     3 + +/* + * Page usage types. + */ +#define VM_PT_FREE          0   /* Page unused */ +#define VM_PT_RESERVED      1   /* Page reserved at boot time */ +#define VM_PT_TABLE         2   /* Page is part of the page table */ +#define VM_PT_KERNEL        3   /* Type for generic kernel allocations */ + +static inline unsigned short +vm_page_type(const struct vm_page *page) +{ +    return page->type; +} + +void vm_page_set_type(struct vm_page *page, unsigned int order, +                      unsigned short type); + +static inline unsigned int +vm_page_order(size_t size) +{ +    return iorder2(vm_page_atop(vm_page_round(size))); +} + +static inline phys_addr_t +vm_page_to_pa(const struct vm_page *page) +{ +    return page->phys_addr; +} + +/* + * Associate private data with a page. + */ +static inline void +vm_page_set_priv(struct vm_page *page, void *priv) +{ +    page->priv = priv; +} + +static inline void * +vm_page_get_priv(const struct vm_page *page) +{ +    return page->priv; +} + +/* + * Load physical memory into the vm_page module at boot time. + * + * All addresses must be page-aligned. Segments can be loaded in any order. + */ +void vm_page_load(unsigned int seg_index, phys_addr_t start, phys_addr_t end); + +/* + * Load available physical memory into the vm_page module at boot time. + * + * The segment referred to must have been loaded with vm_page_load + * before loading its heap. + */ +void vm_page_load_heap(unsigned int seg_index, phys_addr_t start, +                       phys_addr_t end); + +/* + * Return true if the vm_page module is completely initialized, false + * otherwise, in which case only vm_page_bootalloc() can be used for + * allocations. + */ +int vm_page_ready(void); + +/* + * Early allocation function. + * + * This function is used by the vm_resident module to implement + * pmap_steal_memory. It can be used after physical segments have been loaded + * and before the vm_page module is initialized. + */ +phys_addr_t vm_page_bootalloc(size_t size); + +/* + * Set up the vm_page module. + * + * Architecture-specific code must have loaded segments before calling this + * function. Segments must comply with the selector-to-segment-list table, + * e.g. HIGHMEM is loaded if and only if DIRECTMAP, DMA32 and DMA are loaded, + * notwithstanding segment aliasing. + * + * Once this function returns, the vm_page module is ready, and normal + * allocation functions can be used. + */ +void vm_page_setup(void); + +/* + * Make the given page managed by the vm_page module. + * + * If additional memory can be made usable after the VM system is initialized, + * it should be reported through this function. + */ +void vm_page_manage(struct vm_page *page); + +/* + * Return the page descriptor for the given physical address. + */ +struct vm_page * vm_page_lookup_pa(phys_addr_t pa); + +/* + * Allocate a block of 2^order physical pages. + * + * The selector is used to determine the segments from which allocation can + * be attempted. + * + * This function should only be used by the vm_resident module. + */ +struct vm_page * vm_page_alloc_pa(unsigned int order, unsigned int selector, +                                  unsigned short type); + +/* + * Release a block of 2^order physical pages. + * + * This function should only be used by the vm_resident module. + */ +void vm_page_free_pa(struct vm_page *page, unsigned int order); + +/* + * Return the name of the given segment. + */ +const char * vm_page_seg_name(unsigned int seg_index); + +/* + * Display internal information about the module. + */ +void vm_page_info_all(void); + +/* + * Return the maximum physical address for a given segment selector. + */ +phys_addr_t vm_page_seg_end(unsigned int selector); + +/* + * Return the total number of physical pages. + */ +unsigned long vm_page_table_size(void); + +/* + * Return the index of a page in the page table. + */ +unsigned long vm_page_table_index(phys_addr_t pa); + +/* + * Return the total amount of physical memory. + */ +phys_addr_t vm_page_mem_size(void); + +/* + * Return the amount of free (unused) pages. + * + * XXX This currently relies on the kernel being non preemptible and + * uniprocessor. + */ +unsigned long vm_page_mem_free(void); + +/* + * Remove the given page from any page queue it might be in. + */ +void vm_page_queues_remove(struct vm_page *page); + +/* + * Balance physical pages among segments. + * + * This function should be called first by the pageout daemon + * on memory pressure, since it may be unnecessary to perform any + * other operation, let alone shrink caches, if balancing is + * enough to make enough free pages. + * + * Return TRUE if balancing made enough free pages for unprivileged + * allocations to succeed, in which case pending allocations are resumed. + * + * This function acquires vm_page_queue_free_lock, which is held on return. + */ +boolean_t vm_page_balance(void); + +/* + * Evict physical pages. + * + * This function should be called by the pageout daemon after balancing + * the segments and shrinking kernel caches. + * + * Return TRUE if eviction made enough free pages for unprivileged + * allocations to succeed, in which case pending allocations are resumed. + * + * Otherwise, report whether the pageout daemon should wait (some pages + * have been paged out) or not (only clean pages have been released). + * + * This function acquires vm_page_queue_free_lock, which is held on return. + */ +boolean_t vm_page_evict(boolean_t *should_wait); + +/* + * Turn active pages into inactive ones for second-chance LRU + * approximation. + * + * This function should be called by the pageout daemon on memory pressure, + * i.e. right before evicting pages. + * + * XXX This is probably not the best strategy, compared to keeping the + * active/inactive ratio in check at all times, but this means less + * frequent refills. + */ +void vm_page_refill_inactive(void); + +/* + * Print vmstat information + */ +void db_show_vmstat(void); + +#endif	/* _VM_VM_PAGE_H_ */ diff --git a/vm/vm_pageout.c b/vm/vm_pageout.c new file mode 100644 index 0000000..e2f4cf2 --- /dev/null +++ b/vm/vm_pageout.c @@ -0,0 +1,515 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University. + * Copyright (c) 1993,1994 The University of Utah and + * the Computer Systems Laboratory (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF + * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY + * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF + * THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU + *  School of Computer Science + *  Carnegie Mellon University + *  Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + *	File:	vm/vm_pageout.c + *	Author:	Avadis Tevanian, Jr., Michael Wayne Young + *	Date:	1985 + * + *	The proverbial page-out daemon. + */ + +#include <device/net_io.h> +#include <mach/mach_types.h> +#include <mach/memory_object.h> +#include <vm/memory_object_default.user.h> +#include <vm/memory_object_user.user.h> +#include <mach/vm_param.h> +#include <mach/vm_statistics.h> +#include <kern/counters.h> +#include <kern/debug.h> +#include <kern/slab.h> +#include <kern/task.h> +#include <kern/thread.h> +#include <kern/printf.h> +#include <vm/memory_object.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pageout.h> +#include <machine/locore.h> + +#define DEBUG 0 + +/* + * Maximum delay, in milliseconds, between two pageout scans. + */ +#define VM_PAGEOUT_TIMEOUT 50 + +/* + * Event placeholder for pageout requests, synchronized with + * the free page queue lock. + */ +static int vm_pageout_requested; + +/* + * Event placeholder for pageout throttling, synchronized with + * the free page queue lock. + */ +static int vm_pageout_continue; + +/* + *	Routine:	vm_pageout_setup + *	Purpose: + *		Set up a page for pageout. + * + *		Move or copy the page to a new object, as part + *		of which it will be sent to its memory manager + *		in a memory_object_data_return or memory_object_initialize + *		message. + * + *		The "paging_offset" argument specifies the offset + *		of the page within its external memory object. + * + *		The "new_object" and "new_offset" arguments + *		indicate where the page should be moved. + * + *		The "flush" argument specifies whether the page + *		should be flushed from its object.  If not, a + *		copy of the page is moved to the new object. + * + *	In/Out conditions: + *		The page in question must not be on any pageout queues, + *		and must be busy.  The object to which it belongs + *		must be unlocked, and the caller must hold a paging + *		reference to it.  The new_object must not be locked. + * + *		If the page is flushed from its original object, + *		this routine returns a pointer to a place-holder page, + *		inserted at the same offset, to block out-of-order + *		requests for the page.  The place-holder page must + *		be freed after the data_return or initialize message + *		has been sent.  If the page is copied, + *		the holding page is VM_PAGE_NULL. + * + *		The original page is put on a paging queue and marked + *		not busy on exit. + */ +vm_page_t +vm_pageout_setup( +	vm_page_t		m, +	vm_offset_t		paging_offset, +	vm_object_t		new_object, +	vm_offset_t		new_offset, +	boolean_t		flush) +{ +	vm_object_t	old_object = m->object; +	vm_page_t	holding_page = 0; /*'=0'to quiet gcc warnings*/ +	vm_page_t	new_m; + +	assert(m->busy && !m->absent && !m->fictitious); + +	/* +	 *	If we are not flushing the page, allocate a +	 *	page in the object. +	 */ +	if (!flush) { +		for (;;) { +			vm_object_lock(new_object); +			new_m = vm_page_alloc(new_object, new_offset); +			vm_object_unlock(new_object); + +			if (new_m != VM_PAGE_NULL) { +				break; +			} + +			VM_PAGE_WAIT(NULL); +		} +	} + +	if (flush) { +		/* +		 *	Create a place-holder page where the old one was, +		 *	to prevent anyone from attempting to page in this +		 *	page while we`re unlocked. +		 */ +		while ((holding_page = vm_page_grab_fictitious()) +							== VM_PAGE_NULL) +			vm_page_more_fictitious(); + +		vm_object_lock(old_object); +		vm_page_lock_queues(); +		vm_page_remove(m); +		vm_page_unlock_queues(); +		PAGE_WAKEUP_DONE(m); + +		vm_page_lock_queues(); +		vm_page_insert(holding_page, old_object, m->offset); +		vm_page_unlock_queues(); + +		/* +		 *	Record that this page has been written out +		 */ +#if	MACH_PAGEMAP +		vm_external_state_set(old_object->existence_info, +					paging_offset, +					VM_EXTERNAL_STATE_EXISTS); +#endif	/* MACH_PAGEMAP */ + +		vm_object_unlock(old_object); + +		vm_object_lock(new_object); + +		/* +		 *	Move this page into the new object +		 */ + +		vm_page_lock_queues(); +		vm_page_insert(m, new_object, new_offset); +		vm_page_unlock_queues(); + +		m->dirty = TRUE; +		m->precious = FALSE; +		m->page_lock = VM_PROT_NONE; +		m->unlock_request = VM_PROT_NONE; +	} +	else { +		/* +		 *	Copy the data into the new page, +		 *	and mark the new page as clean. +		 */ +		vm_page_copy(m, new_m); + +		vm_object_lock(old_object); +		m->dirty = FALSE; +		pmap_clear_modify(m->phys_addr); + +		/* +		 *	Deactivate old page. +		 */ +		vm_page_lock_queues(); +		vm_page_deactivate(m); +		vm_page_unlock_queues(); + +		PAGE_WAKEUP_DONE(m); + +		/* +		 *	Record that this page has been written out +		 */ + +#if	MACH_PAGEMAP +		vm_external_state_set(old_object->existence_info, +					paging_offset, +					VM_EXTERNAL_STATE_EXISTS); +#endif	/* MACH_PAGEMAP */ + +		vm_object_unlock(old_object); + +		vm_object_lock(new_object); + +		/* +		 *	Use the new page below. +		 */ +		m = new_m; +		m->dirty = TRUE; +		assert(!m->precious); +		PAGE_WAKEUP_DONE(m); +	} + +	/* +	 *	Make the old page eligible for replacement again; if a +	 *	user-supplied memory manager fails to release the page, +	 *	it will be paged out again to the default memory manager. +	 * +	 *	Note that pages written to the default memory manager +	 *	must be wired down -- in return, it guarantees to free +	 *	this page, rather than reusing it. +	 */ + +	vm_page_lock_queues(); +	vm_stat.pageouts++; +	if (m->laundry) { + +		/* +		 *	The caller is telling us that it is going to +		 *	immediately double page this page to the default +		 *	pager. +		 */ + +		assert(!old_object->internal); +		m->laundry = FALSE; +	} else if (old_object->internal || +		   memory_manager_default_port(old_object->pager)) { +		m->laundry = TRUE; +		vm_page_laundry_count++; + +		vm_page_wire(m); +	} else { +		m->external_laundry = TRUE; + +		/* +		 *	If vm_page_external_laundry_count is negative, +		 *	the pageout daemon isn't expecting to be +		 *	notified. +		 */ + +		if (vm_page_external_laundry_count >= 0) { +			vm_page_external_laundry_count++; +		} + +		vm_page_activate(m); +	} +	vm_page_unlock_queues(); + +	/* +	 *	Since IPC operations may block, we drop locks now. +	 *	[The placeholder page is busy, and we still have +	 *	paging_in_progress incremented.] +	 */ + +	vm_object_unlock(new_object); + +	/* +	 *	Return the placeholder page to simplify cleanup. +	 */ +	return (flush ? holding_page : VM_PAGE_NULL); +} + +/* + *	Routine:	vm_pageout_page + *	Purpose: + *		Causes the specified page to be written back to + *		the appropriate memory object. + * + *		The "initial" argument specifies whether this + *		data is an initialization only, and should use + *		memory_object_data_initialize instead of + *		memory_object_data_return. + * + *		The "flush" argument specifies whether the page + *		should be flushed from the object.  If not, a + *		copy of the data is sent to the memory object. + * + *	In/out conditions: + *		The page in question must not be on any pageout queues. + *		The object to which it belongs must be locked. + *	Implementation: + *		Move this page to a completely new object, if flushing; + *		copy to a new page in a new object, if not. + */ +void +vm_pageout_page( +	vm_page_t		m, +	boolean_t		initial, +	boolean_t		flush) +{ +	vm_map_copy_t		copy; +	vm_object_t		old_object; +	vm_object_t		new_object; +	vm_page_t		holding_page; +	vm_offset_t		paging_offset; +	kern_return_t		rc; +	boolean_t		precious_clean; + +	assert(m->busy); + +	/* +	 *	Cleaning but not flushing a clean precious page is a +	 *	no-op.  Remember whether page is clean and precious now +	 *	because vm_pageout_setup will mark it dirty and not precious. +	 * +	 * XXX Check if precious_clean && !flush can really happen. +	 */ +	precious_clean = (!m->dirty) && m->precious; +	if (precious_clean && !flush) { +		PAGE_WAKEUP_DONE(m); +		return; +	} + +	/* +	 *	Verify that we really want to clean this page. +	 */ +	if (m->absent || m->error || (!m->dirty && !m->precious)) { +		VM_PAGE_FREE(m); +		return; +	} + +	/* +	 *	Create a paging reference to let us play with the object. +	 */ +	old_object = m->object; +	paging_offset = m->offset + old_object->paging_offset; +	vm_object_paging_begin(old_object); +	vm_object_unlock(old_object); + +	/* +	 *	Allocate a new object into which we can put the page. +	 */ +	new_object = vm_object_allocate(PAGE_SIZE); +	new_object->used_for_pageout = TRUE; + +	/* +	 *	Move the page into the new object. +	 */ +	holding_page = vm_pageout_setup(m, +				paging_offset, +				new_object, +				0,		/* new offset */ +				flush);		/* flush */ + +	rc = vm_map_copyin_object(new_object, 0, PAGE_SIZE, ©); +	assert(rc == KERN_SUCCESS); + +	if (initial) { +		rc = memory_object_data_initialize( +			 old_object->pager, +			 old_object->pager_request, +			 paging_offset, (pointer_t) copy, PAGE_SIZE); +	} +	else { +		rc = memory_object_data_return( +			 old_object->pager, +			 old_object->pager_request, +			 paging_offset, (pointer_t) copy, PAGE_SIZE, +			 !precious_clean, !flush); +	} + +	if (rc != KERN_SUCCESS) +		vm_map_copy_discard(copy); + +	/* +	 *	Clean up. +	 */ +	vm_object_lock(old_object); +	if (holding_page != VM_PAGE_NULL) +	    VM_PAGE_FREE(holding_page); +	vm_object_paging_end(old_object); +} + +/* + *	vm_pageout_scan does the dirty work for the pageout daemon. + * + *	Return TRUE if the pageout daemon is done for now, FALSE otherwise, + *	in which case should_wait indicates whether the pageout daemon + *	should wait to allow pagers to keep up. + * + *	It returns with vm_page_queue_free_lock held. + */ + +static boolean_t vm_pageout_scan(boolean_t *should_wait) +{ +	boolean_t done; + +	/* +	 *	Try balancing pages among segments first, since this +	 *	may be enough to resume unprivileged allocations. +	 */ + +	/* This function returns with vm_page_queue_free_lock held */ +	done = vm_page_balance(); + +	if (done) { +		return TRUE; +	} + +	simple_unlock(&vm_page_queue_free_lock); + +	/* +	 *	Balancing is not enough. Shrink caches and scan pages +	 *	for eviction. +	 */ + +	stack_collect(); +	net_kmsg_collect(); +	consider_task_collect(); +	if (0)	/* XXX: pcb_collect doesn't do anything yet, so it is +		   pointless to call consider_thread_collect.  */ +	consider_thread_collect(); + +	/* +	 *	slab_collect should be last, because the other operations +	 *	might return memory to caches. +	 */ +	slab_collect(); + +	vm_page_refill_inactive(); + +	/* This function returns with vm_page_queue_free_lock held */ +	return vm_page_evict(should_wait); +} + +void vm_pageout(void) +{ +	boolean_t done, should_wait; + +	current_thread()->vm_privilege = 1; +	stack_privilege(current_thread()); +	thread_set_own_priority(0); + +	for (;;) { +		done = vm_pageout_scan(&should_wait); +		/* we hold vm_page_queue_free_lock now */ + +		if (done) { +			thread_sleep(&vm_pageout_requested, +				     simple_lock_addr(vm_page_queue_free_lock), +				     FALSE); +		} else if (should_wait) { +			assert_wait(&vm_pageout_continue, FALSE); +			thread_set_timeout(VM_PAGEOUT_TIMEOUT * hz / 1000); +			simple_unlock(&vm_page_queue_free_lock); +			thread_block(NULL); + +#if DEBUG +			if (current_thread()->wait_result != THREAD_AWAKENED) { +				printf("vm_pageout: timeout," +				       " vm_page_laundry_count:%d" +				       " vm_page_external_laundry_count:%d\n", +				       vm_page_laundry_count, +				       vm_page_external_laundry_count); +			} +#endif +		} else { +			simple_unlock(&vm_page_queue_free_lock); +		} +	} +} + +/* + *	Start pageout + * + *	The free page queue lock must be held before calling this function. + */ +void vm_pageout_start(void) +{ +	if (!current_thread()) +		return; + +	thread_wakeup_one(&vm_pageout_requested); +} + +/* + *	Resume pageout + * + *	The free page queue lock must be held before calling this function. + */ +void vm_pageout_resume(void) +{ +	thread_wakeup_one(&vm_pageout_continue); +} diff --git a/vm/vm_pageout.h b/vm/vm_pageout.h new file mode 100644 index 0000000..6ddd821 --- /dev/null +++ b/vm/vm_pageout.h @@ -0,0 +1,53 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU + *  School of Computer Science + *  Carnegie Mellon University + *  Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + *	File:	vm/vm_pageout.h + *	Author:	Avadis Tevanian, Jr. + *	Date:	1986 + * + *	Declarations for the pageout daemon interface. + */ + +#ifndef	_VM_VM_PAGEOUT_H_ +#define _VM_VM_PAGEOUT_H_ + +#include <vm/vm_page.h> + +/* + *	Exported routines. + */ + +extern vm_page_t vm_pageout_setup(vm_page_t, vm_offset_t, vm_object_t, +				  vm_offset_t, boolean_t); +extern void vm_pageout_page(vm_page_t, boolean_t, boolean_t); + +extern void vm_pageout(void) __attribute__((noreturn)); + +extern void vm_pageout_start(void); + +extern void vm_pageout_resume(void); + +#endif	/* _VM_VM_PAGEOUT_H_ */ diff --git a/vm/vm_print.h b/vm/vm_print.h new file mode 100644 index 0000000..8a36d75 --- /dev/null +++ b/vm/vm_print.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2013 Free Software Foundation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef VM_PRINT_H +#define	VM_PRINT_H + +#include <vm/vm_map.h> +#include <machine/db_machdep.h> + +/* Debugging: print a map */ +extern void vm_map_print(db_expr_t addr, boolean_t have_addr, +                         db_expr_t count, const char *modif); + +/* Pretty-print a copy object for ddb. */ +extern void vm_map_copy_print(const vm_map_copy_t); + +#include <vm/vm_object.h> + +extern void vm_object_print(vm_object_t); + +#include <vm/vm_page.h> + +extern void vm_page_print(const vm_page_t); + +#endif	/* VM_PRINT_H */ + diff --git a/vm/vm_resident.c b/vm/vm_resident.c new file mode 100644 index 0000000..3f0cc90 --- /dev/null +++ b/vm/vm_resident.c @@ -0,0 +1,1116 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University. + * Copyright (c) 1993,1994 The University of Utah and + * the Computer Systems Laboratory (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF + * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY + * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF + * THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU + *  School of Computer Science + *  Carnegie Mellon University + *  Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + *	File:	vm/vm_resident.c + *	Author:	Avadis Tevanian, Jr., Michael Wayne Young + * + *	Resident memory management module. + */ + +#include <kern/printf.h> +#include <string.h> + +#include <mach/vm_prot.h> +#include <kern/counters.h> +#include <kern/debug.h> +#include <kern/list.h> +#include <kern/sched_prim.h> +#include <kern/task.h> +#include <kern/thread.h> +#include <mach/vm_statistics.h> +#include <machine/vm_param.h> +#include <kern/xpr.h> +#include <kern/slab.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_page.h> +#include <vm/vm_pageout.h> +#include <vm/vm_kern.h> +#include <vm/vm_resident.h> + +#if	MACH_VM_DEBUG +#include <mach/kern_return.h> +#include <mach_debug/hash_info.h> +#include <vm/vm_user.h> +#endif + +#if	MACH_KDB +#include <ddb/db_output.h> +#include <vm/vm_print.h> +#endif	/* MACH_KDB */ + + +/* + *	Associated with each page of user-allocatable memory is a + *	page structure. + */ + +/* + *	These variables record the values returned by vm_page_bootstrap, + *	for debugging purposes.  The implementation of pmap_steal_memory + *	here also uses them internally. + */ + +vm_offset_t virtual_space_start; +vm_offset_t virtual_space_end; + +/* + *	The vm_page_lookup() routine, which provides for fast + *	(virtual memory object, offset) to page lookup, employs + *	the following hash table.  The vm_page_{insert,remove} + *	routines install and remove associations in the table. + *	[This table is often called the virtual-to-physical, + *	or VP, table.] + */ +typedef struct { +	decl_simple_lock_data(,lock) +	vm_page_t pages; +} vm_page_bucket_t; + +vm_page_bucket_t *vm_page_buckets;		/* Array of buckets */ +unsigned long	vm_page_bucket_count = 0;	/* How big is array? */ +unsigned long	vm_page_hash_mask;		/* Mask for hash function */ + +static struct list	vm_page_queue_fictitious; +def_simple_lock_data(,vm_page_queue_free_lock) +int		vm_page_fictitious_count; +int		vm_object_external_count; +int		vm_object_external_pages; + +/* + *	Occasionally, the virtual memory system uses + *	resident page structures that do not refer to + *	real pages, for example to leave a page with + *	important state information in the VP table. + * + *	These page structures are allocated the way + *	most other kernel structures are. + */ +struct kmem_cache	vm_page_cache; + +/* + *	Fictitious pages don't have a physical address, + *	but we must initialize phys_addr to something. + *	For debugging, this should be a strange value + *	that the pmap module can recognize in assertions. + */ +phys_addr_t vm_page_fictitious_addr = (phys_addr_t) -1; + +/* + *	Resident page structures are also chained on + *	queues that are used by the page replacement + *	system (pageout daemon).  These queues are + *	defined here, but are shared by the pageout + *	module. + */ +def_simple_lock_data(,vm_page_queue_lock) +int	vm_page_active_count; +int	vm_page_inactive_count; +int	vm_page_wire_count; + +/* + *	Several page replacement parameters are also + *	shared with this module, so that page allocation + *	(done here in vm_page_alloc) can trigger the + *	pageout daemon. + */ +int	vm_page_laundry_count = 0; +int	vm_page_external_laundry_count = 0; + + +/* + *	The VM system has a couple of heuristics for deciding + *	that pages are "uninteresting" and should be placed + *	on the inactive queue as likely candidates for replacement. + *	These variables let the heuristics be controlled at run-time + *	to make experimentation easier. + */ + +boolean_t vm_page_deactivate_behind = TRUE; +boolean_t vm_page_deactivate_hint = TRUE; + +/* + *	vm_page_bootstrap: + * + *	Initializes the resident memory module. + * + *	Allocates memory for the page cells, and + *	for the object/offset-to-page hash table headers. + *	Each page cell is initialized and placed on the free list. + *	Returns the range of available kernel virtual memory. + */ + +void vm_page_bootstrap( +	vm_offset_t *startp, +	vm_offset_t *endp) +{ +	int i; + +	/* +	 *	Initialize the page queues. +	 */ + +	simple_lock_init(&vm_page_queue_free_lock); +	simple_lock_init(&vm_page_queue_lock); + +	list_init(&vm_page_queue_fictitious); + +	/* +	 *	Allocate (and initialize) the virtual-to-physical +	 *	table hash buckets. +	 * +	 *	The number of buckets should be a power of two to +	 *	get a good hash function.  The following computation +	 *	chooses the first power of two that is greater +	 *	than the number of physical pages in the system. +	 */ + +	if (vm_page_bucket_count == 0) { +		unsigned long npages = vm_page_table_size(); + +		vm_page_bucket_count = 1; +		while (vm_page_bucket_count < npages) +			vm_page_bucket_count <<= 1; +	} + +	vm_page_hash_mask = vm_page_bucket_count - 1; + +	if (vm_page_hash_mask & vm_page_bucket_count) +		printf("vm_page_bootstrap: WARNING -- strange page hash\n"); + +	vm_page_buckets = (vm_page_bucket_t *) +		pmap_steal_memory(vm_page_bucket_count * +				  sizeof(vm_page_bucket_t)); + +	for (i = 0; i < vm_page_bucket_count; i++) { +		vm_page_bucket_t *bucket = &vm_page_buckets[i]; + +		bucket->pages = VM_PAGE_NULL; +		simple_lock_init(&bucket->lock); +	} + +	vm_page_setup(); + +	virtual_space_start = round_page(virtual_space_start); +	virtual_space_end = trunc_page(virtual_space_end); + +	*startp = virtual_space_start; +	*endp = virtual_space_end; +} + +#ifndef	MACHINE_PAGES +/* + *	We implement pmap_steal_memory with the help + *	of two simpler functions, pmap_virtual_space and vm_page_bootalloc. + */ + +vm_offset_t pmap_steal_memory( +	vm_size_t size) +{ +	vm_offset_t addr, vaddr; +	phys_addr_t paddr; + +	size = round_page(size); + +	/* +	 *	If this is the first call to pmap_steal_memory, +	 *	we have to initialize ourself. +	 */ + +	if (virtual_space_start == virtual_space_end) { +		pmap_virtual_space(&virtual_space_start, &virtual_space_end); + +		/* +		 *	The initial values must be aligned properly, and +		 *	we don't trust the pmap module to do it right. +		 */ + +		virtual_space_start = round_page(virtual_space_start); +		virtual_space_end = trunc_page(virtual_space_end); +	} + +	/* +	 *	Allocate virtual memory for this request. +	 */ + +	addr = virtual_space_start; +	virtual_space_start += size; + +	/* +	 *	Allocate and map physical pages to back new virtual pages. +	 */ + +	for (vaddr = round_page(addr); +	     vaddr < addr + size; +	     vaddr += PAGE_SIZE) { +		paddr = vm_page_bootalloc(PAGE_SIZE); + +		/* +		 *	XXX Logically, these mappings should be wired, +		 *	but some pmap modules barf if they are. +		 */ + +		pmap_enter(kernel_pmap, vaddr, paddr, +			   VM_PROT_READ|VM_PROT_WRITE, FALSE); +	} + +	return addr; +} +#endif	/* MACHINE_PAGES */ + +/* + *	Routine:	vm_page_module_init + *	Purpose: + *		Second initialization pass, to be done after + *		the basic VM system is ready. + */ +void		vm_page_module_init(void) +{ +	kmem_cache_init(&vm_page_cache, "vm_page", sizeof(struct vm_page), 0, +			NULL, 0); +} + +/* + *	vm_page_hash: + * + *	Distributes the object/offset key pair among hash buckets. + * + *	NOTE:	To get a good hash function, the bucket count should + *		be a power of two. + */ +#define vm_page_hash(object, offset) \ +	(((unsigned int)(vm_offset_t)object + (unsigned int)atop(offset)) \ +		& vm_page_hash_mask) + +/* + *	vm_page_insert:		[ internal use only ] + * + *	Inserts the given mem entry into the object/object-page + *	table and object list. + * + *	The object and page must be locked. + *	The free page queue must not be locked. + */ + +void vm_page_insert( +	vm_page_t	mem, +	vm_object_t	object, +	vm_offset_t	offset) +{ +	vm_page_bucket_t *bucket; + +	VM_PAGE_CHECK(mem); + +	assert(!mem->active && !mem->inactive); +	assert(!mem->external); + +	if (!object->internal) { +		mem->external = TRUE; +		vm_object_external_pages++; +	} + +	if (mem->tabled) +		panic("vm_page_insert"); + +	/* +	 *	Record the object/offset pair in this page +	 */ + +	mem->object = object; +	mem->offset = offset; + +	/* +	 *	Insert it into the object_object/offset hash table +	 */ + +	bucket = &vm_page_buckets[vm_page_hash(object, offset)]; +	simple_lock(&bucket->lock); +	mem->next = bucket->pages; +	bucket->pages = mem; +	simple_unlock(&bucket->lock); + +	/* +	 *	Now link into the object's list of backed pages. +	 */ + +	queue_enter(&object->memq, mem, vm_page_t, listq); +	mem->tabled = TRUE; + +	/* +	 *	Show that the object has one more resident page. +	 */ + +	object->resident_page_count++; +	assert(object->resident_page_count != 0); + +	/* +	 *	Detect sequential access and inactivate previous page. +	 *	We ignore busy pages. +	 */ + +	if (vm_page_deactivate_behind && +	    (offset == object->last_alloc + PAGE_SIZE)) { +		vm_page_t	last_mem; + +		last_mem = vm_page_lookup(object, object->last_alloc); +		if ((last_mem != VM_PAGE_NULL) && !last_mem->busy) +			vm_page_deactivate(last_mem); +	} +	object->last_alloc = offset; +} + +/* + *	vm_page_replace: + * + *	Exactly like vm_page_insert, except that we first + *	remove any existing page at the given offset in object + *	and we don't do deactivate-behind. + * + *	The object and page must be locked. + *	The free page queue must not be locked. + */ + +void vm_page_replace( +	vm_page_t	mem, +	vm_object_t	object, +	vm_offset_t	offset) +{ +	vm_page_bucket_t *bucket; + +	VM_PAGE_CHECK(mem); + +	assert(!mem->active && !mem->inactive); +	assert(!mem->external); + +	if (!object->internal) { +		mem->external = TRUE; +		vm_object_external_pages++; +	} + +	if (mem->tabled) +		panic("vm_page_replace"); + +	/* +	 *	Record the object/offset pair in this page +	 */ + +	mem->object = object; +	mem->offset = offset; + +	/* +	 *	Insert it into the object_object/offset hash table, +	 *	replacing any page that might have been there. +	 */ + +	bucket = &vm_page_buckets[vm_page_hash(object, offset)]; +	simple_lock(&bucket->lock); +	if (bucket->pages) { +		vm_page_t *mp = &bucket->pages; +		vm_page_t m = *mp; +		do { +			if (m->object == object && m->offset == offset) { +				/* +				 * Remove page from bucket and from object, +				 * and return it to the free list. +				 */ +				*mp = m->next; +				queue_remove(&object->memq, m, vm_page_t, +					     listq); +				m->tabled = FALSE; +				object->resident_page_count--; +				VM_PAGE_QUEUES_REMOVE(m); + +				if (m->external) { +					m->external = FALSE; +					vm_object_external_pages--; +				} + +				/* +				 * Return page to the free list. +				 * Note the page is not tabled now, so this +				 * won't self-deadlock on the bucket lock. +				 */ + +				vm_page_free(m); +				break; +			} +			mp = &m->next; +		} while ((m = *mp) != 0); +		mem->next = bucket->pages; +	} else { +		mem->next = VM_PAGE_NULL; +	} +	bucket->pages = mem; +	simple_unlock(&bucket->lock); + +	/* +	 *	Now link into the object's list of backed pages. +	 */ + +	queue_enter(&object->memq, mem, vm_page_t, listq); +	mem->tabled = TRUE; + +	/* +	 *	And show that the object has one more resident +	 *	page. +	 */ + +	object->resident_page_count++; +	assert(object->resident_page_count != 0); +} + +/* + *	vm_page_remove:		[ internal use only ] + * + *	Removes the given mem entry from the object/offset-page + *	table, the object page list, and the page queues. + * + *	The object and page must be locked. + *	The free page queue must not be locked. + */ + +void vm_page_remove( +	vm_page_t		mem) +{ +	vm_page_bucket_t	*bucket; +	vm_page_t		this; + +	assert(mem->tabled); +	VM_PAGE_CHECK(mem); + +	/* +	 *	Remove from the object_object/offset hash table +	 */ + +	bucket = &vm_page_buckets[vm_page_hash(mem->object, mem->offset)]; +	simple_lock(&bucket->lock); +	if ((this = bucket->pages) == mem) { +		/* optimize for common case */ + +		bucket->pages = mem->next; +	} else { +		vm_page_t	*prev; + +		for (prev = &this->next; +		     (this = *prev) != mem; +		     prev = &this->next) +			continue; +		*prev = this->next; +	} +	simple_unlock(&bucket->lock); + +	/* +	 *	Now remove from the object's list of backed pages. +	 */ + +	queue_remove(&mem->object->memq, mem, vm_page_t, listq); + +	/* +	 *	And show that the object has one fewer resident +	 *	page. +	 */ + +	mem->object->resident_page_count--; + +	mem->tabled = FALSE; + +	VM_PAGE_QUEUES_REMOVE(mem); + +	if (mem->external) { +		mem->external = FALSE; +		vm_object_external_pages--; +	} +} + +/* + *	vm_page_lookup: + * + *	Returns the page associated with the object/offset + *	pair specified; if none is found, VM_PAGE_NULL is returned. + * + *	The object must be locked.  No side effects. + */ + +vm_page_t vm_page_lookup( +	vm_object_t		object, +	vm_offset_t		offset) +{ +	vm_page_t		mem; +	vm_page_bucket_t 	*bucket; + +	/* +	 *	Search the hash table for this object/offset pair +	 */ + +	bucket = &vm_page_buckets[vm_page_hash(object, offset)]; + +	simple_lock(&bucket->lock); +	for (mem = bucket->pages; mem != VM_PAGE_NULL; mem = mem->next) { +		VM_PAGE_CHECK(mem); +		if ((mem->object == object) && (mem->offset == offset)) +			break; +	} +	simple_unlock(&bucket->lock); +	return mem; +} + +/* + *	vm_page_rename: + * + *	Move the given memory entry from its + *	current object to the specified target object/offset. + * + *	The object must be locked. + */ +void vm_page_rename( +	vm_page_t	mem, +	vm_object_t	new_object, +	vm_offset_t	new_offset) +{ +	/* +	 *	Changes to mem->object require the page lock because +	 *	the pageout daemon uses that lock to get the object. +	 */ + +	vm_page_lock_queues(); +    	vm_page_remove(mem); +	vm_page_insert(mem, new_object, new_offset); +	vm_page_unlock_queues(); +} + +static void vm_page_init_template(vm_page_t m) +{ +	m->object = VM_OBJECT_NULL;	/* reset later */ +	m->offset = 0;			/* reset later */ +	m->wire_count = 0; + +	m->inactive = FALSE; +	m->active = FALSE; +	m->laundry = FALSE; +	m->external_laundry = FALSE; +	m->free = FALSE; +	m->external = FALSE; + +	m->busy = TRUE; +	m->wanted = FALSE; +	m->tabled = FALSE; +	m->fictitious = FALSE; +	m->private = FALSE; +	m->absent = FALSE; +	m->error = FALSE; +	m->dirty = FALSE; +	m->precious = FALSE; +	m->reference = FALSE; + +	m->page_lock = VM_PROT_NONE; +	m->unlock_request = VM_PROT_NONE; +} + +/* + *	vm_page_init: + * + *	Initialize the fields in a new page. + *	This takes a structure with random values and initializes it + *	so that it can be given to vm_page_release or vm_page_insert. + */ +void vm_page_init( +	vm_page_t	mem) +{ +	vm_page_init_template(mem); +} + +/* + *	vm_page_grab_fictitious: + * + *	Remove a fictitious page from the free list. + *	Returns VM_PAGE_NULL if there are no free pages. + */ + +vm_page_t vm_page_grab_fictitious(void) +{ +	vm_page_t m; + +	simple_lock(&vm_page_queue_free_lock); +	if (list_empty(&vm_page_queue_fictitious)) { +		m = VM_PAGE_NULL; +	} else { +		m = list_first_entry(&vm_page_queue_fictitious, +				     struct vm_page, node); +		assert(m->fictitious); +		list_remove(&m->node); +		m->free = FALSE; +		vm_page_fictitious_count--; +	} +	simple_unlock(&vm_page_queue_free_lock); + +	return m; +} + +/* + *	vm_page_release_fictitious: + * + *	Release a fictitious page to the free list. + */ + +static void vm_page_release_fictitious( +	vm_page_t m) +{ +	simple_lock(&vm_page_queue_free_lock); +	if (m->free) +		panic("vm_page_release_fictitious"); +	m->free = TRUE; +	list_insert_head(&vm_page_queue_fictitious, &m->node); +	vm_page_fictitious_count++; +	simple_unlock(&vm_page_queue_free_lock); +} + +/* + *	vm_page_more_fictitious: + * + *	Add more fictitious pages to the free list. + *	Allowed to block. + */ + +int vm_page_fictitious_quantum = 5; + +void vm_page_more_fictitious(void) +{ +	vm_page_t m; +	int i; + +	for (i = 0; i < vm_page_fictitious_quantum; i++) { +		m = (vm_page_t) kmem_cache_alloc(&vm_page_cache); +		if (m == VM_PAGE_NULL) +			panic("vm_page_more_fictitious"); + +		vm_page_init(m); +		m->phys_addr = vm_page_fictitious_addr; +		m->fictitious = TRUE; +		vm_page_release_fictitious(m); +	} +} + +/* + *	vm_page_convert: + * + *	Attempt to convert a fictitious page into a real page. + * + *	The object referenced by *MP must be locked. + */ + +boolean_t vm_page_convert(struct vm_page **mp) +{ +	struct vm_page *real_m, *fict_m; +	vm_object_t object; +	vm_offset_t offset; + +	fict_m = *mp; + +	assert(fict_m->fictitious); +	assert(fict_m->phys_addr == vm_page_fictitious_addr); +	assert(!fict_m->active); +	assert(!fict_m->inactive); + +	real_m = vm_page_grab(VM_PAGE_HIGHMEM); +	if (real_m == VM_PAGE_NULL) +		return FALSE; + +	object = fict_m->object; +	offset = fict_m->offset; +	vm_page_remove(fict_m); + +	memcpy(&real_m->vm_page_header, +	       &fict_m->vm_page_header, +	       VM_PAGE_BODY_SIZE); +	real_m->fictitious = FALSE; + +	vm_page_insert(real_m, object, offset); + +	assert(real_m->phys_addr != vm_page_fictitious_addr); +	assert(fict_m->fictitious); +	assert(fict_m->phys_addr == vm_page_fictitious_addr); + +	vm_page_release_fictitious(fict_m); +	*mp = real_m; +	return TRUE; +} + +/* + *	vm_page_grab: + * + *	Remove a page from the free list. + *	Returns VM_PAGE_NULL if the free list is too small. + * + *	FLAGS specify which constraint should be enforced for the allocated + *	addresses. + */ + +vm_page_t vm_page_grab(unsigned flags) +{ +	unsigned selector; +	vm_page_t	mem; + +	if (flags & VM_PAGE_HIGHMEM) +		selector = VM_PAGE_SEL_HIGHMEM; +#if defined(VM_PAGE_DMA32_LIMIT) && VM_PAGE_DMA32_LIMIT > VM_PAGE_DIRECTMAP_LIMIT +       else if (flags & VM_PAGE_DMA32) +               selector = VM_PAGE_SEL_DMA32; +#endif +	else if (flags & VM_PAGE_DIRECTMAP) +		selector = VM_PAGE_SEL_DIRECTMAP; +#if defined(VM_PAGE_DMA32_LIMIT) && VM_PAGE_DMA32_LIMIT <= VM_PAGE_DIRECTMAP_LIMIT +	else if (flags & VM_PAGE_DMA32) +		selector = VM_PAGE_SEL_DMA32; +#endif +	else +		selector = VM_PAGE_SEL_DMA; + +	simple_lock(&vm_page_queue_free_lock); + +	/* +	 * XXX Mach has many modules that merely assume memory is +	 * directly mapped in kernel space. Instead of updating all +	 * users, we assume those which need specific physical memory +	 * properties will wire down their pages, either because +	 * they can't be paged (not part of an object), or with +	 * explicit VM calls. The strategy is then to let memory +	 * pressure balance the physical segments with pageable pages. +	 */ +	mem = vm_page_alloc_pa(0, selector, VM_PT_KERNEL); + +	if (mem == NULL) { +		simple_unlock(&vm_page_queue_free_lock); +		return NULL; +	} + +	mem->free = FALSE; +	simple_unlock(&vm_page_queue_free_lock); + +	return mem; +} + +phys_addr_t vm_page_grab_phys_addr(void) +{ +	vm_page_t p = vm_page_grab(VM_PAGE_DIRECTMAP); +	if (p == VM_PAGE_NULL) +		return -1; +	else +		return p->phys_addr; +} + +/* + *	vm_page_release: + * + *	Return a page to the free list. + */ + +void vm_page_release( +	vm_page_t	mem, +	boolean_t 	laundry, +	boolean_t 	external_laundry) +{ +	simple_lock(&vm_page_queue_free_lock); +	if (mem->free) +		panic("vm_page_release"); +	mem->free = TRUE; +	vm_page_free_pa(mem, 0); +	if (laundry) { +		vm_page_laundry_count--; + +		if (vm_page_laundry_count == 0) { +			vm_pageout_resume(); +		} +	} +	if (external_laundry) { + +		/* +		 *	If vm_page_external_laundry_count is negative, +		 *	the pageout daemon isn't expecting to be +		 *	notified. +		 */ + +		if (vm_page_external_laundry_count > 0) { +			vm_page_external_laundry_count--; + +			if (vm_page_external_laundry_count == 0) { +				vm_pageout_resume(); +			} +		} +	} + +	simple_unlock(&vm_page_queue_free_lock); +} + +/* + *	vm_page_grab_contig: + * + *	Remove a block of contiguous pages from the free list. + *	Returns VM_PAGE_NULL if the request fails. + */ + +vm_page_t vm_page_grab_contig( +	vm_size_t size, +	unsigned int selector) +{ +	unsigned int i, order, nr_pages; +	vm_page_t mem; + +	order = vm_page_order(size); +	nr_pages = 1 << order; + +	simple_lock(&vm_page_queue_free_lock); + +	/* TODO Allow caller to pass type */ +	mem = vm_page_alloc_pa(order, selector, VM_PT_KERNEL); + +	if (mem == NULL) { +		simple_unlock(&vm_page_queue_free_lock); +		return NULL; +	} + +	for (i = 0; i < nr_pages; i++) { +		mem[i].free = FALSE; +	} + +	simple_unlock(&vm_page_queue_free_lock); + +	return mem; +} + +/* + *	vm_page_free_contig: + * + *	Return a block of contiguous pages to the free list. + */ + +void vm_page_free_contig(vm_page_t mem, vm_size_t size) +{ +	unsigned int i, order, nr_pages; + +	order = vm_page_order(size); +	nr_pages = 1 << order; + +	simple_lock(&vm_page_queue_free_lock); + +	for (i = 0; i < nr_pages; i++) { +		if (mem[i].free) +			panic("vm_page_free_contig"); + +		mem[i].free = TRUE; +	} + +	vm_page_free_pa(mem, order); + +	simple_unlock(&vm_page_queue_free_lock); +} + +/* + *	vm_page_alloc: + * + *	Allocate and return a memory cell associated + *	with this VM object/offset pair. + * + *	Object must be locked. + */ + +vm_page_t vm_page_alloc( +	vm_object_t	object, +	vm_offset_t	offset) +{ +	vm_page_t	mem; + +	mem = vm_page_grab(VM_PAGE_HIGHMEM); +	if (mem == VM_PAGE_NULL) +		return VM_PAGE_NULL; + +	vm_page_lock_queues(); +	vm_page_insert(mem, object, offset); +	vm_page_unlock_queues(); + +	return mem; +} + +/* + *	vm_page_free: + * + *	Returns the given page to the free list, + *	disassociating it with any VM object. + * + *	Object and page queues must be locked prior to entry. + */ +void vm_page_free( +	vm_page_t	mem) +{ +	if (mem->free) +		panic("vm_page_free"); + +	if (mem->tabled) { +		vm_page_remove(mem); +	} + +	assert(!mem->active && !mem->inactive); + +	if (mem->wire_count != 0) { +		if (!mem->private && !mem->fictitious) +			vm_page_wire_count--; +		mem->wire_count = 0; +	} + +	PAGE_WAKEUP_DONE(mem); + +	if (mem->absent) +		vm_object_absent_release(mem->object); + +	/* +	 *	XXX The calls to vm_page_init here are +	 *	really overkill. +	 */ + +	if (mem->private || mem->fictitious) { +		vm_page_init(mem); +		mem->phys_addr = vm_page_fictitious_addr; +		mem->fictitious = TRUE; +		vm_page_release_fictitious(mem); +	} else { +		boolean_t laundry = mem->laundry; +		boolean_t external_laundry = mem->external_laundry; +		vm_page_init(mem); +		vm_page_release(mem, laundry, external_laundry); +	} +} + +/* + *	vm_page_zero_fill: + * + *	Zero-fill the specified page. + */ +void vm_page_zero_fill( +	vm_page_t	m) +{ +	VM_PAGE_CHECK(m); + +	pmap_zero_page(m->phys_addr); +} + +/* + *	vm_page_copy: + * + *	Copy one page to another + */ + +void vm_page_copy( +	vm_page_t	src_m, +	vm_page_t	dest_m) +{ +	VM_PAGE_CHECK(src_m); +	VM_PAGE_CHECK(dest_m); + +	pmap_copy_page(src_m->phys_addr, dest_m->phys_addr); +} + +#if	MACH_VM_DEBUG +/* + *	Routine:	vm_page_info + *	Purpose: + *		Return information about the global VP table. + *		Fills the buffer with as much information as possible + *		and returns the desired size of the buffer. + *	Conditions: + *		Nothing locked.  The caller should provide + *		possibly-pageable memory. + */ + +unsigned int +vm_page_info( +	hash_info_bucket_t *info, +	unsigned int	count) +{ +	int i; + +	if (vm_page_bucket_count < count) +		count = vm_page_bucket_count; + +	for (i = 0; i < count; i++) { +		vm_page_bucket_t *bucket = &vm_page_buckets[i]; +		unsigned int bucket_count = 0; +		vm_page_t m; + +		simple_lock(&bucket->lock); +		for (m = bucket->pages; m != VM_PAGE_NULL; m = m->next) +			bucket_count++; +		simple_unlock(&bucket->lock); + +		/* don't touch pageable memory while holding locks */ +		info[i].hib_count = bucket_count; +	} + +	return vm_page_bucket_count; +} +#endif	/* MACH_VM_DEBUG */ + + +#if	MACH_KDB +#define	printf	kdbprintf + +/* + *	Routine:	vm_page_print [exported] + */ +void		vm_page_print(const vm_page_t	p) +{ +	iprintf("Page 0x%X: object 0x%X,", (vm_offset_t) p, (vm_offset_t) p->object); +	 printf(" offset 0x%X", p->offset); +	 printf("wire_count %d,", p->wire_count); +	 printf(" %s", +		(p->active ? "active" : (p->inactive ? "inactive" : "loose"))); +	 printf("%s", +		(p->free ? " free" : "")); +	 printf("%s ", +		(p->laundry ? " laundry" : "")); +	 printf("%s", +		(p->dirty ? "dirty" : "clean")); +	 printf("%s", +	 	(p->busy ? " busy" : "")); +	 printf("%s", +	 	(p->absent ? " absent" : "")); +	 printf("%s", +	 	(p->error ? " error" : "")); +	 printf("%s", +		(p->fictitious ? " fictitious" : "")); +	 printf("%s", +		(p->private ? " private" : "")); +	 printf("%s", +		(p->wanted ? " wanted" : "")); +	 printf("%s,", +		(p->tabled ? "" : "not_tabled")); +	 printf("phys_addr = 0x%X, lock = 0x%X, unlock_request = 0x%X\n", +		p->phys_addr, +		(vm_offset_t) p->page_lock, +		(vm_offset_t) p->unlock_request); +} +#endif	/* MACH_KDB */ diff --git a/vm/vm_resident.h b/vm/vm_resident.h new file mode 100644 index 0000000..e8bf681 --- /dev/null +++ b/vm/vm_resident.h @@ -0,0 +1,45 @@ +/* + * Resident memory management module functions. + * Copyright (C) 2008 Free Software Foundation, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * + *  Author: Barry deFreese. + */ +/* + *     Resident memory management module functions. + * + */ + +#ifndef _VM_RESIDENT_H_ +#define _VM_RESIDENT_H_ + +#include <mach/std_types.h> + +/* + *  vm_page_replace: + * + *  Exactly like vm_page_insert, except that we first + *  remove any existing page at the given offset in object + *  and we don't do deactivate-behind. + * + *  The object and page must be locked. + */ +extern void vm_page_replace ( +    vm_page_t mem, +    vm_object_t object, +    vm_offset_t offset); + +#endif /* _VM_RESIDENT_H_ */ diff --git a/vm/vm_types.h b/vm/vm_types.h new file mode 100644 index 0000000..f64ebee --- /dev/null +++ b/vm/vm_types.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2007 Free Software Foundation, Inc. + *  + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any later + * version. + *  + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License + * for more details. + *  + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + *  + * Written by Thomas Schwinge. + */ + +#ifndef VM_VM_TYPES_H +#define VM_VM_TYPES_H + +/* + *	Types defined: + * + *	vm_map_t		the high-level address map data structure. + *	vm_object_t		Virtual memory object. + *	vm_page_t		See `vm/vm_page.h'. + */ + +typedef struct vm_map *vm_map_t; +#define VM_MAP_NULL ((vm_map_t) 0) + +typedef struct vm_object *vm_object_t; +#define VM_OBJECT_NULL ((vm_object_t) 0) + +typedef struct vm_page *vm_page_t; +#define VM_PAGE_NULL ((vm_page_t) 0) + + +#endif /* VM_VM_TYPES_H */ diff --git a/vm/vm_user.c b/vm/vm_user.c new file mode 100644 index 0000000..868230a --- /dev/null +++ b/vm/vm_user.c @@ -0,0 +1,803 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU + *  School of Computer Science + *  Carnegie Mellon University + *  Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + *	File:	vm/vm_user.c + *	Author:	Avadis Tevanian, Jr., Michael Wayne Young + * + *	User-exported virtual memory functions. + */ + +#include <mach/boolean.h> +#include <mach/kern_return.h> +#include <mach/mach_types.h>	/* to get vm_address_t */ +#include <mach/memory_object.h> +#include <mach/std_types.h>	/* to get pointer_t */ +#include <mach/vm_attributes.h> +#include <mach/vm_param.h> +#include <mach/vm_statistics.h> +#include <mach/vm_cache_statistics.h> +#include <mach/vm_sync.h> +#include <kern/gnumach.server.h> +#include <kern/host.h> +#include <kern/mach.server.h> +#include <kern/mach_host.server.h> +#include <kern/task.h> +#include <vm/vm_fault.h> +#include <vm/vm_kern.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/memory_object_proxy.h> +#include <vm/vm_page.h> + + + +vm_statistics_data_t	vm_stat; + +/* + *	vm_allocate allocates "zero fill" memory in the specfied + *	map. + */ +kern_return_t vm_allocate( +	vm_map_t	map, +	vm_offset_t	*addr, +	vm_size_t	size, +	boolean_t	anywhere) +{ +	kern_return_t	result; + +	if (map == VM_MAP_NULL) +		return(KERN_INVALID_ARGUMENT); +	if (size == 0) { +		*addr = 0; +		return(KERN_SUCCESS); +	} + +	if (anywhere) +		*addr = vm_map_min(map); +	else +		*addr = trunc_page(*addr); +	size = round_page(size); + +	result = vm_map_enter( +			map, +			addr, +			size, +			(vm_offset_t)0, +			anywhere, +			VM_OBJECT_NULL, +			(vm_offset_t)0, +			FALSE, +			VM_PROT_DEFAULT, +			VM_PROT_ALL, +			VM_INHERIT_DEFAULT); + +	return(result); +} + +/* + *	vm_deallocate deallocates the specified range of addresses in the + *	specified address map. + */ +kern_return_t vm_deallocate( +	vm_map_t		map, +	vm_offset_t		start, +	vm_size_t		size) +{ +	if (map == VM_MAP_NULL) +		return(KERN_INVALID_ARGUMENT); + +	if (size == (vm_offset_t) 0) +		return(KERN_SUCCESS); + +	return(vm_map_remove(map, trunc_page(start), round_page(start+size))); +} + +/* + *	vm_inherit sets the inheritance of the specified range in the + *	specified map. + */ +kern_return_t vm_inherit( +	vm_map_t		map, +	vm_offset_t		start, +	vm_size_t		size, +	vm_inherit_t		new_inheritance) +{ +	if (map == VM_MAP_NULL) +		return(KERN_INVALID_ARGUMENT); + +        switch (new_inheritance) { +        case VM_INHERIT_NONE: +        case VM_INHERIT_COPY: +        case VM_INHERIT_SHARE: +                break; +        default: +                return(KERN_INVALID_ARGUMENT); +        } + +	/*Check if range includes projected buffer; +	  user is not allowed direct manipulation in that case*/ +	if (projected_buffer_in_range(map, start, start+size)) +		return(KERN_INVALID_ARGUMENT); + +	return(vm_map_inherit(map, +			      trunc_page(start), +			      round_page(start+size), +			      new_inheritance)); +} + +/* + *	vm_protect sets the protection of the specified range in the + *	specified map. + */ + +kern_return_t vm_protect( +	vm_map_t		map, +	vm_offset_t		start, +	vm_size_t		size, +	boolean_t		set_maximum, +	vm_prot_t		new_protection) +{ +	if ((map == VM_MAP_NULL) || +		(new_protection & ~(VM_PROT_ALL|VM_PROT_NOTIFY))) +		return(KERN_INVALID_ARGUMENT); + +	/*Check if range includes projected buffer; +	  user is not allowed direct manipulation in that case*/ +	if (projected_buffer_in_range(map, start, start+size)) +		return(KERN_INVALID_ARGUMENT); + +	return(vm_map_protect(map, +			      trunc_page(start), +			      round_page(start+size), +			      new_protection, +			      set_maximum)); +} + +kern_return_t vm_statistics( +	vm_map_t		map, +	vm_statistics_data_t	*stat) +{ +	if (map == VM_MAP_NULL) +		return(KERN_INVALID_ARGUMENT); + +	*stat = vm_stat; + +	stat->pagesize = PAGE_SIZE; +	stat->free_count = vm_page_mem_free(); +	stat->active_count = vm_page_active_count; +	stat->inactive_count = vm_page_inactive_count; +	stat->wire_count = vm_page_wire_count; + +	return(KERN_SUCCESS); +} + +kern_return_t vm_cache_statistics( +	vm_map_t			map, +	vm_cache_statistics_data_t	*stats) +{ +	if (map == VM_MAP_NULL) +		return KERN_INVALID_ARGUMENT; + +	stats->cache_object_count = vm_object_external_count; +	stats->cache_count = vm_object_external_pages; + +	/* XXX Not implemented yet */ +	stats->active_tmp_count = 0; +	stats->inactive_tmp_count = 0; +	stats->active_perm_count = 0; +	stats->inactive_perm_count = 0; +	stats->dirty_count = 0; +	stats->laundry_count = 0; +	stats->writeback_count = 0; +	stats->slab_count = 0; +	stats->slab_reclaim_count = 0; +	return KERN_SUCCESS; +} + +/* + * Handle machine-specific attributes for a mapping, such + * as cachability, migrability, etc. + */ +kern_return_t vm_machine_attribute( +	vm_map_t	map, +	vm_address_t	address, +	vm_size_t	size, +	vm_machine_attribute_t	attribute, +	vm_machine_attribute_val_t* value)		/* IN/OUT */ +{ +	if (map == VM_MAP_NULL) +		return(KERN_INVALID_ARGUMENT); + +	/*Check if range includes projected buffer; +	  user is not allowed direct manipulation in that case*/ +	if (projected_buffer_in_range(map, address, address+size)) +		return(KERN_INVALID_ARGUMENT); + +	return vm_map_machine_attribute(map, address, size, attribute, value); +} + +kern_return_t vm_read( +	vm_map_t	map, +	vm_address_t	address, +	vm_size_t	size, +	pointer_t	*data, +	mach_msg_type_number_t	*data_size) +{ +	kern_return_t	error; +	vm_map_copy_t	ipc_address; + +	if (map == VM_MAP_NULL) +		return(KERN_INVALID_ARGUMENT); + +	if ((error = vm_map_copyin(map, +				address, +				size, +				FALSE,	/* src_destroy */ +				&ipc_address)) == KERN_SUCCESS) { +		*data = (pointer_t) ipc_address; +		*data_size = size; +	} +	return(error); +} + +kern_return_t vm_write( +	vm_map_t	map, +	vm_address_t	address, +	pointer_t	data, +	mach_msg_type_number_t	size) +{ +	if (map == VM_MAP_NULL) +		return KERN_INVALID_ARGUMENT; + +	return vm_map_copy_overwrite(map, address, (vm_map_copy_t) data, +				     FALSE /* interruptible XXX */); +} + +kern_return_t vm_copy( +	vm_map_t	map, +	vm_address_t	source_address, +	vm_size_t	size, +	vm_address_t	dest_address) +{ +	vm_map_copy_t copy; +	kern_return_t kr; + +	if (map == VM_MAP_NULL) +		return KERN_INVALID_ARGUMENT; + +	kr = vm_map_copyin(map, source_address, size, +			   FALSE, ©); +	if (kr != KERN_SUCCESS) +		return kr; + +	kr = vm_map_copy_overwrite(map, dest_address, copy, +				   FALSE /* interruptible XXX */); +	if (kr != KERN_SUCCESS) { +		vm_map_copy_discard(copy); +		return kr; +	} + +	return KERN_SUCCESS; +} + + +/* + *	Routine:	vm_map + */ +kern_return_t vm_map( +	vm_map_t	target_map, +	vm_offset_t	*address, +	vm_size_t	size, +	vm_offset_t	mask, +	boolean_t	anywhere, +	ipc_port_t	memory_object, +	vm_offset_t	offset, +	boolean_t	copy, +	vm_prot_t	cur_protection, +	vm_prot_t	max_protection, +	vm_inherit_t	inheritance) +{ +	vm_object_t	object; +	kern_return_t	result; + +	if ((target_map == VM_MAP_NULL) || +	    (cur_protection & ~VM_PROT_ALL) || +	    (max_protection & ~VM_PROT_ALL)) +		return(KERN_INVALID_ARGUMENT); + +        switch (inheritance) { +        case VM_INHERIT_NONE: +        case VM_INHERIT_COPY: +        case VM_INHERIT_SHARE: +                break; +        default: +                return(KERN_INVALID_ARGUMENT); +        } + +	if (size == 0) +		return KERN_INVALID_ARGUMENT; + +#ifdef USER32 +        if (mask & 0x80000000) +            mask |= 0xffffffff00000000; +#endif + +	*address = trunc_page(*address); +	size = round_page(size); + +	if (!IP_VALID(memory_object)) { +		object = VM_OBJECT_NULL; +		offset = 0; +		copy = FALSE; +	} else if ((object = vm_object_enter(memory_object, size, FALSE)) +			== VM_OBJECT_NULL) +	  { +	    ipc_port_t real_memobj; +	    vm_prot_t prot; +	    vm_offset_t start; +	    vm_offset_t len; + +	    result = memory_object_proxy_lookup (memory_object, &real_memobj, +						 &prot, &start, &len); +	    if (result != KERN_SUCCESS) +	      return result; + +           if (!copy) +             { +		/* Reduce the allowed access to the memory object.  */ +		max_protection &= prot; +		cur_protection &= prot; +             } +           else +             { +               /* Disallow making a copy unless the proxy allows reading.  */ +               if (!(prot & VM_PROT_READ)) +                 return KERN_PROTECTION_FAILURE; +             } + +	    /* Reduce the allowed range */ +	    if ((start + offset + size) > (start + len)) +	      return KERN_INVALID_ARGUMENT; + +	    offset += start; + +	    if ((object = vm_object_enter(real_memobj, size, FALSE)) +		== VM_OBJECT_NULL) +	      return KERN_INVALID_ARGUMENT; +	  } + +	/* +	 *	Perform the copy if requested +	 */ + +	if (copy) { +		vm_object_t	new_object; +		vm_offset_t	new_offset; + +		result = vm_object_copy_strategically(object, offset, size, +				&new_object, &new_offset, +				©); + +		/* +		 *	Throw away the reference to the +		 *	original object, as it won't be mapped. +		 */ + +		vm_object_deallocate(object); + +		if (result != KERN_SUCCESS) +			return (result); + +		object = new_object; +		offset = new_offset; +	} + +	if ((result = vm_map_enter(target_map, +				address, size, mask, anywhere, +				object, offset, +				copy, +				cur_protection, max_protection, inheritance +				)) != KERN_SUCCESS) +		vm_object_deallocate(object); +	return(result); +} + +/* + *	Specify that the range of the virtual address space + *	of the target task must not cause page faults for + *	the indicated accesses. + * + *	[ To unwire the pages, specify VM_PROT_NONE. ] + */ +kern_return_t vm_wire(const ipc_port_t port, +		vm_map_t map, +		vm_offset_t start, +		vm_size_t size, +		vm_prot_t access) +{ +	boolean_t priv; + +	if (!IP_VALID(port)) +		return KERN_INVALID_HOST; + +	ip_lock(port); +	if (!ip_active(port) || +		  (ip_kotype(port) != IKOT_HOST_PRIV +		&& ip_kotype(port) != IKOT_HOST)) +	{ +		ip_unlock(port); +		return KERN_INVALID_HOST; +	} + +	priv = ip_kotype(port) == IKOT_HOST_PRIV; +	ip_unlock(port); + +	if (map == VM_MAP_NULL) +		return KERN_INVALID_TASK; + +	if (access & ~VM_PROT_ALL) +		return KERN_INVALID_ARGUMENT; + +	/*Check if range includes projected buffer; +	  user is not allowed direct manipulation in that case*/ +	if (projected_buffer_in_range(map, start, start+size)) +		return(KERN_INVALID_ARGUMENT); + +	/* TODO: make it tunable */ +	if (!priv && access != VM_PROT_NONE && map->size_wired + size > (8<<20)) +		return KERN_NO_ACCESS; + +	return vm_map_pageable(map, trunc_page(start), round_page(start+size), +			       access, TRUE, TRUE); +} + +kern_return_t vm_wire_all(const ipc_port_t port, vm_map_t map, vm_wire_t flags) +{ +	if (!IP_VALID(port)) +		return KERN_INVALID_HOST; + +	ip_lock(port); + +	if (!ip_active(port) +	    || (ip_kotype(port) != IKOT_HOST_PRIV)) { +		ip_unlock(port); +		return KERN_INVALID_HOST; +	} + +	ip_unlock(port); + +	if (map == VM_MAP_NULL) { +		return KERN_INVALID_TASK; +	} + +	if (flags & ~VM_WIRE_ALL) { +		return KERN_INVALID_ARGUMENT; +	} + +	/*Check if range includes projected buffer; +	  user is not allowed direct manipulation in that case*/ +	if (projected_buffer_in_range(map, map->min_offset, map->max_offset)) { +		return KERN_INVALID_ARGUMENT; +	} + +	return vm_map_pageable_all(map, flags); +} + +/* + *	vm_object_sync synchronizes out pages from the memory object to its + *	memory manager, if any. + */ +kern_return_t vm_object_sync( +	vm_object_t		object, +	vm_offset_t		offset, +	vm_size_t		size, +	boolean_t		should_flush, +	boolean_t		should_return, +	boolean_t		should_iosync) +{ +	if (object == VM_OBJECT_NULL) +		return KERN_INVALID_ARGUMENT; + +	/* FIXME: we should rather introduce an internal function, e.g. +	   vm_object_update, rather than calling memory_object_lock_request.  */ +	vm_object_reference(object); + +	/* This is already always synchronous for now.  */ +	(void) should_iosync; + +	size = round_page(offset + size) - trunc_page(offset); +	offset = trunc_page(offset); + +	return  memory_object_lock_request(object, offset, size, +					   should_return ? +						MEMORY_OBJECT_RETURN_ALL : +						MEMORY_OBJECT_RETURN_NONE, +					   should_flush, +					   VM_PROT_NO_CHANGE, +					   NULL, 0); +} + +/* + *	vm_msync synchronizes out pages from the map to their memory manager, + *	if any. + */ +kern_return_t vm_msync( +	vm_map_t		map, +	vm_address_t		address, +	vm_size_t		size, +	vm_sync_t		sync_flags) +{ +	if (map == VM_MAP_NULL) +		return KERN_INVALID_ARGUMENT; + +	return vm_map_msync(map, (vm_offset_t) address, size, sync_flags); +} + +/* + *	vm_allocate_contiguous allocates "zero fill" physical memory and maps + *	it into in the specfied map. + */ +/* TODO: respect physical alignment (palign) + *       and minimum physical address (pmin) + */ +kern_return_t vm_allocate_contiguous( +	host_t			host_priv, +	vm_map_t		map, +	vm_address_t		*result_vaddr, +	rpc_phys_addr_t		*result_paddr, +	vm_size_t		size, +	rpc_phys_addr_t		pmin, +	rpc_phys_addr_t		pmax, +	rpc_phys_addr_t		palign) +{ +	vm_size_t		alloc_size; +	unsigned int		npages; +	unsigned int		i; +	unsigned int		order; +	unsigned int		selector; +	vm_page_t		pages; +	vm_object_t		object; +	kern_return_t		kr; +	vm_address_t		vaddr; + +	if (host_priv == HOST_NULL) +		return KERN_INVALID_HOST; + +	if (map == VM_MAP_NULL) +		return KERN_INVALID_TASK; + +	/* FIXME */ +	if (pmin != 0) +		return KERN_INVALID_ARGUMENT; + +	if (palign == 0) +		palign = PAGE_SIZE; + +	/* FIXME: Allows some small alignments less than page size */ +	if ((palign < PAGE_SIZE) && (PAGE_SIZE % palign == 0)) +		palign = PAGE_SIZE; + +	/* FIXME */ +	if (palign != PAGE_SIZE) +		return KERN_INVALID_ARGUMENT; + +	selector = VM_PAGE_SEL_DMA; +	if (pmax > VM_PAGE_DMA_LIMIT) +#ifdef VM_PAGE_DMA32_LIMIT +#if VM_PAGE_DMA32_LIMIT < VM_PAGE_DIRECTMAP_LIMIT +		if (pmax <= VM_PAGE_DMA32_LIMIT) +			selector = VM_PAGE_SEL_DMA32; +	if (pmax > VM_PAGE_DMA32_LIMIT) +#endif +#endif +		if (pmax <= VM_PAGE_DIRECTMAP_LIMIT) +			selector = VM_PAGE_SEL_DIRECTMAP; +	if (pmax > VM_PAGE_DIRECTMAP_LIMIT) +#ifdef VM_PAGE_DMA32_LIMIT +#if VM_PAGE_DMA32_LIMIT > VM_PAGE_DIRECTMAP_LIMIT +		if (pmax <= VM_PAGE_DMA32_LIMIT) +			selector = VM_PAGE_SEL_DMA32; +	if (pmax > VM_PAGE_DMA32_LIMIT) +#endif +#endif +		if (pmax <= VM_PAGE_HIGHMEM_LIMIT) +			selector = VM_PAGE_SEL_HIGHMEM; + +	size = vm_page_round(size); + +	if (size == 0) +		return KERN_INVALID_ARGUMENT; + +	object = vm_object_allocate(size); + +	if (object == NULL) +		return KERN_RESOURCE_SHORTAGE; + +	/* +	 * XXX The page allocator returns blocks with a power-of-two size. +	 * The requested size may not be a power-of-two, requiring some +	 * work to release back the pages that aren't needed. +	 */ +	order = vm_page_order(size); +	alloc_size = (1 << (order + PAGE_SHIFT)); +	npages = vm_page_atop(alloc_size); + +	pages = vm_page_grab_contig(alloc_size, selector); + +	if (pages == NULL) { +		vm_object_deallocate(object); +		return KERN_RESOURCE_SHORTAGE; +	} + +	vm_object_lock(object); +	vm_page_lock_queues(); + +	for (i = 0; i < vm_page_atop(size); i++) { +		/* +		 * XXX We can safely handle contiguous pages as an array, +		 * but this relies on knowing the implementation of the +		 * page allocator. +		 */ +		pages[i].busy = FALSE; +		vm_page_insert(&pages[i], object, vm_page_ptoa(i)); +		vm_page_wire(&pages[i]); +	} + +	vm_page_unlock_queues(); +	vm_object_unlock(object); + +	for (i = vm_page_atop(size); i < npages; i++) { +		vm_page_release(&pages[i], FALSE, FALSE); +	} + +	vaddr = 0; +	kr = vm_map_enter(map, &vaddr, size, 0, TRUE, object, 0, FALSE, +			  VM_PROT_READ | VM_PROT_WRITE, +			  VM_PROT_READ | VM_PROT_WRITE, VM_INHERIT_DEFAULT); + +	if (kr != KERN_SUCCESS) { +		vm_object_deallocate(object); +		return kr; +	} + +	kr = vm_map_pageable(map, vaddr, vaddr + size, +			     VM_PROT_READ | VM_PROT_WRITE, +			     TRUE, TRUE); + +	if (kr != KERN_SUCCESS) { +		vm_map_remove(map, vaddr, vaddr + size); +		return kr; +	} + +	*result_vaddr = vaddr; +	*result_paddr = pages->phys_addr; + +	assert(*result_paddr >= pmin); +	assert(*result_paddr + size <= pmax); + +	return KERN_SUCCESS; +} + +/* + *	vm_pages_phys returns information about a region of memory + */ +kern_return_t vm_pages_phys( +	host_t				host, +	vm_map_t			map, +	vm_address_t			address, +	vm_size_t			size, +	rpc_phys_addr_array_t		*pagespp, +	mach_msg_type_number_t		*countp) +{ +	if (host == HOST_NULL) +		return KERN_INVALID_HOST; +	if (map == VM_MAP_NULL) +		return KERN_INVALID_TASK; + +	if (!page_aligned(address)) +		return KERN_INVALID_ARGUMENT; +	if (!page_aligned(size)) +		return KERN_INVALID_ARGUMENT; + +	mach_msg_type_number_t count = atop(size), cur; +	rpc_phys_addr_array_t pagesp = *pagespp; +	kern_return_t kr; + +	if (*countp < count) { +		vm_offset_t allocated; +		/* Avoid faults while we keep vm locks */ +		kr = kmem_alloc(ipc_kernel_map, &allocated, +				count * sizeof(pagesp[0])); +		if (kr != KERN_SUCCESS) +			return KERN_RESOURCE_SHORTAGE; +		pagesp = (rpc_phys_addr_array_t) allocated; +	} + +	for (cur = 0; cur < count; cur++) { +		vm_map_t cmap;		/* current map in traversal */ +		rpc_phys_addr_t paddr; +		vm_map_entry_t entry;	/* entry in current map */ + +		/* find the entry containing (or following) the address */ +		vm_map_lock_read(map); +		for (cmap = map;;) { +			/* cmap is read-locked */ + +			if (!vm_map_lookup_entry(cmap, address, &entry)) { +				entry = VM_MAP_ENTRY_NULL; +				break; +			} + +			if (entry->is_sub_map) { +				/* move down to the sub map */ + +				vm_map_t nmap = entry->object.sub_map; +				vm_map_lock_read(nmap); +				vm_map_unlock_read(cmap); +				cmap = nmap; +				continue; +			} else { +				/* Found it */ +				break; +			} +			/*NOTREACHED*/ +		} + +		paddr = 0; +		if (entry) { +			vm_offset_t offset = address - entry->vme_start + entry->offset; +			vm_object_t object = entry->object.vm_object; + +			if (object) { +				vm_object_lock(object); +				vm_page_t page = vm_page_lookup(object, offset); +				if (page) { +					if (page->phys_addr != (typeof(pagesp[cur])) page->phys_addr) +						printf("warning: physical address overflow in vm_pages_phys!!\n"); +					else +						paddr = page->phys_addr; +				} +				vm_object_unlock(object); +			} +		} +		vm_map_unlock_read(cmap); +		pagesp[cur] = paddr; + +		address += PAGE_SIZE; +	} + +	if (pagesp != *pagespp) { +		vm_map_copy_t copy; +		kr = vm_map_copyin(ipc_kernel_map, (vm_offset_t) pagesp, +				   count * sizeof(pagesp[0]), TRUE, ©); +		assert(kr == KERN_SUCCESS); +		*pagespp = (rpc_phys_addr_array_t) copy; +	} + +	*countp = count; + +	return KERN_SUCCESS; +} diff --git a/vm/vm_user.h b/vm/vm_user.h new file mode 100644 index 0000000..c6f20a8 --- /dev/null +++ b/vm/vm_user.h @@ -0,0 +1,60 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU + *  School of Computer Science + *  Carnegie Mellon University + *  Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + *	File:	vm/vm_user.h + *	Author:	Avadis Tevanian, Jr., Michael Wayne Young + *	Date:	1986 + * + *	Declarations of user-visible virtual address space + *	management functionality. + */ + +#ifndef	_VM_VM_USER_H_ +#define _VM_VM_USER_H_ + +#include <mach/kern_return.h> +#include <mach/std_types.h> +#include <mach/mach_types.h> + +extern kern_return_t	vm_allocate(vm_map_t, vm_offset_t *, vm_size_t, +				    boolean_t); +extern kern_return_t	vm_deallocate(vm_map_t, vm_offset_t, vm_size_t); +extern kern_return_t	vm_inherit(vm_map_t, vm_offset_t, vm_size_t, +				   vm_inherit_t); +extern kern_return_t	vm_protect(vm_map_t, vm_offset_t, vm_size_t, boolean_t, +				   vm_prot_t); +extern kern_return_t	vm_statistics(vm_map_t, vm_statistics_data_t *); +extern kern_return_t	vm_cache_statistics(vm_map_t, vm_cache_statistics_data_t *); +extern kern_return_t	vm_read(vm_map_t, vm_address_t, vm_size_t, pointer_t *, +				vm_size_t *); +extern kern_return_t	vm_write(vm_map_t, vm_address_t, pointer_t, vm_size_t); +extern kern_return_t	vm_copy(vm_map_t, vm_address_t, vm_size_t, +				vm_address_t); +extern kern_return_t	vm_map(vm_map_t, vm_offset_t *, vm_size_t, vm_offset_t, +			       boolean_t, ipc_port_t, vm_offset_t, boolean_t, +			       vm_prot_t, vm_prot_t, vm_inherit_t); + +#endif	/* _VM_VM_USER_H_ */  | 
