diff options
author | Pasha <pasha@member.fsf.org> | 2024-02-20 18:49:50 +0000 |
---|---|---|
committer | Pasha <pasha@member.fsf.org> | 2024-02-20 18:49:50 +0000 |
commit | 5e0b8d508ed51004bd836384293be00950ee62c9 (patch) | |
tree | e3f16b1aa8b7177032ce3ec429fbad2b1d92a876 /vm | |
download | gnumach-riscv-5e0b8d508ed51004bd836384293be00950ee62c9.tar.gz gnumach-riscv-5e0b8d508ed51004bd836384293be00950ee62c9.tar.bz2 |
init gnumach copy
Diffstat (limited to 'vm')
-rw-r--r-- | vm/memory_object.c | 1090 | ||||
-rw-r--r-- | vm/memory_object.h | 39 | ||||
-rw-r--r-- | vm/memory_object_default.cli | 28 | ||||
-rw-r--r-- | vm/memory_object_proxy.c | 228 | ||||
-rw-r--r-- | vm/memory_object_proxy.h | 39 | ||||
-rw-r--r-- | vm/memory_object_user.cli | 28 | ||||
-rw-r--r-- | vm/pmap.h | 241 | ||||
-rw-r--r-- | vm/vm_debug.c | 548 | ||||
-rw-r--r-- | vm/vm_external.c | 151 | ||||
-rw-r--r-- | vm/vm_external.h | 95 | ||||
-rw-r--r-- | vm/vm_fault.c | 2136 | ||||
-rw-r--r-- | vm/vm_fault.h | 81 | ||||
-rw-r--r-- | vm/vm_init.c | 88 | ||||
-rw-r--r-- | vm/vm_init.h | 25 | ||||
-rw-r--r-- | vm/vm_kern.c | 1099 | ||||
-rw-r--r-- | vm/vm_kern.h | 100 | ||||
-rw-r--r-- | vm/vm_map.c | 5237 | ||||
-rw-r--r-- | vm/vm_map.h | 585 | ||||
-rw-r--r-- | vm/vm_object.c | 2994 | ||||
-rw-r--r-- | vm/vm_object.h | 415 | ||||
-rw-r--r-- | vm/vm_page.c | 2164 | ||||
-rw-r--r-- | vm/vm_page.h | 567 | ||||
-rw-r--r-- | vm/vm_pageout.c | 515 | ||||
-rw-r--r-- | vm/vm_pageout.h | 53 | ||||
-rw-r--r-- | vm/vm_print.h | 41 | ||||
-rw-r--r-- | vm/vm_resident.c | 1116 | ||||
-rw-r--r-- | vm/vm_resident.h | 45 | ||||
-rw-r--r-- | vm/vm_types.h | 42 | ||||
-rw-r--r-- | vm/vm_user.c | 803 | ||||
-rw-r--r-- | vm/vm_user.h | 60 |
30 files changed, 20653 insertions, 0 deletions
diff --git a/vm/memory_object.c b/vm/memory_object.c new file mode 100644 index 0000000..1ea5956 --- /dev/null +++ b/vm/memory_object.c @@ -0,0 +1,1090 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University. + * Copyright (c) 1993,1994 The University of Utah and + * the Computer Systems Laboratory (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF + * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY + * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF + * THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/memory_object.c + * Author: Michael Wayne Young + * + * External memory management interface control functions. + */ + +/* + * Interface dependencies: + */ + +#include <mach/std_types.h> /* For pointer_t */ +#include <mach/mach_types.h> + +#include <mach/kern_return.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <mach/memory_object.h> +#include <mach/boolean.h> +#include <mach/vm_prot.h> +#include <mach/message.h> + +#include <vm/memory_object_user.user.h> +#include <vm/memory_object_default.user.h> + +/* + * Implementation dependencies: + */ +#include <vm/memory_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pageout.h> +#include <vm/pmap.h> /* For copy_to_phys, pmap_clear_modify */ +#include <kern/debug.h> /* For panic() */ +#include <kern/thread.h> /* For current_thread() */ +#include <kern/host.h> +#include <kern/mach.server.h> /* For rpc prototypes */ +#include <vm/vm_kern.h> /* For kernel_map, vm_move */ +#include <vm/vm_map.h> /* For vm_map_pageable */ +#include <ipc/ipc_port.h> + +#if MACH_PAGEMAP +#include <vm/vm_external.h> +#endif /* MACH_PAGEMAP */ + +typedef int memory_object_lock_result_t; /* moved from below */ + + +ipc_port_t memory_manager_default = IP_NULL; +def_simple_lock_data(static,memory_manager_default_lock) + +/* + * Important note: + * All of these routines gain a reference to the + * object (first argument) as part of the automatic + * argument conversion. Explicit deallocation is necessary. + */ + +kern_return_t memory_object_data_supply( + vm_object_t object, + vm_offset_t offset, + vm_offset_t vm_data_copy, + unsigned int data_cnt, + vm_prot_t lock_value, + boolean_t precious, + ipc_port_t reply_to, + mach_msg_type_name_t reply_to_type) +{ + kern_return_t result = KERN_SUCCESS; + vm_offset_t error_offset = 0; + vm_page_t m; + vm_page_t data_m; + vm_size_t original_length; + vm_offset_t original_offset; + vm_page_t *page_list; + boolean_t was_absent; + vm_map_copy_t data_copy = (vm_map_copy_t)vm_data_copy; + vm_map_copy_t orig_copy = data_copy; + + /* + * Look for bogus arguments + */ + + if (object == VM_OBJECT_NULL) { + return(KERN_INVALID_ARGUMENT); + } + + if (lock_value & ~VM_PROT_ALL) { + vm_object_deallocate(object); + return(KERN_INVALID_ARGUMENT); + } + + if ((data_cnt % PAGE_SIZE) != 0) { + vm_object_deallocate(object); + return(KERN_INVALID_ARGUMENT); + } + + /* + * Adjust the offset from the memory object to the offset + * within the vm_object. + */ + + original_length = data_cnt; + original_offset = offset; + + assert(data_copy->type == VM_MAP_COPY_PAGE_LIST); + page_list = &data_copy->cpy_page_list[0]; + + vm_object_lock(object); + vm_object_paging_begin(object); + offset -= object->paging_offset; + + /* + * Loop over copy stealing pages for pagein. + */ + + for (; data_cnt > 0 ; data_cnt -= PAGE_SIZE, offset += PAGE_SIZE) { + + assert(data_copy->cpy_npages > 0); + data_m = *page_list; + + if (data_m == VM_PAGE_NULL || data_m->tabled || + data_m->error || data_m->absent || data_m->fictitious) { + + panic("Data_supply: bad page"); + } + + /* + * Look up target page and check its state. + */ + +retry_lookup: + m = vm_page_lookup(object,offset); + if (m == VM_PAGE_NULL) { + was_absent = FALSE; + } + else { + if (m->absent && m->busy) { + + /* + * Page was requested. Free the busy + * page waiting for it. Insertion + * of new page happens below. + */ + + VM_PAGE_FREE(m); + was_absent = TRUE; + } + else { + + /* + * Have to wait for page that is busy and + * not absent. This is probably going to + * be an error, but go back and check. + */ + if (m->busy) { + PAGE_ASSERT_WAIT(m, FALSE); + vm_object_unlock(object); + thread_block((void (*)()) 0); + vm_object_lock(object); + goto retry_lookup; + } + + /* + * Page already present; error. + * This is an error if data is precious. + */ + result = KERN_MEMORY_PRESENT; + error_offset = offset + object->paging_offset; + + break; + } + } + + /* + * Ok to pagein page. Target object now has no page + * at offset. Set the page parameters, then drop + * in new page and set up pageout state. Object is + * still locked here. + * + * Must clear busy bit in page before inserting it. + * Ok to skip wakeup logic because nobody else + * can possibly know about this page. + */ + + data_m->busy = FALSE; + data_m->dirty = FALSE; + pmap_clear_modify(data_m->phys_addr); + + data_m->page_lock = lock_value; + data_m->unlock_request = VM_PROT_NONE; + data_m->precious = precious; + + vm_page_lock_queues(); + vm_page_insert(data_m, object, offset); + + if (was_absent) + vm_page_activate(data_m); + else + vm_page_deactivate(data_m); + + vm_page_unlock_queues(); + + /* + * Null out this page list entry, and advance to next + * page. + */ + + *page_list++ = VM_PAGE_NULL; + + if (--(data_copy->cpy_npages) == 0 && + vm_map_copy_has_cont(data_copy)) { + vm_map_copy_t new_copy; + + vm_object_unlock(object); + + vm_map_copy_invoke_cont(data_copy, &new_copy, &result); + + if (result == KERN_SUCCESS) { + + /* + * Consume on success requires that + * we keep the original vm_map_copy + * around in case something fails. + * Free the old copy if it's not the original + */ + if (data_copy != orig_copy) { + vm_map_copy_discard(data_copy); + } + + if ((data_copy = new_copy) != VM_MAP_COPY_NULL) + page_list = &data_copy->cpy_page_list[0]; + + vm_object_lock(object); + } + else { + vm_object_lock(object); + error_offset = offset + object->paging_offset + + PAGE_SIZE; + break; + } + } + } + + /* + * Send reply if one was requested. + */ + vm_object_paging_end(object); + vm_object_unlock(object); + + if (vm_map_copy_has_cont(data_copy)) + vm_map_copy_abort_cont(data_copy); + + if (IP_VALID(reply_to)) { + memory_object_supply_completed( + reply_to, reply_to_type, + object->pager_request, + original_offset, + original_length, + result, + error_offset); + } + + vm_object_deallocate(object); + + /* + * Consume on success: The final data copy must be + * be discarded if it is not the original. The original + * gets discarded only if this routine succeeds. + */ + if (data_copy != orig_copy) + vm_map_copy_discard(data_copy); + if (result == KERN_SUCCESS) + vm_map_copy_discard(orig_copy); + + + return(result); +} + +kern_return_t memory_object_data_error( + vm_object_t object, + vm_offset_t offset, + vm_size_t size, + kern_return_t error_value) +{ + if (object == VM_OBJECT_NULL) + return(KERN_INVALID_ARGUMENT); + + if (size != round_page(size)) + return(KERN_INVALID_ARGUMENT); + + vm_object_lock(object); + offset -= object->paging_offset; + + while (size != 0) { + vm_page_t m; + + m = vm_page_lookup(object, offset); + if ((m != VM_PAGE_NULL) && m->busy && m->absent) { + m->error = TRUE; + m->absent = FALSE; + vm_object_absent_release(object); + + PAGE_WAKEUP_DONE(m); + + vm_page_lock_queues(); + vm_page_activate(m); + vm_page_unlock_queues(); + } + + size -= PAGE_SIZE; + offset += PAGE_SIZE; + } + vm_object_unlock(object); + + vm_object_deallocate(object); + return(KERN_SUCCESS); +} + +kern_return_t memory_object_data_unavailable( + vm_object_t object, + vm_offset_t offset, + vm_size_t size) +{ +#if MACH_PAGEMAP + vm_external_t existence_info = VM_EXTERNAL_NULL; +#endif /* MACH_PAGEMAP */ + + if (object == VM_OBJECT_NULL) + return(KERN_INVALID_ARGUMENT); + + if (size != round_page(size)) + return(KERN_INVALID_ARGUMENT); + +#if MACH_PAGEMAP + if ((offset == 0) && (size > VM_EXTERNAL_LARGE_SIZE) && + (object->existence_info == VM_EXTERNAL_NULL)) { + existence_info = vm_external_create(VM_EXTERNAL_SMALL_SIZE); + } +#endif /* MACH_PAGEMAP */ + + vm_object_lock(object); +#if MACH_PAGEMAP + if (existence_info != VM_EXTERNAL_NULL) { + object->existence_info = existence_info; + } + if ((offset == 0) && (size > VM_EXTERNAL_LARGE_SIZE)) { + vm_object_unlock(object); + vm_object_deallocate(object); + return(KERN_SUCCESS); + } +#endif /* MACH_PAGEMAP */ + offset -= object->paging_offset; + + while (size != 0) { + vm_page_t m; + + /* + * We're looking for pages that are both busy and + * absent (waiting to be filled), converting them + * to just absent. + * + * Pages that are just busy can be ignored entirely. + */ + + m = vm_page_lookup(object, offset); + if ((m != VM_PAGE_NULL) && m->busy && m->absent) { + PAGE_WAKEUP_DONE(m); + + vm_page_lock_queues(); + vm_page_activate(m); + vm_page_unlock_queues(); + } + size -= PAGE_SIZE; + offset += PAGE_SIZE; + } + + vm_object_unlock(object); + + vm_object_deallocate(object); + return(KERN_SUCCESS); +} + +/* + * Routine: memory_object_lock_page + * + * Description: + * Perform the appropriate lock operations on the + * given page. See the description of + * "memory_object_lock_request" for the meanings + * of the arguments. + * + * Returns an indication that the operation + * completed, blocked, or that the page must + * be cleaned. + */ + +#define MEMORY_OBJECT_LOCK_RESULT_DONE 0 +#define MEMORY_OBJECT_LOCK_RESULT_MUST_BLOCK 1 +#define MEMORY_OBJECT_LOCK_RESULT_MUST_CLEAN 2 +#define MEMORY_OBJECT_LOCK_RESULT_MUST_RETURN 3 + +static memory_object_lock_result_t memory_object_lock_page( + vm_page_t m, + memory_object_return_t should_return, + boolean_t should_flush, + vm_prot_t prot) +{ + /* + * Don't worry about pages for which the kernel + * does not have any data. + */ + + if (m->absent) + return(MEMORY_OBJECT_LOCK_RESULT_DONE); + + /* + * If we cannot change access to the page, + * either because a mapping is in progress + * (busy page) or because a mapping has been + * wired, then give up. + */ + + if (m->busy) + return(MEMORY_OBJECT_LOCK_RESULT_MUST_BLOCK); + + assert(!m->fictitious); + + if (m->wire_count != 0) { + /* + * If no change would take place + * anyway, return successfully. + * + * No change means: + * Not flushing AND + * No change to page lock [2 checks] AND + * Don't need to send page to manager + * + * Don't need to send page to manager means: + * No clean or return request OR ( + * Page is not dirty [2 checks] AND ( + * Page is not precious OR + * No request to return precious pages )) + * + * Now isn't that straightforward and obvious ?? ;-) + * + * XXX This doesn't handle sending a copy of a wired + * XXX page to the pager, but that will require some + * XXX significant surgery. + */ + + if (!should_flush && + ((m->page_lock == prot) || (prot == VM_PROT_NO_CHANGE)) && + ((should_return == MEMORY_OBJECT_RETURN_NONE) || + (!m->dirty && !pmap_is_modified(m->phys_addr) && + (!m->precious || + should_return != MEMORY_OBJECT_RETURN_ALL)))) { + /* + * Restart page unlock requests, + * even though no change took place. + * [Memory managers may be expecting + * to see new requests.] + */ + m->unlock_request = VM_PROT_NONE; + PAGE_WAKEUP(m); + + return(MEMORY_OBJECT_LOCK_RESULT_DONE); + } + + return(MEMORY_OBJECT_LOCK_RESULT_MUST_BLOCK); + } + + /* + * If the page is to be flushed, allow + * that to be done as part of the protection. + */ + + if (should_flush) + prot = VM_PROT_ALL; + + /* + * Set the page lock. + * + * If we are decreasing permission, do it now; + * let the fault handler take care of increases + * (pmap_page_protect may not increase protection). + */ + + if (prot != VM_PROT_NO_CHANGE) { + if ((m->page_lock ^ prot) & prot) { + pmap_page_protect(m->phys_addr, VM_PROT_ALL & ~prot); + } + m->page_lock = prot; + + /* + * Restart any past unlock requests, even if no + * change resulted. If the manager explicitly + * requested no protection change, then it is assumed + * to be remembering past requests. + */ + + m->unlock_request = VM_PROT_NONE; + PAGE_WAKEUP(m); + } + + /* + * Handle cleaning. + */ + + if (should_return != MEMORY_OBJECT_RETURN_NONE) { + /* + * Check whether the page is dirty. If + * write permission has not been removed, + * this may have unpredictable results. + */ + + if (!m->dirty) + m->dirty = pmap_is_modified(m->phys_addr); + + if (m->dirty || (m->precious && + should_return == MEMORY_OBJECT_RETURN_ALL)) { + /* + * If we weren't planning + * to flush the page anyway, + * we may need to remove the + * page from the pageout + * system and from physical + * maps now. + */ + + vm_page_lock_queues(); + VM_PAGE_QUEUES_REMOVE(m); + vm_page_unlock_queues(); + + if (!should_flush) + pmap_page_protect(m->phys_addr, + VM_PROT_NONE); + + /* + * Cleaning a page will cause + * it to be flushed. + */ + + if (m->dirty) + return(MEMORY_OBJECT_LOCK_RESULT_MUST_CLEAN); + else + return(MEMORY_OBJECT_LOCK_RESULT_MUST_RETURN); + } + } + + /* + * Handle flushing + */ + + if (should_flush) { + VM_PAGE_FREE(m); + } else { + extern boolean_t vm_page_deactivate_hint; + + /* + * XXX Make clean but not flush a paging hint, + * and deactivate the pages. This is a hack + * because it overloads flush/clean with + * implementation-dependent meaning. This only + * happens to pages that are already clean. + */ + + if (vm_page_deactivate_hint && + (should_return != MEMORY_OBJECT_RETURN_NONE)) { + vm_page_lock_queues(); + vm_page_deactivate(m); + vm_page_unlock_queues(); + } + } + + return(MEMORY_OBJECT_LOCK_RESULT_DONE); +} + +/* + * Routine: memory_object_lock_request [user interface] + * + * Description: + * Control use of the data associated with the given + * memory object. For each page in the given range, + * perform the following operations, in order: + * 1) restrict access to the page (disallow + * forms specified by "prot"); + * 2) return data to the manager (if "should_return" + * is RETURN_DIRTY and the page is dirty, or + * "should_return" is RETURN_ALL and the page + * is either dirty or precious); and, + * 3) flush the cached copy (if "should_flush" + * is asserted). + * The set of pages is defined by a starting offset + * ("offset") and size ("size"). Only pages with the + * same page alignment as the starting offset are + * considered. + * + * A single acknowledgement is sent (to the "reply_to" + * port) when these actions are complete. If successful, + * the naked send right for reply_to is consumed. + */ + +kern_return_t +memory_object_lock_request( + vm_object_t object, + vm_offset_t offset, + vm_size_t size, + memory_object_return_t should_return, + boolean_t should_flush, + vm_prot_t prot, + ipc_port_t reply_to, + mach_msg_type_name_t reply_to_type) +{ + vm_page_t m; + vm_offset_t original_offset = offset; + vm_size_t original_size = size; + vm_offset_t paging_offset = 0; + vm_object_t new_object = VM_OBJECT_NULL; + vm_offset_t new_offset = 0; + vm_offset_t last_offset = offset; + int page_lock_result; + int pageout_action = 0; /* '=0' to quiet lint */ + +#define DATA_WRITE_MAX 32 + vm_page_t holding_pages[DATA_WRITE_MAX]; + + /* + * Check for bogus arguments. + */ + if (object == VM_OBJECT_NULL || + ((prot & ~VM_PROT_ALL) != 0 && prot != VM_PROT_NO_CHANGE)) + return (KERN_INVALID_ARGUMENT); + + size = round_page(size); + + /* + * Lock the object, and acquire a paging reference to + * prevent the memory_object and control ports from + * being destroyed. + */ + + vm_object_lock(object); + vm_object_paging_begin(object); + offset -= object->paging_offset; + + /* + * To avoid blocking while scanning for pages, save + * dirty pages to be cleaned all at once. + * + * XXXO A similar strategy could be used to limit the + * number of times that a scan must be restarted for + * other reasons. Those pages that would require blocking + * could be temporarily collected in another list, or + * their offsets could be recorded in a small array. + */ + + /* + * XXX NOTE: May want to consider converting this to a page list + * XXX vm_map_copy interface. Need to understand object + * XXX coalescing implications before doing so. + */ + +#define PAGEOUT_PAGES \ +MACRO_BEGIN \ + vm_map_copy_t copy; \ + unsigned i; \ + vm_page_t hp; \ + \ + vm_object_unlock(object); \ + \ + (void) vm_map_copyin_object(new_object, 0, new_offset, ©); \ + \ + (void) memory_object_data_return( \ + object->pager, \ + object->pager_request, \ + paging_offset, \ + (pointer_t) copy, \ + new_offset, \ + (pageout_action == MEMORY_OBJECT_LOCK_RESULT_MUST_CLEAN), \ + !should_flush); \ + \ + vm_object_lock(object); \ + \ + for (i = 0; i < atop(new_offset); i++) { \ + hp = holding_pages[i]; \ + if (hp != VM_PAGE_NULL) \ + VM_PAGE_FREE(hp); \ + } \ + \ + new_object = VM_OBJECT_NULL; \ +MACRO_END + + for (; + size != 0; + size -= PAGE_SIZE, offset += PAGE_SIZE) + { + /* + * Limit the number of pages to be cleaned at once. + */ + if (new_object != VM_OBJECT_NULL && + new_offset >= PAGE_SIZE * DATA_WRITE_MAX) + { + PAGEOUT_PAGES; + } + + while ((m = vm_page_lookup(object, offset)) != VM_PAGE_NULL) { + switch ((page_lock_result = memory_object_lock_page(m, + should_return, + should_flush, + prot))) + { + case MEMORY_OBJECT_LOCK_RESULT_DONE: + /* + * End of a cluster of dirty pages. + */ + if (new_object != VM_OBJECT_NULL) { + PAGEOUT_PAGES; + continue; + } + break; + + case MEMORY_OBJECT_LOCK_RESULT_MUST_BLOCK: + /* + * Since it is necessary to block, + * clean any dirty pages now. + */ + if (new_object != VM_OBJECT_NULL) { + PAGEOUT_PAGES; + continue; + } + + PAGE_ASSERT_WAIT(m, FALSE); + vm_object_unlock(object); + thread_block((void (*)()) 0); + vm_object_lock(object); + continue; + + case MEMORY_OBJECT_LOCK_RESULT_MUST_CLEAN: + case MEMORY_OBJECT_LOCK_RESULT_MUST_RETURN: + /* + * The clean and return cases are similar. + * + * Mark the page busy since we unlock the + * object below. + */ + m->busy = TRUE; + + /* + * if this would form a discontiguous block, + * clean the old pages and start anew. + * + * NOTE: The first time through here, new_object + * is null, hiding the fact that pageout_action + * is not initialized. + */ + if (new_object != VM_OBJECT_NULL && + (last_offset != offset || + pageout_action != page_lock_result)) { + PAGEOUT_PAGES; + } + + vm_object_unlock(object); + + /* + * If we have not already allocated an object + * for a range of pages to be written, do so + * now. + */ + if (new_object == VM_OBJECT_NULL) { + new_object = vm_object_allocate(original_size); + new_offset = 0; + paging_offset = m->offset + + object->paging_offset; + pageout_action = page_lock_result; + } + + /* + * Move or copy the dirty page into the + * new object. + */ + m = vm_pageout_setup(m, + m->offset + object->paging_offset, + new_object, + new_offset, + should_flush); + + /* + * Save the holding page if there is one. + */ + holding_pages[atop(new_offset)] = m; + new_offset += PAGE_SIZE; + last_offset = offset + PAGE_SIZE; + + vm_object_lock(object); + break; + } + break; + } + } + + /* + * We have completed the scan for applicable pages. + * Clean any pages that have been saved. + */ + if (new_object != VM_OBJECT_NULL) { + PAGEOUT_PAGES; + } + + if (IP_VALID(reply_to)) { + vm_object_unlock(object); + + /* consumes our naked send-once/send right for reply_to */ + (void) memory_object_lock_completed(reply_to, reply_to_type, + object->pager_request, original_offset, original_size); + + vm_object_lock(object); + } + + vm_object_paging_end(object); + vm_object_unlock(object); + vm_object_deallocate(object); + + return (KERN_SUCCESS); +} + +static kern_return_t +memory_object_set_attributes_common( + vm_object_t object, + boolean_t may_cache, + memory_object_copy_strategy_t copy_strategy) +{ + if (object == VM_OBJECT_NULL) + return(KERN_INVALID_ARGUMENT); + + /* + * Verify the attributes of importance + */ + + switch(copy_strategy) { + case MEMORY_OBJECT_COPY_NONE: + case MEMORY_OBJECT_COPY_CALL: + case MEMORY_OBJECT_COPY_DELAY: + case MEMORY_OBJECT_COPY_TEMPORARY: + break; + default: + vm_object_deallocate(object); + return(KERN_INVALID_ARGUMENT); + } + + if (may_cache) + may_cache = TRUE; + + vm_object_lock(object); + + /* + * Wake up anyone waiting for the ready attribute + * to become asserted. + */ + + if (!object->pager_ready) { + vm_object_wakeup(object, VM_OBJECT_EVENT_PAGER_READY); + } + + /* + * Copy the attributes + */ + + object->can_persist = may_cache; + object->pager_ready = TRUE; + if (copy_strategy == MEMORY_OBJECT_COPY_TEMPORARY) { + object->temporary = TRUE; + } else { + object->copy_strategy = copy_strategy; + } + + vm_object_unlock(object); + + vm_object_deallocate(object); + + return(KERN_SUCCESS); +} + +/* + * XXX rpd claims that reply_to could be obviated in favor of a client + * XXX stub that made change_attributes an RPC. Need investigation. + */ + +kern_return_t memory_object_change_attributes( + vm_object_t object, + boolean_t may_cache, + memory_object_copy_strategy_t copy_strategy, + ipc_port_t reply_to, + mach_msg_type_name_t reply_to_type) +{ + kern_return_t result; + + /* + * Do the work and throw away our object reference. It + * is important that the object reference be deallocated + * BEFORE sending the reply. The whole point of the reply + * is that it shows up after the terminate message that + * may be generated by setting the object uncacheable. + * + * XXX may_cache may become a tri-valued variable to handle + * XXX uncache if not in use. + */ + result = memory_object_set_attributes_common(object, may_cache, + copy_strategy); + + if (IP_VALID(reply_to)) { + + /* consumes our naked send-once/send right for reply_to */ + (void) memory_object_change_completed(reply_to, reply_to_type, + may_cache, copy_strategy); + + } + + return(result); +} + +kern_return_t memory_object_ready( + vm_object_t object, + boolean_t may_cache, + memory_object_copy_strategy_t copy_strategy) +{ + return memory_object_set_attributes_common(object, may_cache, + copy_strategy); +} + +kern_return_t memory_object_get_attributes( + vm_object_t object, + boolean_t *object_ready, + boolean_t *may_cache, + memory_object_copy_strategy_t *copy_strategy) +{ + if (object == VM_OBJECT_NULL) + return(KERN_INVALID_ARGUMENT); + + vm_object_lock(object); + *may_cache = object->can_persist; + *object_ready = object->pager_ready; + *copy_strategy = object->copy_strategy; + vm_object_unlock(object); + + vm_object_deallocate(object); + + return(KERN_SUCCESS); +} + +/* + * If successful, consumes the supplied naked send right. + */ +kern_return_t vm_set_default_memory_manager( + const host_t host, + ipc_port_t *default_manager) +{ + ipc_port_t current_manager; + ipc_port_t new_manager; + ipc_port_t returned_manager; + + if (host == HOST_NULL) + return(KERN_INVALID_HOST); + + new_manager = *default_manager; + simple_lock(&memory_manager_default_lock); + current_manager = memory_manager_default; + + if (new_manager == IP_NULL) { + /* + * Retrieve the current value. + */ + + returned_manager = ipc_port_copy_send(current_manager); + } else { + /* + * Retrieve the current value, + * and replace it with the supplied value. + * We consume the supplied naked send right. + */ + + returned_manager = current_manager; + memory_manager_default = new_manager; + + /* + * In case anyone's been waiting for a memory + * manager to be established, wake them up. + */ + + thread_wakeup((event_t) &memory_manager_default); + } + + simple_unlock(&memory_manager_default_lock); + + *default_manager = returned_manager; + return(KERN_SUCCESS); +} + +/* + * Routine: memory_manager_default_reference + * Purpose: + * Returns a naked send right for the default + * memory manager. The returned right is always + * valid (not IP_NULL or IP_DEAD). + */ + +ipc_port_t memory_manager_default_reference(void) +{ + ipc_port_t current_manager; + + simple_lock(&memory_manager_default_lock); + + while (current_manager = ipc_port_copy_send(memory_manager_default), + !IP_VALID(current_manager)) { + thread_sleep((event_t) &memory_manager_default, + simple_lock_addr(memory_manager_default_lock), + FALSE); + simple_lock(&memory_manager_default_lock); + } + + simple_unlock(&memory_manager_default_lock); + + return current_manager; +} + +/* + * Routine: memory_manager_default_port + * Purpose: + * Returns true if the receiver for the port + * is the default memory manager. + * + * This is a hack to let ds_read_done + * know when it should keep memory wired. + */ + +boolean_t memory_manager_default_port(const ipc_port_t port) +{ + ipc_port_t current; + boolean_t result; + + simple_lock(&memory_manager_default_lock); + current = memory_manager_default; + if (IP_VALID(current)) { + /* + * There is no point in bothering to lock + * both ports, which would be painful to do. + * If the receive rights are moving around, + * we might be inaccurate. + */ + + result = port->ip_receiver == current->ip_receiver; + } else + result = FALSE; + simple_unlock(&memory_manager_default_lock); + + return result; +} + +void memory_manager_default_init(void) +{ + memory_manager_default = IP_NULL; + simple_lock_init(&memory_manager_default_lock); +} diff --git a/vm/memory_object.h b/vm/memory_object.h new file mode 100644 index 0000000..ee0c963 --- /dev/null +++ b/vm/memory_object.h @@ -0,0 +1,39 @@ +/* + * Mach Operating System + * Copyright (c) 1991 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +#ifndef _VM_MEMORY_OBJECT_H_ +#define _VM_MEMORY_OBJECT_H_ + +#include <mach/boolean.h> +#include <ipc/ipc_types.h> + +extern ipc_port_t memory_manager_default_reference(void); +extern boolean_t memory_manager_default_port(ipc_port_t); +extern void memory_manager_default_init(void); + +extern ipc_port_t memory_manager_default; + +#endif /* _VM_MEMORY_OBJECT_H_ */ diff --git a/vm/memory_object_default.cli b/vm/memory_object_default.cli new file mode 100644 index 0000000..998a986 --- /dev/null +++ b/vm/memory_object_default.cli @@ -0,0 +1,28 @@ +/* + * Copyright (c) 1994 The University of Utah and + * the Computer Systems Laboratory at the University of Utah (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software is hereby + * granted provided that (1) source code retains these copyright, permission, + * and disclaimer notices, and (2) redistributions including binaries + * reproduce the notices in supporting documentation, and (3) all advertising + * materials mentioning features or use of this software display the following + * acknowledgement: ``This product includes software developed by the + * Computer Systems Laboratory at the University of Utah.'' + * + * THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF THIS SOFTWARE IN ITS "AS + * IS" CONDITION. THE UNIVERSITY OF UTAH AND CSL DISCLAIM ANY LIABILITY OF + * ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * CSL requests users of this software to return to csl-dist@cs.utah.edu any + * improvements that they make and grant CSL redistribution rights. + * + * Author: Bryan Ford, University of Utah CSL + */ +/* This is a client presentation file. */ + +#define KERNEL_USER 1 +#define SEQNOS 1 + +#include <mach/memory_object_default.defs> diff --git a/vm/memory_object_proxy.c b/vm/memory_object_proxy.c new file mode 100644 index 0000000..5724349 --- /dev/null +++ b/vm/memory_object_proxy.c @@ -0,0 +1,228 @@ +/* memory_object_proxy.c - Proxy memory objects for Mach. + Copyright (C) 2005 Free Software Foundation, Inc. + Written by Marcus Brinkmann. + + This file is part of GNU Mach. + + GNU Mach is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + GNU Mach is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. */ + +/* A proxy memory object is a kernel port that can be used like a real + memory object in a vm_map call, except that the current and maximum + protection are restricted to the proxy object's maximum protection + at the time the mapping is established. The kernel port will hold + a reference to the real memory object for the life time of the + proxy object. + + Note that we don't need to do any reference counting on the proxy + object. Our caller will hold a reference to the proxy object when + looking it up, and is expected to acquire its own reference to the + real memory object if needed before releasing the reference to the + proxy object. + + The user provided real memory object and the maximum protection are + not checked for validity. The maximum protection is only used as a + mask, and the memory object is validated at the time the mapping is + established. */ + +#include <mach/port.h> +#include <mach/kern_return.h> +#include <mach/notify.h> +#include <mach/vm_prot.h> +#include <kern/printf.h> +#include <kern/slab.h> +#include <kern/mach4.server.h> +#include <ipc/ipc_port.h> +#include <ipc/ipc_space.h> + +#include <vm/memory_object_proxy.h> + +/* The cache which holds our proxy memory objects. */ +static struct kmem_cache memory_object_proxy_cache; + +struct memory_object_proxy +{ + struct ipc_port *port; + + ipc_port_t object; + ipc_port_t notify; + vm_prot_t max_protection; + vm_offset_t start; + vm_offset_t len; +}; +typedef struct memory_object_proxy *memory_object_proxy_t; + + +void +memory_object_proxy_init (void) +{ + kmem_cache_init (&memory_object_proxy_cache, "memory_object_proxy", + sizeof (struct memory_object_proxy), 0, NULL, 0); +} + +/* Lookup a proxy memory object by its port. */ +static memory_object_proxy_t +memory_object_proxy_port_lookup (ipc_port_t port) +{ + memory_object_proxy_t proxy; + + if (!IP_VALID(port)) + return 0; + + ip_lock (port); + if (ip_active (port) && (ip_kotype (port) == IKOT_PAGER_PROXY)) + proxy = (memory_object_proxy_t) port->ip_kobject; + else + proxy = 0; + ip_unlock (port); + return proxy; +} + + +/* Process a no-sender notification for the proxy memory object + port. */ +boolean_t +memory_object_proxy_notify (mach_msg_header_t *msg) +{ + if (msg->msgh_id == MACH_NOTIFY_NO_SENDERS) + { + memory_object_proxy_t proxy; + mach_no_senders_notification_t *ns; + + ns = (mach_no_senders_notification_t *) msg; + + proxy = (memory_object_proxy_t) + ((ipc_port_t) ns->not_header.msgh_remote_port)->ip_kobject; + if (!proxy) + return FALSE; + if ((ipc_port_t) ns->not_header.msgh_remote_port != proxy->notify) + return FALSE; + + ipc_port_release_send (proxy->object); + + ipc_kobject_set (proxy->port, IKO_NULL, IKOT_NONE); + ipc_port_dealloc_kernel (proxy->port); + ipc_kobject_set (proxy->notify, IKO_NULL, IKOT_NONE); + ipc_port_dealloc_kernel (proxy->notify); + + kmem_cache_free (&memory_object_proxy_cache, (vm_offset_t) proxy); + + return TRUE; + } + + printf ("memory_object_proxy_notify: strange notification %d\n", + msg->msgh_id); + return FALSE; +} + + +/* Create a new proxy memory object from [START;START+LEN) in the + given OBJECT at OFFSET in the new object with the maximum + protection MAX_PROTECTION and return it in *PORT. */ +kern_return_t +memory_object_create_proxy (ipc_space_t space, vm_prot_t max_protection, + ipc_port_t *object, natural_t object_count, + rpc_vm_offset_t *offset, natural_t offset_count, + rpc_vm_offset_t *start, natural_t start_count, + rpc_vm_size_t *len, natural_t len_count, + ipc_port_t *port) +{ + memory_object_proxy_t proxy; + ipc_port_t notify; + + if (space == IS_NULL) + return KERN_INVALID_TASK; + + if (offset_count != object_count || start_count != object_count + || len_count != object_count) + return KERN_INVALID_ARGUMENT; + + /* FIXME: Support more than one memory object. */ + if (object_count != 1) + return KERN_INVALID_ARGUMENT; + + if (!IP_VALID(object[0])) + return KERN_INVALID_NAME; + + /* FIXME: Support a different offset from 0. */ + if (offset[0] != 0) + return KERN_INVALID_ARGUMENT; + + if (start[0] + len[0] < start[0]) + return KERN_INVALID_ARGUMENT; + + proxy = (memory_object_proxy_t) kmem_cache_alloc (&memory_object_proxy_cache); + + /* Allocate port, keeping a reference for it. */ + proxy->port = ipc_port_alloc_kernel (); + if (proxy->port == IP_NULL) + { + kmem_cache_free (&memory_object_proxy_cache, (vm_offset_t) proxy); + return KERN_RESOURCE_SHORTAGE; + } + /* Associate the port with the proxy memory object. */ + ipc_kobject_set (proxy->port, (ipc_kobject_t) proxy, IKOT_PAGER_PROXY); + + /* Request no-senders notifications on the port. */ + proxy->notify = ipc_port_alloc_kernel (); + ipc_kobject_set (proxy->notify, (ipc_kobject_t) proxy, IKOT_PAGER_PROXY); + notify = ipc_port_make_sonce (proxy->notify); + ip_lock (proxy->port); + ipc_port_nsrequest (proxy->port, 1, notify, ¬ify); + assert (notify == IP_NULL); + + /* Consumes the port right */ + proxy->object = object[0]; + proxy->max_protection = max_protection; + proxy->start = start[0]; + proxy->len = len[0]; + + *port = ipc_port_make_send (proxy->port); + return KERN_SUCCESS; +} + +/* Lookup the real memory object and maximum protection for the proxy + memory object port PORT, for which the caller holds a reference. + *OBJECT is only guaranteed to be valid as long as the caller holds + the reference to PORT (unless the caller acquires its own reference + to it). If PORT is not a proxy memory object, return + KERN_INVALID_ARGUMENT. */ +kern_return_t +memory_object_proxy_lookup (ipc_port_t port, ipc_port_t *object, + vm_prot_t *max_protection, vm_offset_t *start, + vm_offset_t *len) +{ + memory_object_proxy_t proxy; + + proxy = memory_object_proxy_port_lookup (port); + if (!proxy) + return KERN_INVALID_ARGUMENT; + + *max_protection = proxy->max_protection; + *start = 0; + *len = (vm_offset_t) ~0; + + do + { + *object = proxy->object; + if (proxy->len <= *start) + *len = 0; + else + *len = MIN(*len, proxy->len - *start); + *start += proxy->start; + } + while ((proxy = memory_object_proxy_port_lookup (proxy->object))); + + return KERN_SUCCESS; +} diff --git a/vm/memory_object_proxy.h b/vm/memory_object_proxy.h new file mode 100644 index 0000000..8b3f202 --- /dev/null +++ b/vm/memory_object_proxy.h @@ -0,0 +1,39 @@ +/* memory_object_proxy.h - Proxy memory objects for Mach. + Copyright (C) 2005, 2011 Free Software Foundation, Inc. + Written by Marcus Brinkmann. + + This file is part of GNU Mach. + + GNU Mach is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + GNU Mach is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. */ + +#ifndef _VM_MEMORY_OBJECT_PROXY_H_ +#define _VM_MEMORY_OBJECT_PROXY_H_ + +#include <ipc/ipc_types.h> +#include <mach/boolean.h> +#include <mach/machine/kern_return.h> +#include <mach/machine/vm_types.h> +#include <mach/message.h> +#include <mach/vm_prot.h> + +extern void memory_object_proxy_init (void); +extern boolean_t memory_object_proxy_notify (mach_msg_header_t *msg); +extern kern_return_t memory_object_proxy_lookup (ipc_port_t port, + ipc_port_t *object, + vm_prot_t *max_protection, + vm_offset_t *start, + vm_offset_t *len); + +#endif /* _VM_MEMORY_OBJECT_PROXY_H_ */ diff --git a/vm/memory_object_user.cli b/vm/memory_object_user.cli new file mode 100644 index 0000000..2bba41f --- /dev/null +++ b/vm/memory_object_user.cli @@ -0,0 +1,28 @@ +/* + * Copyright (c) 1994 The University of Utah and + * the Computer Systems Laboratory at the University of Utah (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software is hereby + * granted provided that (1) source code retains these copyright, permission, + * and disclaimer notices, and (2) redistributions including binaries + * reproduce the notices in supporting documentation, and (3) all advertising + * materials mentioning features or use of this software display the following + * acknowledgement: ``This product includes software developed by the + * Computer Systems Laboratory at the University of Utah.'' + * + * THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF THIS SOFTWARE IN ITS "AS + * IS" CONDITION. THE UNIVERSITY OF UTAH AND CSL DISCLAIM ANY LIABILITY OF + * ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * CSL requests users of this software to return to csl-dist@cs.utah.edu any + * improvements that they make and grant CSL redistribution rights. + * + * Author: Bryan Ford, University of Utah CSL + */ +/* This is a client presentation file. */ + +#define KERNEL_USER 1 +#define SEQNOS 1 + +#include <mach/memory_object.defs> diff --git a/vm/pmap.h b/vm/pmap.h new file mode 100644 index 0000000..aca9ada --- /dev/null +++ b/vm/pmap.h @@ -0,0 +1,241 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University. + * Copyright (c) 1993,1994 The University of Utah and + * the Computer Systems Laboratory (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF + * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY + * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF + * THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/pmap.h + * Author: Avadis Tevanian, Jr. + * Date: 1985 + * + * Machine address mapping definitions -- machine-independent + * section. [For machine-dependent section, see "machine/pmap.h".] + */ + +#ifndef _VM_PMAP_H_ +#define _VM_PMAP_H_ + +#include <machine/pmap.h> +#include <mach/machine/vm_types.h> +#include <mach/vm_prot.h> +#include <mach/boolean.h> +#include <kern/thread.h> + +/* + * The following is a description of the interface to the + * machine-dependent "physical map" data structure. The module + * must provide a "pmap_t" data type that represents the + * set of valid virtual-to-physical addresses for one user + * address space. [The kernel address space is represented + * by a distinguished "pmap_t".] The routines described manage + * this type, install and update virtual-to-physical mappings, + * and perform operations on physical addresses common to + * many address spaces. + */ + +/* + * Routines used for initialization. + * There is traditionally also a pmap_bootstrap, + * used very early by machine-dependent code, + * but it is not part of the interface. + */ + +/* During VM initialization, steal a chunk of memory. */ +extern vm_offset_t pmap_steal_memory(vm_size_t); +/* Initialization, after kernel runs in virtual memory. */ +extern void pmap_init(void); + +#ifndef MACHINE_PAGES +/* + * If machine/pmap.h defines MACHINE_PAGES, it must implement + * the above functions. The pmap module has complete control. + * Otherwise, it must implement + * pmap_virtual_space + * pmap_init + * and vm/vm_resident.c implements pmap_steal_memory using + * pmap_virtual_space and pmap_enter. + */ + +/* During VM initialization, report virtual space available for the kernel. */ +extern void pmap_virtual_space(vm_offset_t *, vm_offset_t *); +#endif /* MACHINE_PAGES */ + +/* + * Routines to manage the physical map data structure. + */ + +/* Create a pmap_t. */ +pmap_t pmap_create(vm_size_t size); + +/* Return the kernel's pmap_t. */ +#ifndef pmap_kernel +extern pmap_t pmap_kernel(void); +#endif /* pmap_kernel */ + +/* Gain and release a reference. */ +extern void pmap_reference(pmap_t pmap); +extern void pmap_destroy(pmap_t pmap); + +/* Enter a mapping */ +extern void pmap_enter(pmap_t pmap, vm_offset_t va, phys_addr_t pa, + vm_prot_t prot, boolean_t wired); + + +/* + * Routines that operate on ranges of virtual addresses. + */ + +/* Remove mappings. */ +void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); + +/* Change protections. */ +void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot); + +/* + * Routines to set up hardware state for physical maps to be used. + */ +/* Prepare pmap_t to run on a given processor. */ +extern void pmap_activate(pmap_t, thread_t, int); +/* Release pmap_t from use on processor. */ +extern void pmap_deactivate(pmap_t, thread_t, int); + + +/* + * Routines that operate on physical addresses. + */ + +/* Restrict access to page. */ +void pmap_page_protect(phys_addr_t pa, vm_prot_t prot); + +/* + * Routines to manage reference/modify bits based on + * physical addresses, simulating them if not provided + * by the hardware. + */ + +/* Clear reference bit */ +void pmap_clear_reference(phys_addr_t pa); + +/* Return reference bit */ +#ifndef pmap_is_referenced +boolean_t pmap_is_referenced(phys_addr_t pa); +#endif /* pmap_is_referenced */ + +/* Clear modify bit */ +void pmap_clear_modify(phys_addr_t pa); + +/* Return modify bit */ +boolean_t pmap_is_modified(phys_addr_t pa); + +/* + * Sundry required routines + */ +/* Return a virtual-to-physical mapping, if possible. */ +extern phys_addr_t pmap_extract(pmap_t, vm_offset_t); +/* Perform garbage collection, if any. */ +extern void pmap_collect(pmap_t); + +/* Lookup an address. */ +int pmap_whatis(pmap_t, vm_offset_t); + +/* Specify pageability. */ +extern void pmap_change_wiring(pmap_t, vm_offset_t, boolean_t); + +/* + * Optional routines + */ +#ifndef pmap_copy +/* Copy range of mappings, if desired. */ +extern void pmap_copy(pmap_t, pmap_t, vm_offset_t, vm_size_t, + vm_offset_t); +#endif /* pmap_copy */ +#ifndef pmap_attribute +/* Get/Set special memory attributes. */ +extern kern_return_t pmap_attribute(void); +#endif /* pmap_attribute */ + +/* + * Grab a physical page: + * the standard memory allocation mechanism + * during system initialization. + */ +extern vm_offset_t pmap_grab_page (void); + +/* + * Make the specified pages (by pmap, offset) + * pageable (or not) as requested. + */ +extern void pmap_pageable( + pmap_t pmap, + vm_offset_t start, + vm_offset_t end, + boolean_t pageable); + +/* + * Back-door routine for mapping kernel VM at initialization. + * Useful for mapping memory outside the range of direct mapped + * physical memory (i.e., devices). + */ +extern vm_offset_t pmap_map_bd( + vm_offset_t virt, + phys_addr_t start, + phys_addr_t end, + vm_prot_t prot); + +/* + * Routines defined as macros. + */ +#ifndef PMAP_ACTIVATE_USER +#define PMAP_ACTIVATE_USER(pmap, thread, cpu) { \ + if ((pmap) != kernel_pmap) \ + PMAP_ACTIVATE(pmap, thread, cpu); \ +} +#endif /* PMAP_ACTIVATE_USER */ + +#ifndef PMAP_DEACTIVATE_USER +#define PMAP_DEACTIVATE_USER(pmap, thread, cpu) { \ + if ((pmap) != kernel_pmap) \ + PMAP_DEACTIVATE(pmap, thread, cpu); \ +} +#endif /* PMAP_DEACTIVATE_USER */ + +#ifndef PMAP_ACTIVATE_KERNEL +#define PMAP_ACTIVATE_KERNEL(cpu) \ + PMAP_ACTIVATE(kernel_pmap, THREAD_NULL, cpu) +#endif /* PMAP_ACTIVATE_KERNEL */ + +#ifndef PMAP_DEACTIVATE_KERNEL +#define PMAP_DEACTIVATE_KERNEL(cpu) \ + PMAP_DEACTIVATE(kernel_pmap, THREAD_NULL, cpu) +#endif /* PMAP_DEACTIVATE_KERNEL */ + +/* + * Exported data structures + */ + +extern pmap_t kernel_pmap; /* The kernel's map */ + +#endif /* _VM_PMAP_H_ */ diff --git a/vm/vm_debug.c b/vm/vm_debug.c new file mode 100644 index 0000000..b0dace8 --- /dev/null +++ b/vm/vm_debug.c @@ -0,0 +1,548 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/vm_debug.c. + * Author: Rich Draves + * Date: March, 1990 + * + * Exported kernel calls. See mach_debug/mach_debug.defs. + */ + +#include <string.h> + +#include <kern/debug.h> +#include <kern/thread.h> +#include <mach/kern_return.h> +#include <mach/machine/vm_types.h> +#include <mach/memory_object.h> +#include <mach/vm_prot.h> +#include <mach/vm_inherit.h> +#include <mach/vm_param.h> +#include <mach_debug/vm_info.h> +#include <mach_debug/hash_info.h> +#include <vm/vm_map.h> +#include <vm/vm_kern.h> +#include <vm/vm_object.h> +#include <kern/mach_debug.server.h> +#include <kern/task.h> +#include <kern/host.h> +#include <kern/printf.h> +#include <ipc/ipc_port.h> + + +#if MACH_VM_DEBUG + +/* + * Routine: vm_object_real_name + * Purpose: + * Convert a VM object to a name port. + * Conditions: + * Takes object and port locks. + * Returns: + * A naked send right for the object's name port, + * or IP_NULL if the object or its name port is null. + */ + +static ipc_port_t +vm_object_real_name(vm_object_t object) +{ + ipc_port_t port = IP_NULL; + + if (object != VM_OBJECT_NULL) { + vm_object_lock(object); + if (object->pager_name != IP_NULL) + port = ipc_port_make_send(object->pager_name); + vm_object_unlock(object); + } + + return port; +} + +/* + * Routine: mach_vm_region_info [kernel call] + * Purpose: + * Retrieve information about a VM region, + * including info about the object chain. + * Conditions: + * Nothing locked. + * Returns: + * KERN_SUCCESS Retrieve region/object info. + * KERN_INVALID_TASK The map is null. + * KERN_NO_SPACE There is no entry at/after the address. + */ + +kern_return_t +mach_vm_region_info( + vm_map_t map, + vm_offset_t address, + vm_region_info_t *regionp, + ipc_port_t *portp) +{ + vm_map_t cmap; /* current map in traversal */ + vm_map_t nmap; /* next map to look at */ + vm_map_entry_t entry; /* entry in current map */ + vm_object_t object; + + if (map == VM_MAP_NULL) + return KERN_INVALID_TASK; + + /* find the entry containing (or following) the address */ + + vm_map_lock_read(map); + for (cmap = map;;) { + /* cmap is read-locked */ + + if (!vm_map_lookup_entry(cmap, address, &entry)) { + entry = entry->vme_next; + if (entry == vm_map_to_entry(cmap)) { + if (map == cmap) { + vm_map_unlock_read(cmap); + return KERN_NO_SPACE; + } + + /* back out to top-level & skip this submap */ + + address = vm_map_max(cmap); + vm_map_unlock_read(cmap); + vm_map_lock_read(map); + cmap = map; + continue; + } + } + + if (entry->is_sub_map) { + /* move down to the sub map */ + + nmap = entry->object.sub_map; + vm_map_lock_read(nmap); + vm_map_unlock_read(cmap); + cmap = nmap; + continue; + } else { + break; + } + /*NOTREACHED*/ + } + + + assert(entry->vme_start < entry->vme_end); + + regionp->vri_start = entry->vme_start; + regionp->vri_end = entry->vme_end; + + /* attributes from the real entry */ + + regionp->vri_protection = entry->protection; + regionp->vri_max_protection = entry->max_protection; + regionp->vri_inheritance = entry->inheritance; + regionp->vri_wired_count = !!entry->wired_count; /* Doesn't stack */ + regionp->vri_user_wired_count = regionp->vri_wired_count; /* Obsolete */ + + object = entry->object.vm_object; + *portp = vm_object_real_name(object); + regionp->vri_object = (vm_offset_t) object; + regionp->vri_offset = entry->offset; + regionp->vri_needs_copy = entry->needs_copy; + + regionp->vri_sharing = entry->is_shared; + + vm_map_unlock_read(cmap); + return KERN_SUCCESS; +} + +/* + * Routine: mach_vm_object_info [kernel call] + * Purpose: + * Retrieve information about a VM object. + * Conditions: + * Nothing locked. + * Returns: + * KERN_SUCCESS Retrieved object info. + * KERN_INVALID_ARGUMENT The object is null. + */ + +kern_return_t +mach_vm_object_info( + vm_object_t object, + vm_object_info_t *infop, + ipc_port_t *shadowp, + ipc_port_t *copyp) +{ + vm_object_info_t info; + vm_object_info_state_t state; + ipc_port_t shadow, copy; + + if (object == VM_OBJECT_NULL) + return KERN_INVALID_ARGUMENT; + + /* + * Because of lock-ordering/deadlock considerations, + * we can't use vm_object_real_name for the copy object. + */ + + retry: + vm_object_lock(object); + copy = IP_NULL; + if (object->copy != VM_OBJECT_NULL) { + if (!vm_object_lock_try(object->copy)) { + vm_object_unlock(object); + simple_lock_pause(); /* wait a bit */ + goto retry; + } + + if (object->copy->pager_name != IP_NULL) + copy = ipc_port_make_send(object->copy->pager_name); + vm_object_unlock(object->copy); + } + shadow = vm_object_real_name(object->shadow); + + info.voi_object = (vm_offset_t) object; + info.voi_pagesize = PAGE_SIZE; + info.voi_size = object->size; + info.voi_ref_count = object->ref_count; + info.voi_resident_page_count = object->resident_page_count; + info.voi_absent_count = object->absent_count; + info.voi_copy = (vm_offset_t) object->copy; + info.voi_shadow = (vm_offset_t) object->shadow; + info.voi_shadow_offset = object->shadow_offset; + info.voi_paging_offset = object->paging_offset; + info.voi_copy_strategy = object->copy_strategy; + info.voi_last_alloc = object->last_alloc; + info.voi_paging_in_progress = object->paging_in_progress; + + state = 0; + if (object->pager_created) + state |= VOI_STATE_PAGER_CREATED; + if (object->pager_initialized) + state |= VOI_STATE_PAGER_INITIALIZED; + if (object->pager_ready) + state |= VOI_STATE_PAGER_READY; + if (object->can_persist) + state |= VOI_STATE_CAN_PERSIST; + if (object->internal) + state |= VOI_STATE_INTERNAL; + if (object->temporary) + state |= VOI_STATE_TEMPORARY; + if (object->alive) + state |= VOI_STATE_ALIVE; + if (object->lock_in_progress) + state |= VOI_STATE_LOCK_IN_PROGRESS; + if (object->lock_restart) + state |= VOI_STATE_LOCK_RESTART; + info.voi_state = state; + vm_object_unlock(object); + + *infop = info; + *shadowp = shadow; + *copyp = copy; + return KERN_SUCCESS; +} + +#define VPI_STATE_NODATA (VPI_STATE_BUSY|VPI_STATE_FICTITIOUS| \ + VPI_STATE_PRIVATE|VPI_STATE_ABSENT) + +/* + * Routine: mach_vm_object_pages/mach_vm_object_pages_phys/ [kernel call] + * Purpose: + * Retrieve information about the pages in a VM object. + * Conditions: + * Nothing locked. Obeys CountInOut protocol. + * Returns: + * KERN_SUCCESS Retrieved object info. + * KERN_INVALID_ARGUMENT The object is null. + * KERN_RESOURCE_SHORTAGE Couldn't allocate memory. + */ + +static kern_return_t +_mach_vm_object_pages( + vm_object_t object, + void* *pagesp, + natural_t *countp, + int phys) +{ + vm_size_t size; + vm_offset_t addr; + void *pages; + unsigned int potential, actual, count; + vm_page_t p; + kern_return_t kr; + + if (object == VM_OBJECT_NULL) + return KERN_INVALID_ARGUMENT; + + /* start with in-line memory */ + + pages = *pagesp; + potential = *countp; + + for (size = 0;;) { + vm_object_lock(object); + actual = object->resident_page_count; + if (actual <= potential) + break; + vm_object_unlock(object); + + if (pages != *pagesp) + kmem_free(ipc_kernel_map, addr, size); + + if (phys) + size = round_page(actual * sizeof(vm_page_phys_info_t)); + else + size = round_page(actual * sizeof(vm_page_info_t)); + kr = kmem_alloc(ipc_kernel_map, &addr, size); + if (kr != KERN_SUCCESS) + return kr; + + pages = (void *) addr; + if (phys) + potential = size / sizeof(vm_page_phys_info_t); + else + potential = size / sizeof(vm_page_info_t); + } + /* object is locked, we have enough wired memory */ + + count = 0; + queue_iterate(&object->memq, p, vm_page_t, listq) { + vm_page_info_t *info = NULL; + vm_page_phys_info_t *info_phys = NULL; + + if (phys) + info_phys = pages + count * sizeof(*info_phys); + else + info = pages + count * sizeof(*info); + count++; + + vm_page_info_state_t state = 0; + + if (phys) { + info_phys->vpi_offset = p->offset; + if (p->phys_addr != (typeof(info_phys->vpi_phys_addr)) p->phys_addr) + printf("warning: physical address overflow in mach_vm_object_pages!!\n"); + info_phys->vpi_phys_addr = p->phys_addr; + info_phys->vpi_wire_count = p->wire_count; + info_phys->vpi_page_lock = p->page_lock; + info_phys->vpi_unlock_request = p->unlock_request; + } else { + info->vpi_offset = p->offset; + if (p->phys_addr != (typeof(info->vpi_phys_addr)) p->phys_addr) + printf("warning: physical address overflow in mach_vm_object_pages!!\n"); + info->vpi_phys_addr = p->phys_addr; + info->vpi_wire_count = p->wire_count; + info->vpi_page_lock = p->page_lock; + info->vpi_unlock_request = p->unlock_request; + } + + if (p->busy) + state |= VPI_STATE_BUSY; + if (p->wanted) + state |= VPI_STATE_WANTED; + if (p->tabled) + state |= VPI_STATE_TABLED; + if (p->fictitious) + state |= VPI_STATE_FICTITIOUS; + if (p->private) + state |= VPI_STATE_PRIVATE; + if (p->absent) + state |= VPI_STATE_ABSENT; + if (p->error) + state |= VPI_STATE_ERROR; + if (p->dirty) + state |= VPI_STATE_DIRTY; + if (p->precious) + state |= VPI_STATE_PRECIOUS; + if (p->overwriting) + state |= VPI_STATE_OVERWRITING; + + if (((state & (VPI_STATE_NODATA|VPI_STATE_DIRTY)) == 0) && + pmap_is_modified(p->phys_addr)) { + state |= VPI_STATE_DIRTY; + p->dirty = TRUE; + } + + vm_page_lock_queues(); + if (p->inactive) + state |= VPI_STATE_INACTIVE; + if (p->active) + state |= VPI_STATE_ACTIVE; + if (p->laundry) + state |= VPI_STATE_LAUNDRY; + if (p->free) + state |= VPI_STATE_FREE; + if (p->reference) + state |= VPI_STATE_REFERENCE; + + if (((state & (VPI_STATE_NODATA|VPI_STATE_REFERENCE)) == 0) && + pmap_is_referenced(p->phys_addr)) { + state |= VPI_STATE_REFERENCE; + p->reference = TRUE; + } + vm_page_unlock_queues(); + + if (phys) + info_phys->vpi_state = state; + else + info->vpi_state = state; + } + + if (object->resident_page_count != count) + panic("mach_vm_object_pages"); + vm_object_unlock(object); + + if (pages == *pagesp) { + /* data fit in-line; nothing to deallocate */ + + *countp = actual; + } else if (actual == 0) { + kmem_free(ipc_kernel_map, addr, size); + + *countp = 0; + } else { + vm_size_t size_used, rsize_used; + vm_map_copy_t copy; + + /* kmem_alloc doesn't zero memory */ + + if (phys) + size_used = actual * sizeof(vm_page_phys_info_t); + else + size_used = actual * sizeof(vm_page_info_t); + rsize_used = round_page(size_used); + + if (rsize_used != size) + kmem_free(ipc_kernel_map, + addr + rsize_used, size - rsize_used); + + if (size_used != rsize_used) + memset((void *) (addr + size_used), 0, + rsize_used - size_used); + + kr = vm_map_copyin(ipc_kernel_map, addr, rsize_used, + TRUE, ©); + assert(kr == KERN_SUCCESS); + + *pagesp = (void *) copy; + *countp = actual; + } + + return KERN_SUCCESS; +} + +kern_return_t +mach_vm_object_pages( + vm_object_t object, + vm_page_info_array_t *pagesp, + natural_t *countp) +{ + return _mach_vm_object_pages(object, (void**) pagesp, countp, 0); +} + +kern_return_t +mach_vm_object_pages_phys( + vm_object_t object, + vm_page_phys_info_array_t *pagesp, + natural_t *countp) +{ + return _mach_vm_object_pages(object, (void**) pagesp, countp, 1); +} + +#endif /* MACH_VM_DEBUG */ + +/* + * Routine: host_virtual_physical_table_info + * Purpose: + * Return information about the VP table. + * Conditions: + * Nothing locked. Obeys CountInOut protocol. + * Returns: + * KERN_SUCCESS Returned information. + * KERN_INVALID_HOST The host is null. + * KERN_RESOURCE_SHORTAGE Couldn't allocate memory. + */ + +kern_return_t +host_virtual_physical_table_info(const host_t host, + hash_info_bucket_array_t *infop, natural_t *countp) +{ + vm_offset_t addr; + vm_size_t size = 0;/* '=0' to quiet gcc warnings */ + hash_info_bucket_t *info; + unsigned int potential, actual; + kern_return_t kr; + + if (host == HOST_NULL) + return KERN_INVALID_HOST; + + /* start with in-line data */ + + info = *infop; + potential = *countp; + + for (;;) { + actual = vm_page_info(info, potential); + if (actual <= potential) + break; + + /* allocate more memory */ + + if (info != *infop) + kmem_free(ipc_kernel_map, addr, size); + + size = round_page(actual * sizeof *info); + kr = kmem_alloc_pageable(ipc_kernel_map, &addr, size); + if (kr != KERN_SUCCESS) + return KERN_RESOURCE_SHORTAGE; + + info = (hash_info_bucket_t *) addr; + potential = size/sizeof *info; + } + + if (info == *infop) { + /* data fit in-line; nothing to deallocate */ + + *countp = actual; + } else if (actual == 0) { + kmem_free(ipc_kernel_map, addr, size); + + *countp = 0; + } else { + vm_map_copy_t copy; + vm_size_t used; + + used = round_page(actual * sizeof *info); + + if (used != size) + kmem_free(ipc_kernel_map, addr + used, size - used); + + kr = vm_map_copyin(ipc_kernel_map, addr, used, + TRUE, ©); + assert(kr == KERN_SUCCESS); + + *infop = (hash_info_bucket_t *) copy; + *countp = actual; + } + + return KERN_SUCCESS; +} diff --git a/vm/vm_external.c b/vm/vm_external.c new file mode 100644 index 0000000..99f4b9c --- /dev/null +++ b/vm/vm_external.c @@ -0,0 +1,151 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * This module maintains information about the presence of + * pages not in memory. Since an external memory object + * must maintain a complete knowledge of its contents, this + * information takes the form of hints. + */ + +#include <mach/boolean.h> +#include <kern/slab.h> +#include <vm/vm_external.h> +#include <mach/vm_param.h> +#include <kern/assert.h> +#include <string.h> + + + +boolean_t vm_external_unsafe = FALSE; + +struct kmem_cache vm_external_cache; + +/* + * The implementation uses bit arrays to record whether + * a page has been written to external storage. For + * convenience, these bit arrays come in two sizes + * (measured in bytes). + */ + +#define SMALL_SIZE (VM_EXTERNAL_SMALL_SIZE/8) +#define LARGE_SIZE (VM_EXTERNAL_LARGE_SIZE/8) + +struct kmem_cache vm_object_small_existence_map_cache; +struct kmem_cache vm_object_large_existence_map_cache; + + +vm_external_t vm_external_create(vm_offset_t size) +{ + vm_external_t result; + vm_size_t bytes; + + result = (vm_external_t) kmem_cache_alloc(&vm_external_cache); + result->existence_map = (char *) 0; + + bytes = (atop(size) + 07) >> 3; + if (bytes <= SMALL_SIZE) { + result->existence_map = + (char *) kmem_cache_alloc(&vm_object_small_existence_map_cache); + result->existence_size = SMALL_SIZE; + } else { + result->existence_map = + (char *) kmem_cache_alloc(&vm_object_large_existence_map_cache); + result->existence_size = LARGE_SIZE; + } + memset (result->existence_map, 0, result->existence_size); + return(result); +} + +void vm_external_destroy(vm_external_t e) +{ + if (e == VM_EXTERNAL_NULL) + return; + + if (e->existence_map != (char *) 0) { + if (e->existence_size <= SMALL_SIZE) { + kmem_cache_free(&vm_object_small_existence_map_cache, + (vm_offset_t) e->existence_map); + } else { + kmem_cache_free(&vm_object_large_existence_map_cache, + (vm_offset_t) e->existence_map); + } + } + kmem_cache_free(&vm_external_cache, (vm_offset_t) e); +} + +vm_external_state_t _vm_external_state_get(const vm_external_t e, + vm_offset_t offset) +{ + unsigned + int bit, byte; + + if (vm_external_unsafe || + (e == VM_EXTERNAL_NULL) || + (e->existence_map == (char *) 0)) + return(VM_EXTERNAL_STATE_UNKNOWN); + + bit = atop(offset); + byte = bit >> 3; + if (byte >= e->existence_size) return (VM_EXTERNAL_STATE_UNKNOWN); + return( (e->existence_map[byte] & (1 << (bit & 07))) ? + VM_EXTERNAL_STATE_EXISTS : VM_EXTERNAL_STATE_ABSENT ); +} + +void vm_external_state_set( + vm_external_t e, + vm_offset_t offset, + vm_external_state_t state) +{ + unsigned + int bit, byte; + + if ((e == VM_EXTERNAL_NULL) || (e->existence_map == (char *) 0)) + return; + + if (state != VM_EXTERNAL_STATE_EXISTS) + return; + + bit = atop(offset); + byte = bit >> 3; + if (byte >= e->existence_size) return; + e->existence_map[byte] |= (1 << (bit & 07)); +} + +void vm_external_module_initialize(void) +{ + vm_size_t size = (vm_size_t) sizeof(struct vm_external); + + kmem_cache_init(&vm_external_cache, "vm_external", size, 0, + NULL, 0); + + kmem_cache_init(&vm_object_small_existence_map_cache, + "small_existence_map", SMALL_SIZE, 0, + NULL, 0); + + kmem_cache_init(&vm_object_large_existence_map_cache, + "large_existence_map", LARGE_SIZE, 0, + NULL, 0); +} diff --git a/vm/vm_external.h b/vm/vm_external.h new file mode 100644 index 0000000..4e44ddf --- /dev/null +++ b/vm/vm_external.h @@ -0,0 +1,95 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ + +#ifndef _VM_VM_EXTERNAL_H_ +#define _VM_VM_EXTERNAL_H_ + +/* + * External page management hint technology + * + * The data structure exported by this module maintains + * a (potentially incomplete) map of the pages written + * to external storage for a range of virtual memory. + */ + +/* + * The data structure representing the state of pages + * on external storage. + */ + +typedef struct vm_external { + int existence_size; /* Size of the following bitmap */ + char *existence_map; /* A bitmap of pages that have + * been written to backing + * storage. + */ +#if 0 + /* XXX: Currently, existence_count is not used. I guess it + could be useful to get rid of the map if the count drops to + zero. */ + int existence_count;/* Number of bits turned on in + * existence_map. + */ +#endif +} *vm_external_t; + +#define VM_EXTERNAL_NULL ((vm_external_t) 0) + +#define VM_EXTERNAL_SMALL_SIZE 128 +#define VM_EXTERNAL_LARGE_SIZE 8192 + +/* + * The states that may be recorded for a page of external storage. + */ + +typedef int vm_external_state_t; +#define VM_EXTERNAL_STATE_EXISTS 1 +#define VM_EXTERNAL_STATE_UNKNOWN 2 +#define VM_EXTERNAL_STATE_ABSENT 3 + + +/* + * Routines exported by this module. + */ + +/* Initialize the module */ +extern void vm_external_module_initialize(void); +/* Create a vm_external_t */ +extern vm_external_t vm_external_create(vm_offset_t); +/* Destroy one */ +extern void vm_external_destroy(vm_external_t); + +/* Set state of a page. */ +extern void vm_external_state_set(vm_external_t, vm_offset_t, + vm_external_state_t); +/* Retrieve the state for a given page, if known. */ +#define vm_external_state_get(e,offset) (((e) != VM_EXTERNAL_NULL) ? \ + _vm_external_state_get(e, offset) : \ + VM_EXTERNAL_STATE_UNKNOWN) +/* HIDDEN routine */ +extern vm_external_state_t _vm_external_state_get(vm_external_t, vm_offset_t); + +#endif /* _VM_VM_EXTERNAL_H_ */ diff --git a/vm/vm_fault.c b/vm/vm_fault.c new file mode 100644 index 0000000..c6e2800 --- /dev/null +++ b/vm/vm_fault.c @@ -0,0 +1,2136 @@ +/* + * Mach Operating System + * Copyright (c) 1994,1990,1989,1988,1987 Carnegie Mellon University. + * Copyright (c) 1993,1994 The University of Utah and + * the Computer Systems Laboratory (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF + * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY + * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF + * THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm_fault.c + * Author: Avadis Tevanian, Jr., Michael Wayne Young + * + * Page fault handling module. + */ + +#include <kern/printf.h> +#include <vm/vm_fault.h> +#include <mach/kern_return.h> +#include <mach/message.h> /* for error codes */ +#include <kern/counters.h> +#include <kern/debug.h> +#include <kern/thread.h> +#include <kern/sched_prim.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/pmap.h> +#include <mach/vm_statistics.h> +#include <vm/vm_pageout.h> +#include <mach/vm_param.h> +#include <mach/memory_object.h> +#include <vm/memory_object_user.user.h> + /* For memory_object_data_{request,unlock} */ +#include <kern/macros.h> +#include <kern/slab.h> + +#if MACH_PCSAMPLE +#include <kern/pc_sample.h> +#endif + + + +/* + * State needed by vm_fault_continue. + * This is a little hefty to drop directly + * into the thread structure. + */ +typedef struct vm_fault_state { + struct vm_map *vmf_map; + vm_offset_t vmf_vaddr; + vm_prot_t vmf_fault_type; + boolean_t vmf_change_wiring; + vm_fault_continuation_t vmf_continuation; + vm_map_version_t vmf_version; + boolean_t vmf_wired; + struct vm_object *vmf_object; + vm_offset_t vmf_offset; + vm_prot_t vmf_prot; + + boolean_t vmfp_backoff; + struct vm_object *vmfp_object; + vm_offset_t vmfp_offset; + struct vm_page *vmfp_first_m; + vm_prot_t vmfp_access; +} vm_fault_state_t; + +struct kmem_cache vm_fault_state_cache; + +int vm_object_absent_max = 50; + +boolean_t vm_fault_dirty_handling = FALSE; +boolean_t vm_fault_interruptible = TRUE; + +boolean_t software_reference_bits = TRUE; + +#if MACH_KDB +extern struct db_watchpoint *db_watchpoint_list; +#endif /* MACH_KDB */ + +/* + * Routine: vm_fault_init + * Purpose: + * Initialize our private data structures. + */ +void vm_fault_init(void) +{ + kmem_cache_init(&vm_fault_state_cache, "vm_fault_state", + sizeof(vm_fault_state_t), 0, NULL, 0); +} + +/* + * Routine: vm_fault_cleanup + * Purpose: + * Clean up the result of vm_fault_page. + * Results: + * The paging reference for "object" is released. + * "object" is unlocked. + * If "top_page" is not null, "top_page" is + * freed and the paging reference for the object + * containing it is released. + * + * In/out conditions: + * "object" must be locked. + */ +void +vm_fault_cleanup( + vm_object_t object, + vm_page_t top_page) +{ + vm_object_paging_end(object); + vm_object_unlock(object); + + if (top_page != VM_PAGE_NULL) { + object = top_page->object; + vm_object_lock(object); + VM_PAGE_FREE(top_page); + vm_object_paging_end(object); + vm_object_unlock(object); + } +} + + +#if MACH_PCSAMPLE +/* + * Do PC sampling on current thread, assuming + * that it is the thread taking this page fault. + * + * Must check for THREAD_NULL, since faults + * can occur before threads are running. + */ + +#define vm_stat_sample(flavor) \ + MACRO_BEGIN \ + thread_t _thread_ = current_thread(); \ + \ + if (_thread_ != THREAD_NULL) \ + take_pc_sample_macro(_thread_, (flavor), 1, 0); \ + MACRO_END + +#else +#define vm_stat_sample(x) +#endif /* MACH_PCSAMPLE */ + + + +/* + * Routine: vm_fault_page + * Purpose: + * Find the resident page for the virtual memory + * specified by the given virtual memory object + * and offset. + * Additional arguments: + * The required permissions for the page is given + * in "fault_type". Desired permissions are included + * in "protection". + * + * If the desired page is known to be resident (for + * example, because it was previously wired down), asserting + * the "unwiring" parameter will speed the search. + * + * If the operation can be interrupted (by thread_abort + * or thread_terminate), then the "interruptible" + * parameter should be asserted. + * + * Results: + * The page containing the proper data is returned + * in "result_page". + * + * In/out conditions: + * The source object must be locked and referenced, + * and must donate one paging reference. The reference + * is not affected. The paging reference and lock are + * consumed. + * + * If the call succeeds, the object in which "result_page" + * resides is left locked and holding a paging reference. + * If this is not the original object, a busy page in the + * original object is returned in "top_page", to prevent other + * callers from pursuing this same data, along with a paging + * reference for the original object. The "top_page" should + * be destroyed when this guarantee is no longer required. + * The "result_page" is also left busy. It is not removed + * from the pageout queues. + */ +vm_fault_return_t vm_fault_page( + /* Arguments: */ + vm_object_t first_object, /* Object to begin search */ + vm_offset_t first_offset, /* Offset into object */ + vm_prot_t fault_type, /* What access is requested */ + boolean_t must_be_resident,/* Must page be resident? */ + boolean_t interruptible, /* May fault be interrupted? */ + /* Modifies in place: */ + vm_prot_t *protection, /* Protection for mapping */ + /* Returns: */ + vm_page_t *result_page, /* Page found, if successful */ + vm_page_t *top_page, /* Page in top object, if + * not result_page. + */ + /* More arguments: */ + boolean_t resume, /* We are restarting. */ + continuation_t continuation) /* Continuation for blocking. */ +{ + vm_page_t m; + vm_object_t object; + vm_offset_t offset; + vm_page_t first_m; + vm_object_t next_object; + vm_object_t copy_object; + boolean_t look_for_page; + vm_prot_t access_required; + + if (resume) { + vm_fault_state_t *state = + (vm_fault_state_t *) current_thread()->ith_other; + + if (state->vmfp_backoff) + goto after_block_and_backoff; + + object = state->vmfp_object; + offset = state->vmfp_offset; + first_m = state->vmfp_first_m; + access_required = state->vmfp_access; + goto after_thread_block; + } + + vm_stat_sample(SAMPLED_PC_VM_FAULTS_ANY); + vm_stat.faults++; /* needs lock XXX */ + current_task()->faults++; + +/* + * Recovery actions + */ +#define RELEASE_PAGE(m) \ + MACRO_BEGIN \ + PAGE_WAKEUP_DONE(m); \ + vm_page_lock_queues(); \ + if (!m->active && !m->inactive) \ + vm_page_activate(m); \ + vm_page_unlock_queues(); \ + MACRO_END + + if (vm_fault_dirty_handling +#if MACH_KDB + /* + * If there are watchpoints set, then + * we don't want to give away write permission + * on a read fault. Make the task write fault, + * so that the watchpoint code notices the access. + */ + || db_watchpoint_list +#endif /* MACH_KDB */ + ) { + /* + * If we aren't asking for write permission, + * then don't give it away. We're using write + * faults to set the dirty bit. + */ + if (!(fault_type & VM_PROT_WRITE)) + *protection &= ~VM_PROT_WRITE; + } + + if (!vm_fault_interruptible) + interruptible = FALSE; + + /* + * INVARIANTS (through entire routine): + * + * 1) At all times, we must either have the object + * lock or a busy page in some object to prevent + * some other thread from trying to bring in + * the same page. + * + * Note that we cannot hold any locks during the + * pager access or when waiting for memory, so + * we use a busy page then. + * + * Note also that we aren't as concerned about more than + * one thread attempting to memory_object_data_unlock + * the same page at once, so we don't hold the page + * as busy then, but do record the highest unlock + * value so far. [Unlock requests may also be delivered + * out of order.] + * + * 2) To prevent another thread from racing us down the + * shadow chain and entering a new page in the top + * object before we do, we must keep a busy page in + * the top object while following the shadow chain. + * + * 3) We must increment paging_in_progress on any object + * for which we have a busy page, to prevent + * vm_object_collapse from removing the busy page + * without our noticing. + * + * 4) We leave busy pages on the pageout queues. + * If the pageout daemon comes across a busy page, + * it will remove the page from the pageout queues. + */ + + /* + * Search for the page at object/offset. + */ + + object = first_object; + offset = first_offset; + first_m = VM_PAGE_NULL; + access_required = fault_type; + + /* + * See whether this page is resident + */ + + while (TRUE) { + m = vm_page_lookup(object, offset); + if (m != VM_PAGE_NULL) { + /* + * If the page is being brought in, + * wait for it and then retry. + * + * A possible optimization: if the page + * is known to be resident, we can ignore + * pages that are absent (regardless of + * whether they're busy). + */ + + if (m->busy) { + kern_return_t wait_result; + + PAGE_ASSERT_WAIT(m, interruptible); + vm_object_unlock(object); + if (continuation != thread_no_continuation) { + vm_fault_state_t *state = + (vm_fault_state_t *) current_thread()->ith_other; + + /* + * Save variables in case + * thread_block discards + * our kernel stack. + */ + + state->vmfp_backoff = FALSE; + state->vmfp_object = object; + state->vmfp_offset = offset; + state->vmfp_first_m = first_m; + state->vmfp_access = + access_required; + state->vmf_prot = *protection; + + counter(c_vm_fault_page_block_busy_user++); + thread_block(continuation); + } else + { + counter(c_vm_fault_page_block_busy_kernel++); + thread_block((void (*)()) 0); + } + after_thread_block: + wait_result = current_thread()->wait_result; + vm_object_lock(object); + if (wait_result != THREAD_AWAKENED) { + vm_fault_cleanup(object, first_m); + if (wait_result == THREAD_RESTART) + return(VM_FAULT_RETRY); + else + return(VM_FAULT_INTERRUPTED); + } + continue; + } + + /* + * If the page is in error, give up now. + */ + + if (m->error) { + VM_PAGE_FREE(m); + vm_fault_cleanup(object, first_m); + return(VM_FAULT_MEMORY_ERROR); + } + + /* + * If the page isn't busy, but is absent, + * then it was deemed "unavailable". + */ + + if (m->absent) { + /* + * Remove the non-existent page (unless it's + * in the top object) and move on down to the + * next object (if there is one). + */ + + offset += object->shadow_offset; + access_required = VM_PROT_READ; + next_object = object->shadow; + if (next_object == VM_OBJECT_NULL) { + vm_page_t real_m; + + assert(!must_be_resident); + + /* + * Absent page at bottom of shadow + * chain; zero fill the page we left + * busy in the first object, and flush + * the absent page. But first we + * need to allocate a real page. + */ + + real_m = vm_page_grab(VM_PAGE_HIGHMEM); + if (real_m == VM_PAGE_NULL) { + vm_fault_cleanup(object, first_m); + return(VM_FAULT_MEMORY_SHORTAGE); + } + + if (object != first_object) { + VM_PAGE_FREE(m); + vm_object_paging_end(object); + vm_object_unlock(object); + object = first_object; + offset = first_offset; + m = first_m; + first_m = VM_PAGE_NULL; + vm_object_lock(object); + } + + VM_PAGE_FREE(m); + assert(real_m->busy); + vm_page_lock_queues(); + vm_page_insert(real_m, object, offset); + vm_page_unlock_queues(); + m = real_m; + + /* + * Drop the lock while zero filling + * page. Then break because this + * is the page we wanted. Checking + * the page lock is a waste of time; + * this page was either absent or + * newly allocated -- in both cases + * it can't be page locked by a pager. + */ + vm_object_unlock(object); + + vm_page_zero_fill(m); + + vm_stat_sample(SAMPLED_PC_VM_ZFILL_FAULTS); + + vm_stat.zero_fill_count++; + current_task()->zero_fills++; + vm_object_lock(object); + pmap_clear_modify(m->phys_addr); + break; + } else { + if (must_be_resident) { + vm_object_paging_end(object); + } else if (object != first_object) { + vm_object_paging_end(object); + VM_PAGE_FREE(m); + } else { + first_m = m; + m->absent = FALSE; + vm_object_absent_release(object); + m->busy = TRUE; + + vm_page_lock_queues(); + VM_PAGE_QUEUES_REMOVE(m); + vm_page_unlock_queues(); + } + vm_object_lock(next_object); + vm_object_unlock(object); + object = next_object; + vm_object_paging_begin(object); + continue; + } + } + + /* + * If the desired access to this page has + * been locked out, request that it be unlocked. + */ + + if (access_required & m->page_lock) { + if ((access_required & m->unlock_request) != access_required) { + vm_prot_t new_unlock_request; + kern_return_t rc; + + if (!object->pager_ready) { + vm_object_assert_wait(object, + VM_OBJECT_EVENT_PAGER_READY, + interruptible); + goto block_and_backoff; + } + + new_unlock_request = m->unlock_request = + (access_required | m->unlock_request); + vm_object_unlock(object); + if ((rc = memory_object_data_unlock( + object->pager, + object->pager_request, + offset + object->paging_offset, + PAGE_SIZE, + new_unlock_request)) + != KERN_SUCCESS) { + printf("vm_fault: memory_object_data_unlock failed\n"); + vm_object_lock(object); + vm_fault_cleanup(object, first_m); + return((rc == MACH_SEND_INTERRUPTED) ? + VM_FAULT_INTERRUPTED : + VM_FAULT_MEMORY_ERROR); + } + vm_object_lock(object); + continue; + } + + PAGE_ASSERT_WAIT(m, interruptible); + goto block_and_backoff; + } + + /* + * We mark the page busy and leave it on + * the pageout queues. If the pageout + * deamon comes across it, then it will + * remove the page. + */ + + if (!software_reference_bits) { + vm_page_lock_queues(); + if (m->inactive) { + vm_stat_sample(SAMPLED_PC_VM_REACTIVATION_FAULTS); + vm_stat.reactivations++; + current_task()->reactivations++; + } + + VM_PAGE_QUEUES_REMOVE(m); + vm_page_unlock_queues(); + } + + assert(!m->busy); + m->busy = TRUE; + assert(!m->absent); + break; + } + + look_for_page = + (object->pager_created) +#if MACH_PAGEMAP + && (vm_external_state_get(object->existence_info, offset + object->paging_offset) != + VM_EXTERNAL_STATE_ABSENT) +#endif /* MACH_PAGEMAP */ + ; + + if ((look_for_page || (object == first_object)) + && !must_be_resident) { + /* + * Allocate a new page for this object/offset + * pair. + */ + + m = vm_page_grab_fictitious(); + if (m == VM_PAGE_NULL) { + vm_fault_cleanup(object, first_m); + return(VM_FAULT_FICTITIOUS_SHORTAGE); + } + + vm_page_lock_queues(); + vm_page_insert(m, object, offset); + vm_page_unlock_queues(); + } + + if (look_for_page && !must_be_resident) { + kern_return_t rc; + + /* + * If the memory manager is not ready, we + * cannot make requests. + */ + if (!object->pager_ready) { + vm_object_assert_wait(object, + VM_OBJECT_EVENT_PAGER_READY, + interruptible); + VM_PAGE_FREE(m); + goto block_and_backoff; + } + + if (object->internal) { + /* + * Requests to the default pager + * must reserve a real page in advance, + * because the pager's data-provided + * won't block for pages. + */ + + if (m->fictitious && !vm_page_convert(&m)) { + VM_PAGE_FREE(m); + vm_fault_cleanup(object, first_m); + return(VM_FAULT_MEMORY_SHORTAGE); + } + } else if (object->absent_count > + vm_object_absent_max) { + /* + * If there are too many outstanding page + * requests pending on this object, we + * wait for them to be resolved now. + */ + + vm_object_absent_assert_wait(object, interruptible); + VM_PAGE_FREE(m); + goto block_and_backoff; + } + + /* + * Indicate that the page is waiting for data + * from the memory manager. + */ + + m->absent = TRUE; + object->absent_count++; + + /* + * We have a busy page, so we can + * release the object lock. + */ + vm_object_unlock(object); + + /* + * Call the memory manager to retrieve the data. + */ + + vm_stat.pageins++; + vm_stat_sample(SAMPLED_PC_VM_PAGEIN_FAULTS); + current_task()->pageins++; + + if ((rc = memory_object_data_request(object->pager, + object->pager_request, + m->offset + object->paging_offset, + PAGE_SIZE, access_required)) != KERN_SUCCESS) { + if (object->pager && rc != MACH_SEND_INTERRUPTED) + printf("%s(0x%p, 0x%p, 0x%zx, 0x%x, 0x%x) failed, %x\n", + "memory_object_data_request", + object->pager, + object->pager_request, + m->offset + object->paging_offset, + PAGE_SIZE, access_required, rc); + /* + * Don't want to leave a busy page around, + * but the data request may have blocked, + * so check if it's still there and busy. + */ + vm_object_lock(object); + if (m == vm_page_lookup(object,offset) && + m->absent && m->busy) + VM_PAGE_FREE(m); + vm_fault_cleanup(object, first_m); + return((rc == MACH_SEND_INTERRUPTED) ? + VM_FAULT_INTERRUPTED : + VM_FAULT_MEMORY_ERROR); + } + + /* + * Retry with same object/offset, since new data may + * be in a different page (i.e., m is meaningless at + * this point). + */ + vm_object_lock(object); + continue; + } + + /* + * For the XP system, the only case in which we get here is if + * object has no pager (or unwiring). If the pager doesn't + * have the page this is handled in the m->absent case above + * (and if you change things here you should look above). + */ + if (object == first_object) + first_m = m; + else + { + assert(m == VM_PAGE_NULL); + } + + /* + * Move on to the next object. Lock the next + * object before unlocking the current one. + */ + access_required = VM_PROT_READ; + + offset += object->shadow_offset; + next_object = object->shadow; + if (next_object == VM_OBJECT_NULL) { + assert(!must_be_resident); + + /* + * If there's no object left, fill the page + * in the top object with zeros. But first we + * need to allocate a real page. + */ + + if (object != first_object) { + vm_object_paging_end(object); + vm_object_unlock(object); + + object = first_object; + offset = first_offset; + vm_object_lock(object); + } + + m = first_m; + assert(m->object == object); + first_m = VM_PAGE_NULL; + + if (m->fictitious && !vm_page_convert(&m)) { + VM_PAGE_FREE(m); + vm_fault_cleanup(object, VM_PAGE_NULL); + return(VM_FAULT_MEMORY_SHORTAGE); + } + + vm_object_unlock(object); + vm_page_zero_fill(m); + vm_stat_sample(SAMPLED_PC_VM_ZFILL_FAULTS); + vm_stat.zero_fill_count++; + current_task()->zero_fills++; + vm_object_lock(object); + pmap_clear_modify(m->phys_addr); + break; + } + else { + vm_object_lock(next_object); + if ((object != first_object) || must_be_resident) + vm_object_paging_end(object); + vm_object_unlock(object); + object = next_object; + vm_object_paging_begin(object); + } + } + + /* + * PAGE HAS BEEN FOUND. + * + * This page (m) is: + * busy, so that we can play with it; + * not absent, so that nobody else will fill it; + * possibly eligible for pageout; + * + * The top-level page (first_m) is: + * VM_PAGE_NULL if the page was found in the + * top-level object; + * busy, not absent, and ineligible for pageout. + * + * The current object (object) is locked. A paging + * reference is held for the current and top-level + * objects. + */ + + assert(m->busy && !m->absent); + assert((first_m == VM_PAGE_NULL) || + (first_m->busy && !first_m->absent && + !first_m->active && !first_m->inactive)); + + /* + * If the page is being written, but isn't + * already owned by the top-level object, + * we have to copy it into a new page owned + * by the top-level object. + */ + + if (object != first_object) { + /* + * We only really need to copy if we + * want to write it. + */ + + if (fault_type & VM_PROT_WRITE) { + vm_page_t copy_m; + + assert(!must_be_resident); + + /* + * If we try to collapse first_object at this + * point, we may deadlock when we try to get + * the lock on an intermediate object (since we + * have the bottom object locked). We can't + * unlock the bottom object, because the page + * we found may move (by collapse) if we do. + * + * Instead, we first copy the page. Then, when + * we have no more use for the bottom object, + * we unlock it and try to collapse. + * + * Note that we copy the page even if we didn't + * need to... that's the breaks. + */ + + /* + * Allocate a page for the copy + */ + copy_m = vm_page_grab(VM_PAGE_HIGHMEM); + if (copy_m == VM_PAGE_NULL) { + RELEASE_PAGE(m); + vm_fault_cleanup(object, first_m); + return(VM_FAULT_MEMORY_SHORTAGE); + } + + vm_object_unlock(object); + vm_page_copy(m, copy_m); + vm_object_lock(object); + + /* + * If another map is truly sharing this + * page with us, we have to flush all + * uses of the original page, since we + * can't distinguish those which want the + * original from those which need the + * new copy. + * + * XXXO If we know that only one map has + * access to this page, then we could + * avoid the pmap_page_protect() call. + */ + + vm_page_lock_queues(); + vm_page_deactivate(m); + pmap_page_protect(m->phys_addr, VM_PROT_NONE); + vm_page_unlock_queues(); + + /* + * We no longer need the old page or object. + */ + + PAGE_WAKEUP_DONE(m); + vm_object_paging_end(object); + vm_object_unlock(object); + + vm_stat.cow_faults++; + vm_stat_sample(SAMPLED_PC_VM_COW_FAULTS); + current_task()->cow_faults++; + object = first_object; + offset = first_offset; + + vm_object_lock(object); + VM_PAGE_FREE(first_m); + first_m = VM_PAGE_NULL; + assert(copy_m->busy); + vm_page_lock_queues(); + vm_page_insert(copy_m, object, offset); + vm_page_unlock_queues(); + m = copy_m; + + /* + * Now that we've gotten the copy out of the + * way, let's try to collapse the top object. + * But we have to play ugly games with + * paging_in_progress to do that... + */ + + vm_object_paging_end(object); + vm_object_collapse(object); + vm_object_paging_begin(object); + } + else { + *protection &= (~VM_PROT_WRITE); + } + } + + /* + * Now check whether the page needs to be pushed into the + * copy object. The use of asymmetric copy on write for + * shared temporary objects means that we may do two copies to + * satisfy the fault; one above to get the page from a + * shadowed object, and one here to push it into the copy. + */ + + while ((copy_object = first_object->copy) != VM_OBJECT_NULL) { + vm_offset_t copy_offset; + vm_page_t copy_m; + + /* + * If the page is being written, but hasn't been + * copied to the copy-object, we have to copy it there. + */ + + if ((fault_type & VM_PROT_WRITE) == 0) { + *protection &= ~VM_PROT_WRITE; + break; + } + + /* + * If the page was guaranteed to be resident, + * we must have already performed the copy. + */ + + if (must_be_resident) + break; + + /* + * Try to get the lock on the copy_object. + */ + if (!vm_object_lock_try(copy_object)) { + vm_object_unlock(object); + + simple_lock_pause(); /* wait a bit */ + + vm_object_lock(object); + continue; + } + + /* + * Make another reference to the copy-object, + * to keep it from disappearing during the + * copy. + */ + assert(copy_object->ref_count > 0); + copy_object->ref_count++; + + /* + * Does the page exist in the copy? + */ + copy_offset = first_offset - copy_object->shadow_offset; + copy_m = vm_page_lookup(copy_object, copy_offset); + if (copy_m != VM_PAGE_NULL) { + if (copy_m->busy) { + /* + * If the page is being brought + * in, wait for it and then retry. + */ + PAGE_ASSERT_WAIT(copy_m, interruptible); + RELEASE_PAGE(m); + copy_object->ref_count--; + assert(copy_object->ref_count > 0); + vm_object_unlock(copy_object); + goto block_and_backoff; + } + } + else { + /* + * Allocate a page for the copy + */ + copy_m = vm_page_alloc(copy_object, copy_offset); + if (copy_m == VM_PAGE_NULL) { + RELEASE_PAGE(m); + copy_object->ref_count--; + assert(copy_object->ref_count > 0); + vm_object_unlock(copy_object); + vm_fault_cleanup(object, first_m); + return(VM_FAULT_MEMORY_SHORTAGE); + } + + /* + * Must copy page into copy-object. + */ + + vm_page_copy(m, copy_m); + + /* + * If the old page was in use by any users + * of the copy-object, it must be removed + * from all pmaps. (We can't know which + * pmaps use it.) + */ + + vm_page_lock_queues(); + pmap_page_protect(m->phys_addr, VM_PROT_NONE); + copy_m->dirty = TRUE; + vm_page_unlock_queues(); + + /* + * If there's a pager, then immediately + * page out this page, using the "initialize" + * option. Else, we use the copy. + */ + + if (!copy_object->pager_created) { + vm_page_lock_queues(); + vm_page_activate(copy_m); + vm_page_unlock_queues(); + PAGE_WAKEUP_DONE(copy_m); + } else { + /* + * The page is already ready for pageout: + * not on pageout queues and busy. + * Unlock everything except the + * copy_object itself. + */ + + vm_object_unlock(object); + + /* + * Write the page to the copy-object, + * flushing it from the kernel. + */ + + vm_pageout_page(copy_m, TRUE, TRUE); + + /* + * Since the pageout may have + * temporarily dropped the + * copy_object's lock, we + * check whether we'll have + * to deallocate the hard way. + */ + + if ((copy_object->shadow != object) || + (copy_object->ref_count == 1)) { + vm_object_unlock(copy_object); + vm_object_deallocate(copy_object); + vm_object_lock(object); + continue; + } + + /* + * Pick back up the old object's + * lock. [It is safe to do so, + * since it must be deeper in the + * object tree.] + */ + + vm_object_lock(object); + } + + /* + * Because we're pushing a page upward + * in the object tree, we must restart + * any faults that are waiting here. + * [Note that this is an expansion of + * PAGE_WAKEUP that uses the THREAD_RESTART + * wait result]. Can't turn off the page's + * busy bit because we're not done with it. + */ + + if (m->wanted) { + m->wanted = FALSE; + thread_wakeup_with_result((event_t) m, + THREAD_RESTART); + } + } + + /* + * The reference count on copy_object must be + * at least 2: one for our extra reference, + * and at least one from the outside world + * (we checked that when we last locked + * copy_object). + */ + copy_object->ref_count--; + assert(copy_object->ref_count > 0); + vm_object_unlock(copy_object); + + break; + } + + *result_page = m; + *top_page = first_m; + + /* + * If the page can be written, assume that it will be. + * [Earlier, we restrict the permission to allow write + * access only if the fault so required, so we don't + * mark read-only data as dirty.] + */ + + if (vm_fault_dirty_handling && (*protection & VM_PROT_WRITE)) + m->dirty = TRUE; + + return(VM_FAULT_SUCCESS); + + block_and_backoff: + vm_fault_cleanup(object, first_m); + + if (continuation != thread_no_continuation) { + vm_fault_state_t *state = + (vm_fault_state_t *) current_thread()->ith_other; + + /* + * Save variables in case we must restart. + */ + + state->vmfp_backoff = TRUE; + state->vmf_prot = *protection; + + counter(c_vm_fault_page_block_backoff_user++); + thread_block(continuation); + } else + { + counter(c_vm_fault_page_block_backoff_kernel++); + thread_block((void (*)()) 0); + } + after_block_and_backoff: + if (current_thread()->wait_result == THREAD_AWAKENED) + return VM_FAULT_RETRY; + else + return VM_FAULT_INTERRUPTED; + +#undef RELEASE_PAGE +} + +/* + * Routine: vm_fault + * Purpose: + * Handle page faults, including pseudo-faults + * used to change the wiring status of pages. + * Returns: + * If an explicit (expression) continuation is supplied, + * then we call the continuation instead of returning. + * Implementation: + * Explicit continuations make this a little icky, + * because it hasn't been rewritten to embrace CPS. + * Instead, we have resume arguments for vm_fault and + * vm_fault_page, to let continue the fault computation. + * + * vm_fault and vm_fault_page save mucho state + * in the moral equivalent of a closure. The state + * structure is allocated when first entering vm_fault + * and deallocated when leaving vm_fault. + */ + +static void +vm_fault_continue(void) +{ + vm_fault_state_t *state = + (vm_fault_state_t *) current_thread()->ith_other; + + (void) vm_fault(state->vmf_map, + state->vmf_vaddr, + state->vmf_fault_type, + state->vmf_change_wiring, + TRUE, state->vmf_continuation); + /*NOTREACHED*/ +} + +kern_return_t vm_fault( + vm_map_t map, + vm_offset_t vaddr, + vm_prot_t fault_type, + boolean_t change_wiring, + boolean_t resume, + vm_fault_continuation_t continuation) +{ + vm_map_version_t version; /* Map version for verificiation */ + boolean_t wired; /* Should mapping be wired down? */ + vm_object_t object; /* Top-level object */ + vm_offset_t offset; /* Top-level offset */ + vm_prot_t prot; /* Protection for mapping */ + vm_object_t old_copy_object; /* Saved copy object */ + vm_page_t result_page; /* Result of vm_fault_page */ + vm_page_t top_page; /* Placeholder page */ + kern_return_t kr; + + vm_page_t m; /* Fast access to result_page */ + + if (resume) { + vm_fault_state_t *state = + (vm_fault_state_t *) current_thread()->ith_other; + + /* + * Retrieve cached variables and + * continue vm_fault_page. + */ + + object = state->vmf_object; + if (object == VM_OBJECT_NULL) + goto RetryFault; + version = state->vmf_version; + wired = state->vmf_wired; + offset = state->vmf_offset; + prot = state->vmf_prot; + + kr = vm_fault_page(object, offset, fault_type, + (change_wiring && !wired), !change_wiring, + &prot, &result_page, &top_page, + TRUE, vm_fault_continue); + goto after_vm_fault_page; + } + + if (continuation != vm_fault_no_continuation) { + /* + * We will probably need to save state. + */ + + char * state; + + /* + * if this assignment stmt is written as + * 'active_threads[cpu_number()] = kmem_cache_alloc()', + * cpu_number may be evaluated before kmem_cache_alloc; + * if kmem_cache_alloc blocks, cpu_number will be wrong + */ + + state = (char *) kmem_cache_alloc(&vm_fault_state_cache); + current_thread()->ith_other = state; + + } + + RetryFault: ; + + /* + * Find the backing store object and offset into + * it to begin the search. + */ + + if ((kr = vm_map_lookup(&map, vaddr, fault_type, &version, + &object, &offset, + &prot, &wired)) != KERN_SUCCESS) { + goto done; + } + + /* + * If the page is wired, we must fault for the current protection + * value, to avoid further faults. + */ + + if (wired) + fault_type = prot; + + /* + * Make a reference to this object to + * prevent its disposal while we are messing with + * it. Once we have the reference, the map is free + * to be diddled. Since objects reference their + * shadows (and copies), they will stay around as well. + */ + + assert(object->ref_count > 0); + object->ref_count++; + vm_object_paging_begin(object); + + if (continuation != vm_fault_no_continuation) { + vm_fault_state_t *state = + (vm_fault_state_t *) current_thread()->ith_other; + + /* + * Save variables, in case vm_fault_page discards + * our kernel stack and we have to restart. + */ + + state->vmf_map = map; + state->vmf_vaddr = vaddr; + state->vmf_fault_type = fault_type; + state->vmf_change_wiring = change_wiring; + state->vmf_continuation = continuation; + + state->vmf_version = version; + state->vmf_wired = wired; + state->vmf_object = object; + state->vmf_offset = offset; + state->vmf_prot = prot; + + kr = vm_fault_page(object, offset, fault_type, + (change_wiring && !wired), !change_wiring, + &prot, &result_page, &top_page, + FALSE, vm_fault_continue); + } else + { + kr = vm_fault_page(object, offset, fault_type, + (change_wiring && !wired), !change_wiring, + &prot, &result_page, &top_page, + FALSE, (void (*)()) 0); + } + after_vm_fault_page: + + /* + * If we didn't succeed, lose the object reference immediately. + */ + + if (kr != VM_FAULT_SUCCESS) + vm_object_deallocate(object); + + /* + * See why we failed, and take corrective action. + */ + + switch (kr) { + case VM_FAULT_SUCCESS: + break; + case VM_FAULT_RETRY: + goto RetryFault; + case VM_FAULT_INTERRUPTED: + kr = KERN_SUCCESS; + goto done; + case VM_FAULT_MEMORY_SHORTAGE: + if (continuation != vm_fault_no_continuation) { + vm_fault_state_t *state = + (vm_fault_state_t *) current_thread()->ith_other; + + /* + * Save variables in case VM_PAGE_WAIT + * discards our kernel stack. + */ + + state->vmf_map = map; + state->vmf_vaddr = vaddr; + state->vmf_fault_type = fault_type; + state->vmf_change_wiring = change_wiring; + state->vmf_continuation = continuation; + state->vmf_object = VM_OBJECT_NULL; + + VM_PAGE_WAIT(vm_fault_continue); + } else + VM_PAGE_WAIT((void (*)()) 0); + goto RetryFault; + case VM_FAULT_FICTITIOUS_SHORTAGE: + vm_page_more_fictitious(); + goto RetryFault; + case VM_FAULT_MEMORY_ERROR: + kr = KERN_MEMORY_ERROR; + goto done; + } + + m = result_page; + + assert((change_wiring && !wired) ? + (top_page == VM_PAGE_NULL) : + ((top_page == VM_PAGE_NULL) == (m->object == object))); + + /* + * How to clean up the result of vm_fault_page. This + * happens whether the mapping is entered or not. + */ + +#define UNLOCK_AND_DEALLOCATE \ + MACRO_BEGIN \ + vm_fault_cleanup(m->object, top_page); \ + vm_object_deallocate(object); \ + MACRO_END + + /* + * What to do with the resulting page from vm_fault_page + * if it doesn't get entered into the physical map: + */ + +#define RELEASE_PAGE(m) \ + MACRO_BEGIN \ + PAGE_WAKEUP_DONE(m); \ + vm_page_lock_queues(); \ + if (!m->active && !m->inactive) \ + vm_page_activate(m); \ + vm_page_unlock_queues(); \ + MACRO_END + + /* + * We must verify that the maps have not changed + * since our last lookup. + */ + + old_copy_object = m->object->copy; + + vm_object_unlock(m->object); + while (!vm_map_verify(map, &version)) { + vm_object_t retry_object; + vm_offset_t retry_offset; + vm_prot_t retry_prot; + + /* + * To avoid trying to write_lock the map while another + * thread has it read_locked (in vm_map_pageable), we + * do not try for write permission. If the page is + * still writable, we will get write permission. If it + * is not, or has been marked needs_copy, we enter the + * mapping without write permission, and will merely + * take another fault. + */ + kr = vm_map_lookup(&map, vaddr, + fault_type & ~VM_PROT_WRITE, &version, + &retry_object, &retry_offset, &retry_prot, + &wired); + + if (kr != KERN_SUCCESS) { + vm_object_lock(m->object); + RELEASE_PAGE(m); + UNLOCK_AND_DEALLOCATE; + goto done; + } + + vm_object_unlock(retry_object); + vm_object_lock(m->object); + + if ((retry_object != object) || + (retry_offset != offset)) { + RELEASE_PAGE(m); + UNLOCK_AND_DEALLOCATE; + goto RetryFault; + } + + /* + * Check whether the protection has changed or the object + * has been copied while we left the map unlocked. + */ + prot &= retry_prot; + vm_object_unlock(m->object); + } + vm_object_lock(m->object); + + /* + * If the copy object changed while the top-level object + * was unlocked, then we must take away write permission. + */ + + if (m->object->copy != old_copy_object) + prot &= ~VM_PROT_WRITE; + + /* + * If we want to wire down this page, but no longer have + * adequate permissions, we must start all over. + */ + + if (wired && (prot != fault_type)) { + vm_map_verify_done(map, &version); + RELEASE_PAGE(m); + UNLOCK_AND_DEALLOCATE; + goto RetryFault; + } + + /* + * It's critically important that a wired-down page be faulted + * only once in each map for which it is wired. + */ + + vm_object_unlock(m->object); + + /* + * Put this page into the physical map. + * We had to do the unlock above because pmap_enter + * may cause other faults. The page may be on + * the pageout queues. If the pageout daemon comes + * across the page, it will remove it from the queues. + */ + + PMAP_ENTER(map->pmap, vaddr, m, prot, wired); + + /* + * If the page is not wired down and isn't already + * on a pageout queue, then put it where the + * pageout daemon can find it. + */ + vm_object_lock(m->object); + vm_page_lock_queues(); + if (change_wiring) { + if (wired) + vm_page_wire(m); + else + vm_page_unwire(m); + } else if (software_reference_bits) { + if (!m->active && !m->inactive) + vm_page_activate(m); + m->reference = TRUE; + } else { + vm_page_activate(m); + } + vm_page_unlock_queues(); + + /* + * Unlock everything, and return + */ + + vm_map_verify_done(map, &version); + PAGE_WAKEUP_DONE(m); + kr = KERN_SUCCESS; + UNLOCK_AND_DEALLOCATE; + +#undef UNLOCK_AND_DEALLOCATE +#undef RELEASE_PAGE + + done: + if (continuation != vm_fault_no_continuation) { + vm_fault_state_t *state = + (vm_fault_state_t *) current_thread()->ith_other; + + kmem_cache_free(&vm_fault_state_cache, (vm_offset_t) state); + (*continuation)(kr); + /*NOTREACHED*/ + } + + return(kr); +} + +/* + * vm_fault_wire: + * + * Wire down a range of virtual addresses in a map. + */ +void vm_fault_wire( + vm_map_t map, + vm_map_entry_t entry) +{ + + vm_offset_t va; + pmap_t pmap; + vm_offset_t end_addr = entry->vme_end; + + pmap = vm_map_pmap(map); + + /* + * Inform the physical mapping system that the + * range of addresses may not fault, so that + * page tables and such can be locked down as well. + */ + + pmap_pageable(pmap, entry->vme_start, end_addr, FALSE); + + /* + * We simulate a fault to get the page and enter it + * in the physical map. + */ + + for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) { + if (vm_fault_wire_fast(map, va, entry) != KERN_SUCCESS) + (void) vm_fault(map, va, VM_PROT_NONE, TRUE, + FALSE, (void (*)()) 0); + } +} + +/* + * vm_fault_unwire: + * + * Unwire a range of virtual addresses in a map. + */ +void vm_fault_unwire( + vm_map_t map, + vm_map_entry_t entry) +{ + vm_offset_t va; + pmap_t pmap; + vm_offset_t end_addr = entry->vme_end; + vm_object_t object; + + pmap = vm_map_pmap(map); + + object = (entry->is_sub_map) + ? VM_OBJECT_NULL : entry->object.vm_object; + + /* + * Since the pages are wired down, we must be able to + * get their mappings from the physical map system. + */ + + for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) { + pmap_change_wiring(pmap, va, FALSE); + + if (object == VM_OBJECT_NULL) { + vm_map_lock_set_recursive(map); + (void) vm_fault(map, va, VM_PROT_NONE, TRUE, + FALSE, (void (*)()) 0); + vm_map_lock_clear_recursive(map); + } else { + vm_prot_t prot; + vm_page_t result_page; + vm_page_t top_page; + vm_fault_return_t result; + + do { + prot = VM_PROT_NONE; + + vm_object_lock(object); + vm_object_paging_begin(object); + result = vm_fault_page(object, + entry->offset + + (va - entry->vme_start), + VM_PROT_NONE, TRUE, + FALSE, &prot, + &result_page, + &top_page, + FALSE, (void (*)()) 0); + } while (result == VM_FAULT_RETRY); + + if (result != VM_FAULT_SUCCESS) + panic("vm_fault_unwire: failure"); + + vm_page_lock_queues(); + vm_page_unwire(result_page); + vm_page_unlock_queues(); + PAGE_WAKEUP_DONE(result_page); + + vm_fault_cleanup(result_page->object, top_page); + } + } + + /* + * Inform the physical mapping system that the range + * of addresses may fault, so that page tables and + * such may be unwired themselves. + */ + + pmap_pageable(pmap, entry->vme_start, end_addr, TRUE); +} + +/* + * vm_fault_wire_fast: + * + * Handle common case of a wire down page fault at the given address. + * If successful, the page is inserted into the associated physical map. + * The map entry is passed in to avoid the overhead of a map lookup. + * + * NOTE: the given address should be truncated to the + * proper page address. + * + * KERN_SUCCESS is returned if the page fault is handled; otherwise, + * a standard error specifying why the fault is fatal is returned. + * + * The map in question must be referenced, and remains so. + * Caller has a read lock on the map. + * + * This is a stripped version of vm_fault() for wiring pages. Anything + * other than the common case will return KERN_FAILURE, and the caller + * is expected to call vm_fault(). + */ +kern_return_t vm_fault_wire_fast( + vm_map_t map, + vm_offset_t va, + vm_map_entry_t entry) +{ + vm_object_t object; + vm_offset_t offset; + vm_page_t m; + vm_prot_t prot; + + vm_stat.faults++; /* needs lock XXX */ + current_task()->faults++; +/* + * Recovery actions + */ + +#undef RELEASE_PAGE +#define RELEASE_PAGE(m) { \ + PAGE_WAKEUP_DONE(m); \ + vm_page_lock_queues(); \ + vm_page_unwire(m); \ + vm_page_unlock_queues(); \ +} + + +#undef UNLOCK_THINGS +#define UNLOCK_THINGS { \ + object->paging_in_progress--; \ + vm_object_unlock(object); \ +} + +#undef UNLOCK_AND_DEALLOCATE +#define UNLOCK_AND_DEALLOCATE { \ + UNLOCK_THINGS; \ + vm_object_deallocate(object); \ +} +/* + * Give up and have caller do things the hard way. + */ + +#define GIVE_UP { \ + UNLOCK_AND_DEALLOCATE; \ + return(KERN_FAILURE); \ +} + + + /* + * If this entry is not directly to a vm_object, bail out. + */ + if (entry->is_sub_map) + return(KERN_FAILURE); + + /* + * Find the backing store object and offset into it. + */ + + object = entry->object.vm_object; + offset = (va - entry->vme_start) + entry->offset; + prot = entry->protection; + + /* + * Make a reference to this object to prevent its + * disposal while we are messing with it. + */ + + vm_object_lock(object); + assert(object->ref_count > 0); + object->ref_count++; + object->paging_in_progress++; + + /* + * INVARIANTS (through entire routine): + * + * 1) At all times, we must either have the object + * lock or a busy page in some object to prevent + * some other thread from trying to bring in + * the same page. + * + * 2) Once we have a busy page, we must remove it from + * the pageout queues, so that the pageout daemon + * will not grab it away. + * + */ + + /* + * Look for page in top-level object. If it's not there or + * there's something going on, give up. + */ + m = vm_page_lookup(object, offset); + if ((m == VM_PAGE_NULL) || (m->error) || + (m->busy) || (m->absent) || (prot & m->page_lock)) { + GIVE_UP; + } + + /* + * Wire the page down now. All bail outs beyond this + * point must unwire the page. + */ + + vm_page_lock_queues(); + vm_page_wire(m); + vm_page_unlock_queues(); + + /* + * Mark page busy for other threads. + */ + assert(!m->busy); + m->busy = TRUE; + assert(!m->absent); + + /* + * Give up if the page is being written and there's a copy object + */ + if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) { + RELEASE_PAGE(m); + GIVE_UP; + } + + /* + * Put this page into the physical map. + * We have to unlock the object because pmap_enter + * may cause other faults. + */ + vm_object_unlock(object); + + PMAP_ENTER(map->pmap, va, m, prot, TRUE); + + /* + * Must relock object so that paging_in_progress can be cleared. + */ + vm_object_lock(object); + + /* + * Unlock everything, and return + */ + + PAGE_WAKEUP_DONE(m); + UNLOCK_AND_DEALLOCATE; + + return(KERN_SUCCESS); + +} + +/* + * Routine: vm_fault_copy_cleanup + * Purpose: + * Release a page used by vm_fault_copy. + */ + +static void vm_fault_copy_cleanup( + vm_page_t page, + vm_page_t top_page) +{ + vm_object_t object = page->object; + + vm_object_lock(object); + PAGE_WAKEUP_DONE(page); + vm_page_lock_queues(); + if (!page->active && !page->inactive) + vm_page_activate(page); + vm_page_unlock_queues(); + vm_fault_cleanup(object, top_page); +} + +/* + * Routine: vm_fault_copy + * + * Purpose: + * Copy pages from one virtual memory object to another -- + * neither the source nor destination pages need be resident. + * + * Before actually copying a page, the version associated with + * the destination address map wil be verified. + * + * In/out conditions: + * The caller must hold a reference, but not a lock, to + * each of the source and destination objects and to the + * destination map. + * + * Results: + * Returns KERN_SUCCESS if no errors were encountered in + * reading or writing the data. Returns KERN_INTERRUPTED if + * the operation was interrupted (only possible if the + * "interruptible" argument is asserted). Other return values + * indicate a permanent error in copying the data. + * + * The actual amount of data copied will be returned in the + * "copy_size" argument. In the event that the destination map + * verification failed, this amount may be less than the amount + * requested. + */ +kern_return_t vm_fault_copy( + vm_object_t src_object, + vm_offset_t src_offset, + vm_size_t *src_size, /* INOUT */ + vm_object_t dst_object, + vm_offset_t dst_offset, + vm_map_t dst_map, + vm_map_version_t *dst_version, + boolean_t interruptible) +{ + vm_page_t result_page; + vm_prot_t prot; + + vm_page_t src_page; + vm_page_t src_top_page; + + vm_page_t dst_page; + vm_page_t dst_top_page; + + vm_size_t amount_done; + vm_object_t old_copy_object; + +#define RETURN(x) \ + MACRO_BEGIN \ + *src_size = amount_done; \ + MACRO_RETURN(x); \ + MACRO_END + + amount_done = 0; + do { /* while (amount_done != *src_size) */ + + RetrySourceFault: ; + + if (src_object == VM_OBJECT_NULL) { + /* + * No source object. We will just + * zero-fill the page in dst_object. + */ + + src_page = VM_PAGE_NULL; + } else { + prot = VM_PROT_READ; + + vm_object_lock(src_object); + vm_object_paging_begin(src_object); + + switch (vm_fault_page(src_object, src_offset, + VM_PROT_READ, FALSE, interruptible, + &prot, &result_page, &src_top_page, + FALSE, (void (*)()) 0)) { + + case VM_FAULT_SUCCESS: + break; + case VM_FAULT_RETRY: + goto RetrySourceFault; + case VM_FAULT_INTERRUPTED: + RETURN(MACH_SEND_INTERRUPTED); + case VM_FAULT_MEMORY_SHORTAGE: + VM_PAGE_WAIT((void (*)()) 0); + goto RetrySourceFault; + case VM_FAULT_FICTITIOUS_SHORTAGE: + vm_page_more_fictitious(); + goto RetrySourceFault; + case VM_FAULT_MEMORY_ERROR: + return(KERN_MEMORY_ERROR); + } + + src_page = result_page; + + assert((src_top_page == VM_PAGE_NULL) == + (src_page->object == src_object)); + + assert ((prot & VM_PROT_READ) != VM_PROT_NONE); + + vm_object_unlock(src_page->object); + } + + RetryDestinationFault: ; + + prot = VM_PROT_WRITE; + + vm_object_lock(dst_object); + vm_object_paging_begin(dst_object); + + switch (vm_fault_page(dst_object, dst_offset, VM_PROT_WRITE, + FALSE, FALSE /* interruptible */, + &prot, &result_page, &dst_top_page, + FALSE, (void (*)()) 0)) { + + case VM_FAULT_SUCCESS: + break; + case VM_FAULT_RETRY: + goto RetryDestinationFault; + case VM_FAULT_INTERRUPTED: + if (src_page != VM_PAGE_NULL) + vm_fault_copy_cleanup(src_page, + src_top_page); + RETURN(MACH_SEND_INTERRUPTED); + case VM_FAULT_MEMORY_SHORTAGE: + VM_PAGE_WAIT((void (*)()) 0); + goto RetryDestinationFault; + case VM_FAULT_FICTITIOUS_SHORTAGE: + vm_page_more_fictitious(); + goto RetryDestinationFault; + case VM_FAULT_MEMORY_ERROR: + if (src_page != VM_PAGE_NULL) + vm_fault_copy_cleanup(src_page, + src_top_page); + return(KERN_MEMORY_ERROR); + } + assert ((prot & VM_PROT_WRITE) != VM_PROT_NONE); + + dst_page = result_page; + + old_copy_object = dst_page->object->copy; + + vm_object_unlock(dst_page->object); + + if (!vm_map_verify(dst_map, dst_version)) { + + BailOut: ; + + if (src_page != VM_PAGE_NULL) + vm_fault_copy_cleanup(src_page, src_top_page); + vm_fault_copy_cleanup(dst_page, dst_top_page); + break; + } + + + vm_object_lock(dst_page->object); + if (dst_page->object->copy != old_copy_object) { + vm_object_unlock(dst_page->object); + vm_map_verify_done(dst_map, dst_version); + goto BailOut; + } + vm_object_unlock(dst_page->object); + + /* + * Copy the page, and note that it is dirty + * immediately. + */ + + if (src_page == VM_PAGE_NULL) + vm_page_zero_fill(dst_page); + else + vm_page_copy(src_page, dst_page); + dst_page->dirty = TRUE; + + /* + * Unlock everything, and return + */ + + vm_map_verify_done(dst_map, dst_version); + + if (src_page != VM_PAGE_NULL) + vm_fault_copy_cleanup(src_page, src_top_page); + vm_fault_copy_cleanup(dst_page, dst_top_page); + + amount_done += PAGE_SIZE; + src_offset += PAGE_SIZE; + dst_offset += PAGE_SIZE; + + } while (amount_done != *src_size); + + RETURN(KERN_SUCCESS); +#undef RETURN + + /*NOTREACHED*/ +} + + + + + +#ifdef notdef + +/* + * Routine: vm_fault_page_overwrite + * + * Description: + * A form of vm_fault_page that assumes that the + * resulting page will be overwritten in its entirety, + * making it unnecessary to obtain the correct *contents* + * of the page. + * + * Implementation: + * XXX Untested. Also unused. Eventually, this technology + * could be used in vm_fault_copy() to advantage. + */ +vm_fault_return_t vm_fault_page_overwrite( + vm_object_t dst_object, + vm_offset_t dst_offset, + vm_page_t *result_page) /* OUT */ +{ + vm_page_t dst_page; + +#define interruptible FALSE /* XXX */ + + while (TRUE) { + /* + * Look for a page at this offset + */ + + while ((dst_page = vm_page_lookup(dst_object, dst_offset)) + == VM_PAGE_NULL) { + /* + * No page, no problem... just allocate one. + */ + + dst_page = vm_page_alloc(dst_object, dst_offset); + if (dst_page == VM_PAGE_NULL) { + vm_object_unlock(dst_object); + VM_PAGE_WAIT((void (*)()) 0); + vm_object_lock(dst_object); + continue; + } + + /* + * Pretend that the memory manager + * write-protected the page. + * + * Note that we will be asking for write + * permission without asking for the data + * first. + */ + + dst_page->overwriting = TRUE; + dst_page->page_lock = VM_PROT_WRITE; + dst_page->absent = TRUE; + dst_object->absent_count++; + + break; + + /* + * When we bail out, we might have to throw + * away the page created here. + */ + +#define DISCARD_PAGE \ + MACRO_BEGIN \ + vm_object_lock(dst_object); \ + dst_page = vm_page_lookup(dst_object, dst_offset); \ + if ((dst_page != VM_PAGE_NULL) && dst_page->overwriting) \ + VM_PAGE_FREE(dst_page); \ + vm_object_unlock(dst_object); \ + MACRO_END + } + + /* + * If the page is write-protected... + */ + + if (dst_page->page_lock & VM_PROT_WRITE) { + /* + * ... and an unlock request hasn't been sent + */ + + if ( ! (dst_page->unlock_request & VM_PROT_WRITE)) { + vm_prot_t u; + kern_return_t rc; + + /* + * ... then send one now. + */ + + if (!dst_object->pager_ready) { + vm_object_assert_wait(dst_object, + VM_OBJECT_EVENT_PAGER_READY, + interruptible); + vm_object_unlock(dst_object); + thread_block((void (*)()) 0); + if (current_thread()->wait_result != + THREAD_AWAKENED) { + DISCARD_PAGE; + return(VM_FAULT_INTERRUPTED); + } + continue; + } + + u = dst_page->unlock_request |= VM_PROT_WRITE; + vm_object_unlock(dst_object); + + if ((rc = memory_object_data_unlock( + dst_object->pager, + dst_object->pager_request, + dst_offset + dst_object->paging_offset, + PAGE_SIZE, + u)) != KERN_SUCCESS) { + printf("vm_object_overwrite: memory_object_data_unlock failed\n"); + DISCARD_PAGE; + return((rc == MACH_SEND_INTERRUPTED) ? + VM_FAULT_INTERRUPTED : + VM_FAULT_MEMORY_ERROR); + } + vm_object_lock(dst_object); + continue; + } + + /* ... fall through to wait below */ + } else { + /* + * If the page isn't being used for other + * purposes, then we're done. + */ + if ( ! (dst_page->busy || dst_page->absent || dst_page->error) ) + break; + } + + PAGE_ASSERT_WAIT(dst_page, interruptible); + vm_object_unlock(dst_object); + thread_block((void (*)()) 0); + if (current_thread()->wait_result != THREAD_AWAKENED) { + DISCARD_PAGE; + return(VM_FAULT_INTERRUPTED); + } + } + + *result_page = dst_page; + return(VM_FAULT_SUCCESS); + +#undef interruptible +#undef DISCARD_PAGE +} + +#endif /* notdef */ diff --git a/vm/vm_fault.h b/vm/vm_fault.h new file mode 100644 index 0000000..ae692b1 --- /dev/null +++ b/vm/vm_fault.h @@ -0,0 +1,81 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/vm_fault.h + * + * Page fault handling module declarations. + */ + +#ifndef _VM_VM_FAULT_H_ +#define _VM_VM_FAULT_H_ + +#include <mach/kern_return.h> +#include <mach/vm_prot.h> +#include <vm/vm_map.h> +#include <vm/vm_types.h> + +/* + * Page fault handling based on vm_object only. + */ + +typedef kern_return_t vm_fault_return_t; +#define VM_FAULT_SUCCESS 0 +#define VM_FAULT_RETRY 1 +#define VM_FAULT_INTERRUPTED 2 +#define VM_FAULT_MEMORY_SHORTAGE 3 +#define VM_FAULT_FICTITIOUS_SHORTAGE 4 +#define VM_FAULT_MEMORY_ERROR 5 + +typedef void (*vm_fault_continuation_t)(kern_return_t); +#define vm_fault_no_continuation ((vm_fault_continuation_t)0) + +extern void vm_fault_init(void); +extern vm_fault_return_t vm_fault_page(vm_object_t, vm_offset_t, vm_prot_t, + boolean_t, boolean_t, vm_prot_t *, + vm_page_t *, vm_page_t *, boolean_t, + continuation_t); + +extern void vm_fault_cleanup(vm_object_t, vm_page_t); +/* + * Page fault handling based on vm_map (or entries therein) + */ + +extern kern_return_t vm_fault(vm_map_t, vm_offset_t, vm_prot_t, boolean_t, + boolean_t, vm_fault_continuation_t); +extern void vm_fault_wire(vm_map_t, vm_map_entry_t); +extern void vm_fault_unwire(vm_map_t, vm_map_entry_t); + +/* Copy pages from one object to another. */ +extern kern_return_t vm_fault_copy(vm_object_t, vm_offset_t, vm_size_t *, + vm_object_t, vm_offset_t, vm_map_t, + vm_map_version_t *, boolean_t); + +kern_return_t vm_fault_wire_fast( + vm_map_t map, + vm_offset_t va, + vm_map_entry_t entry); + +#endif /* _VM_VM_FAULT_H_ */ diff --git a/vm/vm_init.c b/vm/vm_init.c new file mode 100644 index 0000000..593af11 --- /dev/null +++ b/vm/vm_init.c @@ -0,0 +1,88 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University. + * Copyright (c) 1993,1994 The University of Utah and + * the Computer Systems Laboratory (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF + * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY + * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF + * THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/vm_init.c + * Author: Avadis Tevanian, Jr., Michael Wayne Young + * Date: 1985 + * + * Initialize the Virtual Memory subsystem. + */ + +#include <mach/machine/vm_types.h> +#include <kern/slab.h> +#include <kern/kalloc.h> +#include <vm/vm_fault.h> +#include <vm/vm_init.h> +#include <vm/vm_object.h> +#include <vm/vm_map.h> +#include <vm/vm_page.h> +#include <vm/vm_kern.h> +#include <vm/memory_object.h> +#include <vm/memory_object_proxy.h> + + +/* + * vm_mem_bootstrap initializes the virtual memory system. + * This is done only by the first cpu up. + */ + +void vm_mem_bootstrap(void) +{ + vm_offset_t start, end; + + /* + * Initializes resident memory structures. + * From here on, all physical memory is accounted for, + * and we use only virtual addresses. + */ + + vm_page_bootstrap(&start, &end); + + /* + * Initialize other VM packages + */ + + slab_bootstrap(); + vm_object_bootstrap(); + vm_map_init(); + kmem_init(start, end); + pmap_init(); + slab_init(); + kalloc_init(); + vm_fault_init(); + vm_page_module_init(); + memory_manager_default_init(); +} + +void vm_mem_init(void) +{ + vm_object_init(); + memory_object_proxy_init(); + vm_page_info_all(); +} diff --git a/vm/vm_init.h b/vm/vm_init.h new file mode 100644 index 0000000..42ef48b --- /dev/null +++ b/vm/vm_init.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2013 Free Software Foundation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef _VM_VM_INIT_H_ +#define _VM_VM_INIT_H_ + +extern void vm_mem_init(void); +extern void vm_mem_bootstrap(void); + +#endif /* _VM_VM_INIT_H_ */ diff --git a/vm/vm_kern.c b/vm/vm_kern.c new file mode 100644 index 0000000..51223d9 --- /dev/null +++ b/vm/vm_kern.c @@ -0,0 +1,1099 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University. + * Copyright (c) 1993,1994 The University of Utah and + * the Computer Systems Laboratory (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF + * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY + * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF + * THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/vm_kern.c + * Author: Avadis Tevanian, Jr., Michael Wayne Young + * Date: 1985 + * + * Kernel memory management. + */ + +#include <string.h> + +#include <mach/kern_return.h> +#include <machine/locore.h> +#include <machine/vm_param.h> +#include <kern/assert.h> +#include <kern/debug.h> +#include <kern/lock.h> +#include <kern/slab.h> +#include <kern/thread.h> +#include <kern/printf.h> +#include <vm/pmap.h> +#include <vm/vm_fault.h> +#include <vm/vm_kern.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pageout.h> + + + +/* + * Variables exported by this module. + */ + +static struct vm_map kernel_map_store; +vm_map_t kernel_map = &kernel_map_store; +vm_map_t kernel_pageable_map; + +/* + * projected_buffer_allocate + * + * Allocate a wired-down buffer shared between kernel and user task. + * Fresh, zero-filled memory is allocated. + * If persistence is false, this buffer can only be deallocated from + * user task using projected_buffer_deallocate, and deallocation + * from user task also deallocates the buffer from the kernel map. + * projected_buffer_collect is called from vm_map_deallocate to + * automatically deallocate projected buffers on task_deallocate. + * Sharing with more than one user task is achieved by using + * projected_buffer_map for the second and subsequent tasks. + * The user is precluded from manipulating the VM entry of this buffer + * (i.e. changing protection, inheritance or machine attributes). + */ + +kern_return_t +projected_buffer_allocate( + vm_map_t map, + vm_size_t size, + int persistence, + vm_offset_t *kernel_p, + vm_offset_t *user_p, + vm_prot_t protection, + vm_inherit_t inheritance) /*Currently only VM_INHERIT_NONE supported*/ +{ + vm_object_t object; + vm_map_entry_t u_entry, k_entry; + vm_offset_t addr; + phys_addr_t physical_addr; + vm_size_t r_size; + kern_return_t kr; + + if (map == VM_MAP_NULL || map == kernel_map) + return(KERN_INVALID_ARGUMENT); + + /* + * Allocate a new object. + */ + + size = round_page(size); + object = vm_object_allocate(size); + + vm_map_lock(kernel_map); + kr = vm_map_find_entry(kernel_map, &addr, size, (vm_offset_t) 0, + VM_OBJECT_NULL, &k_entry); + if (kr != KERN_SUCCESS) { + vm_map_unlock(kernel_map); + vm_object_deallocate(object); + return kr; + } + + k_entry->object.vm_object = object; + if (!persistence) + k_entry->projected_on = (vm_map_entry_t) -1; + /*Mark entry so as to automatically deallocate it when + last corresponding user entry is deallocated*/ + vm_map_unlock(kernel_map); + *kernel_p = addr; + + vm_map_lock(map); + kr = vm_map_find_entry(map, &addr, size, (vm_offset_t) 0, + VM_OBJECT_NULL, &u_entry); + if (kr != KERN_SUCCESS) { + vm_map_unlock(map); + vm_map_lock(kernel_map); + vm_map_entry_delete(kernel_map, k_entry); + vm_map_unlock(kernel_map); + vm_object_deallocate(object); + return kr; + } + + u_entry->object.vm_object = object; + vm_object_reference(object); + u_entry->projected_on = k_entry; + /*Creates coupling with kernel mapping of the buffer, and + also guarantees that user cannot directly manipulate + buffer VM entry*/ + u_entry->protection = protection; + u_entry->max_protection = protection; + u_entry->inheritance = inheritance; + vm_map_unlock(map); + *user_p = addr; + + /* + * Allocate wired-down memory in the object, + * and enter it in the kernel pmap. + */ + kmem_alloc_pages(object, 0, + *kernel_p, *kernel_p + size, + VM_PROT_READ | VM_PROT_WRITE); + memset((void*) *kernel_p, 0, size); /*Zero fill*/ + + /* Set up physical mappings for user pmap */ + + pmap_pageable(map->pmap, *user_p, *user_p + size, FALSE); + for (r_size = 0; r_size < size; r_size += PAGE_SIZE) { + physical_addr = pmap_extract(kernel_pmap, *kernel_p + r_size); + pmap_enter(map->pmap, *user_p + r_size, physical_addr, + protection, TRUE); + } + + return(KERN_SUCCESS); +} + + +/* + * projected_buffer_map + * + * Map an area of kernel memory onto a task's address space. + * No new memory is allocated; the area must previously exist in the + * kernel memory map. + */ + +kern_return_t +projected_buffer_map( + vm_map_t map, + vm_offset_t kernel_addr, + vm_size_t size, + vm_offset_t *user_p, + vm_prot_t protection, + vm_inherit_t inheritance) /*Currently only VM_INHERIT_NONE supported*/ +{ + vm_map_entry_t u_entry, k_entry; + vm_offset_t user_addr; + phys_addr_t physical_addr; + vm_size_t r_size; + kern_return_t kr; + + /* + * Find entry in kernel map + */ + + size = round_page(size); + if (map == VM_MAP_NULL || map == kernel_map || + !vm_map_lookup_entry(kernel_map, kernel_addr, &k_entry) || + kernel_addr + size > k_entry->vme_end) + return(KERN_INVALID_ARGUMENT); + + + /* + * Create entry in user task + */ + + vm_map_lock(map); + kr = vm_map_find_entry(map, &user_addr, size, (vm_offset_t) 0, + VM_OBJECT_NULL, &u_entry); + if (kr != KERN_SUCCESS) { + vm_map_unlock(map); + return kr; + } + + u_entry->object.vm_object = k_entry->object.vm_object; + vm_object_reference(k_entry->object.vm_object); + u_entry->offset = kernel_addr - k_entry->vme_start + k_entry->offset; + u_entry->projected_on = k_entry; + /*Creates coupling with kernel mapping of the buffer, and + also guarantees that user cannot directly manipulate + buffer VM entry*/ + u_entry->protection = protection; + u_entry->max_protection = protection; + u_entry->inheritance = inheritance; + u_entry->wired_count = k_entry->wired_count; + vm_map_unlock(map); + *user_p = user_addr; + + /* Set up physical mappings for user pmap */ + + pmap_pageable(map->pmap, user_addr, user_addr + size, + !k_entry->wired_count); + for (r_size = 0; r_size < size; r_size += PAGE_SIZE) { + physical_addr = pmap_extract(kernel_pmap, kernel_addr + r_size); + pmap_enter(map->pmap, user_addr + r_size, physical_addr, + protection, k_entry->wired_count); + } + + return(KERN_SUCCESS); +} + + +/* + * projected_buffer_deallocate + * + * Unmap projected buffer from task's address space. + * May also unmap buffer from kernel map, if buffer is not + * persistent and only the kernel reference remains. + */ + +kern_return_t +projected_buffer_deallocate( + vm_map_t map, + vm_offset_t start, + vm_offset_t end) +{ + vm_map_entry_t entry, k_entry; + + if (map == VM_MAP_NULL || map == kernel_map) + return KERN_INVALID_ARGUMENT; + + vm_map_lock(map); + if (!vm_map_lookup_entry(map, start, &entry) || + end > entry->vme_end || + /*Check corresponding kernel entry*/ + (k_entry = entry->projected_on) == 0) { + vm_map_unlock(map); + return(KERN_INVALID_ARGUMENT); + } + + /*Prepare for deallocation*/ + if (entry->vme_start < start) + _vm_map_clip_start(&map->hdr, entry, start, 1); + if (entry->vme_end > end) + _vm_map_clip_end(&map->hdr, entry, end, 1); + if (map->first_free == entry) /*Adjust first_free hint*/ + map->first_free = entry->vme_prev; + entry->projected_on = 0; /*Needed to allow deletion*/ + entry->wired_count = 0; /*Avoid unwire fault*/ + vm_map_entry_delete(map, entry); + vm_map_unlock(map); + + /*Check if the buffer is not persistent and only the + kernel mapping remains, and if so delete it*/ + vm_map_lock(kernel_map); + if (k_entry->projected_on == (vm_map_entry_t) -1 && + k_entry->object.vm_object->ref_count == 1) { + if (kernel_map->first_free == k_entry) + kernel_map->first_free = k_entry->vme_prev; + k_entry->projected_on = 0; /*Allow unwire fault*/ + vm_map_entry_delete(kernel_map, k_entry); + } + vm_map_unlock(kernel_map); + return(KERN_SUCCESS); +} + + +/* + * projected_buffer_collect + * + * Unmap all projected buffers from task's address space. + */ + +kern_return_t +projected_buffer_collect(vm_map_t map) +{ + vm_map_entry_t entry, next; + + if (map == VM_MAP_NULL || map == kernel_map) + return(KERN_INVALID_ARGUMENT); + + for (entry = vm_map_first_entry(map); + entry != vm_map_to_entry(map); + entry = next) { + next = entry->vme_next; + if (entry->projected_on != 0) + projected_buffer_deallocate(map, entry->vme_start, entry->vme_end); + } + return(KERN_SUCCESS); +} + + +/* + * projected_buffer_in_range + * + * Verifies whether a projected buffer exists in the address range + * given. + */ + +boolean_t +projected_buffer_in_range( + vm_map_t map, + vm_offset_t start, + vm_offset_t end) +{ + vm_map_entry_t entry; + + if (map == VM_MAP_NULL || map == kernel_map) + return(FALSE); + + /*Find first entry*/ + if (!vm_map_lookup_entry(map, start, &entry)) + entry = entry->vme_next; + + while (entry != vm_map_to_entry(map) && entry->projected_on == 0 && + entry->vme_start <= end) { + entry = entry->vme_next; + } + return(entry != vm_map_to_entry(map) && entry->vme_start <= end); +} + + +/* + * kmem_alloc: + * + * Allocate wired-down memory in the kernel's address map + * or a submap. The memory is not zero-filled. + */ + +kern_return_t +kmem_alloc( + vm_map_t map, + vm_offset_t *addrp, + vm_size_t size) +{ + vm_object_t object; + vm_map_entry_t entry; + vm_offset_t addr; + unsigned int attempts; + kern_return_t kr; + + /* + * Allocate a new object. We must do this before locking + * the map, lest we risk deadlock with the default pager: + * device_read_alloc uses kmem_alloc, + * which tries to allocate an object, + * which uses kmem_alloc_wired to get memory, + * which blocks for pages. + * then the default pager needs to read a block + * to process a memory_object_data_write, + * and device_read_alloc calls kmem_alloc + * and deadlocks on the map lock. + */ + + size = round_page(size); + object = vm_object_allocate(size); + + attempts = 0; + +retry: + vm_map_lock(map); + kr = vm_map_find_entry(map, &addr, size, (vm_offset_t) 0, + VM_OBJECT_NULL, &entry); + if (kr != KERN_SUCCESS) { + vm_map_unlock(map); + + if (attempts == 0) { + attempts++; + slab_collect(); + goto retry; + } + + printf_once("no more room for kmem_alloc in %p (%s)\n", + map, map->name); + vm_object_deallocate(object); + return kr; + } + + entry->object.vm_object = object; + entry->offset = 0; + + /* + * Since we have not given out this address yet, + * it is safe to unlock the map. + */ + vm_map_unlock(map); + + /* + * Allocate wired-down memory in the kernel_object, + * for this entry, and enter it in the kernel pmap. + */ + kmem_alloc_pages(object, 0, + addr, addr + size, + VM_PROT_DEFAULT); + + /* + * Return the memory, not zeroed. + */ + *addrp = addr; + return KERN_SUCCESS; +} + +/* + * kmem_valloc: + * + * Allocate addressing space in the kernel's address map + * or a submap. The adressing space does not map anything. + */ + +kern_return_t +kmem_valloc( + vm_map_t map, + vm_offset_t *addrp, + vm_size_t size) +{ + vm_map_entry_t entry; + vm_offset_t offset; + vm_offset_t addr; + unsigned int attempts; + kern_return_t kr; + + /* + * Use the kernel object for wired-down kernel pages. + * Assume that no region of the kernel object is + * referenced more than once. We want vm_map_find_entry + * to extend an existing entry if possible. + */ + + size = round_page(size); + attempts = 0; + +retry: + vm_map_lock(map); + kr = vm_map_find_entry(map, &addr, size, (vm_offset_t) 0, + kernel_object, &entry); + if (kr != KERN_SUCCESS) { + vm_map_unlock(map); + + if (attempts == 0) { + attempts++; + slab_collect(); + goto retry; + } + + printf_once("no more room for kmem_valloc in %p (%s)\n", + map, map->name); + return kr; + } + + /* + * Since we didn't know where the new region would + * start, we couldn't supply the correct offset into + * the kernel object. We only initialize the entry + * if we aren't extending an existing entry. + */ + + offset = addr - VM_MIN_KERNEL_ADDRESS; + + if (entry->object.vm_object == VM_OBJECT_NULL) { + vm_object_reference(kernel_object); + + entry->object.vm_object = kernel_object; + entry->offset = offset; + } + + /* + * Since we have not given out this address yet, + * it is safe to unlock the map. + */ + vm_map_unlock(map); + + /* + * Return the memory, not mapped. + */ + *addrp = addr; + return KERN_SUCCESS; +} + +/* + * kmem_alloc_wired: + * + * Allocate wired-down memory in the kernel's address map + * or a submap. The memory is not zero-filled. + * + * The memory is allocated in the kernel_object. + * It may not be copied with vm_map_copy. + */ + +kern_return_t +kmem_alloc_wired( + vm_map_t map, + vm_offset_t *addrp, + vm_size_t size) +{ + vm_offset_t offset; + vm_offset_t addr; + kern_return_t kr; + + kr = kmem_valloc(map, &addr, size); + if (kr != KERN_SUCCESS) + return kr; + + offset = addr - VM_MIN_KERNEL_ADDRESS; + + /* + * Allocate wired-down memory in the kernel_object, + * for this entry, and enter it in the kernel pmap. + */ + kmem_alloc_pages(kernel_object, offset, + addr, addr + size, + VM_PROT_DEFAULT); + + /* + * Return the memory, not zeroed. + */ + *addrp = addr; + return KERN_SUCCESS; +} + +/* + * kmem_alloc_aligned: + * + * Like kmem_alloc_wired, except that the memory is aligned. + * The size should be a power-of-2. + */ + +kern_return_t +kmem_alloc_aligned( + vm_map_t map, + vm_offset_t *addrp, + vm_size_t size) +{ + vm_map_entry_t entry; + vm_offset_t offset; + vm_offset_t addr; + unsigned int attempts; + kern_return_t kr; + + if ((size & (size - 1)) != 0) + panic("kmem_alloc_aligned"); + + /* + * Use the kernel object for wired-down kernel pages. + * Assume that no region of the kernel object is + * referenced more than once. We want vm_map_find_entry + * to extend an existing entry if possible. + */ + + size = round_page(size); + attempts = 0; + +retry: + vm_map_lock(map); + kr = vm_map_find_entry(map, &addr, size, size - 1, + kernel_object, &entry); + if (kr != KERN_SUCCESS) { + vm_map_unlock(map); + + if (attempts == 0) { + attempts++; + slab_collect(); + goto retry; + } + + printf_once("no more room for kmem_alloc_aligned in %p (%s)\n", + map, map->name); + return kr; + } + + /* + * Since we didn't know where the new region would + * start, we couldn't supply the correct offset into + * the kernel object. We only initialize the entry + * if we aren't extending an existing entry. + */ + + offset = addr - VM_MIN_KERNEL_ADDRESS; + + if (entry->object.vm_object == VM_OBJECT_NULL) { + vm_object_reference(kernel_object); + + entry->object.vm_object = kernel_object; + entry->offset = offset; + } + + /* + * Since we have not given out this address yet, + * it is safe to unlock the map. + */ + vm_map_unlock(map); + + /* + * Allocate wired-down memory in the kernel_object, + * for this entry, and enter it in the kernel pmap. + */ + kmem_alloc_pages(kernel_object, offset, + addr, addr + size, + VM_PROT_DEFAULT); + + /* + * Return the memory, not zeroed. + */ + *addrp = addr; + return KERN_SUCCESS; +} + +/* + * kmem_map_aligned_table: map a table or structure in a virtual memory page + * Align the table initial address with the page initial address. + * + * Parameters: + * phys_address: physical address, the start address of the table. + * size: size of the table. + * mode: access mode. VM_PROT_READ for read, VM_PROT_WRITE for write. + * + * Returns a reference to the virtual address if success, NULL if failure. + */ + +void* +kmem_map_aligned_table( + phys_addr_t phys_address, + vm_size_t size, + int mode) +{ + vm_offset_t virt_addr; + kern_return_t ret; + phys_addr_t into_page = phys_address % PAGE_SIZE; + phys_addr_t nearest_page = phys_address - into_page; + + size += into_page; + + ret = kmem_alloc_wired(kernel_map, &virt_addr, + round_page(size)); + + if (ret != KERN_SUCCESS) + return NULL; + + (void) pmap_map_bd(virt_addr, nearest_page, + nearest_page + round_page(size), mode); + + /* XXX remember mapping somewhere so we can free it? */ + + return (void *) (virt_addr + into_page); +} + +/* + * kmem_alloc_pageable: + * + * Allocate pageable memory in the kernel's address map. + */ + +kern_return_t +kmem_alloc_pageable( + vm_map_t map, + vm_offset_t *addrp, + vm_size_t size) +{ + vm_offset_t addr; + kern_return_t kr; + + addr = vm_map_min(map); + kr = vm_map_enter(map, &addr, round_page(size), + (vm_offset_t) 0, TRUE, + VM_OBJECT_NULL, (vm_offset_t) 0, FALSE, + VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT); + if (kr != KERN_SUCCESS) { + printf_once("no more room for kmem_alloc_pageable in %p (%s)\n", + map, map->name); + return kr; + } + + *addrp = addr; + return KERN_SUCCESS; +} + +/* + * kmem_free: + * + * Release a region of kernel virtual memory allocated + * with kmem_alloc, kmem_alloc_wired, or kmem_alloc_pageable, + * and return the physical pages associated with that region. + */ + +void +kmem_free( + vm_map_t map, + vm_offset_t addr, + vm_size_t size) +{ + kern_return_t kr; + + kr = vm_map_remove(map, trunc_page(addr), round_page(addr + size)); + if (kr != KERN_SUCCESS) + panic("kmem_free"); +} + +/* + * Allocate new wired pages in an object. + * The object is assumed to be mapped into the kernel map or + * a submap. + */ +void +kmem_alloc_pages( + vm_object_t object, + vm_offset_t offset, + vm_offset_t start, + vm_offset_t end, + vm_prot_t protection) +{ + /* + * Mark the pmap region as not pageable. + */ + pmap_pageable(kernel_pmap, start, end, FALSE); + + while (start < end) { + vm_page_t mem; + + vm_object_lock(object); + + /* + * Allocate a page + */ + while ((mem = vm_page_alloc(object, offset)) + == VM_PAGE_NULL) { + vm_object_unlock(object); + VM_PAGE_WAIT((void (*)()) 0); + vm_object_lock(object); + } + + /* + * Wire it down + */ + vm_page_lock_queues(); + vm_page_wire(mem); + vm_page_unlock_queues(); + vm_object_unlock(object); + + /* + * Enter it in the kernel pmap + */ + PMAP_ENTER(kernel_pmap, start, mem, + protection, TRUE); + + vm_object_lock(object); + PAGE_WAKEUP_DONE(mem); + vm_object_unlock(object); + + start += PAGE_SIZE; + offset += PAGE_SIZE; + } +} + +/* + * Remap wired pages in an object into a new region. + * The object is assumed to be mapped into the kernel map or + * a submap. + */ +void +kmem_remap_pages( + vm_object_t object, + vm_offset_t offset, + vm_offset_t start, + vm_offset_t end, + vm_prot_t protection) +{ + /* + * Mark the pmap region as not pageable. + */ + pmap_pageable(kernel_pmap, start, end, FALSE); + + while (start < end) { + vm_page_t mem; + + vm_object_lock(object); + + /* + * Find a page + */ + if ((mem = vm_page_lookup(object, offset)) == VM_PAGE_NULL) + panic("kmem_remap_pages"); + + /* + * Wire it down (again) + */ + vm_page_lock_queues(); + vm_page_wire(mem); + vm_page_unlock_queues(); + vm_object_unlock(object); + + /* + * Enter it in the kernel pmap. The page isn't busy, + * but this shouldn't be a problem because it is wired. + */ + PMAP_ENTER(kernel_pmap, start, mem, + protection, TRUE); + + start += PAGE_SIZE; + offset += PAGE_SIZE; + } +} + +/* + * kmem_submap: + * + * Initializes a map to manage a subrange + * of the kernel virtual address space. + * + * Arguments are as follows: + * + * map Map to initialize + * parent Map to take range from + * size Size of range to find + * min, max Returned endpoints of map + * pageable Can the region be paged + */ + +void +kmem_submap( + vm_map_t map, + vm_map_t parent, + vm_offset_t *min, + vm_offset_t *max, + vm_size_t size) +{ + vm_offset_t addr; + kern_return_t kr; + + size = round_page(size); + + /* + * Need reference on submap object because it is internal + * to the vm_system. vm_object_enter will never be called + * on it (usual source of reference for vm_map_enter). + */ + vm_object_reference(vm_submap_object); + + addr = vm_map_min(parent); + kr = vm_map_enter(parent, &addr, size, + (vm_offset_t) 0, TRUE, + vm_submap_object, (vm_offset_t) 0, FALSE, + VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT); + if (kr != KERN_SUCCESS) + panic("kmem_submap"); + + pmap_reference(vm_map_pmap(parent)); + vm_map_setup(map, vm_map_pmap(parent), addr, addr + size); + kr = vm_map_submap(parent, addr, addr + size, map); + if (kr != KERN_SUCCESS) + panic("kmem_submap"); + + *min = addr; + *max = addr + size; +} + +/* + * kmem_init: + * + * Initialize the kernel's virtual memory map, taking + * into account all memory allocated up to this time. + */ +void kmem_init( + vm_offset_t start, + vm_offset_t end) +{ + vm_map_setup(kernel_map, pmap_kernel(), VM_MIN_KERNEL_ADDRESS, end); + + /* + * Reserve virtual memory allocated up to this time. + */ + if (start != VM_MIN_KERNEL_ADDRESS) { + kern_return_t rc; + vm_offset_t addr = VM_MIN_KERNEL_ADDRESS; + rc = vm_map_enter(kernel_map, + &addr, start - VM_MIN_KERNEL_ADDRESS, + (vm_offset_t) 0, TRUE, + VM_OBJECT_NULL, (vm_offset_t) 0, FALSE, + VM_PROT_DEFAULT, VM_PROT_ALL, + VM_INHERIT_DEFAULT); + if (rc) + panic("vm_map_enter failed (%d)\n", rc); + } +} + +/* + * New and improved IO wiring support. + */ + +/* + * kmem_io_map_copyout: + * + * Establish temporary mapping in designated map for the memory + * passed in. Memory format must be a page_list vm_map_copy. + * Mapping is READ-ONLY. + */ + +kern_return_t +kmem_io_map_copyout( + vm_map_t map, + vm_offset_t *addr, /* actual addr of data */ + vm_offset_t *alloc_addr, /* page aligned addr */ + vm_size_t *alloc_size, /* size allocated */ + vm_map_copy_t copy, + vm_size_t min_size) /* Do at least this much */ +{ + vm_offset_t myaddr, offset; + vm_size_t mysize, copy_size; + kern_return_t ret; + vm_page_t *page_list; + vm_map_copy_t new_copy; + int i; + + assert(copy->type == VM_MAP_COPY_PAGE_LIST); + assert(min_size != 0); + + /* + * Figure out the size in vm pages. + */ + min_size += copy->offset - trunc_page(copy->offset); + min_size = round_page(min_size); + mysize = round_page(copy->offset + copy->size) - + trunc_page(copy->offset); + + /* + * If total size is larger than one page list and + * we don't have to do more than one page list, then + * only do one page list. + * + * XXX Could be much smarter about this ... like trimming length + * XXX if we need more than one page list but not all of them. + */ + + copy_size = ptoa(copy->cpy_npages); + if (mysize > copy_size && copy_size > min_size) + mysize = copy_size; + + /* + * Allocate some address space in the map (must be kernel + * space). + */ + myaddr = vm_map_min(map); + ret = vm_map_enter(map, &myaddr, mysize, + (vm_offset_t) 0, TRUE, + VM_OBJECT_NULL, (vm_offset_t) 0, FALSE, + VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT); + + if (ret != KERN_SUCCESS) + return(ret); + + /* + * Tell the pmap module that this will be wired, and + * enter the mappings. + */ + pmap_pageable(vm_map_pmap(map), myaddr, myaddr + mysize, TRUE); + + *addr = myaddr + (copy->offset - trunc_page(copy->offset)); + *alloc_addr = myaddr; + *alloc_size = mysize; + + offset = myaddr; + page_list = ©->cpy_page_list[0]; + while (TRUE) { + for ( i = 0; i < copy->cpy_npages; i++, offset += PAGE_SIZE) { + PMAP_ENTER(vm_map_pmap(map), offset, *page_list, + VM_PROT_READ, TRUE); + page_list++; + } + + if (offset == (myaddr + mysize)) + break; + + /* + * Onward to the next page_list. The extend_cont + * leaves the current page list's pages alone; + * they'll be cleaned up at discard. Reset this + * copy's continuation to discard the next one. + */ + vm_map_copy_invoke_extend_cont(copy, &new_copy, &ret); + + if (ret != KERN_SUCCESS) { + kmem_io_map_deallocate(map, myaddr, mysize); + return(ret); + } + copy->cpy_cont = vm_map_copy_discard_cont; + copy->cpy_cont_args = (vm_map_copyin_args_t)new_copy; + copy = new_copy; + page_list = ©->cpy_page_list[0]; + } + + return(ret); +} + +/* + * kmem_io_map_deallocate: + * + * Get rid of the mapping established by kmem_io_map_copyout. + * Assumes that addr and size have been rounded to page boundaries. + * (e.g., the alloc_addr and alloc_size returned by kmem_io_map_copyout) + */ + +void +kmem_io_map_deallocate( + vm_map_t map, + vm_offset_t addr, + vm_size_t size) +{ + /* + * Remove the mappings. The pmap_remove is needed. + */ + + pmap_remove(vm_map_pmap(map), addr, addr + size); + vm_map_remove(map, addr, addr + size); +} + +/* + * Routine: copyinmap + * Purpose: + * Like copyin, except that fromaddr is an address + * in the specified VM map. This implementation + * is incomplete; it handles the current user map + * and the kernel map/submaps. + */ + +int copyinmap( + vm_map_t map, + char *fromaddr, + char *toaddr, + int length) +{ + if (vm_map_pmap(map) == kernel_pmap) { + /* assume a correct copy */ + memcpy(toaddr, fromaddr, length); + return 0; + } + + if (current_map() == map) + return copyin( fromaddr, toaddr, length); + + return 1; +} + +/* + * Routine: copyoutmap + * Purpose: + * Like copyout, except that toaddr is an address + * in the specified VM map. This implementation + * is incomplete; it handles the current user map + * and the kernel map/submaps. + */ + +int copyoutmap( + vm_map_t map, + char *fromaddr, + char *toaddr, + int length) +{ + if (vm_map_pmap(map) == kernel_pmap) { + /* assume a correct copy */ + memcpy(toaddr, fromaddr, length); + return 0; + } + + if (current_map() == map) + return copyout(fromaddr, toaddr, length); + + return 1; +} diff --git a/vm/vm_kern.h b/vm/vm_kern.h new file mode 100644 index 0000000..13115ff --- /dev/null +++ b/vm/vm_kern.h @@ -0,0 +1,100 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/vm_kern.h + * Author: Avadis Tevanian, Jr., Michael Wayne Young + * Date: 1985 + * + * Kernel memory management definitions. + */ + +#ifndef _VM_VM_KERN_H_ +#define _VM_VM_KERN_H_ + +#include <mach/kern_return.h> +#include <vm/vm_map.h> + +extern kern_return_t projected_buffer_allocate(vm_map_t, vm_size_t, int, + vm_offset_t *, vm_offset_t *, + vm_prot_t, vm_inherit_t); +extern kern_return_t projected_buffer_deallocate(vm_map_t, vm_offset_t, + vm_offset_t); +extern kern_return_t projected_buffer_map(vm_map_t, vm_offset_t, vm_size_t, + vm_offset_t *, vm_prot_t, + vm_inherit_t); +extern kern_return_t projected_buffer_collect(vm_map_t); + +extern void kmem_init(vm_offset_t, vm_offset_t); + +extern kern_return_t kmem_alloc(vm_map_t, vm_offset_t *, vm_size_t); +extern kern_return_t kmem_alloc_pageable(vm_map_t, vm_offset_t *, + vm_size_t); +extern kern_return_t kmem_valloc(vm_map_t, vm_offset_t *, vm_size_t); +extern kern_return_t kmem_alloc_wired(vm_map_t, vm_offset_t *, vm_size_t); +extern kern_return_t kmem_alloc_aligned(vm_map_t, vm_offset_t *, vm_size_t); +extern void* kmem_map_aligned_table(phys_addr_t, vm_size_t, int); + +extern void kmem_free(vm_map_t, vm_offset_t, vm_size_t); + +extern void kmem_submap(vm_map_t, vm_map_t, vm_offset_t *, + vm_offset_t *, vm_size_t); + +extern kern_return_t kmem_io_map_copyout(vm_map_t, vm_offset_t *, + vm_offset_t *, vm_size_t *, + vm_map_copy_t, vm_size_t); +extern void kmem_io_map_deallocate(vm_map_t, vm_offset_t, + vm_size_t); + +extern int +copyinmap (vm_map_t map, char *fromaddr, char *toaddr, int length); + +extern int +copyoutmap (vm_map_t map, char *fromaddr, char *toaddr, int length); + +extern vm_map_t kernel_map; +extern vm_map_t kernel_pageable_map; +extern vm_map_t ipc_kernel_map; + +extern boolean_t projected_buffer_in_range( + vm_map_t map, + vm_offset_t start, + vm_offset_t end); + +extern void kmem_alloc_pages( + vm_object_t object, + vm_offset_t offset, + vm_offset_t start, + vm_offset_t end, + vm_prot_t protection); + +extern void kmem_remap_pages( + vm_object_t object, + vm_offset_t offset, + vm_offset_t start, + vm_offset_t end, + vm_prot_t protection); + +#endif /* _VM_VM_KERN_H_ */ diff --git a/vm/vm_map.c b/vm/vm_map.c new file mode 100644 index 0000000..e454bb2 --- /dev/null +++ b/vm/vm_map.c @@ -0,0 +1,5237 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University. + * Copyright (c) 1993,1994 The University of Utah and + * the Computer Systems Laboratory (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF + * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY + * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF + * THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/vm_map.c + * Author: Avadis Tevanian, Jr., Michael Wayne Young + * Date: 1985 + * + * Virtual memory mapping module. + */ + +#include <kern/printf.h> +#include <mach/kern_return.h> +#include <mach/port.h> +#include <mach/vm_attributes.h> +#include <mach/vm_param.h> +#include <mach/vm_wire.h> +#include <kern/assert.h> +#include <kern/debug.h> +#include <kern/kalloc.h> +#include <kern/mach.server.h> +#include <kern/list.h> +#include <kern/rbtree.h> +#include <kern/slab.h> +#include <kern/mach4.server.h> +#include <vm/pmap.h> +#include <vm/vm_fault.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_resident.h> +#include <vm/vm_kern.h> +#include <vm/memory_object_proxy.h> +#include <ipc/ipc_port.h> +#include <string.h> + +#if MACH_KDB +#include <ddb/db_output.h> +#include <vm/vm_print.h> +#endif /* MACH_KDB */ + +/* + * Macros to copy a vm_map_entry. We must be careful to correctly + * manage the wired page count. vm_map_entry_copy() creates a new + * map entry to the same memory - the wired count in the new entry + * must be set to zero. vm_map_entry_copy_full() creates a new + * entry that is identical to the old entry. This preserves the + * wire count; it's used for map splitting and cache changing in + * vm_map_copyout. + */ +#define vm_map_entry_copy(NEW,OLD) \ +MACRO_BEGIN \ + *(NEW) = *(OLD); \ + (NEW)->is_shared = FALSE; \ + (NEW)->needs_wakeup = FALSE; \ + (NEW)->in_transition = FALSE; \ + (NEW)->wired_count = 0; \ + (NEW)->wired_access = VM_PROT_NONE; \ +MACRO_END + +#define vm_map_entry_copy_full(NEW,OLD) (*(NEW) = *(OLD)) + +/* + * Virtual memory maps provide for the mapping, protection, + * and sharing of virtual memory objects. In addition, + * this module provides for an efficient virtual copy of + * memory from one map to another. + * + * Synchronization is required prior to most operations. + * + * Maps consist of an ordered doubly-linked list of simple + * entries; a hint and a red-black tree are used to speed up lookups. + * + * Sharing maps have been deleted from this version of Mach. + * All shared objects are now mapped directly into the respective + * maps. This requires a change in the copy on write strategy; + * the asymmetric (delayed) strategy is used for shared temporary + * objects instead of the symmetric (shadow) strategy. This is + * selected by the (new) use_shared_copy bit in the object. See + * vm_object_copy_temporary in vm_object.c for details. All maps + * are now "top level" maps (either task map, kernel map or submap + * of the kernel map). + * + * Since portions of maps are specified by start/end addresses, + * which may not align with existing map entries, all + * routines merely "clip" entries to these start/end values. + * [That is, an entry is split into two, bordering at a + * start or end value.] Note that these clippings may not + * always be necessary (as the two resulting entries are then + * not changed); however, the clipping is done for convenience. + * The entries can later be "glued back together" (coalesced). + * + * The symmetric (shadow) copy strategy implements virtual copy + * by copying VM object references from one map to + * another, and then marking both regions as copy-on-write. + * It is important to note that only one writeable reference + * to a VM object region exists in any map when this strategy + * is used -- this means that shadow object creation can be + * delayed until a write operation occurs. The asymmetric (delayed) + * strategy allows multiple maps to have writeable references to + * the same region of a vm object, and hence cannot delay creating + * its copy objects. See vm_object_copy_temporary() in vm_object.c. + * Copying of permanent objects is completely different; see + * vm_object_copy_strategically() in vm_object.c. + */ + +struct kmem_cache vm_map_cache; /* cache for vm_map structures */ +struct kmem_cache vm_map_entry_cache; /* cache for vm_map_entry structures */ +struct kmem_cache vm_map_copy_cache; /* cache for vm_map_copy structures */ + +/* + * Placeholder object for submap operations. This object is dropped + * into the range by a call to vm_map_find, and removed when + * vm_map_submap creates the submap. + */ + +static struct vm_object vm_submap_object_store; +vm_object_t vm_submap_object = &vm_submap_object_store; + +/* + * vm_map_init: + * + * Initialize the vm_map module. Must be called before + * any other vm_map routines. + * + * Map and entry structures are allocated from caches -- we must + * initialize those caches. + * + * There are two caches of interest: + * + * vm_map_cache: used to allocate maps. + * vm_map_entry_cache: used to allocate map entries. + * + * We make sure the map entry cache allocates memory directly from the + * physical allocator to avoid recursion with this module. + */ + +void vm_map_init(void) +{ + kmem_cache_init(&vm_map_cache, "vm_map", sizeof(struct vm_map), 0, + NULL, 0); + kmem_cache_init(&vm_map_entry_cache, "vm_map_entry", + sizeof(struct vm_map_entry), 0, NULL, + KMEM_CACHE_NOOFFSLAB | KMEM_CACHE_PHYSMEM); + kmem_cache_init(&vm_map_copy_cache, "vm_map_copy", + sizeof(struct vm_map_copy), 0, NULL, 0); + + /* + * Submap object is initialized by vm_object_init. + */ +} + +void vm_map_setup( + vm_map_t map, + pmap_t pmap, + vm_offset_t min, + vm_offset_t max) +{ + vm_map_first_entry(map) = vm_map_to_entry(map); + vm_map_last_entry(map) = vm_map_to_entry(map); + map->hdr.nentries = 0; + rbtree_init(&map->hdr.tree); + rbtree_init(&map->hdr.gap_tree); + + map->size = 0; + map->size_wired = 0; + map->ref_count = 1; + map->pmap = pmap; + map->min_offset = min; + map->max_offset = max; + map->wiring_required = FALSE; + map->wait_for_space = FALSE; + map->first_free = vm_map_to_entry(map); + map->hint = vm_map_to_entry(map); + map->name = NULL; + vm_map_lock_init(map); + simple_lock_init(&map->ref_lock); + simple_lock_init(&map->hint_lock); +} + +/* + * vm_map_create: + * + * Creates and returns a new empty VM map with + * the given physical map structure, and having + * the given lower and upper address bounds. + */ +vm_map_t vm_map_create( + pmap_t pmap, + vm_offset_t min, + vm_offset_t max) +{ + vm_map_t result; + + result = (vm_map_t) kmem_cache_alloc(&vm_map_cache); + if (result == VM_MAP_NULL) + return VM_MAP_NULL; + + vm_map_setup(result, pmap, min, max); + + return(result); +} + +void vm_map_lock(struct vm_map *map) +{ + lock_write(&map->lock); + + /* + * XXX Memory allocation may occur while a map is locked, + * for example when clipping entries. If the system is running + * low on memory, allocating may block until pages are + * available. But if a map used by the default pager is + * kept locked, a deadlock occurs. + * + * This workaround temporarily elevates the current thread + * VM privileges to avoid that particular deadlock, and does + * so regardless of the map for convenience, and because it's + * currently impossible to predict which map the default pager + * may depend on. + * + * This workaround isn't reliable, and only makes exhaustion + * less likely. In particular pageout may cause lots of data + * to be passed between the kernel and the pagers, often + * in the form of large copy maps. Making the minimum + * number of pages depend on the total number of pages + * should make exhaustion even less likely. + */ + + if (current_thread()) { + current_thread()->vm_privilege++; + assert(current_thread()->vm_privilege != 0); + } + + map->timestamp++; +} + +void vm_map_unlock(struct vm_map *map) +{ + if (current_thread()) { + current_thread()->vm_privilege--; + } + + lock_write_done(&map->lock); +} + +/* + * vm_map_entry_create: [ internal use only ] + * + * Allocates a VM map entry for insertion in the + * given map (or map copy). No fields are filled. + */ +#define vm_map_entry_create(map) \ + _vm_map_entry_create(&(map)->hdr) + +#define vm_map_copy_entry_create(copy) \ + _vm_map_entry_create(&(copy)->cpy_hdr) + +static vm_map_entry_t +_vm_map_entry_create(const struct vm_map_header *map_header) +{ + vm_map_entry_t entry; + + entry = (vm_map_entry_t) kmem_cache_alloc(&vm_map_entry_cache); + if (entry == VM_MAP_ENTRY_NULL) + panic("vm_map_entry_create"); + + return(entry); +} + +/* + * vm_map_entry_dispose: [ internal use only ] + * + * Inverse of vm_map_entry_create. + */ +#define vm_map_entry_dispose(map, entry) \ + _vm_map_entry_dispose(&(map)->hdr, (entry)) + +#define vm_map_copy_entry_dispose(map, entry) \ + _vm_map_entry_dispose(&(copy)->cpy_hdr, (entry)) + +static void +_vm_map_entry_dispose(const struct vm_map_header *map_header, + vm_map_entry_t entry) +{ + (void)map_header; + + kmem_cache_free(&vm_map_entry_cache, (vm_offset_t) entry); +} + +/* + * Red-black tree lookup/insert comparison functions + */ +static inline int vm_map_entry_cmp_lookup(vm_offset_t addr, + const struct rbtree_node *node) +{ + struct vm_map_entry *entry; + + entry = rbtree_entry(node, struct vm_map_entry, tree_node); + + if (addr < entry->vme_start) + return -1; + else if (addr < entry->vme_end) + return 0; + else + return 1; +} + +static inline int vm_map_entry_cmp_insert(const struct rbtree_node *a, + const struct rbtree_node *b) +{ + struct vm_map_entry *entry; + + entry = rbtree_entry(a, struct vm_map_entry, tree_node); + return vm_map_entry_cmp_lookup(entry->vme_start, b); +} + +/* + * Gap management functions + */ +static inline int vm_map_entry_gap_cmp_lookup(vm_size_t gap_size, + const struct rbtree_node *node) +{ + struct vm_map_entry *entry; + + entry = rbtree_entry(node, struct vm_map_entry, gap_node); + + if (gap_size < entry->gap_size) + return -1; + else if (gap_size == entry->gap_size) + return 0; + else + return 1; +} + +static inline int vm_map_entry_gap_cmp_insert(const struct rbtree_node *a, + const struct rbtree_node *b) +{ + struct vm_map_entry *entry; + + entry = rbtree_entry(a, struct vm_map_entry, gap_node); + return vm_map_entry_gap_cmp_lookup(entry->gap_size, b); +} + +static int +vm_map_gap_valid(struct vm_map_header *hdr, struct vm_map_entry *entry) +{ + return entry != (struct vm_map_entry *)&hdr->links; +} + +static void +vm_map_gap_compute(struct vm_map_header *hdr, struct vm_map_entry *entry) +{ + struct vm_map_entry *next; + + next = entry->vme_next; + + if (vm_map_gap_valid(hdr, next)) { + entry->gap_size = next->vme_start - entry->vme_end; + } else { + entry->gap_size = hdr->vme_end - entry->vme_end; + } +} + +static void +vm_map_gap_insert_single(struct vm_map_header *hdr, struct vm_map_entry *entry) +{ + struct vm_map_entry *tmp; + struct rbtree_node *node; + unsigned long slot; + + if (!vm_map_gap_valid(hdr, entry)) { + return; + } + + vm_map_gap_compute(hdr, entry); + + if (entry->gap_size == 0) { + return; + } + + node = rbtree_lookup_slot(&hdr->gap_tree, entry->gap_size, + vm_map_entry_gap_cmp_lookup, slot); + + if (node == NULL) { + rbtree_insert_slot(&hdr->gap_tree, slot, &entry->gap_node); + list_init(&entry->gap_list); + entry->in_gap_tree = 1; + } else { + tmp = rbtree_entry(node, struct vm_map_entry, gap_node); + list_insert_tail(&tmp->gap_list, &entry->gap_list); + entry->in_gap_tree = 0; + } +} + +static void +vm_map_gap_remove_single(struct vm_map_header *hdr, struct vm_map_entry *entry) +{ + struct vm_map_entry *tmp; + + if (!vm_map_gap_valid(hdr, entry)) { + return; + } + + if (entry->gap_size == 0) { + return; + } + + if (!entry->in_gap_tree) { + list_remove(&entry->gap_list); + return; + } + + rbtree_remove(&hdr->gap_tree, &entry->gap_node); + + if (list_empty(&entry->gap_list)) { + return; + } + + tmp = list_first_entry(&entry->gap_list, struct vm_map_entry, gap_list); + assert(tmp->gap_size == entry->gap_size); + list_remove(&tmp->gap_list); + list_set_head(&tmp->gap_list, &entry->gap_list); + assert(!tmp->in_gap_tree); + rbtree_insert(&hdr->gap_tree, &tmp->gap_node, + vm_map_entry_gap_cmp_insert); + tmp->in_gap_tree = 1; +} + +static void +vm_map_gap_update(struct vm_map_header *hdr, struct vm_map_entry *entry) +{ + vm_map_gap_remove_single(hdr, entry); + vm_map_gap_insert_single(hdr, entry); +} + +static void +vm_map_gap_insert(struct vm_map_header *hdr, struct vm_map_entry *entry) +{ + vm_map_gap_remove_single(hdr, entry->vme_prev); + vm_map_gap_insert_single(hdr, entry->vme_prev); + vm_map_gap_insert_single(hdr, entry); +} + +static void +vm_map_gap_remove(struct vm_map_header *hdr, struct vm_map_entry *entry) +{ + vm_map_gap_remove_single(hdr, entry); + vm_map_gap_remove_single(hdr, entry->vme_prev); + vm_map_gap_insert_single(hdr, entry->vme_prev); +} + +/* + * vm_map_entry_{un,}link: + * + * Insert/remove entries from maps (or map copies). + * + * The start and end addresses of the entries must be properly set + * before using these macros. + */ +#define vm_map_entry_link(map, after_where, entry) \ + _vm_map_entry_link(&(map)->hdr, after_where, entry, 1) + +#define vm_map_copy_entry_link(copy, after_where, entry) \ + _vm_map_entry_link(&(copy)->cpy_hdr, after_where, entry, 0) + +#define _vm_map_entry_link(hdr, after_where, entry, link_gap) \ + MACRO_BEGIN \ + (hdr)->nentries++; \ + (entry)->vme_prev = (after_where); \ + (entry)->vme_next = (after_where)->vme_next; \ + (entry)->vme_prev->vme_next = \ + (entry)->vme_next->vme_prev = (entry); \ + rbtree_insert(&(hdr)->tree, &(entry)->tree_node, \ + vm_map_entry_cmp_insert); \ + if (link_gap) \ + vm_map_gap_insert((hdr), (entry)); \ + MACRO_END + +#define vm_map_entry_unlink(map, entry) \ + _vm_map_entry_unlink(&(map)->hdr, entry, 1) + +#define vm_map_copy_entry_unlink(copy, entry) \ + _vm_map_entry_unlink(&(copy)->cpy_hdr, entry, 0) + +#define _vm_map_entry_unlink(hdr, entry, unlink_gap) \ + MACRO_BEGIN \ + (hdr)->nentries--; \ + (entry)->vme_next->vme_prev = (entry)->vme_prev; \ + (entry)->vme_prev->vme_next = (entry)->vme_next; \ + rbtree_remove(&(hdr)->tree, &(entry)->tree_node); \ + if (unlink_gap) \ + vm_map_gap_remove((hdr), (entry)); \ + MACRO_END + +/* + * vm_map_reference: + * + * Creates another valid reference to the given map. + * + */ +void vm_map_reference(vm_map_t map) +{ + if (map == VM_MAP_NULL) + return; + + simple_lock(&map->ref_lock); + map->ref_count++; + simple_unlock(&map->ref_lock); +} + +/* + * vm_map_deallocate: + * + * Removes a reference from the specified map, + * destroying it if no references remain. + * The map should not be locked. + */ +void vm_map_deallocate(vm_map_t map) +{ + int c; + + if (map == VM_MAP_NULL) + return; + + simple_lock(&map->ref_lock); + c = --map->ref_count; + simple_unlock(&map->ref_lock); + + if (c > 0) { + return; + } + + projected_buffer_collect(map); + (void) vm_map_delete(map, map->min_offset, map->max_offset); + + pmap_destroy(map->pmap); + + kmem_cache_free(&vm_map_cache, (vm_offset_t) map); +} + +/* + * SAVE_HINT: + * + * Saves the specified entry as the hint for + * future lookups. Performs necessary interlocks. + */ +#define SAVE_HINT(map,value) \ + simple_lock(&(map)->hint_lock); \ + (map)->hint = (value); \ + simple_unlock(&(map)->hint_lock); + +/* + * vm_map_lookup_entry: [ internal use only ] + * + * Finds the map entry containing (or + * immediately preceding) the specified address + * in the given map; the entry is returned + * in the "entry" parameter. The boolean + * result indicates whether the address is + * actually contained in the map. + */ +boolean_t vm_map_lookup_entry( + vm_map_t map, + vm_offset_t address, + vm_map_entry_t *entry) /* OUT */ +{ + struct rbtree_node *node; + vm_map_entry_t hint; + + /* + * First, make a quick check to see if we are already + * looking at the entry we want (which is often the case). + */ + + simple_lock(&map->hint_lock); + hint = map->hint; + simple_unlock(&map->hint_lock); + + if ((hint != vm_map_to_entry(map)) && (address >= hint->vme_start)) { + if (address < hint->vme_end) { + *entry = hint; + return(TRUE); + } else { + vm_map_entry_t next = hint->vme_next; + + if ((next == vm_map_to_entry(map)) + || (address < next->vme_start)) { + *entry = hint; + return(FALSE); + } + } + } + + /* + * If the hint didn't help, use the red-black tree. + */ + + node = rbtree_lookup_nearest(&map->hdr.tree, address, + vm_map_entry_cmp_lookup, RBTREE_LEFT); + + if (node == NULL) { + *entry = vm_map_to_entry(map); + SAVE_HINT(map, *entry); + return(FALSE); + } else { + *entry = rbtree_entry(node, struct vm_map_entry, tree_node); + SAVE_HINT(map, *entry); + return((address < (*entry)->vme_end) ? TRUE : FALSE); + } +} + +/* + * Find a range of available space from the specified map. + * + * If successful, this function returns the map entry immediately preceding + * the range, and writes the range address in startp. If the map contains + * no entry, the entry returned points to the map header. + * Otherwise, NULL is returned. + * + * If map_locked is true, this function will not wait for more space in case + * of failure. Otherwise, the map is locked. + */ +static struct vm_map_entry * +vm_map_find_entry_anywhere(struct vm_map *map, + vm_size_t size, + vm_offset_t mask, + boolean_t map_locked, + vm_offset_t *startp) +{ + struct vm_map_entry *entry; + struct rbtree_node *node; + vm_size_t max_size; + vm_offset_t start, end; + vm_offset_t max; + + assert(size != 0); + + max = map->max_offset; + if (((mask + 1) & mask) != 0) { + /* We have high bits in addition to the low bits */ + + int first0 = __builtin_ffs(~mask); /* First zero after low bits */ + vm_offset_t lowmask = (1UL << (first0-1)) - 1; /* low bits */ + vm_offset_t himask = mask - lowmask; /* high bits */ + int second1 = __builtin_ffs(himask); /* First one after low bits */ + + max = 1UL << (second1-1); + + if (himask + max != 0) { + /* high bits do not continue up to the end */ + printf("invalid mask %zx\n", mask); + return NULL; + } + + mask = lowmask; + } + + if (!map_locked) { + vm_map_lock(map); + } + +restart: + if (map->hdr.nentries == 0) { + entry = vm_map_to_entry(map); + start = (map->min_offset + mask) & ~mask; + end = start + size; + + if ((start < map->min_offset) || (end <= start) || (end > max)) { + goto error; + } + + *startp = start; + return entry; + } + + entry = map->first_free; + + if (entry != vm_map_to_entry(map)) { + start = (entry->vme_end + mask) & ~mask; + end = start + size; + + if ((start >= entry->vme_end) + && (end > start) + && (end <= max) + && (end <= (entry->vme_end + entry->gap_size))) { + *startp = start; + return entry; + } + } + + max_size = size + mask; + + if (max_size < size) { + printf("max_size %zd got smaller than size %zd with mask %zd\n", + max_size, size, mask); + goto error; + } + + node = rbtree_lookup_nearest(&map->hdr.gap_tree, max_size, + vm_map_entry_gap_cmp_lookup, RBTREE_RIGHT); + + if (node == NULL) { + if (map_locked || !map->wait_for_space) { + goto error; + } + + assert_wait((event_t)map, TRUE); + vm_map_unlock(map); + thread_block(NULL); + vm_map_lock(map); + goto restart; + } + + entry = rbtree_entry(node, struct vm_map_entry, gap_node); + assert(entry->in_gap_tree); + + if (!list_empty(&entry->gap_list)) { + entry = list_last_entry(&entry->gap_list, + struct vm_map_entry, gap_list); + } + + assert(entry->gap_size >= max_size); + start = (entry->vme_end + mask) & ~mask; + assert(start >= entry->vme_end); + end = start + size; + assert(end > start); + assert(end <= (entry->vme_end + entry->gap_size)); + if (end > max) { + /* Does not respect the allowed maximum */ + printf("%zx does not respect %zx\n", end, max); + return NULL; + } + *startp = start; + return entry; + +error: + printf("no more room in %p (%s)\n", map, map->name); + return NULL; +} + +/* + * Routine: vm_map_find_entry + * Purpose: + * Allocate a range in the specified virtual address map, + * returning the entry allocated for that range. + * Used by kmem_alloc, etc. Returns wired entries. + * + * The map must be locked. + * + * If an entry is allocated, the object/offset fields + * are initialized to zero. If an object is supplied, + * then an existing entry may be extended. + */ +kern_return_t vm_map_find_entry( + vm_map_t map, + vm_offset_t *address, /* OUT */ + vm_size_t size, + vm_offset_t mask, + vm_object_t object, + vm_map_entry_t *o_entry) /* OUT */ +{ + vm_map_entry_t entry, new_entry; + vm_offset_t start; + vm_offset_t end; + + entry = vm_map_find_entry_anywhere(map, size, mask, TRUE, &start); + + if (entry == NULL) { + return KERN_NO_SPACE; + } + + end = start + size; + + /* + * At this point, + * "start" and "end" should define the endpoints of the + * available new range, and + * "entry" should refer to the region before the new + * range, and + * + * the map should be locked. + */ + + *address = start; + + /* + * See whether we can avoid creating a new entry by + * extending one of our neighbors. [So far, we only attempt to + * extend from below.] + */ + + if ((object != VM_OBJECT_NULL) && + (entry != vm_map_to_entry(map)) && + (entry->vme_end == start) && + (!entry->is_shared) && + (!entry->is_sub_map) && + (entry->object.vm_object == object) && + (entry->needs_copy == FALSE) && + (entry->inheritance == VM_INHERIT_DEFAULT) && + (entry->protection == VM_PROT_DEFAULT) && + (entry->max_protection == VM_PROT_ALL) && + (entry->wired_count != 0) && + (entry->projected_on == 0)) { + /* + * Because this is a special case, + * we don't need to use vm_object_coalesce. + */ + + entry->vme_end = end; + vm_map_gap_update(&map->hdr, entry); + new_entry = entry; + } else { + new_entry = vm_map_entry_create(map); + + new_entry->vme_start = start; + new_entry->vme_end = end; + + new_entry->is_shared = FALSE; + new_entry->is_sub_map = FALSE; + new_entry->object.vm_object = VM_OBJECT_NULL; + new_entry->offset = (vm_offset_t) 0; + + new_entry->needs_copy = FALSE; + + new_entry->inheritance = VM_INHERIT_DEFAULT; + new_entry->protection = VM_PROT_DEFAULT; + new_entry->max_protection = VM_PROT_ALL; + new_entry->wired_count = 1; + new_entry->wired_access = VM_PROT_DEFAULT; + + new_entry->in_transition = FALSE; + new_entry->needs_wakeup = FALSE; + new_entry->projected_on = 0; + + /* + * Insert the new entry into the list + */ + + vm_map_entry_link(map, entry, new_entry); + } + + map->size += size; + + /* + * Update the free space hint and the lookup hint + */ + + map->first_free = new_entry; + SAVE_HINT(map, new_entry); + + *o_entry = new_entry; + return(KERN_SUCCESS); +} + +boolean_t vm_map_pmap_enter_print = FALSE; +boolean_t vm_map_pmap_enter_enable = FALSE; + +/* + * Routine: vm_map_pmap_enter + * + * Description: + * Force pages from the specified object to be entered into + * the pmap at the specified address if they are present. + * As soon as a page not found in the object the scan ends. + * + * Returns: + * Nothing. + * + * In/out conditions: + * The source map should not be locked on entry. + */ +static void +vm_map_pmap_enter( + vm_map_t map, + vm_offset_t addr, + vm_offset_t end_addr, + vm_object_t object, + vm_offset_t offset, + vm_prot_t protection) +{ + while (addr < end_addr) { + vm_page_t m; + + vm_object_lock(object); + vm_object_paging_begin(object); + + m = vm_page_lookup(object, offset); + if (m == VM_PAGE_NULL || m->absent) { + vm_object_paging_end(object); + vm_object_unlock(object); + return; + } + + if (vm_map_pmap_enter_print) { + printf("vm_map_pmap_enter:"); + printf("map: %p, addr: %zx, object: %p, offset: %zx\n", + map, addr, object, offset); + } + + m->busy = TRUE; + vm_object_unlock(object); + + PMAP_ENTER(map->pmap, addr, m, + protection, FALSE); + + vm_object_lock(object); + PAGE_WAKEUP_DONE(m); + vm_page_lock_queues(); + if (!m->active && !m->inactive) + vm_page_activate(m); + vm_page_unlock_queues(); + vm_object_paging_end(object); + vm_object_unlock(object); + + offset += PAGE_SIZE; + addr += PAGE_SIZE; + } +} + +/* + * Routine: vm_map_enter + * + * Description: + * Allocate a range in the specified virtual address map. + * The resulting range will refer to memory defined by + * the given memory object and offset into that object. + * + * Arguments are as defined in the vm_map call. + */ +kern_return_t vm_map_enter( + vm_map_t map, + vm_offset_t *address, /* IN/OUT */ + vm_size_t size, + vm_offset_t mask, + boolean_t anywhere, + vm_object_t object, + vm_offset_t offset, + boolean_t needs_copy, + vm_prot_t cur_protection, + vm_prot_t max_protection, + vm_inherit_t inheritance) +{ + vm_map_entry_t entry; + vm_map_entry_t next_entry; + vm_offset_t start; + vm_offset_t end; + kern_return_t result = KERN_SUCCESS; + +#define RETURN(value) { result = value; goto BailOut; } + + if (size == 0) + return KERN_INVALID_ARGUMENT; + + start = *address; + + if (anywhere) { + entry = vm_map_find_entry_anywhere(map, size, mask, FALSE, &start); + + if (entry == NULL) { + RETURN(KERN_NO_SPACE); + } + + end = start + size; + *address = start; + next_entry = entry->vme_next; + } else { + vm_map_entry_t temp_entry; + + /* + * Verify that: + * the address doesn't itself violate + * the mask requirement. + */ + + if ((start & mask) != 0) + return(KERN_NO_SPACE); + + vm_map_lock(map); + + /* + * ... the address is within bounds + */ + + end = start + size; + + if ((start < map->min_offset) || + (end > map->max_offset) || + (start >= end)) { + RETURN(KERN_INVALID_ADDRESS); + } + + /* + * ... the starting address isn't allocated + */ + + if (vm_map_lookup_entry(map, start, &temp_entry)) + RETURN(KERN_NO_SPACE); + + entry = temp_entry; + next_entry = entry->vme_next; + + /* + * ... the next region doesn't overlap the + * end point. + */ + + if ((next_entry != vm_map_to_entry(map)) && + (next_entry->vme_start < end)) + RETURN(KERN_NO_SPACE); + } + + /* + * At this point, + * "start" and "end" should define the endpoints of the + * available new range, and + * "entry" should refer to the region before the new + * range, and + * + * the map should be locked. + */ + + /* + * See whether we can avoid creating a new entry (and object) by + * extending one of our neighbors. + */ + + if ((entry != vm_map_to_entry(map)) && + (entry->vme_end == start) && + (!entry->is_shared) && + (!entry->is_sub_map) && + (entry->inheritance == inheritance) && + (entry->protection == cur_protection) && + (entry->max_protection == max_protection) && + (entry->wired_count == 0) && + (entry->projected_on == 0)) { + if (vm_object_coalesce(entry->object.vm_object, + object, + entry->offset, + offset, + (vm_size_t)(entry->vme_end - entry->vme_start), + size, + &entry->object.vm_object, + &entry->offset)) { + + /* + * Coalesced the two objects - can extend + * the previous map entry to include the + * new range. + */ + map->size += size; + entry->vme_end = end; + vm_map_gap_update(&map->hdr, entry); + /* + * Now that we did, perhaps we could simplify + * things even further by coalescing the next + * entry into the one we just extended. + */ + vm_map_coalesce_entry(map, next_entry); + RETURN(KERN_SUCCESS); + } + } + if ((next_entry != vm_map_to_entry(map)) && + (next_entry->vme_start == end) && + (!next_entry->is_shared) && + (!next_entry->is_sub_map) && + (next_entry->inheritance == inheritance) && + (next_entry->protection == cur_protection) && + (next_entry->max_protection == max_protection) && + (next_entry->wired_count == 0) && + (next_entry->projected_on == 0)) { + if (vm_object_coalesce(object, + next_entry->object.vm_object, + offset, + next_entry->offset, + size, + (vm_size_t)(next_entry->vme_end - next_entry->vme_start), + &next_entry->object.vm_object, + &next_entry->offset)) { + + /* + * Coalesced the two objects - can extend + * the next map entry to include the + * new range. + */ + map->size += size; + next_entry->vme_start = start; + vm_map_gap_update(&map->hdr, entry); + /* + * Now that we did, perhaps we could simplify + * things even further by coalescing the + * entry into the previous one. + */ + vm_map_coalesce_entry(map, next_entry); + RETURN(KERN_SUCCESS); + } + } + + /* + * Create a new entry + */ + + /**/ { + vm_map_entry_t new_entry; + + new_entry = vm_map_entry_create(map); + + new_entry->vme_start = start; + new_entry->vme_end = end; + + new_entry->is_shared = FALSE; + new_entry->is_sub_map = FALSE; + new_entry->object.vm_object = object; + new_entry->offset = offset; + + new_entry->needs_copy = needs_copy; + + new_entry->inheritance = inheritance; + new_entry->protection = cur_protection; + new_entry->max_protection = max_protection; + new_entry->wired_count = 0; + new_entry->wired_access = VM_PROT_NONE; + + new_entry->in_transition = FALSE; + new_entry->needs_wakeup = FALSE; + new_entry->projected_on = 0; + + /* + * Insert the new entry into the list + */ + + vm_map_entry_link(map, entry, new_entry); + map->size += size; + + /* + * Update the free space hint and the lookup hint + */ + + if ((map->first_free == entry) && + ((entry == vm_map_to_entry(map) ? map->min_offset : entry->vme_end) + >= new_entry->vme_start)) + map->first_free = new_entry; + + SAVE_HINT(map, new_entry); + + if (map->wiring_required) { + /* Returns with the map read-locked if successful */ + result = vm_map_pageable(map, start, end, cur_protection, FALSE, FALSE); + + if (result != KERN_SUCCESS) { + RETURN(KERN_SUCCESS); + } + } + + vm_map_unlock(map); + + if ((object != VM_OBJECT_NULL) && + (vm_map_pmap_enter_enable) && + (!anywhere) && + (!needs_copy) && + (size < (128*1024))) { + vm_map_pmap_enter(map, start, end, + object, offset, cur_protection); + } + + return(result); + /**/ } + + BailOut: ; + + vm_map_unlock(map); + return(result); + +#undef RETURN +} + +/* + * vm_map_clip_start: [ internal use only ] + * + * Asserts that the given entry begins at or after + * the specified address; if necessary, + * it splits the entry into two. + */ +#define vm_map_clip_start(map, entry, startaddr) \ + MACRO_BEGIN \ + if ((startaddr) > (entry)->vme_start) \ + _vm_map_clip_start(&(map)->hdr,(entry),(startaddr),1); \ + MACRO_END + +#define vm_map_copy_clip_start(copy, entry, startaddr) \ + MACRO_BEGIN \ + if ((startaddr) > (entry)->vme_start) \ + _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr),0); \ + MACRO_END + +/* + * This routine is called only when it is known that + * the entry must be split. + */ +void _vm_map_clip_start( + struct vm_map_header *map_header, + vm_map_entry_t entry, + vm_offset_t start, + boolean_t link_gap) +{ + vm_map_entry_t new_entry; + + /* + * Split off the front portion -- + * note that we must insert the new + * entry BEFORE this one, so that + * this entry has the specified starting + * address. + */ + + new_entry = _vm_map_entry_create(map_header); + vm_map_entry_copy_full(new_entry, entry); + + new_entry->vme_end = start; + entry->offset += (start - entry->vme_start); + entry->vme_start = start; + + _vm_map_entry_link(map_header, entry->vme_prev, new_entry, link_gap); + + if (entry->is_sub_map) + vm_map_reference(new_entry->object.sub_map); + else + vm_object_reference(new_entry->object.vm_object); +} + +/* + * vm_map_clip_end: [ internal use only ] + * + * Asserts that the given entry ends at or before + * the specified address; if necessary, + * it splits the entry into two. + */ +#define vm_map_clip_end(map, entry, endaddr) \ + MACRO_BEGIN \ + if ((endaddr) < (entry)->vme_end) \ + _vm_map_clip_end(&(map)->hdr,(entry),(endaddr),1); \ + MACRO_END + +#define vm_map_copy_clip_end(copy, entry, endaddr) \ + MACRO_BEGIN \ + if ((endaddr) < (entry)->vme_end) \ + _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr),0); \ + MACRO_END + +/* + * This routine is called only when it is known that + * the entry must be split. + */ +void _vm_map_clip_end( + struct vm_map_header *map_header, + vm_map_entry_t entry, + vm_offset_t end, + boolean_t link_gap) +{ + vm_map_entry_t new_entry; + + /* + * Create a new entry and insert it + * AFTER the specified entry + */ + + new_entry = _vm_map_entry_create(map_header); + vm_map_entry_copy_full(new_entry, entry); + + new_entry->vme_start = entry->vme_end = end; + new_entry->offset += (end - entry->vme_start); + + _vm_map_entry_link(map_header, entry, new_entry, link_gap); + + if (entry->is_sub_map) + vm_map_reference(new_entry->object.sub_map); + else + vm_object_reference(new_entry->object.vm_object); +} + +/* + * VM_MAP_RANGE_CHECK: [ internal use only ] + * + * Asserts that the starting and ending region + * addresses fall within the valid range of the map. + */ +#define VM_MAP_RANGE_CHECK(map, start, end) \ + { \ + if (start < vm_map_min(map)) \ + start = vm_map_min(map); \ + if (end > vm_map_max(map)) \ + end = vm_map_max(map); \ + if (start > end) \ + start = end; \ + } + +/* + * vm_map_submap: [ kernel use only ] + * + * Mark the given range as handled by a subordinate map. + * + * This range must have been created with vm_map_find using + * the vm_submap_object, and no other operations may have been + * performed on this range prior to calling vm_map_submap. + * + * Only a limited number of operations can be performed + * within this rage after calling vm_map_submap: + * vm_fault + * [Don't try vm_map_copyin!] + * + * To remove a submapping, one must first remove the + * range from the superior map, and then destroy the + * submap (if desired). [Better yet, don't try it.] + */ +kern_return_t vm_map_submap( + vm_map_t map, + vm_offset_t start, + vm_offset_t end, + vm_map_t submap) +{ + vm_map_entry_t entry; + kern_return_t result = KERN_INVALID_ARGUMENT; + vm_object_t object; + + vm_map_lock(map); + + VM_MAP_RANGE_CHECK(map, start, end); + + if (vm_map_lookup_entry(map, start, &entry)) { + vm_map_clip_start(map, entry, start); + } + else + entry = entry->vme_next; + + vm_map_clip_end(map, entry, end); + + if ((entry->vme_start == start) && (entry->vme_end == end) && + (!entry->is_sub_map) && + ((object = entry->object.vm_object) == vm_submap_object) && + (object->resident_page_count == 0) && + (object->copy == VM_OBJECT_NULL) && + (object->shadow == VM_OBJECT_NULL) && + (!object->pager_created)) { + entry->object.vm_object = VM_OBJECT_NULL; + vm_object_deallocate(object); + entry->is_sub_map = TRUE; + vm_map_reference(entry->object.sub_map = submap); + result = KERN_SUCCESS; + } + vm_map_unlock(map); + + return(result); +} + +static void +vm_map_entry_inc_wired(vm_map_t map, vm_map_entry_t entry) +{ + /* + * This member is a counter to indicate whether an entry + * should be faulted in (first time it is wired, wired_count + * goes from 0 to 1) or not (other times, wired_count goes + * from 1 to 2 or remains 2). + */ + if (entry->wired_count > 1) { + return; + } + + if (entry->wired_count == 0) { + map->size_wired += entry->vme_end - entry->vme_start; + } + + entry->wired_count++; +} + +static void +vm_map_entry_reset_wired(vm_map_t map, vm_map_entry_t entry) +{ + if (entry->wired_count != 0) { + map->size_wired -= entry->vme_end - entry->vme_start; + entry->wired_count = 0; + } +} + +/* + * vm_map_pageable_scan: scan entries and update wiring as appropriate + * + * This function is used by the VM system after either the wiring + * access or protection of a mapping changes. It scans part or + * all the entries of a map, and either wires, unwires, or skips + * entries depending on their state. + * + * The map must be locked. If wiring faults are performed, the lock + * is downgraded to a read lock. The caller should always consider + * the map read locked on return. + */ +static void +vm_map_pageable_scan(struct vm_map *map, + struct vm_map_entry *start, + struct vm_map_entry *end) +{ + struct vm_map_entry *entry; + boolean_t do_wire_faults; + + /* + * Pass 1. Update counters and prepare wiring faults. + */ + + do_wire_faults = FALSE; + + for (entry = start; entry != end; entry = entry->vme_next) { + + /* + * Unwiring. + * + * Note that unwiring faults can be performed while + * holding a write lock on the map. A wiring fault + * can only be done with a read lock. + */ + + if (entry->wired_access == VM_PROT_NONE) { + if (entry->wired_count != 0) { + vm_map_entry_reset_wired(map, entry); + vm_fault_unwire(map, entry); + } + + continue; + } + + /* + * Wiring. + */ + + if (entry->protection == VM_PROT_NONE) { + + /* + * Make sure entries that cannot be accessed + * because of their protection aren't wired. + */ + + if (entry->wired_count == 0) { + continue; + } + + /* + * This normally occurs after changing the protection of + * a wired region to VM_PROT_NONE. + */ + vm_map_entry_reset_wired(map, entry); + vm_fault_unwire(map, entry); + continue; + } + + /* + * We must do this in two passes: + * + * 1. Holding the write lock, we create any shadow + * or zero-fill objects that need to be created. + * Then we increment the wiring count. + * + * 2. We downgrade to a read lock, and call + * vm_fault_wire to fault in the pages for any + * newly wired area (wired_count is 1). + * + * Downgrading to a read lock for vm_fault_wire avoids + * a possible deadlock with another thread that may have + * faulted on one of the pages to be wired (it would mark + * the page busy, blocking us, then in turn block on the + * map lock that we hold). Because of problems in the + * recursive lock package, we cannot upgrade to a write + * lock in vm_map_lookup. Thus, any actions that require + * the write lock must be done beforehand. Because we + * keep the read lock on the map, the copy-on-write + * status of the entries we modify here cannot change. + */ + + if (entry->wired_count == 0) { + /* + * Perform actions of vm_map_lookup that need + * the write lock on the map: create a shadow + * object for a copy-on-write region, or an + * object for a zero-fill region. + */ + if (entry->needs_copy && + ((entry->protection & VM_PROT_WRITE) != 0)) { + vm_object_shadow(&entry->object.vm_object, + &entry->offset, + (vm_size_t)(entry->vme_end + - entry->vme_start)); + entry->needs_copy = FALSE; + } + + if (entry->object.vm_object == VM_OBJECT_NULL) { + entry->object.vm_object = + vm_object_allocate( + (vm_size_t)(entry->vme_end + - entry->vme_start)); + entry->offset = (vm_offset_t)0; + } + } + + vm_map_entry_inc_wired(map, entry); + + if (entry->wired_count == 1) { + do_wire_faults = TRUE; + } + } + + /* + * Pass 2. Trigger wiring faults. + */ + + if (!do_wire_faults) { + return; + } + + /* + * HACK HACK HACK HACK + * + * If we are wiring in the kernel map or a submap of it, + * unlock the map to avoid deadlocks. We trust that the + * kernel threads are well-behaved, and therefore will + * not do anything destructive to this region of the map + * while we have it unlocked. We cannot trust user threads + * to do the same. + * + * HACK HACK HACK HACK + */ + if (vm_map_pmap(map) == kernel_pmap) { + vm_map_unlock(map); /* trust me ... */ + } else { + vm_map_lock_set_recursive(map); + vm_map_lock_write_to_read(map); + } + + for (entry = start; entry != end; entry = entry->vme_next) { + /* + * The wiring count can only be 1 if it was + * incremented by this function right before + * downgrading the lock. + */ + if (entry->wired_count == 1) { + /* + * XXX This assumes that the faults always succeed. + */ + vm_fault_wire(map, entry); + } + } + + if (vm_map_pmap(map) == kernel_pmap) { + vm_map_lock(map); + } else { + vm_map_lock_clear_recursive(map); + } +} + +/* + * vm_map_protect: + * + * Sets the protection of the specified address + * region in the target map. If "set_max" is + * specified, the maximum protection is to be set; + * otherwise, only the current protection is affected. + */ +kern_return_t vm_map_protect( + vm_map_t map, + vm_offset_t start, + vm_offset_t end, + vm_prot_t new_prot, + boolean_t set_max) +{ + vm_map_entry_t current; + vm_map_entry_t entry; + vm_map_entry_t next; + + vm_map_lock(map); + + VM_MAP_RANGE_CHECK(map, start, end); + + if (vm_map_lookup_entry(map, start, &entry)) { + vm_map_clip_start(map, entry, start); + } + else + entry = entry->vme_next; + + /* + * Make a first pass to check for protection + * violations. + */ + + current = entry; + while ((current != vm_map_to_entry(map)) && + (current->vme_start < end)) { + + if (current->is_sub_map) { + vm_map_unlock(map); + return(KERN_INVALID_ARGUMENT); + } + if ((new_prot & (VM_PROT_NOTIFY | current->max_protection)) + != new_prot) { + vm_map_unlock(map); + return(KERN_PROTECTION_FAILURE); + } + + current = current->vme_next; + } + + /* + * Go back and fix up protections. + * [Note that clipping is not necessary the second time.] + */ + + current = entry; + + while ((current != vm_map_to_entry(map)) && + (current->vme_start < end)) { + + vm_prot_t old_prot; + + vm_map_clip_end(map, current, end); + + old_prot = current->protection; + if (set_max) + current->protection = + (current->max_protection = new_prot) & + old_prot; + else + current->protection = new_prot; + + /* + * Make sure the new protection doesn't conflict + * with the desired wired access if any. + */ + + if ((current->protection != VM_PROT_NONE) && + (current->wired_access != VM_PROT_NONE || + map->wiring_required)) { + current->wired_access = current->protection; + } + + /* + * Update physical map if necessary. + */ + + if (current->protection != old_prot) { + pmap_protect(map->pmap, current->vme_start, + current->vme_end, + current->protection); + } + + next = current->vme_next; + vm_map_coalesce_entry(map, current); + current = next; + } + + next = current->vme_next; + if (vm_map_coalesce_entry(map, current)) + current = next; + + /* Returns with the map read-locked if successful */ + vm_map_pageable_scan(map, entry, current); + + vm_map_unlock(map); + return(KERN_SUCCESS); +} + +/* + * vm_map_inherit: + * + * Sets the inheritance of the specified address + * range in the target map. Inheritance + * affects how the map will be shared with + * child maps at the time of vm_map_fork. + */ +kern_return_t vm_map_inherit( + vm_map_t map, + vm_offset_t start, + vm_offset_t end, + vm_inherit_t new_inheritance) +{ + vm_map_entry_t entry; + vm_map_entry_t temp_entry; + vm_map_entry_t next; + + vm_map_lock(map); + + VM_MAP_RANGE_CHECK(map, start, end); + + if (vm_map_lookup_entry(map, start, &temp_entry)) { + entry = temp_entry; + vm_map_clip_start(map, entry, start); + } + else + entry = temp_entry->vme_next; + + while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) { + vm_map_clip_end(map, entry, end); + + entry->inheritance = new_inheritance; + + next = entry->vme_next; + vm_map_coalesce_entry(map, entry); + entry = next; + } + + vm_map_coalesce_entry(map, entry); + + vm_map_unlock(map); + return(KERN_SUCCESS); +} + +/* + * vm_map_pageable: + * + * Sets the pageability of the specified address + * range in the target map. Regions specified + * as not pageable require locked-down physical + * memory and physical page maps. access_type indicates + * types of accesses that must not generate page faults. + * This is checked against protection of memory being locked-down. + * access_type of VM_PROT_NONE makes memory pageable. + * + * If lock_map is TRUE, the map is locked and unlocked + * by this function. Otherwise, it is assumed the caller + * already holds the lock, in which case the function + * returns with the lock downgraded to a read lock if successful. + * + * If check_range is TRUE, this function fails if it finds + * holes or protection mismatches in the specified range. + * + * A reference must remain to the map throughout the call. + */ + +kern_return_t vm_map_pageable( + vm_map_t map, + vm_offset_t start, + vm_offset_t end, + vm_prot_t access_type, + boolean_t lock_map, + boolean_t check_range) +{ + vm_map_entry_t entry; + vm_map_entry_t start_entry; + vm_map_entry_t end_entry; + + if (lock_map) { + vm_map_lock(map); + } + + VM_MAP_RANGE_CHECK(map, start, end); + + if (!vm_map_lookup_entry(map, start, &start_entry)) { + /* + * Start address is not in map; this is fatal. + */ + if (lock_map) { + vm_map_unlock(map); + } + + return KERN_NO_SPACE; + } + + /* + * Pass 1. Clip entries, check for holes and protection mismatches + * if requested. + */ + + vm_map_clip_start(map, start_entry, start); + + for (entry = start_entry; + (entry != vm_map_to_entry(map)) && + (entry->vme_start < end); + entry = entry->vme_next) { + vm_map_clip_end(map, entry, end); + + if (check_range && + (((entry->vme_end < end) && + ((entry->vme_next == vm_map_to_entry(map)) || + (entry->vme_next->vme_start > entry->vme_end))) || + ((entry->protection & access_type) != access_type))) { + if (lock_map) { + vm_map_unlock(map); + } + + return KERN_NO_SPACE; + } + } + + end_entry = entry; + + /* + * Pass 2. Set the desired wired access. + */ + + for (entry = start_entry; entry != end_entry; entry = entry->vme_next) { + entry->wired_access = access_type; + } + + /* Returns with the map read-locked */ + vm_map_pageable_scan(map, start_entry, end_entry); + + if (lock_map) { + vm_map_unlock(map); + } + + return(KERN_SUCCESS); +} + +/* Update pageability of all the memory currently in the map. + * The map must be locked, and protection mismatch will not be checked, see + * vm_map_pageable(). + */ +static kern_return_t +vm_map_pageable_current(vm_map_t map, vm_prot_t access_type) +{ + struct rbtree_node *node; + vm_offset_t min_address, max_address; + + node = rbtree_first(&map->hdr.tree); + min_address = rbtree_entry(node, struct vm_map_entry, + tree_node)->vme_start; + + node = rbtree_last(&map->hdr.tree); + max_address = rbtree_entry(node, struct vm_map_entry, + tree_node)->vme_end; + + /* Returns with the map read-locked if successful */ + return vm_map_pageable(map, min_address, max_address,access_type, + FALSE, FALSE); +} + + +/* + * vm_map_pageable_all: + * + * Sets the pageability of an entire map. If the VM_WIRE_CURRENT + * flag is set, then all current mappings are locked down. If the + * VM_WIRE_FUTURE flag is set, then all mappings created after the + * call returns are locked down. If no flags are passed + * (i.e. VM_WIRE_NONE), all mappings become pageable again, and + * future mappings aren't automatically locked down any more. + * + * The access type of the mappings match their current protection. + * Null mappings (with protection PROT_NONE) are updated to track + * that they should be wired in case they become accessible. + */ +kern_return_t +vm_map_pageable_all(struct vm_map *map, vm_wire_t flags) +{ + boolean_t wiring_required; + kern_return_t kr; + + if ((flags & ~VM_WIRE_ALL) != 0) { + return KERN_INVALID_ARGUMENT; + } + + vm_map_lock(map); + + if (flags == VM_WIRE_NONE) { + map->wiring_required = FALSE; + + /* Returns with the map read-locked if successful */ + kr = vm_map_pageable_current(map, VM_PROT_NONE); + vm_map_unlock(map); + return kr; + } + + wiring_required = map->wiring_required; + + if (flags & VM_WIRE_FUTURE) { + map->wiring_required = TRUE; + } + + if (flags & VM_WIRE_CURRENT) { + /* Returns with the map read-locked if successful */ + kr = vm_map_pageable_current(map, VM_PROT_READ | VM_PROT_WRITE); + + if (kr != KERN_SUCCESS) { + if (flags & VM_WIRE_FUTURE) { + map->wiring_required = wiring_required; + } + + vm_map_unlock(map); + return kr; + } + } + + vm_map_unlock(map); + + return KERN_SUCCESS; +} + +/* + * vm_map_entry_delete: [ internal use only ] + * + * Deallocate the given entry from the target map. + */ +void vm_map_entry_delete( + vm_map_t map, + vm_map_entry_t entry) +{ + vm_offset_t s, e; + vm_size_t size; + vm_object_t object; + extern vm_object_t kernel_object; + + s = entry->vme_start; + e = entry->vme_end; + size = e - s; + + /*Check if projected buffer*/ + if (map != kernel_map && entry->projected_on != 0) { + /*Check if projected kernel entry is persistent; + may only manipulate directly if it is*/ + if (entry->projected_on->projected_on == 0) + entry->wired_count = 0; /*Avoid unwire fault*/ + else + return; + } + + /* + * Get the object. Null objects cannot have pmap entries. + */ + + if ((object = entry->object.vm_object) != VM_OBJECT_NULL) { + + /* + * Unwire before removing addresses from the pmap; + * otherwise, unwiring will put the entries back in + * the pmap. + */ + + if (entry->wired_count != 0) { + vm_map_entry_reset_wired(map, entry); + vm_fault_unwire(map, entry); + } + + /* + * If the object is shared, we must remove + * *all* references to this data, since we can't + * find all of the physical maps which are sharing + * it. + */ + + if (object == kernel_object) { + vm_object_lock(object); + vm_object_page_remove(object, entry->offset, + entry->offset + size); + vm_object_unlock(object); + } else if (entry->is_shared) { + vm_object_pmap_remove(object, + entry->offset, + entry->offset + size); + } else { + pmap_remove(map->pmap, s, e); + /* + * If this object has no pager and our + * reference to the object is the only + * one, we can release the deleted pages + * now. + */ + vm_object_lock(object); + if ((!object->pager_created) && + (object->ref_count == 1) && + (object->paging_in_progress == 0)) { + vm_object_page_remove(object, + entry->offset, + entry->offset + size); + } + vm_object_unlock(object); + } + } + + /* + * Deallocate the object only after removing all + * pmap entries pointing to its pages. + */ + + if (entry->is_sub_map) + vm_map_deallocate(entry->object.sub_map); + else + vm_object_deallocate(entry->object.vm_object); + + vm_map_entry_unlink(map, entry); + map->size -= size; + + vm_map_entry_dispose(map, entry); +} + +/* + * vm_map_delete: [ internal use only ] + * + * Deallocates the given address range from the target + * map. + */ + +kern_return_t vm_map_delete( + vm_map_t map, + vm_offset_t start, + vm_offset_t end) +{ + vm_map_entry_t entry; + vm_map_entry_t first_entry; + + if (map->pmap == kernel_pmap && (start < kernel_virtual_start || end > kernel_virtual_end)) + panic("vm_map_delete(%lx-%lx) falls in physical memory area!\n", (unsigned long) start, (unsigned long) end); + + /* + * Find the start of the region, and clip it + */ + + if (!vm_map_lookup_entry(map, start, &first_entry)) + entry = first_entry->vme_next; + else { + entry = first_entry; + vm_map_clip_start(map, entry, start); + + /* + * Fix the lookup hint now, rather than each + * time though the loop. + */ + + SAVE_HINT(map, entry->vme_prev); + } + + /* + * Save the free space hint + */ + + if (map->first_free->vme_start >= start) + map->first_free = entry->vme_prev; + + /* + * Step through all entries in this region + */ + + while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) { + vm_map_entry_t next; + + vm_map_clip_end(map, entry, end); + + /* + * If the entry is in transition, we must wait + * for it to exit that state. It could be clipped + * while we leave the map unlocked. + */ + if(entry->in_transition) { + /* + * Say that we are waiting, and wait for entry. + */ + entry->needs_wakeup = TRUE; + vm_map_entry_wait(map, FALSE); + vm_map_lock(map); + + /* + * The entry could have been clipped or it + * may not exist anymore. look it up again. + */ + if(!vm_map_lookup_entry(map, start, &entry)) { + entry = entry->vme_next; + } + continue; + } + + next = entry->vme_next; + + vm_map_entry_delete(map, entry); + entry = next; + } + + if (map->wait_for_space) + thread_wakeup((event_t) map); + + return(KERN_SUCCESS); +} + +/* + * vm_map_remove: + * + * Remove the given address range from the target map. + * This is the exported form of vm_map_delete. + */ +kern_return_t vm_map_remove( + vm_map_t map, + vm_offset_t start, + vm_offset_t end) +{ + kern_return_t result; + + vm_map_lock(map); + VM_MAP_RANGE_CHECK(map, start, end); + result = vm_map_delete(map, start, end); + vm_map_unlock(map); + + return(result); +} + + +/* + * vm_map_copy_steal_pages: + * + * Steal all the pages from a vm_map_copy page_list by copying ones + * that have not already been stolen. + */ +static void +vm_map_copy_steal_pages(vm_map_copy_t copy) +{ + vm_page_t m, new_m; + int i; + vm_object_t object; + + for (i = 0; i < copy->cpy_npages; i++) { + + /* + * If the page is not tabled, then it's already stolen. + */ + m = copy->cpy_page_list[i]; + if (!m->tabled) + continue; + + /* + * Page was not stolen, get a new + * one and do the copy now. + */ + while ((new_m = vm_page_grab(VM_PAGE_HIGHMEM)) == VM_PAGE_NULL) { + VM_PAGE_WAIT((void(*)()) 0); + } + + vm_page_copy(m, new_m); + + object = m->object; + vm_object_lock(object); + vm_page_lock_queues(); + if (!m->active && !m->inactive) + vm_page_activate(m); + vm_page_unlock_queues(); + PAGE_WAKEUP_DONE(m); + vm_object_paging_end(object); + vm_object_unlock(object); + + copy->cpy_page_list[i] = new_m; + } +} + +/* + * vm_map_copy_page_discard: + * + * Get rid of the pages in a page_list copy. If the pages are + * stolen, they are freed. If the pages are not stolen, they + * are unbusied, and associated state is cleaned up. + */ +void vm_map_copy_page_discard(vm_map_copy_t copy) +{ + while (copy->cpy_npages > 0) { + vm_page_t m; + + if((m = copy->cpy_page_list[--(copy->cpy_npages)]) != + VM_PAGE_NULL) { + + /* + * If it's not in the table, then it's + * a stolen page that goes back + * to the free list. Else it belongs + * to some object, and we hold a + * paging reference on that object. + */ + if (!m->tabled) { + VM_PAGE_FREE(m); + } + else { + vm_object_t object; + + object = m->object; + + vm_object_lock(object); + vm_page_lock_queues(); + if (!m->active && !m->inactive) + vm_page_activate(m); + vm_page_unlock_queues(); + + PAGE_WAKEUP_DONE(m); + vm_object_paging_end(object); + vm_object_unlock(object); + } + } + } +} + +/* + * Routine: vm_map_copy_discard + * + * Description: + * Dispose of a map copy object (returned by + * vm_map_copyin). + */ +void +vm_map_copy_discard(vm_map_copy_t copy) +{ +free_next_copy: + if (copy == VM_MAP_COPY_NULL) + return; + + switch (copy->type) { + case VM_MAP_COPY_ENTRY_LIST: + while (vm_map_copy_first_entry(copy) != + vm_map_copy_to_entry(copy)) { + vm_map_entry_t entry = vm_map_copy_first_entry(copy); + + vm_map_copy_entry_unlink(copy, entry); + vm_object_deallocate(entry->object.vm_object); + vm_map_copy_entry_dispose(copy, entry); + } + break; + case VM_MAP_COPY_OBJECT: + vm_object_deallocate(copy->cpy_object); + break; + case VM_MAP_COPY_PAGE_LIST: + + /* + * To clean this up, we have to unbusy all the pages + * and release the paging references in their objects. + */ + if (copy->cpy_npages > 0) + vm_map_copy_page_discard(copy); + + /* + * If there's a continuation, abort it. The + * abort routine releases any storage. + */ + if (vm_map_copy_has_cont(copy)) { + + /* + * Special case: recognize + * vm_map_copy_discard_cont and optimize + * here to avoid tail recursion. + */ + if (copy->cpy_cont == vm_map_copy_discard_cont) { + vm_map_copy_t new_copy; + + new_copy = (vm_map_copy_t) copy->cpy_cont_args; + kmem_cache_free(&vm_map_copy_cache, (vm_offset_t) copy); + copy = new_copy; + goto free_next_copy; + } + else { + vm_map_copy_abort_cont(copy); + } + } + + break; + } + kmem_cache_free(&vm_map_copy_cache, (vm_offset_t) copy); +} + +/* + * Routine: vm_map_copy_copy + * + * Description: + * Move the information in a map copy object to + * a new map copy object, leaving the old one + * empty. + * + * This is used by kernel routines that need + * to look at out-of-line data (in copyin form) + * before deciding whether to return SUCCESS. + * If the routine returns FAILURE, the original + * copy object will be deallocated; therefore, + * these routines must make a copy of the copy + * object and leave the original empty so that + * deallocation will not fail. + */ +vm_map_copy_t +vm_map_copy_copy(vm_map_copy_t copy) +{ + vm_map_copy_t new_copy; + + if (copy == VM_MAP_COPY_NULL) + return VM_MAP_COPY_NULL; + + /* + * Allocate a new copy object, and copy the information + * from the old one into it. + */ + + new_copy = (vm_map_copy_t) kmem_cache_alloc(&vm_map_copy_cache); + *new_copy = *copy; + + if (copy->type == VM_MAP_COPY_ENTRY_LIST) { + /* + * The links in the entry chain must be + * changed to point to the new copy object. + */ + vm_map_copy_first_entry(copy)->vme_prev + = vm_map_copy_to_entry(new_copy); + vm_map_copy_last_entry(copy)->vme_next + = vm_map_copy_to_entry(new_copy); + } + + /* + * Change the old copy object into one that contains + * nothing to be deallocated. + */ + copy->type = VM_MAP_COPY_OBJECT; + copy->cpy_object = VM_OBJECT_NULL; + + /* + * Return the new object. + */ + return new_copy; +} + +/* + * Routine: vm_map_copy_discard_cont + * + * Description: + * A version of vm_map_copy_discard that can be called + * as a continuation from a vm_map_copy page list. + */ +kern_return_t vm_map_copy_discard_cont( +vm_map_copyin_args_t cont_args, +vm_map_copy_t *copy_result) /* OUT */ +{ + vm_map_copy_discard((vm_map_copy_t) cont_args); + if (copy_result != (vm_map_copy_t *)0) + *copy_result = VM_MAP_COPY_NULL; + return(KERN_SUCCESS); +} + +/* + * Routine: vm_map_copy_overwrite + * + * Description: + * Copy the memory described by the map copy + * object (copy; returned by vm_map_copyin) onto + * the specified destination region (dst_map, dst_addr). + * The destination must be writeable. + * + * Unlike vm_map_copyout, this routine actually + * writes over previously-mapped memory. If the + * previous mapping was to a permanent (user-supplied) + * memory object, it is preserved. + * + * The attributes (protection and inheritance) of the + * destination region are preserved. + * + * If successful, consumes the copy object. + * Otherwise, the caller is responsible for it. + * + * Implementation notes: + * To overwrite temporary virtual memory, it is + * sufficient to remove the previous mapping and insert + * the new copy. This replacement is done either on + * the whole region (if no permanent virtual memory + * objects are embedded in the destination region) or + * in individual map entries. + * + * To overwrite permanent virtual memory, it is + * necessary to copy each page, as the external + * memory management interface currently does not + * provide any optimizations. + * + * Once a page of permanent memory has been overwritten, + * it is impossible to interrupt this function; otherwise, + * the call would be neither atomic nor location-independent. + * The kernel-state portion of a user thread must be + * interruptible. + * + * It may be expensive to forward all requests that might + * overwrite permanent memory (vm_write, vm_copy) to + * uninterruptible kernel threads. This routine may be + * called by interruptible threads; however, success is + * not guaranteed -- if the request cannot be performed + * atomically and interruptibly, an error indication is + * returned. + */ +kern_return_t vm_map_copy_overwrite( + vm_map_t dst_map, + vm_offset_t dst_addr, + vm_map_copy_t copy, + boolean_t interruptible) +{ + vm_size_t size; + vm_offset_t start; + vm_map_entry_t tmp_entry; + vm_map_entry_t entry; + + boolean_t contains_permanent_objects = FALSE; + + interruptible = FALSE; /* XXX */ + + /* + * Check for null copy object. + */ + + if (copy == VM_MAP_COPY_NULL) + return(KERN_SUCCESS); + + /* + * Only works for entry lists at the moment. Will + * support page lists LATER. + */ + + assert(copy->type == VM_MAP_COPY_ENTRY_LIST); + + /* + * Currently this routine only handles page-aligned + * regions. Eventually, it should handle misalignments + * by actually copying pages. + */ + + if (!page_aligned(copy->offset) || + !page_aligned(copy->size) || + !page_aligned(dst_addr)) + return(KERN_INVALID_ARGUMENT); + + size = copy->size; + + if (size == 0) { + vm_map_copy_discard(copy); + return(KERN_SUCCESS); + } + + /* + * Verify that the destination is all writeable + * initially. + */ +start_pass_1: + vm_map_lock(dst_map); + if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) { + vm_map_unlock(dst_map); + return(KERN_INVALID_ADDRESS); + } + vm_map_clip_start(dst_map, tmp_entry, dst_addr); + for (entry = tmp_entry;;) { + vm_size_t sub_size = (entry->vme_end - entry->vme_start); + vm_map_entry_t next = entry->vme_next; + + if ( ! (entry->protection & VM_PROT_WRITE)) { + vm_map_unlock(dst_map); + return(KERN_PROTECTION_FAILURE); + } + + /* + * If the entry is in transition, we must wait + * for it to exit that state. Anything could happen + * when we unlock the map, so start over. + */ + if (entry->in_transition) { + + /* + * Say that we are waiting, and wait for entry. + */ + entry->needs_wakeup = TRUE; + vm_map_entry_wait(dst_map, FALSE); + + goto start_pass_1; + } + + if (size <= sub_size) + break; + + if ((next == vm_map_to_entry(dst_map)) || + (next->vme_start != entry->vme_end)) { + vm_map_unlock(dst_map); + return(KERN_INVALID_ADDRESS); + } + + + /* + * Check for permanent objects in the destination. + */ + + if ((entry->object.vm_object != VM_OBJECT_NULL) && + !entry->object.vm_object->temporary) + contains_permanent_objects = TRUE; + + size -= sub_size; + entry = next; + } + + /* + * If there are permanent objects in the destination, then + * the copy cannot be interrupted. + */ + + if (interruptible && contains_permanent_objects) { + vm_map_unlock(dst_map); + return(KERN_FAILURE); /* XXX */ + } + + /* + * XXXO If there are no permanent objects in the destination, + * XXXO and the destination map entry is not shared, + * XXXO then the map entries can be deleted and replaced + * XXXO with those from the copy. The following code is the + * XXXO basic idea of what to do, but there are lots of annoying + * XXXO little details about getting protection and inheritance + * XXXO right. Should add protection, inheritance, and sharing checks + * XXXO to the above pass and make sure that no wiring is involved. + */ +/* + * if (!contains_permanent_objects) { + * + * * + * * Run over copy and adjust entries. Steal code + * * from vm_map_copyout() to do this. + * * + * + * tmp_entry = tmp_entry->vme_prev; + * vm_map_delete(dst_map, dst_addr, dst_addr + copy->size); + * vm_map_copy_insert(dst_map, tmp_entry, copy); + * + * vm_map_unlock(dst_map); + * vm_map_copy_discard(copy); + * } + */ + /* + * + * Make a second pass, overwriting the data + * At the beginning of each loop iteration, + * the next entry to be overwritten is "tmp_entry" + * (initially, the value returned from the lookup above), + * and the starting address expected in that entry + * is "start". + */ + + start = dst_addr; + + while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) { + vm_map_entry_t copy_entry = vm_map_copy_first_entry(copy); + vm_size_t copy_size = (copy_entry->vme_end - copy_entry->vme_start); + vm_object_t object; + + entry = tmp_entry; + size = (entry->vme_end - entry->vme_start); + /* + * Make sure that no holes popped up in the + * address map, and that the protection is + * still valid, in case the map was unlocked + * earlier. + */ + + if (entry->vme_start != start) { + vm_map_unlock(dst_map); + return(KERN_INVALID_ADDRESS); + } + assert(entry != vm_map_to_entry(dst_map)); + + /* + * Check protection again + */ + + if ( ! (entry->protection & VM_PROT_WRITE)) { + vm_map_unlock(dst_map); + return(KERN_PROTECTION_FAILURE); + } + + /* + * Adjust to source size first + */ + + if (copy_size < size) { + vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size); + size = copy_size; + } + + /* + * Adjust to destination size + */ + + if (size < copy_size) { + vm_map_copy_clip_end(copy, copy_entry, + copy_entry->vme_start + size); + copy_size = size; + } + + assert((entry->vme_end - entry->vme_start) == size); + assert((tmp_entry->vme_end - tmp_entry->vme_start) == size); + assert((copy_entry->vme_end - copy_entry->vme_start) == size); + + /* + * If the destination contains temporary unshared memory, + * we can perform the copy by throwing it away and + * installing the source data. + */ + + object = entry->object.vm_object; + if (!entry->is_shared && + ((object == VM_OBJECT_NULL) || object->temporary)) { + vm_object_t old_object = entry->object.vm_object; + vm_offset_t old_offset = entry->offset; + + entry->object = copy_entry->object; + entry->offset = copy_entry->offset; + entry->needs_copy = copy_entry->needs_copy; + vm_map_entry_reset_wired(dst_map, entry); + + vm_map_copy_entry_unlink(copy, copy_entry); + vm_map_copy_entry_dispose(copy, copy_entry); + + vm_object_pmap_protect( + old_object, + old_offset, + size, + dst_map->pmap, + tmp_entry->vme_start, + VM_PROT_NONE); + + vm_object_deallocate(old_object); + + /* + * Set up for the next iteration. The map + * has not been unlocked, so the next + * address should be at the end of this + * entry, and the next map entry should be + * the one following it. + */ + + start = tmp_entry->vme_end; + tmp_entry = tmp_entry->vme_next; + } else { + vm_map_version_t version; + vm_object_t dst_object = entry->object.vm_object; + vm_offset_t dst_offset = entry->offset; + kern_return_t r; + + /* + * Take an object reference, and record + * the map version information so that the + * map can be safely unlocked. + */ + + vm_object_reference(dst_object); + + version.main_timestamp = dst_map->timestamp; + + vm_map_unlock(dst_map); + + /* + * Copy as much as possible in one pass + */ + + copy_size = size; + r = vm_fault_copy( + copy_entry->object.vm_object, + copy_entry->offset, + ©_size, + dst_object, + dst_offset, + dst_map, + &version, + FALSE /* XXX interruptible */ ); + + /* + * Release the object reference + */ + + vm_object_deallocate(dst_object); + + /* + * If a hard error occurred, return it now + */ + + if (r != KERN_SUCCESS) + return(r); + + if (copy_size != 0) { + /* + * Dispose of the copied region + */ + + vm_map_copy_clip_end(copy, copy_entry, + copy_entry->vme_start + copy_size); + vm_map_copy_entry_unlink(copy, copy_entry); + vm_object_deallocate(copy_entry->object.vm_object); + vm_map_copy_entry_dispose(copy, copy_entry); + } + + /* + * Pick up in the destination map where we left off. + * + * Use the version information to avoid a lookup + * in the normal case. + */ + + start += copy_size; + vm_map_lock(dst_map); + if ((version.main_timestamp + 1) == dst_map->timestamp) { + /* We can safely use saved tmp_entry value */ + + vm_map_clip_end(dst_map, tmp_entry, start); + tmp_entry = tmp_entry->vme_next; + } else { + /* Must do lookup of tmp_entry */ + + if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) { + vm_map_unlock(dst_map); + return(KERN_INVALID_ADDRESS); + } + vm_map_clip_start(dst_map, tmp_entry, start); + } + } + + } + vm_map_unlock(dst_map); + + /* + * Throw away the vm_map_copy object + */ + vm_map_copy_discard(copy); + + return(KERN_SUCCESS); +} + +/* + * Routine: vm_map_copy_insert + * + * Description: + * Link a copy chain ("copy") into a map at the + * specified location (after "where"). + * Side effects: + * The copy chain is destroyed. + */ +static void +vm_map_copy_insert(struct vm_map *map, struct vm_map_entry *where, + struct vm_map_copy *copy) +{ + struct vm_map_entry *entry; + + assert(copy->type == VM_MAP_COPY_ENTRY_LIST); + + for (;;) { + entry = vm_map_copy_first_entry(copy); + + if (entry == vm_map_copy_to_entry(copy)) { + break; + } + + /* + * TODO Turn copy maps into their own type so they don't + * use any of the tree operations. + */ + vm_map_copy_entry_unlink(copy, entry); + vm_map_entry_link(map, where, entry); + where = entry; + } + + kmem_cache_free(&vm_map_copy_cache, (vm_offset_t)copy); +} + +/* + * Routine: vm_map_copyout + * + * Description: + * Copy out a copy chain ("copy") into newly-allocated + * space in the destination map. + * + * If successful, consumes the copy object. + * Otherwise, the caller is responsible for it. + */ +kern_return_t vm_map_copyout( + vm_map_t dst_map, + vm_offset_t *dst_addr, /* OUT */ + vm_map_copy_t copy) +{ + vm_size_t size; + vm_size_t adjustment; + vm_offset_t start; + vm_offset_t vm_copy_start; + vm_map_entry_t last; + vm_map_entry_t entry; + kern_return_t kr; + + /* + * Check for null copy object. + */ + + if (copy == VM_MAP_COPY_NULL) { + *dst_addr = 0; + return(KERN_SUCCESS); + } + + /* + * Check for special copy object, created + * by vm_map_copyin_object. + */ + + if (copy->type == VM_MAP_COPY_OBJECT) { + vm_object_t object = copy->cpy_object; + vm_size_t offset = copy->offset; + vm_size_t tmp_size = copy->size; + + *dst_addr = 0; + kr = vm_map_enter(dst_map, dst_addr, tmp_size, + (vm_offset_t) 0, TRUE, + object, offset, FALSE, + VM_PROT_DEFAULT, VM_PROT_ALL, + VM_INHERIT_DEFAULT); + if (kr != KERN_SUCCESS) + return(kr); + kmem_cache_free(&vm_map_copy_cache, (vm_offset_t) copy); + return(KERN_SUCCESS); + } + + if (copy->type == VM_MAP_COPY_PAGE_LIST) + return(vm_map_copyout_page_list(dst_map, dst_addr, copy)); + + /* + * Find space for the data + */ + + vm_copy_start = trunc_page(copy->offset); + size = round_page(copy->offset + copy->size) - vm_copy_start; + last = vm_map_find_entry_anywhere(dst_map, size, 0, FALSE, &start); + + if (last == NULL) { + vm_map_unlock(dst_map); + return KERN_NO_SPACE; + } + + /* + * Adjust the addresses in the copy chain, and + * reset the region attributes. + */ + + adjustment = start - vm_copy_start; + for (entry = vm_map_copy_first_entry(copy); + entry != vm_map_copy_to_entry(copy); + entry = entry->vme_next) { + entry->vme_start += adjustment; + entry->vme_end += adjustment; + + /* + * XXX There is no need to update the gap tree here. + * See vm_map_copy_insert. + */ + + entry->inheritance = VM_INHERIT_DEFAULT; + entry->protection = VM_PROT_DEFAULT; + entry->max_protection = VM_PROT_ALL; + entry->projected_on = 0; + + /* + * If the entry is now wired, + * map the pages into the destination map. + */ + if (entry->wired_count != 0) { + vm_offset_t va; + vm_offset_t offset; + vm_object_t object; + + object = entry->object.vm_object; + offset = entry->offset; + va = entry->vme_start; + + pmap_pageable(dst_map->pmap, + entry->vme_start, + entry->vme_end, + TRUE); + + while (va < entry->vme_end) { + vm_page_t m; + + /* + * Look up the page in the object. + * Assert that the page will be found in the + * top object: + * either + * the object was newly created by + * vm_object_copy_slowly, and has + * copies of all of the pages from + * the source object + * or + * the object was moved from the old + * map entry; because the old map + * entry was wired, all of the pages + * were in the top-level object. + * (XXX not true if we wire pages for + * reading) + */ + vm_object_lock(object); + vm_object_paging_begin(object); + + m = vm_page_lookup(object, offset); + if (m == VM_PAGE_NULL || m->wire_count == 0 || + m->absent) + panic("vm_map_copyout: wiring %p", m); + + m->busy = TRUE; + vm_object_unlock(object); + + PMAP_ENTER(dst_map->pmap, va, m, + entry->protection, TRUE); + + vm_object_lock(object); + PAGE_WAKEUP_DONE(m); + /* the page is wired, so we don't have to activate */ + vm_object_paging_end(object); + vm_object_unlock(object); + + offset += PAGE_SIZE; + va += PAGE_SIZE; + } + } + + + } + + /* + * Correct the page alignment for the result + */ + + *dst_addr = start + (copy->offset - vm_copy_start); + + /* + * Update the hints and the map size + */ + + if (dst_map->first_free == last) + dst_map->first_free = vm_map_copy_last_entry(copy); + SAVE_HINT(dst_map, vm_map_copy_last_entry(copy)); + + dst_map->size += size; + + /* + * Link in the copy + */ + + vm_map_copy_insert(dst_map, last, copy); + + if (dst_map->wiring_required) { + /* Returns with the map read-locked if successful */ + kr = vm_map_pageable(dst_map, start, start + size, + VM_PROT_READ | VM_PROT_WRITE, + FALSE, FALSE); + + if (kr != KERN_SUCCESS) { + vm_map_unlock(dst_map); + return kr; + } + } + + vm_map_unlock(dst_map); + + return(KERN_SUCCESS); +} + +/* + * + * vm_map_copyout_page_list: + * + * Version of vm_map_copyout() for page list vm map copies. + * + */ +kern_return_t vm_map_copyout_page_list( + vm_map_t dst_map, + vm_offset_t *dst_addr, /* OUT */ + vm_map_copy_t copy) +{ + vm_size_t size; + vm_offset_t start; + vm_offset_t end; + vm_offset_t offset; + vm_map_entry_t last; + vm_object_t object; + vm_page_t *page_list, m; + vm_map_entry_t entry; + vm_offset_t old_last_offset; + boolean_t cont_invoked, needs_wakeup = FALSE; + kern_return_t result = KERN_SUCCESS; + vm_map_copy_t orig_copy; + vm_offset_t dst_offset; + boolean_t must_wire; + + /* + * Make sure the pages are stolen, because we are + * going to put them in a new object. Assume that + * all pages are identical to first in this regard. + */ + + page_list = ©->cpy_page_list[0]; + if ((*page_list)->tabled) + vm_map_copy_steal_pages(copy); + + /* + * Find space for the data + */ + + size = round_page(copy->offset + copy->size) - + trunc_page(copy->offset); + + vm_map_lock(dst_map); + + last = vm_map_find_entry_anywhere(dst_map, size, 0, TRUE, &start); + + if (last == NULL) { + vm_map_unlock(dst_map); + return KERN_NO_SPACE; + } + + end = start + size; + + must_wire = dst_map->wiring_required; + + /* + * See whether we can avoid creating a new entry (and object) by + * extending one of our neighbors. [So far, we only attempt to + * extend from below.] + * + * The code path below here is a bit twisted. If any of the + * extension checks fails, we branch to create_object. If + * it all works, we fall out the bottom and goto insert_pages. + */ + if (last == vm_map_to_entry(dst_map) || + last->vme_end != start || + last->is_shared != FALSE || + last->is_sub_map != FALSE || + last->inheritance != VM_INHERIT_DEFAULT || + last->protection != VM_PROT_DEFAULT || + last->max_protection != VM_PROT_ALL || + (must_wire ? (last->wired_count == 0) + : (last->wired_count != 0))) { + goto create_object; + } + + /* + * If this entry needs an object, make one. + */ + if (last->object.vm_object == VM_OBJECT_NULL) { + object = vm_object_allocate( + (vm_size_t)(last->vme_end - last->vme_start + size)); + last->object.vm_object = object; + last->offset = 0; + vm_object_lock(object); + } + else { + vm_offset_t prev_offset = last->offset; + vm_size_t prev_size = start - last->vme_start; + vm_size_t new_size; + + /* + * This is basically vm_object_coalesce. + */ + + object = last->object.vm_object; + vm_object_lock(object); + + /* + * Try to collapse the object first + */ + vm_object_collapse(object); + + /* + * Can't coalesce if pages not mapped to + * last may be in use anyway: + * . more than one reference + * . paged out + * . shadows another object + * . has a copy elsewhere + * . paging references (pages might be in page-list) + */ + + if ((object->ref_count > 1) || + object->pager_created || + (object->shadow != VM_OBJECT_NULL) || + (object->copy != VM_OBJECT_NULL) || + (object->paging_in_progress != 0)) { + vm_object_unlock(object); + goto create_object; + } + + /* + * Extend the object if necessary. Don't have to call + * vm_object_page_remove because the pages aren't mapped, + * and vm_page_replace will free up any old ones it encounters. + */ + new_size = prev_offset + prev_size + size; + if (new_size > object->size) + object->size = new_size; + } + + /* + * Coalesced the two objects - can extend + * the previous map entry to include the + * new range. + */ + dst_map->size += size; + last->vme_end = end; + vm_map_gap_update(&dst_map->hdr, last); + + SAVE_HINT(dst_map, last); + + goto insert_pages; + +create_object: + + /* + * Create object + */ + object = vm_object_allocate(size); + + /* + * Create entry + */ + + entry = vm_map_entry_create(dst_map); + + entry->object.vm_object = object; + entry->offset = 0; + + entry->is_shared = FALSE; + entry->is_sub_map = FALSE; + entry->needs_copy = FALSE; + entry->wired_count = 0; + + if (must_wire) { + vm_map_entry_inc_wired(dst_map, entry); + entry->wired_access = VM_PROT_DEFAULT; + } else { + entry->wired_access = VM_PROT_NONE; + } + + entry->in_transition = TRUE; + entry->needs_wakeup = FALSE; + + entry->vme_start = start; + entry->vme_end = start + size; + + entry->inheritance = VM_INHERIT_DEFAULT; + entry->protection = VM_PROT_DEFAULT; + entry->max_protection = VM_PROT_ALL; + entry->projected_on = 0; + + vm_object_lock(object); + + /* + * Update the hints and the map size + */ + if (dst_map->first_free == last) { + dst_map->first_free = entry; + } + SAVE_HINT(dst_map, entry); + dst_map->size += size; + + /* + * Link in the entry + */ + vm_map_entry_link(dst_map, last, entry); + last = entry; + + /* + * Transfer pages into new object. + * Scan page list in vm_map_copy. + */ +insert_pages: + dst_offset = copy->offset & PAGE_MASK; + cont_invoked = FALSE; + orig_copy = copy; + last->in_transition = TRUE; + old_last_offset = last->offset + + (start - last->vme_start); + + vm_page_lock_queues(); + + for (offset = 0; offset < size; offset += PAGE_SIZE) { + m = *page_list; + assert(m && !m->tabled); + + /* + * Must clear busy bit in page before inserting it. + * Ok to skip wakeup logic because nobody else + * can possibly know about this page. + * The page is dirty in its new object. + */ + + assert(!m->wanted); + + m->busy = FALSE; + m->dirty = TRUE; + vm_page_replace(m, object, old_last_offset + offset); + if (must_wire) { + vm_page_wire(m); + PMAP_ENTER(dst_map->pmap, + last->vme_start + m->offset - last->offset, + m, last->protection, TRUE); + } else { + vm_page_activate(m); + } + + *page_list++ = VM_PAGE_NULL; + if (--(copy->cpy_npages) == 0 && + vm_map_copy_has_cont(copy)) { + vm_map_copy_t new_copy; + + /* + * Ok to unlock map because entry is + * marked in_transition. + */ + cont_invoked = TRUE; + vm_page_unlock_queues(); + vm_object_unlock(object); + vm_map_unlock(dst_map); + vm_map_copy_invoke_cont(copy, &new_copy, &result); + + if (result == KERN_SUCCESS) { + + /* + * If we got back a copy with real pages, + * steal them now. Either all of the + * pages in the list are tabled or none + * of them are; mixtures are not possible. + * + * Save original copy for consume on + * success logic at end of routine. + */ + if (copy != orig_copy) + vm_map_copy_discard(copy); + + if ((copy = new_copy) != VM_MAP_COPY_NULL) { + page_list = ©->cpy_page_list[0]; + if ((*page_list)->tabled) + vm_map_copy_steal_pages(copy); + } + } + else { + /* + * Continuation failed. + */ + vm_map_lock(dst_map); + goto error; + } + + vm_map_lock(dst_map); + vm_object_lock(object); + vm_page_lock_queues(); + } + } + + vm_page_unlock_queues(); + vm_object_unlock(object); + + *dst_addr = start + dst_offset; + + /* + * Clear the in transition bits. This is easy if we + * didn't have a continuation. + */ +error: + if (!cont_invoked) { + /* + * We didn't unlock the map, so nobody could + * be waiting. + */ + last->in_transition = FALSE; + assert(!last->needs_wakeup); + needs_wakeup = FALSE; + } + else { + if (!vm_map_lookup_entry(dst_map, start, &entry)) + panic("vm_map_copyout_page_list: missing entry"); + + /* + * Clear transition bit for all constituent entries that + * were in the original entry. Also check for waiters. + */ + while((entry != vm_map_to_entry(dst_map)) && + (entry->vme_start < end)) { + assert(entry->in_transition); + entry->in_transition = FALSE; + if(entry->needs_wakeup) { + entry->needs_wakeup = FALSE; + needs_wakeup = TRUE; + } + entry = entry->vme_next; + } + } + + if (result != KERN_SUCCESS) + vm_map_delete(dst_map, start, end); + + vm_map_unlock(dst_map); + + if (needs_wakeup) + vm_map_entry_wakeup(dst_map); + + /* + * Consume on success logic. + */ + if (copy != orig_copy) { + kmem_cache_free(&vm_map_copy_cache, (vm_offset_t) copy); + } + if (result == KERN_SUCCESS) { + kmem_cache_free(&vm_map_copy_cache, (vm_offset_t) orig_copy); + } + + return(result); +} + +/* + * Routine: vm_map_copyin + * + * Description: + * Copy the specified region (src_addr, len) from the + * source address space (src_map), possibly removing + * the region from the source address space (src_destroy). + * + * Returns: + * A vm_map_copy_t object (copy_result), suitable for + * insertion into another address space (using vm_map_copyout), + * copying over another address space region (using + * vm_map_copy_overwrite). If the copy is unused, it + * should be destroyed (using vm_map_copy_discard). + * + * In/out conditions: + * The source map should not be locked on entry. + */ +kern_return_t vm_map_copyin( + vm_map_t src_map, + vm_offset_t src_addr, + vm_size_t len, + boolean_t src_destroy, + vm_map_copy_t *copy_result) /* OUT */ +{ + vm_map_entry_t tmp_entry; /* Result of last map lookup -- + * in multi-level lookup, this + * entry contains the actual + * vm_object/offset. + */ + + vm_offset_t src_start; /* Start of current entry -- + * where copy is taking place now + */ + vm_offset_t src_end; /* End of entire region to be + * copied */ + + vm_map_copy_t copy; /* Resulting copy */ + + /* + * Check for copies of zero bytes. + */ + + if (len == 0) { + *copy_result = VM_MAP_COPY_NULL; + return(KERN_SUCCESS); + } + + /* + * Check that the end address doesn't overflow + */ + + if ((src_addr + len) <= src_addr) { + return KERN_INVALID_ADDRESS; + } + + /* + * Compute start and end of region + */ + + src_start = trunc_page(src_addr); + src_end = round_page(src_addr + len); + + /* + * XXX VM maps shouldn't end at maximum address + */ + + if (src_end == 0) { + return KERN_INVALID_ADDRESS; + } + + /* + * Allocate a header element for the list. + * + * Use the start and end in the header to + * remember the endpoints prior to rounding. + */ + + copy = (vm_map_copy_t) kmem_cache_alloc(&vm_map_copy_cache); + vm_map_copy_first_entry(copy) = + vm_map_copy_last_entry(copy) = vm_map_copy_to_entry(copy); + copy->type = VM_MAP_COPY_ENTRY_LIST; + copy->cpy_hdr.nentries = 0; + rbtree_init(©->cpy_hdr.tree); + rbtree_init(©->cpy_hdr.gap_tree); + + copy->offset = src_addr; + copy->size = len; + +#define RETURN(x) \ + MACRO_BEGIN \ + vm_map_unlock(src_map); \ + vm_map_copy_discard(copy); \ + MACRO_RETURN(x); \ + MACRO_END + + /* + * Find the beginning of the region. + */ + + vm_map_lock(src_map); + + if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) + RETURN(KERN_INVALID_ADDRESS); + vm_map_clip_start(src_map, tmp_entry, src_start); + + /* + * Go through entries until we get to the end. + */ + + while (TRUE) { + vm_map_entry_t src_entry = tmp_entry; /* Top-level entry */ + vm_size_t src_size; /* Size of source + * map entry (in both + * maps) + */ + + vm_object_t src_object; /* Object to copy */ + vm_offset_t src_offset; + + boolean_t src_needs_copy; /* Should source map + * be made read-only + * for copy-on-write? + */ + + vm_map_entry_t new_entry; /* Map entry for copy */ + boolean_t new_entry_needs_copy; /* Will new entry be COW? */ + + boolean_t was_wired; /* Was source wired? */ + vm_map_version_t version; /* Version before locks + * dropped to make copy + */ + + /* + * Verify that the region can be read. + */ + + if (! (src_entry->protection & VM_PROT_READ)) + RETURN(KERN_PROTECTION_FAILURE); + + /* + * Clip against the endpoints of the entire region. + */ + + vm_map_clip_end(src_map, src_entry, src_end); + + src_size = src_entry->vme_end - src_start; + src_object = src_entry->object.vm_object; + src_offset = src_entry->offset; + was_wired = (src_entry->wired_count != 0); + + /* + * Create a new address map entry to + * hold the result. Fill in the fields from + * the appropriate source entries. + */ + + new_entry = vm_map_copy_entry_create(copy); + vm_map_entry_copy(new_entry, src_entry); + + /* + * Attempt non-blocking copy-on-write optimizations. + */ + + if (src_destroy && + (src_object == VM_OBJECT_NULL || + (src_object->temporary && !src_object->use_shared_copy))) + { + /* + * If we are destroying the source, and the object + * is temporary, and not shared writable, + * we can move the object reference + * from the source to the copy. The copy is + * copy-on-write only if the source is. + * We make another reference to the object, because + * destroying the source entry will deallocate it. + */ + vm_object_reference(src_object); + + /* + * Copy is always unwired. vm_map_copy_entry + * set its wired count to zero. + */ + + goto CopySuccessful; + } + + if (!was_wired && + vm_object_copy_temporary( + &new_entry->object.vm_object, + &new_entry->offset, + &src_needs_copy, + &new_entry_needs_copy)) { + + new_entry->needs_copy = new_entry_needs_copy; + + /* + * Handle copy-on-write obligations + */ + + if (src_needs_copy && !tmp_entry->needs_copy) { + vm_object_pmap_protect( + src_object, + src_offset, + src_size, + (src_entry->is_shared ? PMAP_NULL + : src_map->pmap), + src_entry->vme_start, + src_entry->protection & + ~VM_PROT_WRITE); + + tmp_entry->needs_copy = TRUE; + } + + /* + * The map has never been unlocked, so it's safe to + * move to the next entry rather than doing another + * lookup. + */ + + goto CopySuccessful; + } + + new_entry->needs_copy = FALSE; + + /* + * Take an object reference, so that we may + * release the map lock(s). + */ + + assert(src_object != VM_OBJECT_NULL); + vm_object_reference(src_object); + + /* + * Record the timestamp for later verification. + * Unlock the map. + */ + + version.main_timestamp = src_map->timestamp; + vm_map_unlock(src_map); + + /* + * Perform the copy + */ + + if (was_wired) { + vm_object_lock(src_object); + (void) vm_object_copy_slowly( + src_object, + src_offset, + src_size, + FALSE, + &new_entry->object.vm_object); + new_entry->offset = 0; + new_entry->needs_copy = FALSE; + } else { + kern_return_t result; + + result = vm_object_copy_strategically(src_object, + src_offset, + src_size, + &new_entry->object.vm_object, + &new_entry->offset, + &new_entry_needs_copy); + + new_entry->needs_copy = new_entry_needs_copy; + + + if (result != KERN_SUCCESS) { + vm_map_copy_entry_dispose(copy, new_entry); + + vm_map_lock(src_map); + RETURN(result); + } + + } + + /* + * Throw away the extra reference + */ + + vm_object_deallocate(src_object); + + /* + * Verify that the map has not substantially + * changed while the copy was being made. + */ + + vm_map_lock(src_map); /* Increments timestamp once! */ + + if ((version.main_timestamp + 1) == src_map->timestamp) + goto CopySuccessful; + + /* + * Simple version comparison failed. + * + * Retry the lookup and verify that the + * same object/offset are still present. + * + * [Note: a memory manager that colludes with + * the calling task can detect that we have + * cheated. While the map was unlocked, the + * mapping could have been changed and restored.] + */ + + if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) { + vm_map_copy_entry_dispose(copy, new_entry); + RETURN(KERN_INVALID_ADDRESS); + } + + src_entry = tmp_entry; + vm_map_clip_start(src_map, src_entry, src_start); + + if ((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) + goto VerificationFailed; + + if (src_entry->vme_end < new_entry->vme_end) + src_size = (new_entry->vme_end = src_entry->vme_end) - src_start; + + if ((src_entry->object.vm_object != src_object) || + (src_entry->offset != src_offset) ) { + + /* + * Verification failed. + * + * Start over with this top-level entry. + */ + + VerificationFailed: ; + + vm_object_deallocate(new_entry->object.vm_object); + vm_map_copy_entry_dispose(copy, new_entry); + tmp_entry = src_entry; + continue; + } + + /* + * Verification succeeded. + */ + + CopySuccessful: ; + + /* + * Link in the new copy entry. + */ + + vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), + new_entry); + + /* + * Determine whether the entire region + * has been copied. + */ + src_start = new_entry->vme_end; + if ((src_start >= src_end) && (src_end != 0)) + break; + + /* + * Verify that there are no gaps in the region + */ + + tmp_entry = src_entry->vme_next; + if (tmp_entry->vme_start != src_start) + RETURN(KERN_INVALID_ADDRESS); + } + + /* + * If the source should be destroyed, do it now, since the + * copy was successful. + */ + if (src_destroy) + (void) vm_map_delete(src_map, trunc_page(src_addr), src_end); + + vm_map_unlock(src_map); + + *copy_result = copy; + return(KERN_SUCCESS); + +#undef RETURN +} + +/* + * vm_map_copyin_object: + * + * Create a copy object from an object. + * Our caller donates an object reference. + */ + +kern_return_t vm_map_copyin_object( + vm_object_t object, + vm_offset_t offset, /* offset of region in object */ + vm_size_t size, /* size of region in object */ + vm_map_copy_t *copy_result) /* OUT */ +{ + vm_map_copy_t copy; /* Resulting copy */ + + /* + * We drop the object into a special copy object + * that contains the object directly. These copy objects + * are distinguished by links. + */ + + copy = (vm_map_copy_t) kmem_cache_alloc(&vm_map_copy_cache); + vm_map_copy_first_entry(copy) = + vm_map_copy_last_entry(copy) = VM_MAP_ENTRY_NULL; + copy->type = VM_MAP_COPY_OBJECT; + copy->cpy_object = object; + copy->offset = offset; + copy->size = size; + + *copy_result = copy; + return(KERN_SUCCESS); +} + +/* + * vm_map_copyin_page_list_cont: + * + * Continuation routine for vm_map_copyin_page_list. + * + * If vm_map_copyin_page_list can't fit the entire vm range + * into a single page list object, it creates a continuation. + * When the target of the operation has used the pages in the + * initial page list, it invokes the continuation, which calls + * this routine. If an error happens, the continuation is aborted + * (abort arg to this routine is TRUE). To avoid deadlocks, the + * pages are discarded from the initial page list before invoking + * the continuation. + * + * NOTE: This is not the same sort of continuation used by + * the scheduler. + */ + +static kern_return_t vm_map_copyin_page_list_cont( + vm_map_copyin_args_t cont_args, + vm_map_copy_t *copy_result) /* OUT */ +{ + kern_return_t result = 0; /* '=0' to quiet gcc warnings */ + boolean_t do_abort, src_destroy, src_destroy_only; + + /* + * Check for cases that only require memory destruction. + */ + do_abort = (copy_result == (vm_map_copy_t *) 0); + src_destroy = (cont_args->destroy_len != (vm_size_t) 0); + src_destroy_only = (cont_args->src_len == (vm_size_t) 0); + + if (do_abort || src_destroy_only) { + if (src_destroy) + result = vm_map_remove(cont_args->map, + cont_args->destroy_addr, + cont_args->destroy_addr + cont_args->destroy_len); + if (!do_abort) + *copy_result = VM_MAP_COPY_NULL; + } + else { + result = vm_map_copyin_page_list(cont_args->map, + cont_args->src_addr, cont_args->src_len, src_destroy, + cont_args->steal_pages, copy_result, TRUE); + + if (src_destroy && !cont_args->steal_pages && + vm_map_copy_has_cont(*copy_result)) { + vm_map_copyin_args_t new_args; + /* + * Transfer old destroy info. + */ + new_args = (vm_map_copyin_args_t) + (*copy_result)->cpy_cont_args; + new_args->destroy_addr = cont_args->destroy_addr; + new_args->destroy_len = cont_args->destroy_len; + } + } + + vm_map_deallocate(cont_args->map); + kfree((vm_offset_t)cont_args, sizeof(vm_map_copyin_args_data_t)); + + return(result); +} + +/* + * vm_map_copyin_page_list: + * + * This is a variant of vm_map_copyin that copies in a list of pages. + * If steal_pages is TRUE, the pages are only in the returned list. + * If steal_pages is FALSE, the pages are busy and still in their + * objects. A continuation may be returned if not all the pages fit: + * the recipient of this copy_result must be prepared to deal with it. + */ + +kern_return_t vm_map_copyin_page_list( + vm_map_t src_map, + vm_offset_t src_addr, + vm_size_t len, + boolean_t src_destroy, + boolean_t steal_pages, + vm_map_copy_t *copy_result, /* OUT */ + boolean_t is_cont) +{ + vm_map_entry_t src_entry; + vm_page_t m; + vm_offset_t src_start; + vm_offset_t src_end; + vm_size_t src_size; + vm_object_t src_object; + vm_offset_t src_offset; + vm_offset_t src_last_offset; + vm_map_copy_t copy; /* Resulting copy */ + kern_return_t result = KERN_SUCCESS; + boolean_t need_map_lookup; + vm_map_copyin_args_t cont_args; + + /* + * If steal_pages is FALSE, this leaves busy pages in + * the object. A continuation must be used if src_destroy + * is true in this case (!steal_pages && src_destroy). + * + * XXX Still have a more general problem of what happens + * XXX if the same page occurs twice in a list. Deadlock + * XXX can happen if vm_fault_page was called. A + * XXX possible solution is to use a continuation if vm_fault_page + * XXX is called and we cross a map entry boundary. + */ + + /* + * Check for copies of zero bytes. + */ + + if (len == 0) { + *copy_result = VM_MAP_COPY_NULL; + return(KERN_SUCCESS); + } + + /* + * Check that the end address doesn't overflow + */ + + if ((src_addr + len) <= src_addr) { + return KERN_INVALID_ADDRESS; + } + + /* + * Compute start and end of region + */ + + src_start = trunc_page(src_addr); + src_end = round_page(src_addr + len); + + /* + * XXX VM maps shouldn't end at maximum address + */ + + if (src_end == 0) { + return KERN_INVALID_ADDRESS; + } + + /* + * Allocate a header element for the page list. + * + * Record original offset and size, as caller may not + * be page-aligned. + */ + + copy = (vm_map_copy_t) kmem_cache_alloc(&vm_map_copy_cache); + copy->type = VM_MAP_COPY_PAGE_LIST; + copy->cpy_npages = 0; + copy->offset = src_addr; + copy->size = len; + copy->cpy_cont = ((kern_return_t (*)()) 0); + copy->cpy_cont_args = VM_MAP_COPYIN_ARGS_NULL; + + /* + * Find the beginning of the region. + */ + +do_map_lookup: + + vm_map_lock(src_map); + + if (!vm_map_lookup_entry(src_map, src_start, &src_entry)) { + result = KERN_INVALID_ADDRESS; + goto error; + } + need_map_lookup = FALSE; + + /* + * Go through entries until we get to the end. + */ + + while (TRUE) { + + if (! (src_entry->protection & VM_PROT_READ)) { + result = KERN_PROTECTION_FAILURE; + goto error; + } + + if (src_end > src_entry->vme_end) + src_size = src_entry->vme_end - src_start; + else + src_size = src_end - src_start; + + src_object = src_entry->object.vm_object; + src_offset = src_entry->offset + + (src_start - src_entry->vme_start); + + /* + * If src_object is NULL, allocate it now; + * we're going to fault on it shortly. + */ + if (src_object == VM_OBJECT_NULL) { + src_object = vm_object_allocate((vm_size_t) + src_entry->vme_end - + src_entry->vme_start); + src_entry->object.vm_object = src_object; + } + + /* + * Iterate over pages. Fault in ones that aren't present. + */ + src_last_offset = src_offset + src_size; + for (; (src_offset < src_last_offset && !need_map_lookup); + src_offset += PAGE_SIZE, src_start += PAGE_SIZE) { + + if (copy->cpy_npages == VM_MAP_COPY_PAGE_LIST_MAX) { +make_continuation: + /* + * At this point we have the max number of + * pages busy for this thread that we're + * willing to allow. Stop here and record + * arguments for the remainder. Note: + * this means that this routine isn't atomic, + * but that's the breaks. Note that only + * the first vm_map_copy_t that comes back + * from this routine has the right offset + * and size; those from continuations are + * page rounded, and short by the amount + * already done. + * + * Reset src_end so the src_destroy + * code at the bottom doesn't do + * something stupid. + */ + + cont_args = (vm_map_copyin_args_t) + kalloc(sizeof(vm_map_copyin_args_data_t)); + cont_args->map = src_map; + vm_map_reference(src_map); + cont_args->src_addr = src_start; + cont_args->src_len = len - (src_start - src_addr); + if (src_destroy) { + cont_args->destroy_addr = cont_args->src_addr; + cont_args->destroy_len = cont_args->src_len; + } + else { + cont_args->destroy_addr = (vm_offset_t) 0; + cont_args->destroy_len = (vm_offset_t) 0; + } + cont_args->steal_pages = steal_pages; + + copy->cpy_cont_args = cont_args; + copy->cpy_cont = vm_map_copyin_page_list_cont; + + src_end = src_start; + vm_map_clip_end(src_map, src_entry, src_end); + break; + } + + /* + * Try to find the page of data. + */ + vm_object_lock(src_object); + vm_object_paging_begin(src_object); + if (((m = vm_page_lookup(src_object, src_offset)) != + VM_PAGE_NULL) && !m->busy && !m->fictitious && + !m->absent && !m->error) { + + /* + * This is the page. Mark it busy + * and keep the paging reference on + * the object whilst we do our thing. + */ + m->busy = TRUE; + + /* + * Also write-protect the page, so + * that the map`s owner cannot change + * the data. The busy bit will prevent + * faults on the page from succeeding + * until the copy is released; after + * that, the page can be re-entered + * as writable, since we didn`t alter + * the map entry. This scheme is a + * cheap copy-on-write. + * + * Don`t forget the protection and + * the page_lock value! + * + * If the source is being destroyed + * AND not shared writable, we don`t + * have to protect the page, since + * we will destroy the (only) + * writable mapping later. + */ + if (!src_destroy || + src_object->use_shared_copy) + { + pmap_page_protect(m->phys_addr, + src_entry->protection + & ~m->page_lock + & ~VM_PROT_WRITE); + } + + } + else { + vm_prot_t result_prot; + vm_page_t top_page; + kern_return_t kr; + + /* + * Have to fault the page in; must + * unlock the map to do so. While + * the map is unlocked, anything + * can happen, we must lookup the + * map entry before continuing. + */ + vm_map_unlock(src_map); + need_map_lookup = TRUE; +retry: + result_prot = VM_PROT_READ; + + kr = vm_fault_page(src_object, src_offset, + VM_PROT_READ, FALSE, FALSE, + &result_prot, &m, &top_page, + FALSE, (void (*)()) 0); + /* + * Cope with what happened. + */ + switch (kr) { + case VM_FAULT_SUCCESS: + break; + case VM_FAULT_INTERRUPTED: /* ??? */ + case VM_FAULT_RETRY: + vm_object_lock(src_object); + vm_object_paging_begin(src_object); + goto retry; + case VM_FAULT_MEMORY_SHORTAGE: + VM_PAGE_WAIT((void (*)()) 0); + vm_object_lock(src_object); + vm_object_paging_begin(src_object); + goto retry; + case VM_FAULT_FICTITIOUS_SHORTAGE: + vm_page_more_fictitious(); + vm_object_lock(src_object); + vm_object_paging_begin(src_object); + goto retry; + case VM_FAULT_MEMORY_ERROR: + /* + * Something broke. If this + * is a continuation, return + * a partial result if possible, + * else fail the whole thing. + * In the continuation case, the + * next continuation call will + * get this error if it persists. + */ + vm_map_lock(src_map); + if (is_cont && + copy->cpy_npages != 0) + goto make_continuation; + + result = KERN_MEMORY_ERROR; + goto error; + } + + if (top_page != VM_PAGE_NULL) { + vm_object_lock(src_object); + VM_PAGE_FREE(top_page); + vm_object_paging_end(src_object); + vm_object_unlock(src_object); + } + + /* + * We do not need to write-protect + * the page, since it cannot have + * been in the pmap (and we did not + * enter it above). The busy bit + * will protect the page from being + * entered as writable until it is + * unlocked. + */ + + } + + /* + * The page is busy, its object is locked, and + * we have a paging reference on it. Either + * the map is locked, or need_map_lookup is + * TRUE. + * + * Put the page in the page list. + */ + copy->cpy_page_list[copy->cpy_npages++] = m; + vm_object_unlock(m->object); + } + + /* + * DETERMINE whether the entire region + * has been copied. + */ + if (src_start >= src_end && src_end != 0) { + if (need_map_lookup) + vm_map_lock(src_map); + break; + } + + /* + * If need_map_lookup is TRUE, have to start over with + * another map lookup. Note that we dropped the map + * lock (to call vm_fault_page) above only in this case. + */ + if (need_map_lookup) + goto do_map_lookup; + + /* + * Verify that there are no gaps in the region + */ + + src_start = src_entry->vme_end; + src_entry = src_entry->vme_next; + if (src_entry->vme_start != src_start) { + result = KERN_INVALID_ADDRESS; + goto error; + } + } + + /* + * If steal_pages is true, make sure all + * pages in the copy are not in any object + * We try to remove them from the original + * object, but we may have to copy them. + * + * At this point every page in the list is busy + * and holds a paging reference to its object. + * When we're done stealing, every page is busy, + * and in no object (m->tabled == FALSE). + */ + src_start = trunc_page(src_addr); + if (steal_pages) { + int i; + vm_offset_t unwire_end; + + unwire_end = src_start; + for (i = 0; i < copy->cpy_npages; i++) { + + /* + * Remove the page from its object if it + * can be stolen. It can be stolen if: + * + * (1) The source is being destroyed, + * the object is temporary, and + * not shared. + * (2) The page is not precious. + * + * The not shared check consists of two + * parts: (a) there are no objects that + * shadow this object. (b) it is not the + * object in any shared map entries (i.e., + * use_shared_copy is not set). + * + * The first check (a) means that we can't + * steal pages from objects that are not + * at the top of their shadow chains. This + * should not be a frequent occurrence. + * + * Stealing wired pages requires telling the + * pmap module to let go of them. + * + * NOTE: stealing clean pages from objects + * whose mappings survive requires a call to + * the pmap module. Maybe later. + */ + m = copy->cpy_page_list[i]; + src_object = m->object; + vm_object_lock(src_object); + + if (src_destroy && + src_object->temporary && + (!src_object->shadowed) && + (!src_object->use_shared_copy) && + !m->precious) { + vm_offset_t page_vaddr; + + page_vaddr = src_start + (i * PAGE_SIZE); + if (m->wire_count > 0) { + + assert(m->wire_count == 1); + /* + * In order to steal a wired + * page, we have to unwire it + * first. We do this inline + * here because we have the page. + * + * Step 1: Unwire the map entry. + * Also tell the pmap module + * that this piece of the + * pmap is pageable. + */ + vm_object_unlock(src_object); + if (page_vaddr >= unwire_end) { + if (!vm_map_lookup_entry(src_map, + page_vaddr, &src_entry)) + panic("vm_map_copyin_page_list: missing wired map entry"); + + vm_map_clip_start(src_map, src_entry, + page_vaddr); + vm_map_clip_end(src_map, src_entry, + src_start + src_size); + + assert(src_entry->wired_count > 0); + vm_map_entry_reset_wired(src_map, src_entry); + unwire_end = src_entry->vme_end; + pmap_pageable(vm_map_pmap(src_map), + page_vaddr, unwire_end, TRUE); + } + + /* + * Step 2: Unwire the page. + * pmap_remove handles this for us. + */ + vm_object_lock(src_object); + } + + /* + * Don't need to remove the mapping; + * vm_map_delete will handle it. + * + * Steal the page. Setting the wire count + * to zero is vm_page_unwire without + * activating the page. + */ + vm_page_lock_queues(); + vm_page_remove(m); + if (m->wire_count > 0) { + m->wire_count = 0; + vm_page_wire_count--; + } else { + VM_PAGE_QUEUES_REMOVE(m); + } + vm_page_unlock_queues(); + } + else { + /* + * Have to copy this page. Have to + * unlock the map while copying, + * hence no further page stealing. + * Hence just copy all the pages. + * Unlock the map while copying; + * This means no further page stealing. + */ + vm_object_unlock(src_object); + vm_map_unlock(src_map); + + vm_map_copy_steal_pages(copy); + + vm_map_lock(src_map); + break; + } + + vm_object_paging_end(src_object); + vm_object_unlock(src_object); + } + + /* + * If the source should be destroyed, do it now, since the + * copy was successful. + */ + + if (src_destroy) { + (void) vm_map_delete(src_map, src_start, src_end); + } + } + else { + /* + * !steal_pages leaves busy pages in the map. + * This will cause src_destroy to hang. Use + * a continuation to prevent this. + */ + if (src_destroy && !vm_map_copy_has_cont(copy)) { + cont_args = (vm_map_copyin_args_t) + kalloc(sizeof(vm_map_copyin_args_data_t)); + vm_map_reference(src_map); + cont_args->map = src_map; + cont_args->src_addr = (vm_offset_t) 0; + cont_args->src_len = (vm_size_t) 0; + cont_args->destroy_addr = src_start; + cont_args->destroy_len = src_end - src_start; + cont_args->steal_pages = FALSE; + + copy->cpy_cont_args = cont_args; + copy->cpy_cont = vm_map_copyin_page_list_cont; + } + + } + + vm_map_unlock(src_map); + + *copy_result = copy; + return(result); + +error: + vm_map_unlock(src_map); + vm_map_copy_discard(copy); + return(result); +} + +/* + * vm_map_fork: + * + * Create and return a new map based on the old + * map, according to the inheritance values on the + * regions in that map. + * + * The source map must not be locked. + */ +vm_map_t vm_map_fork(vm_map_t old_map) +{ + vm_map_t new_map; + vm_map_entry_t old_entry; + vm_map_entry_t new_entry; + pmap_t new_pmap = pmap_create((vm_size_t) 0); + vm_size_t new_size = 0; + vm_size_t entry_size; + vm_object_t object; + + if (new_pmap == PMAP_NULL) + return VM_MAP_NULL; + + vm_map_lock(old_map); + + new_map = vm_map_create(new_pmap, + old_map->min_offset, + old_map->max_offset); + if (new_map == VM_MAP_NULL) { + pmap_destroy(new_pmap); + return VM_MAP_NULL; + } + + for ( + old_entry = vm_map_first_entry(old_map); + old_entry != vm_map_to_entry(old_map); + ) { + if (old_entry->is_sub_map) + panic("vm_map_fork: encountered a submap"); + + entry_size = (old_entry->vme_end - old_entry->vme_start); + + switch (old_entry->inheritance) { + case VM_INHERIT_NONE: + break; + + case VM_INHERIT_SHARE: + /* + * New sharing code. New map entry + * references original object. Temporary + * objects use asynchronous copy algorithm for + * future copies. First make sure we have + * the right object. If we need a shadow, + * or someone else already has one, then + * make a new shadow and share it. + */ + + object = old_entry->object.vm_object; + if (object == VM_OBJECT_NULL) { + object = vm_object_allocate( + (vm_size_t)(old_entry->vme_end - + old_entry->vme_start)); + old_entry->offset = 0; + old_entry->object.vm_object = object; + assert(!old_entry->needs_copy); + } + else if (old_entry->needs_copy || object->shadowed || + (object->temporary && !old_entry->is_shared && + object->size > (vm_size_t)(old_entry->vme_end - + old_entry->vme_start))) { + + assert(object->temporary); + assert(!(object->shadowed && old_entry->is_shared)); + vm_object_shadow( + &old_entry->object.vm_object, + &old_entry->offset, + (vm_size_t) (old_entry->vme_end - + old_entry->vme_start)); + + /* + * If we're making a shadow for other than + * copy on write reasons, then we have + * to remove write permission. + */ + + if (!old_entry->needs_copy && + (old_entry->protection & VM_PROT_WRITE)) { + pmap_protect(vm_map_pmap(old_map), + old_entry->vme_start, + old_entry->vme_end, + old_entry->protection & + ~VM_PROT_WRITE); + } + old_entry->needs_copy = FALSE; + object = old_entry->object.vm_object; + } + + /* + * Set use_shared_copy to indicate that + * object must use shared (delayed) copy-on + * write. This is ignored for permanent objects. + * Bump the reference count for the new entry + */ + + vm_object_lock(object); + object->use_shared_copy = TRUE; + object->ref_count++; + vm_object_unlock(object); + + new_entry = vm_map_entry_create(new_map); + + if (old_entry->projected_on != 0) { + /* + * If entry is projected buffer, clone the + * entry exactly. + */ + + vm_map_entry_copy_full(new_entry, old_entry); + + } else { + /* + * Clone the entry, using object ref from above. + * Mark both entries as shared. + */ + + vm_map_entry_copy(new_entry, old_entry); + old_entry->is_shared = TRUE; + new_entry->is_shared = TRUE; + } + + /* + * Insert the entry into the new map -- we + * know we're inserting at the end of the new + * map. + */ + + vm_map_entry_link( + new_map, + vm_map_last_entry(new_map), + new_entry); + + /* + * Update the physical map + */ + + pmap_copy(new_map->pmap, old_map->pmap, + new_entry->vme_start, + entry_size, + old_entry->vme_start); + + new_size += entry_size; + break; + + case VM_INHERIT_COPY: + if (old_entry->wired_count == 0) { + boolean_t src_needs_copy; + boolean_t new_entry_needs_copy; + + new_entry = vm_map_entry_create(new_map); + vm_map_entry_copy(new_entry, old_entry); + + if (vm_object_copy_temporary( + &new_entry->object.vm_object, + &new_entry->offset, + &src_needs_copy, + &new_entry_needs_copy)) { + + /* + * Handle copy-on-write obligations + */ + + if (src_needs_copy && !old_entry->needs_copy) { + vm_object_pmap_protect( + old_entry->object.vm_object, + old_entry->offset, + entry_size, + (old_entry->is_shared ? + PMAP_NULL : + old_map->pmap), + old_entry->vme_start, + old_entry->protection & + ~VM_PROT_WRITE); + + old_entry->needs_copy = TRUE; + } + + new_entry->needs_copy = new_entry_needs_copy; + + /* + * Insert the entry at the end + * of the map. + */ + + vm_map_entry_link(new_map, + vm_map_last_entry(new_map), + new_entry); + + + new_size += entry_size; + break; + } + + vm_map_entry_dispose(new_map, new_entry); + } + + /* INNER BLOCK (copy cannot be optimized) */ { + + vm_offset_t start = old_entry->vme_start; + vm_map_copy_t copy; + vm_map_entry_t last = vm_map_last_entry(new_map); + + vm_map_unlock(old_map); + if (vm_map_copyin(old_map, + start, + entry_size, + FALSE, + ©) + != KERN_SUCCESS) { + vm_map_lock(old_map); + if (!vm_map_lookup_entry(old_map, start, &last)) + last = last->vme_next; + old_entry = last; + /* + * For some error returns, want to + * skip to the next element. + */ + + continue; + } + + /* + * Insert the copy into the new map + */ + + vm_map_copy_insert(new_map, last, copy); + new_size += entry_size; + + /* + * Pick up the traversal at the end of + * the copied region. + */ + + vm_map_lock(old_map); + start += entry_size; + if (!vm_map_lookup_entry(old_map, start, &last)) + last = last->vme_next; + else + vm_map_clip_start(old_map, last, start); + old_entry = last; + + continue; + /* INNER BLOCK (copy cannot be optimized) */ } + } + old_entry = old_entry->vme_next; + } + + new_map->size = new_size; + vm_map_unlock(old_map); + + return(new_map); +} + +/* + * vm_map_lookup: + * + * Finds the VM object, offset, and + * protection for a given virtual address in the + * specified map, assuming a page fault of the + * type specified. + * + * Returns the (object, offset, protection) for + * this address, whether it is wired down, and whether + * this map has the only reference to the data in question. + * In order to later verify this lookup, a "version" + * is returned. + * + * The map should not be locked; it will not be + * locked on exit. In order to guarantee the + * existence of the returned object, it is returned + * locked. + * + * If a lookup is requested with "write protection" + * specified, the map may be changed to perform virtual + * copying operations, although the data referenced will + * remain the same. + */ +kern_return_t vm_map_lookup( + vm_map_t *var_map, /* IN/OUT */ + vm_offset_t vaddr, + vm_prot_t fault_type, + + vm_map_version_t *out_version, /* OUT */ + vm_object_t *object, /* OUT */ + vm_offset_t *offset, /* OUT */ + vm_prot_t *out_prot, /* OUT */ + boolean_t *wired) /* OUT */ +{ + vm_map_entry_t entry; + vm_map_t map = *var_map; + vm_prot_t prot; + + RetryLookup: ; + + /* + * Lookup the faulting address. + */ + + vm_map_lock_read(map); + +#define RETURN(why) \ + { \ + vm_map_unlock_read(map); \ + return(why); \ + } + + /* + * If the map has an interesting hint, try it before calling + * full blown lookup routine. + */ + + simple_lock(&map->hint_lock); + entry = map->hint; + simple_unlock(&map->hint_lock); + + if ((entry == vm_map_to_entry(map)) || + (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) { + vm_map_entry_t tmp_entry; + + /* + * Entry was either not a valid hint, or the vaddr + * was not contained in the entry, so do a full lookup. + */ + if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) + RETURN(KERN_INVALID_ADDRESS); + + entry = tmp_entry; + } + + /* + * Handle submaps. + */ + + if (entry->is_sub_map) { + vm_map_t old_map = map; + + *var_map = map = entry->object.sub_map; + vm_map_unlock_read(old_map); + goto RetryLookup; + } + + /* + * Check whether this task is allowed to have + * this page. + */ + + prot = entry->protection; + + if ((fault_type & (prot)) != fault_type) { + if ((prot & VM_PROT_NOTIFY) && (fault_type & VM_PROT_WRITE)) { + RETURN(KERN_WRITE_PROTECTION_FAILURE); + } else { + RETURN(KERN_PROTECTION_FAILURE); + } + } + + /* + * If this page is not pageable, we have to get + * it for all possible accesses. + */ + + if ((*wired = (entry->wired_count != 0))) + prot = fault_type = entry->protection; + + /* + * If the entry was copy-on-write, we either ... + */ + + if (entry->needs_copy) { + /* + * If we want to write the page, we may as well + * handle that now since we've got the map locked. + * + * If we don't need to write the page, we just + * demote the permissions allowed. + */ + + if (fault_type & VM_PROT_WRITE) { + /* + * Make a new object, and place it in the + * object chain. Note that no new references + * have appeared -- one just moved from the + * map to the new object. + */ + + if (vm_map_lock_read_to_write(map)) { + goto RetryLookup; + } + map->timestamp++; + + vm_object_shadow( + &entry->object.vm_object, + &entry->offset, + (vm_size_t) (entry->vme_end - entry->vme_start)); + + entry->needs_copy = FALSE; + + vm_map_lock_write_to_read(map); + } + else { + /* + * We're attempting to read a copy-on-write + * page -- don't allow writes. + */ + + prot &= (~VM_PROT_WRITE); + } + } + + /* + * Create an object if necessary. + */ + if (entry->object.vm_object == VM_OBJECT_NULL) { + + if (vm_map_lock_read_to_write(map)) { + goto RetryLookup; + } + + entry->object.vm_object = vm_object_allocate( + (vm_size_t)(entry->vme_end - entry->vme_start)); + entry->offset = 0; + vm_map_lock_write_to_read(map); + } + + /* + * Return the object/offset from this entry. If the entry + * was copy-on-write or empty, it has been fixed up. Also + * return the protection. + */ + + *offset = (vaddr - entry->vme_start) + entry->offset; + *object = entry->object.vm_object; + *out_prot = prot; + + /* + * Lock the object to prevent it from disappearing + */ + + vm_object_lock(*object); + + /* + * Save the version number and unlock the map. + */ + + out_version->main_timestamp = map->timestamp; + + RETURN(KERN_SUCCESS); + +#undef RETURN +} + +/* + * vm_map_verify: + * + * Verifies that the map in question has not changed + * since the given version. If successful, the map + * will not change until vm_map_verify_done() is called. + */ +boolean_t vm_map_verify( + vm_map_t map, + vm_map_version_t *version) /* REF */ +{ + boolean_t result; + + vm_map_lock_read(map); + result = (map->timestamp == version->main_timestamp); + + if (!result) + vm_map_unlock_read(map); + + return(result); +} + +/* + * vm_map_verify_done: + * + * Releases locks acquired by a vm_map_verify. + * + * This is now a macro in vm/vm_map.h. It does a + * vm_map_unlock_read on the map. + */ + +/* + * vm_region: + * + * User call to obtain information about a region in + * a task's address map. + */ + +kern_return_t vm_region( + vm_map_t map, + vm_offset_t *address, /* IN/OUT */ + vm_size_t *size, /* OUT */ + vm_prot_t *protection, /* OUT */ + vm_prot_t *max_protection, /* OUT */ + vm_inherit_t *inheritance, /* OUT */ + boolean_t *is_shared, /* OUT */ + ipc_port_t *object_name, /* OUT */ + vm_offset_t *offset_in_object) /* OUT */ +{ + vm_map_entry_t tmp_entry; + vm_map_entry_t entry; + vm_offset_t tmp_offset; + vm_offset_t start; + + if (map == VM_MAP_NULL) + return(KERN_INVALID_ARGUMENT); + + start = *address; + + vm_map_lock_read(map); + if (!vm_map_lookup_entry(map, start, &tmp_entry)) { + if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) { + vm_map_unlock_read(map); + return(KERN_NO_SPACE); + } + } else { + entry = tmp_entry; + } + + start = entry->vme_start; + *protection = entry->protection; + *max_protection = entry->max_protection; + *inheritance = entry->inheritance; + *address = start; + *size = (entry->vme_end - start); + + tmp_offset = entry->offset; + + + if (entry->is_sub_map) { + *is_shared = FALSE; + *object_name = IP_NULL; + *offset_in_object = tmp_offset; + } else { + *is_shared = entry->is_shared; + *object_name = vm_object_name(entry->object.vm_object); + *offset_in_object = tmp_offset; + } + + vm_map_unlock_read(map); + + return(KERN_SUCCESS); +} + +/* + * vm_region_create_proxy: + * + * Gets a proxy to the region that ADDRESS belongs to, starting at the + * region start, with MAX_PROTECTION and LEN limited by the region ones, + * and returns it in *PORT. + */ +kern_return_t +vm_region_create_proxy (task_t task, vm_address_t address, + vm_prot_t max_protection, vm_size_t len, + ipc_port_t *port) +{ + kern_return_t ret; + vm_map_entry_t entry, tmp_entry; + vm_object_t object; + rpc_vm_offset_t rpc_offset, rpc_start; + rpc_vm_size_t rpc_len = (rpc_vm_size_t) len; + ipc_port_t pager; + + if (task == TASK_NULL) + return(KERN_INVALID_ARGUMENT); + + vm_map_lock_read(task->map); + if (!vm_map_lookup_entry(task->map, address, &tmp_entry)) { + if ((entry = tmp_entry->vme_next) == vm_map_to_entry(task->map)) { + vm_map_unlock_read(task->map); + return(KERN_NO_SPACE); + } + } else { + entry = tmp_entry; + } + + if (entry->is_sub_map) { + vm_map_unlock_read(task->map); + return(KERN_INVALID_ARGUMENT); + } + + /* Limit the allowed protection and range to the entry ones */ + if (len > entry->vme_end - entry->vme_start) { + vm_map_unlock_read(task->map); + return(KERN_INVALID_ARGUMENT); + } + max_protection &= entry->max_protection; + + object = entry->object.vm_object; + vm_object_lock(object); + /* Create a pager in case this is an internal object that does + not yet have one. */ + vm_object_pager_create(object); + pager = ipc_port_copy_send(object->pager); + vm_object_unlock(object); + + rpc_start = (address - entry->vme_start) + entry->offset; + rpc_offset = 0; + + vm_map_unlock_read(task->map); + + ret = memory_object_create_proxy(task->itk_space, max_protection, + &pager, 1, + &rpc_offset, 1, + &rpc_start, 1, + &rpc_len, 1, port); + if (ret) + ipc_port_release_send(pager); + + return ret; +} + +/* + * Routine: vm_map_coalesce_entry + * Purpose: + * Try to coalesce an entry with the preceeding entry in the map. + * Conditions: + * The map is locked. If coalesced, the entry is destroyed + * by the call. + * Returns: + * Whether the entry was coalesced. + */ +boolean_t +vm_map_coalesce_entry( + vm_map_t map, + vm_map_entry_t entry) +{ + vm_map_entry_t prev = entry->vme_prev; + vm_size_t prev_size; + vm_size_t entry_size; + + /* + * Check the basic conditions for coalescing the two entries. + */ + if ((entry == vm_map_to_entry(map)) || + (prev == vm_map_to_entry(map)) || + (prev->vme_end != entry->vme_start) || + (prev->is_shared || entry->is_shared) || + (prev->is_sub_map || entry->is_sub_map) || + (prev->inheritance != entry->inheritance) || + (prev->protection != entry->protection) || + (prev->max_protection != entry->max_protection) || + (prev->needs_copy != entry->needs_copy) || + (prev->in_transition || entry->in_transition) || + (prev->wired_count != entry->wired_count) || + (prev->projected_on != 0) || + (entry->projected_on != 0)) + return FALSE; + + prev_size = prev->vme_end - prev->vme_start; + entry_size = entry->vme_end - entry->vme_start; + assert(prev->gap_size == 0); + + /* + * See if we can coalesce the two objects. + */ + if (!vm_object_coalesce(prev->object.vm_object, + entry->object.vm_object, + prev->offset, + entry->offset, + prev_size, + entry_size, + &prev->object.vm_object, + &prev->offset)) + return FALSE; + + /* + * Update the hints. + */ + if (map->hint == entry) + SAVE_HINT(map, prev); + if (map->first_free == entry) + map->first_free = prev; + + /* + * Get rid of the entry without changing any wirings or the pmap, + * and without altering map->size. + */ + prev->vme_end = entry->vme_end; + vm_map_entry_unlink(map, entry); + vm_map_entry_dispose(map, entry); + + return TRUE; +} + + + +/* + * Routine: vm_map_machine_attribute + * Purpose: + * Provide machine-specific attributes to mappings, + * such as cachability etc. for machines that provide + * them. NUMA architectures and machines with big/strange + * caches will use this. + * Note: + * Responsibilities for locking and checking are handled here, + * everything else in the pmap module. If any non-volatile + * information must be kept, the pmap module should handle + * it itself. [This assumes that attributes do not + * need to be inherited, which seems ok to me] + */ +kern_return_t vm_map_machine_attribute( + vm_map_t map, + vm_offset_t address, + vm_size_t size, + vm_machine_attribute_t attribute, + vm_machine_attribute_val_t* value) /* IN/OUT */ +{ + kern_return_t ret; + + if (address < vm_map_min(map) || + (address + size) > vm_map_max(map)) + return KERN_INVALID_ARGUMENT; + + vm_map_lock(map); + + ret = pmap_attribute(map->pmap, address, size, attribute, value); + + vm_map_unlock(map); + + return ret; +} + +/* + * Routine: vm_map_msync + * Purpose: + * Synchronize out pages of the given map out to their memory + * manager, if any. + */ +kern_return_t vm_map_msync( + vm_map_t map, + vm_offset_t address, + vm_size_t size, + vm_sync_t sync_flags) +{ + if (map == VM_MAP_NULL) + return KERN_INVALID_ARGUMENT; + + if ((sync_flags & (VM_SYNC_ASYNCHRONOUS | VM_SYNC_SYNCHRONOUS)) == + (VM_SYNC_ASYNCHRONOUS | VM_SYNC_SYNCHRONOUS)) + return KERN_INVALID_ARGUMENT; + + size = round_page(address + size) - trunc_page(address); + address = trunc_page(address); + + if (size == 0) + return KERN_SUCCESS; + + /* TODO */ + + return KERN_INVALID_ARGUMENT; +} + + + +#if MACH_KDB + +#define printf kdbprintf + +/* + * vm_map_print: [ debug ] + */ +void vm_map_print(db_expr_t addr, boolean_t have_addr, db_expr_t count, const char *modif) +{ + vm_map_t map; + vm_map_entry_t entry; + + if (!have_addr) + map = current_thread()->task->map; + else + map = (vm_map_t)addr; + + iprintf("Map 0x%X: name=\"%s\", pmap=0x%X,", + (vm_offset_t) map, map->name, (vm_offset_t) (map->pmap)); + printf("ref=%d,nentries=%d\n", map->ref_count, map->hdr.nentries); + printf("size=%lu,resident:%lu,wired=%lu\n", map->size, + pmap_resident_count(map->pmap) * PAGE_SIZE, map->size_wired); + printf("version=%d\n", map->timestamp); + indent += 1; + for (entry = vm_map_first_entry(map); + entry != vm_map_to_entry(map); + entry = entry->vme_next) { + static char *inheritance_name[3] = { "share", "copy", "none"}; + + iprintf("map entry 0x%X: ", (vm_offset_t) entry); + printf("start=0x%X, end=0x%X\n", + (vm_offset_t) entry->vme_start, (vm_offset_t) entry->vme_end); + iprintf("prot=%X/%X/%s, ", + entry->protection, + entry->max_protection, + inheritance_name[entry->inheritance]); + if (entry->wired_count != 0) { + printf("wired, "); + } + if (entry->in_transition) { + printf("in transition"); + if (entry->needs_wakeup) + printf("(wake request)"); + printf(", "); + } + if (entry->is_sub_map) { + printf("submap=0x%X, offset=0x%X\n", + (vm_offset_t) entry->object.sub_map, + (vm_offset_t) entry->offset); + } else { + printf("object=0x%X, offset=0x%X", + (vm_offset_t) entry->object.vm_object, + (vm_offset_t) entry->offset); + if (entry->is_shared) + printf(", shared"); + if (entry->needs_copy) + printf(", copy needed"); + printf("\n"); + + if ((entry->vme_prev == vm_map_to_entry(map)) || + (entry->vme_prev->object.vm_object != entry->object.vm_object)) { + indent += 1; + vm_object_print(entry->object.vm_object); + indent -= 1; + } + } + } + indent -= 1; +} + +/* + * Routine: vm_map_copy_print + * Purpose: + * Pretty-print a copy object for ddb. + */ + +void vm_map_copy_print(const vm_map_copy_t copy) +{ + int i, npages; + + printf("copy object 0x%x\n", copy); + + indent += 1; + + iprintf("type=%d", copy->type); + switch (copy->type) { + case VM_MAP_COPY_ENTRY_LIST: + printf("[entry_list]"); + break; + + case VM_MAP_COPY_OBJECT: + printf("[object]"); + break; + + case VM_MAP_COPY_PAGE_LIST: + printf("[page_list]"); + break; + + default: + printf("[bad type]"); + break; + } + printf(", offset=0x%x", copy->offset); + printf(", size=0x%x\n", copy->size); + + switch (copy->type) { + case VM_MAP_COPY_ENTRY_LIST: + /* XXX add stuff here */ + break; + + case VM_MAP_COPY_OBJECT: + iprintf("object=0x%x\n", copy->cpy_object); + break; + + case VM_MAP_COPY_PAGE_LIST: + iprintf("npages=%d", copy->cpy_npages); + printf(", cont=%x", copy->cpy_cont); + printf(", cont_args=%x\n", copy->cpy_cont_args); + if (copy->cpy_npages < 0) { + npages = 0; + } else if (copy->cpy_npages > VM_MAP_COPY_PAGE_LIST_MAX) { + npages = VM_MAP_COPY_PAGE_LIST_MAX; + } else { + npages = copy->cpy_npages; + } + iprintf("copy->cpy_page_list[0..%d] = {", npages); + for (i = 0; i < npages - 1; i++) { + printf("0x%x, ", copy->cpy_page_list[i]); + } + if (npages > 0) { + printf("0x%x", copy->cpy_page_list[npages - 1]); + } + printf("}\n"); + break; + } + + indent -= 1; +} +#endif /* MACH_KDB */ diff --git a/vm/vm_map.h b/vm/vm_map.h new file mode 100644 index 0000000..a4949e4 --- /dev/null +++ b/vm/vm_map.h @@ -0,0 +1,585 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University. + * Copyright (c) 1993,1994 The University of Utah and + * the Computer Systems Laboratory (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF + * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY + * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF + * THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/vm_map.h + * Author: Avadis Tevanian, Jr., Michael Wayne Young + * Date: 1985 + * + * Virtual memory map module definitions. + * + * Contributors: + * avie, dlb, mwyoung + */ + +#ifndef _VM_VM_MAP_H_ +#define _VM_VM_MAP_H_ + +#include <mach/kern_return.h> +#include <mach/boolean.h> +#include <mach/machine/vm_types.h> +#include <mach/vm_attributes.h> +#include <mach/vm_prot.h> +#include <mach/vm_inherit.h> +#include <mach/vm_wire.h> +#include <mach/vm_sync.h> +#include <vm/pmap.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_types.h> +#include <kern/list.h> +#include <kern/lock.h> +#include <kern/rbtree.h> +#include <kern/macros.h> + +/* TODO: make it dynamic */ +#define KENTRY_DATA_SIZE (256*PAGE_SIZE) + +/* + * Types defined: + * + * vm_map_entry_t an entry in an address map. + * vm_map_version_t a timestamp of a map, for use with vm_map_lookup + * vm_map_copy_t represents memory copied from an address map, + * used for inter-map copy operations + */ + +/* + * Type: vm_map_object_t [internal use only] + * + * Description: + * The target of an address mapping, either a virtual + * memory object or a sub map (of the kernel map). + */ +typedef union vm_map_object { + struct vm_object *vm_object; /* object object */ + struct vm_map *sub_map; /* belongs to another map */ +} vm_map_object_t; + +/* + * Type: vm_map_entry_t [internal use only] + * + * Description: + * A single mapping within an address map. + * + * Implementation: + * Address map entries consist of start and end addresses, + * a VM object (or sub map) and offset into that object, + * and user-exported inheritance and protection information. + * Control information for virtual copy operations is also + * stored in the address map entry. + */ +struct vm_map_links { + struct vm_map_entry *prev; /* previous entry */ + struct vm_map_entry *next; /* next entry */ + vm_offset_t start; /* start address */ + vm_offset_t end; /* end address */ +}; + +struct vm_map_entry { + struct vm_map_links links; /* links to other entries */ +#define vme_prev links.prev +#define vme_next links.next +#define vme_start links.start +#define vme_end links.end + struct rbtree_node tree_node; /* links to other entries in tree */ + struct rbtree_node gap_node; /* links to other entries in gap tree */ + struct list gap_list; /* links to other entries with + the same gap size */ + vm_size_t gap_size; /* size of available memory + following this entry */ + union vm_map_object object; /* object I point to */ + vm_offset_t offset; /* offset into object */ + unsigned int + /* boolean_t */ in_gap_tree:1, /* entry is in the gap tree if true, + or linked to other entries with + the same gap size if false */ + /* boolean_t */ is_shared:1, /* region is shared */ + /* boolean_t */ is_sub_map:1, /* Is "object" a submap? */ + /* boolean_t */ in_transition:1, /* Entry being changed */ + /* boolean_t */ needs_wakeup:1, /* Waiters on in_transition */ + /* Only used when object is a vm_object: */ + /* boolean_t */ needs_copy:1; /* does object need to be copied */ + + /* Only in task maps: */ + vm_prot_t protection; /* protection code */ + vm_prot_t max_protection; /* maximum protection */ + vm_inherit_t inheritance; /* inheritance */ + unsigned short wired_count; /* can be paged if = 0 */ + vm_prot_t wired_access; /* wiring access types, as accepted + by vm_map_pageable; used on wiring + scans when protection != VM_PROT_NONE */ + struct vm_map_entry *projected_on; /* 0 for normal map entry + or persistent kernel map projected buffer entry; + -1 for non-persistent kernel map projected buffer entry; + pointer to corresponding kernel map entry for user map + projected buffer entry */ +}; + +typedef struct vm_map_entry *vm_map_entry_t; + +#define VM_MAP_ENTRY_NULL ((vm_map_entry_t) 0) + +/* + * Type: struct vm_map_header + * + * Description: + * Header for a vm_map and a vm_map_copy. + */ +struct vm_map_header { + struct vm_map_links links; /* first, last, min, max */ + struct rbtree tree; /* Sorted tree of entries */ + struct rbtree gap_tree; /* Sorted tree of gap lists + for allocations */ + int nentries; /* Number of entries */ +}; + +/* + * Type: vm_map_t [exported; contents invisible] + * + * Description: + * An address map -- a directory relating valid + * regions of a task's address space to the corresponding + * virtual memory objects. + * + * Implementation: + * Maps are doubly-linked lists of map entries, sorted + * by address. They're also contained in a red-black tree. + * One hint is used to start searches again at the last + * successful search, insertion, or removal. If the hint + * lookup failed (i.e. the hint didn't refer to the requested + * entry), a BST lookup is performed. Another hint is used to + * quickly find free space. + */ +struct vm_map { + lock_data_t lock; /* Lock for map data */ + struct vm_map_header hdr; /* Map entry header */ +#define min_offset hdr.links.start /* start of range */ +#define max_offset hdr.links.end /* end of range */ + pmap_t pmap; /* Physical map */ + vm_size_t size; /* virtual size */ + vm_size_t size_wired; /* wired size */ + int ref_count; /* Reference count */ + decl_simple_lock_data(, ref_lock) /* Lock for ref_count field */ + vm_map_entry_t hint; /* hint for quick lookups */ + decl_simple_lock_data(, hint_lock) /* lock for hint storage */ + vm_map_entry_t first_free; /* First free space hint */ + + /* Flags */ + unsigned int wait_for_space:1, /* Should callers wait + for space? */ + /* boolean_t */ wiring_required:1; /* New mappings are wired? */ + + unsigned int timestamp; /* Version number */ + + const char *name; /* Associated name */ +}; + +#define vm_map_to_entry(map) ((struct vm_map_entry *) &(map)->hdr.links) +#define vm_map_first_entry(map) ((map)->hdr.links.next) +#define vm_map_last_entry(map) ((map)->hdr.links.prev) + +/* + * Type: vm_map_version_t [exported; contents invisible] + * + * Description: + * Map versions may be used to quickly validate a previous + * lookup operation. + * + * Usage note: + * Because they are bulky objects, map versions are usually + * passed by reference. + * + * Implementation: + * Just a timestamp for the main map. + */ +typedef struct vm_map_version { + unsigned int main_timestamp; +} vm_map_version_t; + +/* + * Type: vm_map_copy_t [exported; contents invisible] + * + * Description: + * A map copy object represents a region of virtual memory + * that has been copied from an address map but is still + * in transit. + * + * A map copy object may only be used by a single thread + * at a time. + * + * Implementation: + * There are three formats for map copy objects. + * The first is very similar to the main + * address map in structure, and as a result, some + * of the internal maintenance functions/macros can + * be used with either address maps or map copy objects. + * + * The map copy object contains a header links + * entry onto which the other entries that represent + * the region are chained. + * + * The second format is a single vm object. This is used + * primarily in the pageout path. The third format is a + * list of vm pages. An optional continuation provides + * a hook to be called to obtain more of the memory, + * or perform other operations. The continuation takes 3 + * arguments, a saved arg buffer, a pointer to a new vm_map_copy + * (returned) and an abort flag (abort if TRUE). + */ + +#define VM_MAP_COPY_PAGE_LIST_MAX 64 + +struct vm_map_copy; +struct vm_map_copyin_args_data; +typedef kern_return_t (*vm_map_copy_cont_fn)(struct vm_map_copyin_args_data*, struct vm_map_copy**); + +typedef struct vm_map_copy { + int type; +#define VM_MAP_COPY_ENTRY_LIST 1 +#define VM_MAP_COPY_OBJECT 2 +#define VM_MAP_COPY_PAGE_LIST 3 + vm_offset_t offset; + vm_size_t size; + union { + struct vm_map_header hdr; /* ENTRY_LIST */ + struct { /* OBJECT */ + vm_object_t object; + } c_o; + struct { /* PAGE_LIST */ + vm_page_t page_list[VM_MAP_COPY_PAGE_LIST_MAX]; + int npages; + vm_map_copy_cont_fn cont; + struct vm_map_copyin_args_data* cont_args; + } c_p; + } c_u; +} *vm_map_copy_t; + +#define cpy_hdr c_u.hdr + +#define cpy_object c_u.c_o.object + +#define cpy_page_list c_u.c_p.page_list +#define cpy_npages c_u.c_p.npages +#define cpy_cont c_u.c_p.cont +#define cpy_cont_args c_u.c_p.cont_args + +#define VM_MAP_COPY_NULL ((vm_map_copy_t) 0) + +/* + * Useful macros for entry list copy objects + */ + +#define vm_map_copy_to_entry(copy) \ + ((struct vm_map_entry *) &(copy)->cpy_hdr.links) +#define vm_map_copy_first_entry(copy) \ + ((copy)->cpy_hdr.links.next) +#define vm_map_copy_last_entry(copy) \ + ((copy)->cpy_hdr.links.prev) + +/* + * Continuation macros for page list copy objects + */ + +#define vm_map_copy_invoke_cont(old_copy, new_copy, result) \ +MACRO_BEGIN \ + vm_map_copy_page_discard(old_copy); \ + *result = (*((old_copy)->cpy_cont))((old_copy)->cpy_cont_args, \ + new_copy); \ + (old_copy)->cpy_cont = (kern_return_t (*)()) 0; \ +MACRO_END + +#define vm_map_copy_invoke_extend_cont(old_copy, new_copy, result) \ +MACRO_BEGIN \ + *result = (*((old_copy)->cpy_cont))((old_copy)->cpy_cont_args, \ + new_copy); \ + (old_copy)->cpy_cont = (kern_return_t (*)()) 0; \ +MACRO_END + +#define vm_map_copy_abort_cont(old_copy) \ +MACRO_BEGIN \ + vm_map_copy_page_discard(old_copy); \ + (*((old_copy)->cpy_cont))((old_copy)->cpy_cont_args, \ + (vm_map_copy_t *) 0); \ + (old_copy)->cpy_cont = (kern_return_t (*)()) 0; \ + (old_copy)->cpy_cont_args = VM_MAP_COPYIN_ARGS_NULL; \ +MACRO_END + +#define vm_map_copy_has_cont(copy) \ + (((copy)->cpy_cont) != (kern_return_t (*)()) 0) + +/* + * Continuation structures for vm_map_copyin_page_list. + */ + +typedef struct vm_map_copyin_args_data { + vm_map_t map; + vm_offset_t src_addr; + vm_size_t src_len; + vm_offset_t destroy_addr; + vm_size_t destroy_len; + boolean_t steal_pages; +} vm_map_copyin_args_data_t, *vm_map_copyin_args_t; + +#define VM_MAP_COPYIN_ARGS_NULL ((vm_map_copyin_args_t) 0) + +/* + * Macros: vm_map_lock, etc. [internal use only] + * Description: + * Perform locking on the data portion of a map. + */ + +#define vm_map_lock_init(map) \ +MACRO_BEGIN \ + lock_init(&(map)->lock, TRUE); \ + (map)->timestamp = 0; \ +MACRO_END + +void vm_map_lock(struct vm_map *map); +void vm_map_unlock(struct vm_map *map); + +#define vm_map_lock_read(map) lock_read(&(map)->lock) +#define vm_map_unlock_read(map) lock_read_done(&(map)->lock) +#define vm_map_lock_write_to_read(map) \ + lock_write_to_read(&(map)->lock) +#define vm_map_lock_read_to_write(map) \ + (lock_read_to_write(&(map)->lock) || (((map)->timestamp++), 0)) +#define vm_map_lock_set_recursive(map) \ + lock_set_recursive(&(map)->lock) +#define vm_map_lock_clear_recursive(map) \ + lock_clear_recursive(&(map)->lock) + +/* + * Exported procedures that operate on vm_map_t. + */ + +/* Initialize the module */ +extern void vm_map_init(void); + +/* Initialize an empty map */ +extern void vm_map_setup(vm_map_t, pmap_t, vm_offset_t, vm_offset_t); +/* Create an empty map */ +extern vm_map_t vm_map_create(pmap_t, vm_offset_t, vm_offset_t); +/* Create a map in the image of an existing map */ +extern vm_map_t vm_map_fork(vm_map_t); + +/* Gain a reference to an existing map */ +extern void vm_map_reference(vm_map_t); +/* Lose a reference */ +extern void vm_map_deallocate(vm_map_t); + +/* Enter a mapping */ +extern kern_return_t vm_map_enter(vm_map_t, vm_offset_t *, vm_size_t, + vm_offset_t, boolean_t, vm_object_t, + vm_offset_t, boolean_t, vm_prot_t, + vm_prot_t, vm_inherit_t); +/* Enter a mapping primitive */ +extern kern_return_t vm_map_find_entry(vm_map_t, vm_offset_t *, vm_size_t, + vm_offset_t, vm_object_t, + vm_map_entry_t *); +/* Deallocate a region */ +extern kern_return_t vm_map_remove(vm_map_t, vm_offset_t, vm_offset_t); +/* Change protection */ +extern kern_return_t vm_map_protect(vm_map_t, vm_offset_t, vm_offset_t, + vm_prot_t, boolean_t); +/* Change inheritance */ +extern kern_return_t vm_map_inherit(vm_map_t, vm_offset_t, vm_offset_t, + vm_inherit_t); + +/* Look up an address */ +extern kern_return_t vm_map_lookup(vm_map_t *, vm_offset_t, vm_prot_t, + vm_map_version_t *, vm_object_t *, + vm_offset_t *, vm_prot_t *, boolean_t *); +/* Find a map entry */ +extern boolean_t vm_map_lookup_entry(vm_map_t, vm_offset_t, + vm_map_entry_t *); +/* Verify that a previous lookup is still valid */ +extern boolean_t vm_map_verify(vm_map_t, vm_map_version_t *); +/* vm_map_verify_done is now a macro -- see below */ +/* Make a copy of a region */ +extern kern_return_t vm_map_copyin(vm_map_t, vm_offset_t, vm_size_t, + boolean_t, vm_map_copy_t *); +/* Make a copy of a region using a page list copy */ +extern kern_return_t vm_map_copyin_page_list(vm_map_t, vm_offset_t, + vm_size_t, boolean_t, + boolean_t, vm_map_copy_t *, + boolean_t); +/* Place a copy into a map */ +extern kern_return_t vm_map_copyout(vm_map_t, vm_offset_t *, vm_map_copy_t); +/* Overwrite existing memory with a copy */ +extern kern_return_t vm_map_copy_overwrite(vm_map_t, vm_offset_t, + vm_map_copy_t, boolean_t); +/* Discard a copy without using it */ +extern void vm_map_copy_discard(vm_map_copy_t); +extern void vm_map_copy_page_discard(vm_map_copy_t); +extern vm_map_copy_t vm_map_copy_copy(vm_map_copy_t); +/* Page list continuation version of previous */ +extern kern_return_t vm_map_copy_discard_cont(vm_map_copyin_args_t, + vm_map_copy_t *); + +extern boolean_t vm_map_coalesce_entry(vm_map_t, vm_map_entry_t); + +/* Add or remove machine- dependent attributes from map regions */ +extern kern_return_t vm_map_machine_attribute(vm_map_t, vm_offset_t, + vm_size_t, + vm_machine_attribute_t, + vm_machine_attribute_val_t *); + +extern kern_return_t vm_map_msync(vm_map_t, + vm_offset_t, vm_size_t, vm_sync_t); + +/* Delete entry from map */ +extern void vm_map_entry_delete(vm_map_t, vm_map_entry_t); + +kern_return_t vm_map_delete( + vm_map_t map, + vm_offset_t start, + vm_offset_t end); + +kern_return_t vm_map_copyout_page_list( + vm_map_t dst_map, + vm_offset_t *dst_addr, /* OUT */ + vm_map_copy_t copy); + +void vm_map_copy_page_discard (vm_map_copy_t copy); + +boolean_t vm_map_lookup_entry( + vm_map_t map, + vm_offset_t address, + vm_map_entry_t *entry); /* OUT */ + +static inline void vm_map_set_name(vm_map_t map, const char *name) +{ + map->name = name; +} + + +/* + * Functions implemented as macros + */ +#define vm_map_min(map) ((map)->min_offset) + /* Lowest valid address in + * a map */ + +#define vm_map_max(map) ((map)->max_offset) + /* Highest valid address */ + +#define vm_map_pmap(map) ((map)->pmap) + /* Physical map associated + * with this address map */ + +#define vm_map_verify_done(map, version) (vm_map_unlock_read(map)) + /* Operation that required + * a verified lookup is + * now complete */ +/* + * Pageability functions. + */ +extern kern_return_t vm_map_pageable(vm_map_t, vm_offset_t, vm_offset_t, + vm_prot_t, boolean_t, boolean_t); + +extern kern_return_t vm_map_pageable_all(vm_map_t, vm_wire_t); + +/* + * Submap object. Must be used to create memory to be put + * in a submap by vm_map_submap. + */ +extern vm_object_t vm_submap_object; + +/* + * vm_map_copyin_object: + * + * Create a copy object from an object. + * Our caller donates an object reference. + */ +extern kern_return_t vm_map_copyin_object( + vm_object_t object, + vm_offset_t offset, /* offset of region in object */ + vm_size_t size, /* size of region in object */ + vm_map_copy_t *copy_result); /* OUT */ + +/* + * vm_map_submap: [ kernel use only ] + * + * Mark the given range as handled by a subordinate map. + * + * This range must have been created with vm_map_find using + * the vm_submap_object, and no other operations may have been + * performed on this range prior to calling vm_map_submap. + * + * Only a limited number of operations can be performed + * within this rage after calling vm_map_submap: + * vm_fault + * [Don't try vm_map_copyin!] + * + * To remove a submapping, one must first remove the + * range from the superior map, and then destroy the + * submap (if desired). [Better yet, don't try it.] + */ +extern kern_return_t vm_map_submap( + vm_map_t map, + vm_offset_t start, + vm_offset_t end, + vm_map_t submap); + +/* + * Wait and wakeup macros for in_transition map entries. + */ +#define vm_map_entry_wait(map, interruptible) \ + MACRO_BEGIN \ + assert_wait((event_t)&(map)->hdr, interruptible); \ + vm_map_unlock(map); \ + thread_block((void (*)()) 0); \ + MACRO_END + +#define vm_map_entry_wakeup(map) thread_wakeup((event_t)&(map)->hdr) + +/* + * This routine is called only when it is known that + * the entry must be split. + */ +extern void _vm_map_clip_start( + struct vm_map_header *map_header, + vm_map_entry_t entry, + vm_offset_t start, + boolean_t link_gap); + +/* + * vm_map_clip_end: [ internal use only ] + * + * Asserts that the given entry ends at or before + * the specified address; if necessary, + * it splits the entry into two. + */ +void _vm_map_clip_end( + struct vm_map_header *map_header, + vm_map_entry_t entry, + vm_offset_t end, + boolean_t link_gap); + +#endif /* _VM_VM_MAP_H_ */ diff --git a/vm/vm_object.c b/vm/vm_object.c new file mode 100644 index 0000000..c238cce --- /dev/null +++ b/vm/vm_object.c @@ -0,0 +1,2994 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University. + * Copyright (c) 1993,1994 The University of Utah and + * the Computer Systems Laboratory (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF + * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY + * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF + * THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/vm_object.c + * Author: Avadis Tevanian, Jr., Michael Wayne Young + * + * Virtual memory object module. + */ + +#include <kern/printf.h> +#include <string.h> + +#include <mach/memory_object.h> +#include <vm/memory_object_default.user.h> +#include <vm/memory_object_user.user.h> +#include <machine/vm_param.h> +#include <ipc/ipc_port.h> +#include <ipc/ipc_space.h> +#include <kern/assert.h> +#include <kern/debug.h> +#include <kern/mach.server.h> +#include <kern/lock.h> +#include <kern/queue.h> +#include <kern/xpr.h> +#include <kern/slab.h> +#include <vm/memory_object.h> +#include <vm/vm_fault.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pageout.h> + +#if MACH_KDB +#include <ddb/db_output.h> +#endif /* MACH_KDB */ + +void memory_object_release( + ipc_port_t pager, + pager_request_t pager_request, + ipc_port_t pager_name); /* forward */ + +/* + * Virtual memory objects maintain the actual data + * associated with allocated virtual memory. A given + * page of memory exists within exactly one object. + * + * An object is only deallocated when all "references" + * are given up. Only one "reference" to a given + * region of an object should be writeable. + * + * Associated with each object is a list of all resident + * memory pages belonging to that object; this list is + * maintained by the "vm_page" module, but locked by the object's + * lock. + * + * Each object also records the memory object port + * that is used by the kernel to request and write + * back data (the memory object port, field "pager"), + * and the ports provided to the memory manager, the server that + * manages that data, to return data and control its + * use (the memory object control port, field "pager_request") + * and for naming (the memory object name port, field "pager_name"). + * + * Virtual memory objects are allocated to provide + * zero-filled memory (vm_allocate) or map a user-defined + * memory object into a virtual address space (vm_map). + * + * Virtual memory objects that refer to a user-defined + * memory object are called "permanent", because all changes + * made in virtual memory are reflected back to the + * memory manager, which may then store it permanently. + * Other virtual memory objects are called "temporary", + * meaning that changes need be written back only when + * necessary to reclaim pages, and that storage associated + * with the object can be discarded once it is no longer + * mapped. + * + * A permanent memory object may be mapped into more + * than one virtual address space. Moreover, two threads + * may attempt to make the first mapping of a memory + * object concurrently. Only one thread is allowed to + * complete this mapping; all others wait for the + * "pager_initialized" field is asserted, indicating + * that the first thread has initialized all of the + * necessary fields in the virtual memory object structure. + * + * The kernel relies on a *default memory manager* to + * provide backing storage for the zero-filled virtual + * memory objects. The memory object ports associated + * with these temporary virtual memory objects are only + * generated and passed to the default memory manager + * when it becomes necessary. Virtual memory objects + * that depend on the default memory manager are called + * "internal". The "pager_created" field is provided to + * indicate whether these ports have ever been allocated. + * + * The kernel may also create virtual memory objects to + * hold changed pages after a copy-on-write operation. + * In this case, the virtual memory object (and its + * backing storage -- its memory object) only contain + * those pages that have been changed. The "shadow" + * field refers to the virtual memory object that contains + * the remainder of the contents. The "shadow_offset" + * field indicates where in the "shadow" these contents begin. + * The "copy" field refers to a virtual memory object + * to which changed pages must be copied before changing + * this object, in order to implement another form + * of copy-on-write optimization. + * + * The virtual memory object structure also records + * the attributes associated with its memory object. + * The "pager_ready", "can_persist" and "copy_strategy" + * fields represent those attributes. The "cached_list" + * field is used in the implementation of the persistence + * attribute. + * + * ZZZ Continue this comment. + */ + +struct kmem_cache vm_object_cache; /* vm backing store cache */ + +/* + * All wired-down kernel memory belongs to a single virtual + * memory object (kernel_object) to avoid wasting data structures. + */ +static struct vm_object kernel_object_store; +vm_object_t kernel_object = &kernel_object_store; + +/* + * Virtual memory objects that are not referenced by + * any address maps, but that are allowed to persist + * (an attribute specified by the associated memory manager), + * are kept in a queue (vm_object_cached_list). + * + * When an object from this queue is referenced again, + * for example to make another address space mapping, + * it must be removed from the queue. That is, the + * queue contains *only* objects with zero references. + * + * The kernel may choose to terminate objects from this + * queue in order to reclaim storage. The current policy + * is to let memory pressure dynamically adjust the number + * of unreferenced objects. The pageout daemon attempts to + * collect objects after removing pages from them. + * + * A simple lock (accessed by routines + * vm_object_cache_{lock,lock_try,unlock}) governs the + * object cache. It must be held when objects are + * added to or removed from the cache (in vm_object_terminate). + * The routines that acquire a reference to a virtual + * memory object based on one of the memory object ports + * must also lock the cache. + * + * Ideally, the object cache should be more isolated + * from the reference mechanism, so that the lock need + * not be held to make simple references. + */ +queue_head_t vm_object_cached_list; + +def_simple_lock_data(static,vm_object_cached_lock_data) + +#define vm_object_cache_lock() \ + simple_lock(&vm_object_cached_lock_data) +#define vm_object_cache_lock_try() \ + simple_lock_try(&vm_object_cached_lock_data) +#define vm_object_cache_unlock() \ + simple_unlock(&vm_object_cached_lock_data) + +/* + * Number of physical pages referenced by cached objects. + * This counter is protected by its own lock to work around + * lock ordering issues. + */ +int vm_object_cached_pages; + +def_simple_lock_data(static,vm_object_cached_pages_lock_data) + +/* + * Virtual memory objects are initialized from + * a template (see vm_object_allocate). + * + * When adding a new field to the virtual memory + * object structure, be sure to add initialization + * (see vm_object_init). + */ +struct vm_object vm_object_template; + +/* + * vm_object_allocate: + * + * Returns a new object with the given size. + */ + +static void _vm_object_setup( + vm_object_t object, + vm_size_t size) +{ + *object = vm_object_template; + queue_init(&object->memq); + vm_object_lock_init(object); + object->size = size; +} + +static vm_object_t _vm_object_allocate( + vm_size_t size) +{ + vm_object_t object; + + object = (vm_object_t) kmem_cache_alloc(&vm_object_cache); + if (!object) + return 0; + + _vm_object_setup(object, size); + + return object; +} + +vm_object_t vm_object_allocate( + vm_size_t size) +{ + vm_object_t object; + ipc_port_t port; + + object = _vm_object_allocate(size); + if (object == 0) + panic("vm_object_allocate"); + port = ipc_port_alloc_kernel(); + if (port == IP_NULL) + panic("vm_object_allocate"); + object->pager_name = port; + ipc_kobject_set(port, (ipc_kobject_t) object, IKOT_PAGING_NAME); + + return object; +} + +/* + * vm_object_bootstrap: + * + * Initialize the VM objects module. + */ +void vm_object_bootstrap(void) +{ + kmem_cache_init(&vm_object_cache, "vm_object", + sizeof(struct vm_object), 0, NULL, 0); + + queue_init(&vm_object_cached_list); + simple_lock_init(&vm_object_cached_lock_data); + + /* + * Fill in a template object, for quick initialization + */ + + vm_object_template.ref_count = 1; + vm_object_template.size = 0; + vm_object_template.resident_page_count = 0; + vm_object_template.copy = VM_OBJECT_NULL; + vm_object_template.shadow = VM_OBJECT_NULL; + vm_object_template.shadow_offset = (vm_offset_t) 0; + + vm_object_template.pager = IP_NULL; + vm_object_template.paging_offset = 0; + vm_object_template.pager_request = PAGER_REQUEST_NULL; + vm_object_template.pager_name = IP_NULL; + + vm_object_template.pager_created = FALSE; + vm_object_template.pager_initialized = FALSE; + vm_object_template.pager_ready = FALSE; + + vm_object_template.copy_strategy = MEMORY_OBJECT_COPY_NONE; + /* ignored if temporary, will be reset before + * permanent object becomes ready */ + vm_object_template.use_shared_copy = FALSE; + vm_object_template.shadowed = FALSE; + + vm_object_template.absent_count = 0; + vm_object_template.all_wanted = 0; /* all bits FALSE */ + + vm_object_template.paging_in_progress = 0; + vm_object_template.used_for_pageout = FALSE; + vm_object_template.can_persist = FALSE; + vm_object_template.cached = FALSE; + vm_object_template.internal = TRUE; + vm_object_template.temporary = TRUE; + vm_object_template.alive = TRUE; + vm_object_template.lock_in_progress = FALSE; + vm_object_template.lock_restart = FALSE; + vm_object_template.last_alloc = (vm_offset_t) 0; + +#if MACH_PAGEMAP + vm_object_template.existence_info = VM_EXTERNAL_NULL; +#endif /* MACH_PAGEMAP */ + + /* + * Initialize the "kernel object" + */ + + _vm_object_setup(kernel_object, + VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS); + + /* + * Initialize the "submap object". Make it as large as the + * kernel object so that no limit is imposed on submap sizes. + */ + + _vm_object_setup(vm_submap_object, + VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS); + +#if MACH_PAGEMAP + vm_external_module_initialize(); +#endif /* MACH_PAGEMAP */ +} + +void vm_object_init(void) +{ + /* + * Finish initializing the kernel object. + * The submap object doesn't need a name port. + */ + + kernel_object->pager_name = ipc_port_alloc_kernel(); + ipc_kobject_set(kernel_object->pager_name, + (ipc_kobject_t) kernel_object, + IKOT_PAGING_NAME); +} + +/* + * Object cache management functions. + * + * Both the cache and the object must be locked + * before calling these functions. + */ + +static void vm_object_cache_add( + vm_object_t object) +{ + assert(!object->cached); + queue_enter(&vm_object_cached_list, object, vm_object_t, cached_list); + object->cached = TRUE; +} + +static void vm_object_cache_remove( + vm_object_t object) +{ + assert(object->cached); + queue_remove(&vm_object_cached_list, object, vm_object_t, cached_list); + object->cached = FALSE; +} + +void vm_object_collect( + vm_object_t object) +{ + vm_object_unlock(object); + + /* + * The cache lock must be acquired in the proper order. + */ + + vm_object_cache_lock(); + vm_object_lock(object); + + /* + * If the object was referenced while the lock was + * dropped, cancel the termination. + */ + + if (!vm_object_collectable(object)) { + vm_object_unlock(object); + vm_object_cache_unlock(); + return; + } + + vm_object_cache_remove(object); + vm_object_terminate(object); +} + +/* + * vm_object_reference: + * + * Gets another reference to the given object. + */ +void vm_object_reference( + vm_object_t object) +{ + if (object == VM_OBJECT_NULL) + return; + + vm_object_lock(object); + assert(object->ref_count > 0); + object->ref_count++; + vm_object_unlock(object); +} + +/* + * vm_object_deallocate: + * + * Release a reference to the specified object, + * gained either through a vm_object_allocate + * or a vm_object_reference call. When all references + * are gone, storage associated with this object + * may be relinquished. + * + * No object may be locked. + */ +void vm_object_deallocate( + vm_object_t object) +{ + vm_object_t temp; + + while (object != VM_OBJECT_NULL) { + + /* + * The cache holds a reference (uncounted) to + * the object; we must lock it before removing + * the object. + */ + + vm_object_cache_lock(); + + /* + * Lose the reference + */ + vm_object_lock(object); + if (--(object->ref_count) > 0) { + + /* + * If there are still references, then + * we are done. + */ + vm_object_unlock(object); + vm_object_cache_unlock(); + return; + } + + /* + * See whether this object can persist. If so, enter + * it in the cache. + */ + if (object->can_persist && (object->resident_page_count > 0)) { + vm_object_cache_add(object); + vm_object_cache_unlock(); + vm_object_unlock(object); + return; + } + + if (object->pager_created && + !object->pager_initialized) { + + /* + * Have to wait for initialization. + * Put reference back and retry + * when it's initialized. + */ + + object->ref_count++; + vm_object_assert_wait(object, + VM_OBJECT_EVENT_INITIALIZED, FALSE); + vm_object_unlock(object); + vm_object_cache_unlock(); + thread_block((void (*)()) 0); + continue; + } + + /* + * Take the reference to the shadow object + * out of the object to be destroyed. + */ + + temp = object->shadow; + + /* + * Destroy the object; the cache lock will + * be released in the process. + */ + + vm_object_terminate(object); + + /* + * Deallocate the reference to the shadow + * by continuing the loop with that object + * in place of the original. + */ + + object = temp; + } +} + +/* + * Routine: vm_object_terminate + * Purpose: + * Free all resources associated with a vm_object. + * In/out conditions: + * Upon entry, the object and the cache must be locked, + * and the object must have no references. + * + * The shadow object reference is left alone. + * + * Upon exit, the cache will be unlocked, and the + * object will cease to exist. + */ +void vm_object_terminate( + vm_object_t object) +{ + vm_page_t p; + vm_object_t shadow_object; + + /* + * Make sure the object isn't already being terminated + */ + + assert(object->alive); + object->alive = FALSE; + + /* + * Make sure no one can look us up now. + */ + + vm_object_remove(object); + vm_object_cache_unlock(); + + /* + * Detach the object from its shadow if we are the shadow's + * copy. + */ + if ((shadow_object = object->shadow) != VM_OBJECT_NULL) { + vm_object_lock(shadow_object); + assert((shadow_object->copy == object) || + (shadow_object->copy == VM_OBJECT_NULL)); + shadow_object->copy = VM_OBJECT_NULL; + vm_object_unlock(shadow_object); + } + + /* + * The pageout daemon might be playing with our pages. + * Now that the object is dead, it won't touch any more + * pages, but some pages might already be on their way out. + * Hence, we wait until the active paging activities have ceased. + */ + + vm_object_paging_wait(object, FALSE); + + /* + * Clean or free the pages, as appropriate. + * It is possible for us to find busy/absent pages, + * if some faults on this object were aborted. + */ + + if ((object->temporary) || (object->pager == IP_NULL)) { + while (!queue_empty(&object->memq)) { + p = (vm_page_t) queue_first(&object->memq); + + VM_PAGE_CHECK(p); + + VM_PAGE_FREE(p); + } + } else while (!queue_empty(&object->memq)) { + p = (vm_page_t) queue_first(&object->memq); + + VM_PAGE_CHECK(p); + + vm_page_lock_queues(); + VM_PAGE_QUEUES_REMOVE(p); + vm_page_unlock_queues(); + + if (p->absent || p->private) { + + /* + * For private pages, VM_PAGE_FREE just + * leaves the page structure around for + * its owner to clean up. For absent + * pages, the structure is returned to + * the appropriate pool. + */ + + goto free_page; + } + + if (!p->dirty) + p->dirty = pmap_is_modified(p->phys_addr); + + if (p->dirty || p->precious) { + p->busy = TRUE; + vm_pageout_page(p, FALSE, TRUE); /* flush page */ + } else { + free_page: + VM_PAGE_FREE(p); + } + } + + assert(object->ref_count == 0); + assert(object->paging_in_progress == 0); + assert(!object->cached); + + if (!object->internal) { + assert(object->resident_page_count == 0); + + vm_page_lock_queues(); + vm_object_external_count--; + vm_page_unlock_queues(); + } + + /* + * Throw away port rights... note that they may + * already have been thrown away (by vm_object_destroy + * or memory_object_destroy). + * + * Instead of destroying the control and name ports, + * we send all rights off to the memory manager instead, + * using memory_object_terminate. + */ + + vm_object_unlock(object); + + if (object->pager != IP_NULL) { + /* consumes our rights for pager, pager_request, pager_name */ + memory_object_release(object->pager, + object->pager_request, + object->pager_name); + } else if (object->pager_name != IP_NULL) { + /* consumes our right for pager_name */ + ipc_port_dealloc_kernel(object->pager_name); + } + +#if MACH_PAGEMAP + vm_external_destroy(object->existence_info); +#endif /* MACH_PAGEMAP */ + + /* + * Free the space for the object. + */ + + kmem_cache_free(&vm_object_cache, (vm_offset_t) object); +} + +/* + * Routine: vm_object_pager_wakeup + * Purpose: Wake up anyone waiting for IKOT_PAGER_TERMINATING + */ + +void +vm_object_pager_wakeup( + ipc_port_t pager) +{ + boolean_t someone_waiting; + + /* + * If anyone was waiting for the memory_object_terminate + * to be queued, wake them up now. + */ + vm_object_cache_lock(); + assert(ip_kotype(pager) == IKOT_PAGER_TERMINATING); + someone_waiting = (pager->ip_kobject != IKO_NULL); + if (ip_active(pager)) + ipc_kobject_set(pager, IKO_NULL, IKOT_NONE); + vm_object_cache_unlock(); + if (someone_waiting) { + thread_wakeup((event_t) pager); + } +} + +/* + * Routine: memory_object_release + * Purpose: Terminate the pager and release port rights, + * just like memory_object_terminate, except + * that we wake up anyone blocked in vm_object_enter + * waiting for termination message to be queued + * before calling memory_object_init. + */ +void memory_object_release( + ipc_port_t pager, + pager_request_t pager_request, + ipc_port_t pager_name) +{ + + /* + * Keep a reference to pager port; + * the terminate might otherwise release all references. + */ + ip_reference(pager); + + /* + * Terminate the pager. + */ + (void) memory_object_terminate(pager, pager_request, pager_name); + + /* + * Wakeup anyone waiting for this terminate + */ + vm_object_pager_wakeup(pager); + + /* + * Release reference to pager port. + */ + ip_release(pager); +} + +/* + * Routine: vm_object_abort_activity [internal use only] + * Purpose: + * Abort paging requests pending on this object. + * In/out conditions: + * The object is locked on entry and exit. + */ +static void vm_object_abort_activity( + vm_object_t object) +{ + vm_page_t p; + vm_page_t next; + + /* + * Abort all activity that would be waiting + * for a result on this memory object. + * + * We could also choose to destroy all pages + * that we have in memory for this object, but + * we don't. + */ + + p = (vm_page_t) queue_first(&object->memq); + while (!queue_end(&object->memq, (queue_entry_t) p)) { + next = (vm_page_t) queue_next(&p->listq); + + /* + * If it's being paged in, destroy it. + * If an unlock has been requested, start it again. + */ + + if (p->busy && p->absent) { + VM_PAGE_FREE(p); + } + else { + if (p->unlock_request != VM_PROT_NONE) + p->unlock_request = VM_PROT_NONE; + PAGE_WAKEUP(p); + } + + p = next; + } + + /* + * Wake up threads waiting for the memory object to + * become ready. + */ + + object->pager_ready = TRUE; + vm_object_wakeup(object, VM_OBJECT_EVENT_PAGER_READY); +} + +/* + * Routine: memory_object_destroy [user interface] + * Purpose: + * Shut down a memory object, despite the + * presence of address map (or other) references + * to the vm_object. + * Note: + * This routine may be called either from the user interface, + * or from port destruction handling (via vm_object_destroy). + */ +kern_return_t memory_object_destroy( + vm_object_t object, + kern_return_t reason) +{ + ipc_port_t old_object, old_name; + pager_request_t old_control; + + if (object == VM_OBJECT_NULL) + return KERN_SUCCESS; + + /* + * Remove the port associations immediately. + * + * This will prevent the memory manager from further + * meddling. [If it wanted to flush data or make + * other changes, it should have done so before performing + * the destroy call.] + */ + + vm_object_cache_lock(); + vm_object_lock(object); + vm_object_remove(object); + object->can_persist = FALSE; + vm_object_cache_unlock(); + + /* + * Rip out the ports from the vm_object now... this + * will prevent new memory_object calls from succeeding. + */ + + old_object = object->pager; + object->pager = IP_NULL; + + old_control = object->pager_request; + object->pager_request = PAGER_REQUEST_NULL; + + old_name = object->pager_name; + object->pager_name = IP_NULL; + + + /* + * Wait for existing paging activity (that might + * have the old ports) to subside. + */ + + vm_object_paging_wait(object, FALSE); + vm_object_unlock(object); + + /* + * Shut down the ports now. + * + * [Paging operations may be proceeding concurrently -- + * they'll get the null values established above.] + */ + + if (old_object != IP_NULL) { + /* consumes our rights for object, control, name */ + memory_object_release(old_object, old_control, + old_name); + } else if (old_name != IP_NULL) { + /* consumes our right for name */ + ipc_port_dealloc_kernel(object->pager_name); + } + + /* + * Lose the reference that was donated for this routine + */ + + vm_object_deallocate(object); + + return KERN_SUCCESS; +} + +/* + * Routine: vm_object_pmap_protect + * + * Purpose: + * Reduces the permission for all physical + * pages in the specified object range. + * + * If removing write permission only, it is + * sufficient to protect only the pages in + * the top-level object; only those pages may + * have write permission. + * + * If removing all access, we must follow the + * shadow chain from the top-level object to + * remove access to all pages in shadowed objects. + * + * The object must *not* be locked. The object must + * be temporary/internal. + * + * If pmap is not NULL, this routine assumes that + * the only mappings for the pages are in that + * pmap. + */ +boolean_t vm_object_pmap_protect_by_page = FALSE; + +void vm_object_pmap_protect( + vm_object_t object, + vm_offset_t offset, + vm_size_t size, + pmap_t pmap, + vm_offset_t pmap_start, + vm_prot_t prot) +{ + if (object == VM_OBJECT_NULL) + return; + + vm_object_lock(object); + + assert(object->temporary && object->internal); + + while (TRUE) { + if (object->resident_page_count > atop(size) / 2 && + pmap != PMAP_NULL) { + vm_object_unlock(object); + pmap_protect(pmap, pmap_start, pmap_start + size, prot); + return; + } + + { + vm_page_t p; + vm_offset_t end; + + end = offset + size; + + queue_iterate(&object->memq, p, vm_page_t, listq) { + if (!p->fictitious && + (offset <= p->offset) && + (p->offset < end)) { + if ((pmap == PMAP_NULL) || + vm_object_pmap_protect_by_page) { + pmap_page_protect(p->phys_addr, + prot & ~p->page_lock); + } else { + vm_offset_t start = + pmap_start + + (p->offset - offset); + + pmap_protect(pmap, + start, + start + PAGE_SIZE, + prot); + } + } + } + } + + if (prot == VM_PROT_NONE) { + /* + * Must follow shadow chain to remove access + * to pages in shadowed objects. + */ + vm_object_t next_object; + + next_object = object->shadow; + if (next_object != VM_OBJECT_NULL) { + offset += object->shadow_offset; + vm_object_lock(next_object); + vm_object_unlock(object); + object = next_object; + } + else { + /* + * End of chain - we are done. + */ + break; + } + } + else { + /* + * Pages in shadowed objects may never have + * write permission - we may stop here. + */ + break; + } + } + + vm_object_unlock(object); +} + +/* + * vm_object_pmap_remove: + * + * Removes all physical pages in the specified + * object range from all physical maps. + * + * The object must *not* be locked. + */ +void vm_object_pmap_remove( + vm_object_t object, + vm_offset_t start, + vm_offset_t end) +{ + vm_page_t p; + + if (object == VM_OBJECT_NULL) + return; + + vm_object_lock(object); + queue_iterate(&object->memq, p, vm_page_t, listq) { + if (!p->fictitious && + (start <= p->offset) && + (p->offset < end)) + pmap_page_protect(p->phys_addr, VM_PROT_NONE); + } + vm_object_unlock(object); +} + +/* + * Routine: vm_object_copy_slowly + * + * Description: + * Copy the specified range of the source + * virtual memory object without using + * protection-based optimizations (such + * as copy-on-write). The pages in the + * region are actually copied. + * + * In/out conditions: + * The caller must hold a reference and a lock + * for the source virtual memory object. The source + * object will be returned *unlocked*. + * + * Results: + * If the copy is completed successfully, KERN_SUCCESS is + * returned. If the caller asserted the interruptible + * argument, and an interruption occurred while waiting + * for a user-generated event, MACH_SEND_INTERRUPTED is + * returned. Other values may be returned to indicate + * hard errors during the copy operation. + * + * A new virtual memory object is returned in a + * parameter (_result_object). The contents of this + * new object, starting at a zero offset, are a copy + * of the source memory region. In the event of + * an error, this parameter will contain the value + * VM_OBJECT_NULL. + */ +kern_return_t vm_object_copy_slowly( + vm_object_t src_object, + vm_offset_t src_offset, + vm_size_t size, + boolean_t interruptible, + vm_object_t *_result_object) /* OUT */ +{ + vm_object_t new_object; + vm_offset_t new_offset; + + if (size == 0) { + vm_object_unlock(src_object); + *_result_object = VM_OBJECT_NULL; + return KERN_INVALID_ARGUMENT; + } + + /* + * Prevent destruction of the source object while we copy. + */ + + assert(src_object->ref_count > 0); + src_object->ref_count++; + vm_object_unlock(src_object); + + /* + * Create a new object to hold the copied pages. + * A few notes: + * We fill the new object starting at offset 0, + * regardless of the input offset. + * We don't bother to lock the new object within + * this routine, since we have the only reference. + */ + + new_object = vm_object_allocate(size); + new_offset = 0; + + assert(size == trunc_page(size)); /* Will the loop terminate? */ + + for ( ; + size != 0 ; + src_offset += PAGE_SIZE, new_offset += PAGE_SIZE, size -= PAGE_SIZE + ) { + vm_page_t new_page; + vm_fault_return_t result; + + while ((new_page = vm_page_alloc(new_object, new_offset)) + == VM_PAGE_NULL) { + VM_PAGE_WAIT((void (*)()) 0); + } + + do { + vm_prot_t prot = VM_PROT_READ; + vm_page_t _result_page; + vm_page_t top_page; + vm_page_t result_page; + + vm_object_lock(src_object); + src_object->paging_in_progress++; + + result = vm_fault_page(src_object, src_offset, + VM_PROT_READ, FALSE, interruptible, + &prot, &_result_page, &top_page, + FALSE, (void (*)()) 0); + + switch(result) { + case VM_FAULT_SUCCESS: + result_page = _result_page; + + /* + * We don't need to hold the object + * lock -- the busy page will be enough. + * [We don't care about picking up any + * new modifications.] + * + * Copy the page to the new object. + * + * POLICY DECISION: + * If result_page is clean, + * we could steal it instead + * of copying. + */ + + vm_object_unlock(result_page->object); + vm_page_copy(result_page, new_page); + + /* + * Let go of both pages (make them + * not busy, perform wakeup, activate). + */ + + new_page->busy = FALSE; + new_page->dirty = TRUE; + vm_object_lock(result_page->object); + PAGE_WAKEUP_DONE(result_page); + + vm_page_lock_queues(); + if (!result_page->active && + !result_page->inactive) + vm_page_activate(result_page); + vm_page_activate(new_page); + vm_page_unlock_queues(); + + /* + * Release paging references and + * top-level placeholder page, if any. + */ + + vm_fault_cleanup(result_page->object, + top_page); + + break; + + case VM_FAULT_RETRY: + break; + + case VM_FAULT_MEMORY_SHORTAGE: + VM_PAGE_WAIT((void (*)()) 0); + break; + + case VM_FAULT_FICTITIOUS_SHORTAGE: + vm_page_more_fictitious(); + break; + + case VM_FAULT_INTERRUPTED: + vm_page_free(new_page); + vm_object_deallocate(new_object); + vm_object_deallocate(src_object); + *_result_object = VM_OBJECT_NULL; + return MACH_SEND_INTERRUPTED; + + case VM_FAULT_MEMORY_ERROR: + /* + * A policy choice: + * (a) ignore pages that we can't + * copy + * (b) return the null object if + * any page fails [chosen] + */ + + vm_page_free(new_page); + vm_object_deallocate(new_object); + vm_object_deallocate(src_object); + *_result_object = VM_OBJECT_NULL; + return KERN_MEMORY_ERROR; + } + } while (result != VM_FAULT_SUCCESS); + } + + /* + * Lose the extra reference, and return our object. + */ + + vm_object_deallocate(src_object); + *_result_object = new_object; + return KERN_SUCCESS; +} + +/* + * Routine: vm_object_copy_temporary + * + * Purpose: + * Copy the specified range of the source virtual + * memory object, if it can be done without blocking. + * + * Results: + * If the copy is successful, the copy is returned in + * the arguments; otherwise, the arguments are not + * affected. + * + * In/out conditions: + * The object should be unlocked on entry and exit. + */ + +boolean_t vm_object_copy_temporary( + vm_object_t *_object, /* INOUT */ + vm_offset_t *_offset, /* INOUT */ + boolean_t *_src_needs_copy, /* OUT */ + boolean_t *_dst_needs_copy) /* OUT */ +{ + vm_object_t object = *_object; + + if (object == VM_OBJECT_NULL) { + *_src_needs_copy = FALSE; + *_dst_needs_copy = FALSE; + return TRUE; + } + + /* + * If the object is temporary, we can perform + * a symmetric copy-on-write without asking. + */ + + vm_object_lock(object); + if (object->temporary) { + + /* + * Shared objects use delayed copy + */ + if (object->use_shared_copy) { + + /* + * Asymmetric copy strategy. Destination + * must be copied (to allow copy object reuse). + * Source is unaffected. + */ + vm_object_unlock(object); + object = vm_object_copy_delayed(object); + *_object = object; + *_src_needs_copy = FALSE; + *_dst_needs_copy = TRUE; + return TRUE; + } + + /* + * Make another reference to the object. + * + * Leave object/offset unchanged. + */ + + assert(object->ref_count > 0); + object->ref_count++; + object->shadowed = TRUE; + vm_object_unlock(object); + + /* + * Both source and destination must make + * shadows, and the source must be made + * read-only if not already. + */ + + *_src_needs_copy = TRUE; + *_dst_needs_copy = TRUE; + return TRUE; + } + + if (object->pager_ready && + (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY)) { + /* XXX Do something intelligent (see temporary code above) */ + } + vm_object_unlock(object); + + return FALSE; +} + +/* + * Routine: vm_object_copy_call [internal] + * + * Description: + * Copy the specified (src_offset, size) portion + * of the source object (src_object), using the + * user-managed copy algorithm. + * + * In/out conditions: + * The source object must be locked on entry. It + * will be *unlocked* on exit. + * + * Results: + * If the copy is successful, KERN_SUCCESS is returned. + * This routine is interruptible; if a wait for + * a user-generated event is interrupted, MACH_SEND_INTERRUPTED + * is returned. Other return values indicate hard errors + * in creating the user-managed memory object for the copy. + * + * A new object that represents the copied virtual + * memory is returned in a parameter (*_result_object). + * If the return value indicates an error, this parameter + * is not valid. + */ +static kern_return_t vm_object_copy_call( + vm_object_t src_object, + vm_offset_t src_offset, + vm_size_t size, + vm_object_t *_result_object) /* OUT */ +{ + vm_offset_t src_end = src_offset + size; + ipc_port_t new_memory_object; + vm_object_t new_object; + vm_page_t p; + + /* + * Create a memory object port to be associated + * with this new vm_object. + * + * Since the kernel has the only rights to this + * port, we need not hold the cache lock. + * + * Since we have the only object reference, we + * need not be worried about collapse operations. + * + */ + + new_memory_object = ipc_port_alloc_kernel(); + if (new_memory_object == IP_NULL) + return KERN_RESOURCE_SHORTAGE; + + /* + * Set the backing object for the new + * temporary object. + */ + + assert(src_object->ref_count > 0); + src_object->ref_count++; + vm_object_paging_begin(src_object); + vm_object_unlock(src_object); + + /* we hold a naked receive right for new_memory_object */ + (void) ipc_port_make_send(new_memory_object); + /* now we also hold a naked send right for new_memory_object */ + + /* + * Let the memory manager know that a copy operation + * is in progress. Note that we're using the old + * memory object's ports (for which we're holding + * a paging reference)... the memory manager cannot + * yet affect the new memory object. + */ + + (void) memory_object_copy(src_object->pager, + src_object->pager_request, + src_offset, size, + new_memory_object); + /* no longer hold the naked receive right for new_memory_object */ + + vm_object_lock(src_object); + vm_object_paging_end(src_object); + + /* + * Remove write access from all of the pages of + * the old memory object that we can. + */ + + queue_iterate(&src_object->memq, p, vm_page_t, listq) { + if (!p->fictitious && + (src_offset <= p->offset) && + (p->offset < src_end) && + !(p->page_lock & VM_PROT_WRITE)) { + p->page_lock |= VM_PROT_WRITE; + pmap_page_protect(p->phys_addr, VM_PROT_ALL & ~p->page_lock); + } + } + + vm_object_unlock(src_object); + + /* + * Initialize the rest of the paging stuff + */ + + new_object = vm_object_enter(new_memory_object, size, FALSE); + assert(new_object); + new_object->shadow = src_object; + new_object->shadow_offset = src_offset; + + /* + * Drop the reference for new_memory_object taken above. + */ + + ipc_port_release_send(new_memory_object); + /* no longer hold the naked send right for new_memory_object */ + + *_result_object = new_object; + return KERN_SUCCESS; +} + +/* + * Routine: vm_object_copy_delayed [internal] + * + * Description: + * Copy the specified virtual memory object, using + * the asymmetric copy-on-write algorithm. + * + * In/out conditions: + * The object must be unlocked on entry. + * + * This routine will not block waiting for user-generated + * events. It is not interruptible. + */ +vm_object_t vm_object_copy_delayed( + vm_object_t src_object) +{ + vm_object_t new_copy; + vm_object_t old_copy; + vm_page_t p; + + /* + * The user-level memory manager wants to see + * all of the changes to this object, but it + * has promised not to make any changes on its own. + * + * Perform an asymmetric copy-on-write, as follows: + * Create a new object, called a "copy object" + * to hold pages modified by the new mapping + * (i.e., the copy, not the original mapping). + * Record the original object as the backing + * object for the copy object. If the + * original mapping does not change a page, + * it may be used read-only by the copy. + * Record the copy object in the original + * object. When the original mapping causes + * a page to be modified, it must be copied + * to a new page that is "pushed" to the + * copy object. + * Mark the new mapping (the copy object) + * copy-on-write. This makes the copy + * object itself read-only, allowing it + * to be reused if the original mapping + * makes no changes, and simplifying the + * synchronization required in the "push" + * operation described above. + * + * The copy-on-write is said to be asymmetric because + * the original object is *not* marked copy-on-write. + * A copied page is pushed to the copy object, regardless + * which party attempted to modify the page. + * + * Repeated asymmetric copy operations may be done. + * If the original object has not been changed since + * the last copy, its copy object can be reused. + * Otherwise, a new copy object can be inserted + * between the original object and its previous + * copy object. Since any copy object is read-only, + * this cannot affect the contents of the previous copy + * object. + * + * Note that a copy object is higher in the object + * tree than the original object; therefore, use of + * the copy object recorded in the original object + * must be done carefully, to avoid deadlock. + */ + + /* + * Allocate a new copy object before locking, even + * though we may not need it later. + */ + + new_copy = vm_object_allocate(src_object->size); + + vm_object_lock(src_object); + + /* + * See whether we can reuse the result of a previous + * copy operation. + */ + Retry: + old_copy = src_object->copy; + if (old_copy != VM_OBJECT_NULL) { + /* + * Try to get the locks (out of order) + */ + if (!vm_object_lock_try(old_copy)) { + vm_object_unlock(src_object); + + simple_lock_pause(); /* wait a bit */ + + vm_object_lock(src_object); + goto Retry; + } + + /* + * Determine whether the old copy object has + * been modified. + */ + + if (old_copy->resident_page_count == 0 && + !old_copy->pager_created) { + /* + * It has not been modified. + * + * Return another reference to + * the existing copy-object. + */ + assert(old_copy->ref_count > 0); + old_copy->ref_count++; + vm_object_unlock(old_copy); + vm_object_unlock(src_object); + + vm_object_deallocate(new_copy); + + return old_copy; + } + + /* + * The copy-object is always made large enough to + * completely shadow the original object, since + * it may have several users who want to shadow + * the original object at different points. + */ + + assert((old_copy->shadow == src_object) && + (old_copy->shadow_offset == (vm_offset_t) 0)); + + /* + * Make the old copy-object shadow the new one. + * It will receive no more pages from the original + * object. + */ + + src_object->ref_count--; /* remove ref. from old_copy */ + assert(src_object->ref_count > 0); + old_copy->shadow = new_copy; + assert(new_copy->ref_count > 0); + new_copy->ref_count++; + vm_object_unlock(old_copy); /* done with old_copy */ + } + + /* + * Point the new copy at the existing object. + */ + + new_copy->shadow = src_object; + new_copy->shadow_offset = 0; + new_copy->shadowed = TRUE; /* caller must set needs_copy */ + assert(src_object->ref_count > 0); + src_object->ref_count++; + src_object->copy = new_copy; + + /* + * Mark all pages of the existing object copy-on-write. + * This object may have a shadow chain below it, but + * those pages will already be marked copy-on-write. + */ + + queue_iterate(&src_object->memq, p, vm_page_t, listq) { + if (!p->fictitious) + pmap_page_protect(p->phys_addr, + (VM_PROT_ALL & ~VM_PROT_WRITE & + ~p->page_lock)); + } + + vm_object_unlock(src_object); + + return new_copy; +} + +/* + * Routine: vm_object_copy_strategically + * + * Purpose: + * Perform a copy according to the source object's + * declared strategy. This operation may block, + * and may be interrupted. + */ +kern_return_t vm_object_copy_strategically( + vm_object_t src_object, + vm_offset_t src_offset, + vm_size_t size, + vm_object_t *dst_object, /* OUT */ + vm_offset_t *dst_offset, /* OUT */ + boolean_t *dst_needs_copy) /* OUT */ +{ + kern_return_t result = KERN_SUCCESS; /* to quiet gcc warnings */ + boolean_t interruptible = TRUE; /* XXX */ + + assert(src_object != VM_OBJECT_NULL); + + vm_object_lock(src_object); + + /* XXX assert(!src_object->temporary); JSB FIXME */ + + /* + * The copy strategy is only valid if the memory manager + * is "ready". + */ + + while (!src_object->pager_ready) { + vm_object_wait( src_object, + VM_OBJECT_EVENT_PAGER_READY, + interruptible); + if (interruptible && + (current_thread()->wait_result != THREAD_AWAKENED)) { + *dst_object = VM_OBJECT_NULL; + *dst_offset = 0; + *dst_needs_copy = FALSE; + return MACH_SEND_INTERRUPTED; + } + vm_object_lock(src_object); + } + + /* + * The object may be temporary (even though it is external). + * If so, do a symmetric copy. + */ + + if (src_object->temporary) { + /* + * XXX + * This does not count as intelligent! + * This buys us the object->temporary optimizations, + * but we aren't using a symmetric copy, + * which may confuse the vm code. The correct thing + * to do here is to figure out what to call to get + * a temporary shadowing set up. + */ + src_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY; + } + + /* + * The object is permanent. Use the appropriate copy strategy. + */ + + switch (src_object->copy_strategy) { + case MEMORY_OBJECT_COPY_NONE: + if ((result = vm_object_copy_slowly( + src_object, + src_offset, + size, + interruptible, + dst_object)) + == KERN_SUCCESS) { + *dst_offset = 0; + *dst_needs_copy = FALSE; + } + break; + + case MEMORY_OBJECT_COPY_CALL: + if ((result = vm_object_copy_call( + src_object, + src_offset, + size, + dst_object)) + == KERN_SUCCESS) { + *dst_offset = 0; + *dst_needs_copy = FALSE; + } + break; + + case MEMORY_OBJECT_COPY_DELAY: + vm_object_unlock(src_object); + *dst_object = vm_object_copy_delayed(src_object); + *dst_offset = src_offset; + *dst_needs_copy = TRUE; + + result = KERN_SUCCESS; + break; + } + + return result; +} + +/* + * vm_object_shadow: + * + * Create a new object which is backed by the + * specified existing object range. The source + * object reference is deallocated. + * + * The new object and offset into that object + * are returned in the source parameters. + */ + +void vm_object_shadow( + vm_object_t *object, /* IN/OUT */ + vm_offset_t *offset, /* IN/OUT */ + vm_size_t length) +{ + vm_object_t source; + vm_object_t result; + + source = *object; + + /* + * Allocate a new object with the given length + */ + + if ((result = vm_object_allocate(length)) == VM_OBJECT_NULL) + panic("vm_object_shadow: no object for shadowing"); + + /* + * The new object shadows the source object, adding + * a reference to it. Our caller changes his reference + * to point to the new object, removing a reference to + * the source object. Net result: no change of reference + * count. + */ + result->shadow = source; + + /* + * Store the offset into the source object, + * and fix up the offset into the new object. + */ + + result->shadow_offset = *offset; + + /* + * Return the new things + */ + + *offset = 0; + *object = result; +} + +/* + * The relationship between vm_object structures and + * the memory_object ports requires careful synchronization. + * + * All associations are created by vm_object_enter. All three + * port fields are filled in, as follows: + * pager: the memory_object port itself, supplied by + * the user requesting a mapping (or the kernel, + * when initializing internal objects); the + * kernel simulates holding send rights by keeping + * a port reference; + * pager_request: + * pager_name: + * the memory object control and name ports, + * created by the kernel; the kernel holds + * receive (and ownership) rights to these + * ports, but no other references. + * All of the ports are referenced by their global names. + * + * When initialization is complete, the "initialized" field + * is asserted. Other mappings using a particular memory object, + * and any references to the vm_object gained through the + * port association must wait for this initialization to occur. + * + * In order to allow the memory manager to set attributes before + * requests (notably virtual copy operations, but also data or + * unlock requests) are made, a "ready" attribute is made available. + * Only the memory manager may affect the value of this attribute. + * Its value does not affect critical kernel functions, such as + * internal object initialization or destruction. [Furthermore, + * memory objects created by the kernel are assumed to be ready + * immediately; the default memory manager need not explicitly + * set the "ready" attribute.] + * + * [Both the "initialized" and "ready" attribute wait conditions + * use the "pager" field as the wait event.] + * + * The port associations can be broken down by any of the + * following routines: + * vm_object_terminate: + * No references to the vm_object remain, and + * the object cannot (or will not) be cached. + * This is the normal case, and is done even + * though one of the other cases has already been + * done. + * vm_object_destroy: + * The memory_object port has been destroyed, + * meaning that the kernel cannot flush dirty + * pages or request new data or unlock existing + * data. + * memory_object_destroy: + * The memory manager has requested that the + * kernel relinquish rights to the memory object + * port. [The memory manager may not want to + * destroy the port, but may wish to refuse or + * tear down existing memory mappings.] + * Each routine that breaks an association must break all of + * them at once. At some later time, that routine must clear + * the vm_object port fields and release the port rights. + * [Furthermore, each routine must cope with the simultaneous + * or previous operations of the others.] + * + * In addition to the lock on the object, the vm_object_cache_lock + * governs the port associations. References gained through the + * port association require use of the cache lock. + * + * Because the port fields may be cleared spontaneously, they + * cannot be used to determine whether a memory object has + * ever been associated with a particular vm_object. [This + * knowledge is important to the shadow object mechanism.] + * For this reason, an additional "created" attribute is + * provided. + * + * During various paging operations, the port values found in the + * vm_object must be valid. To prevent these port rights from being + * released, and to prevent the port associations from changing + * (other than being removed, i.e., made null), routines may use + * the vm_object_paging_begin/end routines [actually, macros]. + * The implementation uses the "paging_in_progress" and "wanted" fields. + * [Operations that alter the validity of the port values include the + * termination routines and vm_object_collapse.] + */ + +vm_object_t vm_object_lookup( + ipc_port_t port) +{ + vm_object_t object = VM_OBJECT_NULL; + + if (IP_VALID(port)) { + ip_lock(port); + if (ip_active(port) && + (ip_kotype(port) == IKOT_PAGING_REQUEST)) { + vm_object_cache_lock(); + object = (vm_object_t) port->ip_kobject; + vm_object_lock(object); + + assert(object->alive); + + if (object->ref_count == 0) + vm_object_cache_remove(object); + + object->ref_count++; + vm_object_unlock(object); + vm_object_cache_unlock(); + } + ip_unlock(port); + } + + return object; +} + +vm_object_t vm_object_lookup_name( + ipc_port_t port) +{ + vm_object_t object = VM_OBJECT_NULL; + + if (IP_VALID(port)) { + ip_lock(port); + if (ip_active(port) && + (ip_kotype(port) == IKOT_PAGING_NAME)) { + vm_object_cache_lock(); + object = (vm_object_t) port->ip_kobject; + vm_object_lock(object); + + assert(object->alive); + + if (object->ref_count == 0) + vm_object_cache_remove(object); + + object->ref_count++; + vm_object_unlock(object); + vm_object_cache_unlock(); + } + ip_unlock(port); + } + + return object; +} + +void vm_object_destroy( + ipc_port_t pager) +{ + vm_object_t object; + pager_request_t old_request; + ipc_port_t old_name; + + /* + * Perform essentially the same operations as in vm_object_lookup, + * except that this time we look up based on the memory_object + * port, not the control port. + */ + vm_object_cache_lock(); + if (ip_kotype(pager) != IKOT_PAGER) { + vm_object_cache_unlock(); + return; + } + + object = (vm_object_t) pager->ip_kobject; + vm_object_lock(object); + if (object->ref_count == 0) + vm_object_cache_remove(object); + object->ref_count++; + + object->can_persist = FALSE; + + assert(object->pager == pager); + + /* + * Remove the port associations. + * + * Note that the memory_object itself is dead, so + * we don't bother with it. + */ + + object->pager = IP_NULL; + vm_object_remove(object); + + old_request = object->pager_request; + object->pager_request = PAGER_REQUEST_NULL; + + old_name = object->pager_name; + object->pager_name = IP_NULL; + + vm_object_unlock(object); + vm_object_cache_unlock(); + + /* + * Clean up the port references. Note that there's no + * point in trying the memory_object_terminate call + * because the memory_object itself is dead. + */ + + ipc_port_release_send(pager); + if (old_request != IP_NULL) + ipc_port_dealloc_kernel(old_request); + if (old_name != IP_NULL) + ipc_port_dealloc_kernel(old_name); + + /* + * Restart pending page requests + */ + + vm_object_abort_activity(object); + + /* + * Lose the object reference. + */ + + vm_object_deallocate(object); +} + +/* + * Routine: vm_object_enter + * Purpose: + * Find a VM object corresponding to the given + * pager; if no such object exists, create one, + * and initialize the pager. + */ +vm_object_t vm_object_enter( + ipc_port_t pager, + vm_size_t size, + boolean_t internal) +{ + vm_object_t object; + vm_object_t new_object; + boolean_t must_init; + ipc_kobject_type_t po; + +restart: + if (!IP_VALID(pager)) + return vm_object_allocate(size); + + new_object = VM_OBJECT_NULL; + must_init = FALSE; + + /* + * Look for an object associated with this port. + */ + + vm_object_cache_lock(); + for (;;) { + po = ip_kotype(pager); + + /* + * If a previous object is being terminated, + * we must wait for the termination message + * to be queued. + * + * We set kobject to a non-null value to let the + * terminator know that someone is waiting. + * Among the possibilities is that the port + * could die while we're waiting. Must restart + * instead of continuing the loop. + */ + + if (po == IKOT_PAGER_TERMINATING) { + pager->ip_kobject = (ipc_kobject_t) pager; + assert_wait((event_t) pager, FALSE); + vm_object_cache_unlock(); + thread_block((void (*)()) 0); + goto restart; + } + + /* + * Bail if there is already a kobject associated + * with the pager port. + */ + if (po != IKOT_NONE) { + break; + } + + /* + * We must unlock to create a new object; + * if we do so, we must try the lookup again. + */ + + if (new_object == VM_OBJECT_NULL) { + vm_object_cache_unlock(); + new_object = vm_object_allocate(size); + vm_object_cache_lock(); + } else { + /* + * Lookup failed twice, and we have something + * to insert; set the object. + */ + + ipc_kobject_set(pager, + (ipc_kobject_t) new_object, + IKOT_PAGER); + new_object = VM_OBJECT_NULL; + must_init = TRUE; + } + } + + if (internal) + must_init = TRUE; + + /* + * It's only good if it's a VM object! + */ + + object = (po == IKOT_PAGER) ? (vm_object_t) pager->ip_kobject + : VM_OBJECT_NULL; + + if ((object != VM_OBJECT_NULL) && !must_init) { + vm_object_lock(object); + if (object->ref_count == 0) + vm_object_cache_remove(object); + object->ref_count++; + vm_object_unlock(object); + + vm_stat.hits++; + } + assert((object == VM_OBJECT_NULL) || (object->ref_count > 0) || + ((object->paging_in_progress != 0) && internal)); + + vm_stat.lookups++; + + vm_object_cache_unlock(); + + /* + * If we raced to create a vm_object but lost, let's + * throw away ours. + */ + + if (new_object != VM_OBJECT_NULL) + vm_object_deallocate(new_object); + + if (object == VM_OBJECT_NULL) + return(object); + + if (must_init) { + /* + * Copy the naked send right we were given. + */ + + pager = ipc_port_copy_send(pager); + if (!IP_VALID(pager)) + panic("vm_object_enter: port died"); /* XXX */ + + object->pager_created = TRUE; + object->pager = pager; + + /* + * Allocate request port. + */ + + object->pager_request = ipc_port_alloc_kernel(); + if (object->pager_request == IP_NULL) + panic("vm_object_enter: pager request alloc"); + + ipc_kobject_set(object->pager_request, + (ipc_kobject_t) object, + IKOT_PAGING_REQUEST); + + /* + * Let the pager know we're using it. + */ + + if (internal) { + /* acquire a naked send right for the DMM */ + ipc_port_t DMM = memory_manager_default_reference(); + + /* mark the object internal */ + object->internal = TRUE; + assert(object->temporary); + + /* default-pager objects are ready immediately */ + object->pager_ready = TRUE; + + /* consumes the naked send right for DMM */ + (void) memory_object_create(DMM, + pager, + object->size, + object->pager_request, + object->pager_name, + PAGE_SIZE); + } else { + /* the object is external and not temporary */ + object->internal = FALSE; + object->temporary = FALSE; + + assert(object->resident_page_count == 0); + vm_object_external_count++; + + /* user pager objects are not ready until marked so */ + object->pager_ready = FALSE; + + (void) memory_object_init(pager, + object->pager_request, + object->pager_name, + PAGE_SIZE); + + } + + vm_object_lock(object); + object->pager_initialized = TRUE; + + vm_object_wakeup(object, VM_OBJECT_EVENT_INITIALIZED); + } else { + vm_object_lock(object); + } + /* + * [At this point, the object must be locked] + */ + + /* + * Wait for the work above to be done by the first + * thread to map this object. + */ + + while (!object->pager_initialized) { + vm_object_wait( object, + VM_OBJECT_EVENT_INITIALIZED, + FALSE); + vm_object_lock(object); + } + vm_object_unlock(object); + + return object; +} + +/* + * Routine: vm_object_pager_create + * Purpose: + * Create a memory object for an internal object. + * In/out conditions: + * The object is locked on entry and exit; + * it may be unlocked within this call. + * Limitations: + * Only one thread may be performing a + * vm_object_pager_create on an object at + * a time. Presumably, only the pageout + * daemon will be using this routine. + */ +void vm_object_pager_create( + vm_object_t object) +{ + ipc_port_t pager; + + if (object->pager_created) { + /* + * Someone else got to it first... + * wait for them to finish initializing + */ + + while (!object->pager_initialized) { + vm_object_wait( object, + VM_OBJECT_EVENT_PAGER_READY, + FALSE); + vm_object_lock(object); + } + return; + } + + /* + * Indicate that a memory object has been assigned + * before dropping the lock, to prevent a race. + */ + + object->pager_created = TRUE; + + /* + * Prevent collapse or termination by + * holding a paging reference + */ + + vm_object_paging_begin(object); + vm_object_unlock(object); + +#if MACH_PAGEMAP + object->existence_info = vm_external_create( + object->size + + object->paging_offset); + assert((object->size + object->paging_offset) >= + object->size); +#endif /* MACH_PAGEMAP */ + + /* + * Create the pager, and associate with it + * this object. + * + * Note that we only make the port association + * so that vm_object_enter can properly look up + * the object to complete the initialization... + * we do not expect any user to ever map this + * object. + * + * Since the kernel has the only rights to the + * port, it's safe to install the association + * without holding the cache lock. + */ + + pager = ipc_port_alloc_kernel(); + if (pager == IP_NULL) + panic("vm_object_pager_create: allocate pager port"); + + (void) ipc_port_make_send(pager); + ipc_kobject_set(pager, (ipc_kobject_t) object, IKOT_PAGER); + + /* + * Initialize the rest of the paging stuff + */ + + if (vm_object_enter(pager, object->size, TRUE) != object) + panic("vm_object_pager_create: mismatch"); + + /* + * Drop the naked send right taken above. + */ + + ipc_port_release_send(pager); + + /* + * Release the paging reference + */ + + vm_object_lock(object); + vm_object_paging_end(object); +} + +/* + * Routine: vm_object_remove + * Purpose: + * Eliminate the pager/object association + * for this pager. + * Conditions: + * The object cache must be locked. + */ +void vm_object_remove( + vm_object_t object) +{ + ipc_port_t port; + + if ((port = object->pager) != IP_NULL) { + if (ip_kotype(port) == IKOT_PAGER) + ipc_kobject_set(port, IKO_NULL, + IKOT_PAGER_TERMINATING); + else if (ip_kotype(port) != IKOT_NONE) + panic("vm_object_remove: bad object port"); + } + if ((port = object->pager_request) != IP_NULL) { + if (ip_kotype(port) == IKOT_PAGING_REQUEST) + ipc_kobject_set(port, IKO_NULL, IKOT_NONE); + else if (ip_kotype(port) != IKOT_NONE) + panic("vm_object_remove: bad request port"); + } + if ((port = object->pager_name) != IP_NULL) { + if (ip_kotype(port) == IKOT_PAGING_NAME) + ipc_kobject_set(port, IKO_NULL, IKOT_NONE); + else if (ip_kotype(port) != IKOT_NONE) + panic("vm_object_remove: bad name port"); + } +} + +/* + * Global variables for vm_object_collapse(): + * + * Counts for normal collapses and bypasses. + * Debugging variables, to watch or disable collapse. + */ +long object_collapses = 0; +long object_bypasses = 0; + +int vm_object_collapse_debug = 0; +boolean_t vm_object_collapse_allowed = TRUE; +boolean_t vm_object_collapse_bypass_allowed = TRUE; + +/* + * vm_object_collapse: + * + * Collapse an object with the object backing it. + * Pages in the backing object are moved into the + * parent, and the backing object is deallocated. + * + * Requires that the object be locked and the page + * queues be unlocked. May unlock/relock the object, + * so the caller should hold a reference for the object. + */ +void vm_object_collapse( + vm_object_t object) +{ + vm_object_t backing_object; + vm_offset_t backing_offset; + vm_size_t size; + vm_offset_t new_offset; + vm_page_t p, pp; + ipc_port_t old_name_port; + + if (!vm_object_collapse_allowed) + return; + + while (TRUE) { + /* + * Verify that the conditions are right for collapse: + * + * The object exists and no pages in it are currently + * being paged out (or have ever been paged out). + * + * This check is probably overkill -- if a memory + * object has not been created, the fault handler + * shouldn't release the object lock while paging + * is in progress or absent pages exist. + */ + if (object == VM_OBJECT_NULL || + object->pager_created || + object->paging_in_progress != 0 || + object->absent_count != 0) + return; + + /* + * There is a backing object, and + */ + + if ((backing_object = object->shadow) == VM_OBJECT_NULL) + return; + + vm_object_lock(backing_object); + /* + * ... + * The backing object is not read_only, + * and no pages in the backing object are + * currently being paged out. + * The backing object is internal. + * + * XXX It may be sufficient for the backing + * XXX object to be temporary. + */ + + if (!backing_object->internal || + backing_object->paging_in_progress != 0) { + vm_object_unlock(backing_object); + return; + } + + /* + * The backing object can't be a copy-object: + * the shadow_offset for the copy-object must stay + * as 0. Furthermore (for the 'we have all the + * pages' case), if we bypass backing_object and + * just shadow the next object in the chain, old + * pages from that object would then have to be copied + * BOTH into the (former) backing_object and into the + * parent object. + */ + if (backing_object->shadow != VM_OBJECT_NULL && + backing_object->shadow->copy != VM_OBJECT_NULL) { + vm_object_unlock(backing_object); + return; + } + + /* + * We know that we can either collapse the backing + * object (if the parent is the only reference to + * it) or (perhaps) remove the parent's reference + * to it. + */ + + backing_offset = object->shadow_offset; + size = object->size; + + /* + * If there is exactly one reference to the backing + * object, we can collapse it into the parent. + */ + + if (backing_object->ref_count == 1) { + if (!vm_object_cache_lock_try()) { + vm_object_unlock(backing_object); + return; + } + + /* + * We can collapse the backing object. + * + * Move all in-memory pages from backing_object + * to the parent. Pages that have been paged out + * will be overwritten by any of the parent's + * pages that shadow them. + */ + + while (!queue_empty(&backing_object->memq)) { + + p = (vm_page_t) + queue_first(&backing_object->memq); + + new_offset = (p->offset - backing_offset); + + assert(!p->busy || p->absent); + + /* + * If the parent has a page here, or if + * this page falls outside the parent, + * dispose of it. + * + * Otherwise, move it as planned. + */ + + if (p->offset < backing_offset || + new_offset >= size) { + VM_PAGE_FREE(p); + } else { + pp = vm_page_lookup(object, new_offset); + if (pp != VM_PAGE_NULL && !pp->absent) { + /* + * Parent object has a real page. + * Throw away the backing object's + * page. + */ + VM_PAGE_FREE(p); + } + else { + assert(pp == VM_PAGE_NULL || ! + "vm_object_collapse: bad case"); + + /* + * Parent now has no page. + * Move the backing object's page up. + */ + vm_page_rename(p, object, new_offset); + } + } + } + + /* + * Move the pager from backing_object to object. + * + * XXX We're only using part of the paging space + * for keeps now... we ought to discard the + * unused portion. + */ + + switch (vm_object_collapse_debug) { + case 0: + break; + case 1: + if ((backing_object->pager == IP_NULL) && + (backing_object->pager_request == + PAGER_REQUEST_NULL)) + break; + /* Fall through to... */ + + default: + printf("vm_object_collapse: %p (pager %p, request %p) up to %p\n", + backing_object, backing_object->pager, backing_object->pager_request, + object); + if (vm_object_collapse_debug > 2) + SoftDebugger("vm_object_collapse"); + } + + object->pager = backing_object->pager; + if (object->pager != IP_NULL) + ipc_kobject_set(object->pager, + (ipc_kobject_t) object, + IKOT_PAGER); + object->pager_initialized = backing_object->pager_initialized; + object->pager_ready = backing_object->pager_ready; + object->pager_created = backing_object->pager_created; + + object->pager_request = backing_object->pager_request; + if (object->pager_request != IP_NULL) + ipc_kobject_set(object->pager_request, + (ipc_kobject_t) object, + IKOT_PAGING_REQUEST); + old_name_port = object->pager_name; + if (old_name_port != IP_NULL) + ipc_kobject_set(old_name_port, + IKO_NULL, IKOT_NONE); + object->pager_name = backing_object->pager_name; + if (object->pager_name != IP_NULL) + ipc_kobject_set(object->pager_name, + (ipc_kobject_t) object, + IKOT_PAGING_NAME); + + vm_object_cache_unlock(); + + /* + * If there is no pager, leave paging-offset alone. + */ + if (object->pager != IP_NULL) + object->paging_offset = + backing_object->paging_offset + + backing_offset; + +#if MACH_PAGEMAP + assert(object->existence_info == VM_EXTERNAL_NULL); + object->existence_info = backing_object->existence_info; +#endif /* MACH_PAGEMAP */ + + /* + * Object now shadows whatever backing_object did. + * Note that the reference to backing_object->shadow + * moves from within backing_object to within object. + */ + + object->shadow = backing_object->shadow; + object->shadow_offset += backing_object->shadow_offset; + if (object->shadow != VM_OBJECT_NULL && + object->shadow->copy != VM_OBJECT_NULL) { + panic("vm_object_collapse: we collapsed a copy-object!"); + } + /* + * Discard backing_object. + * + * Since the backing object has no pages, no + * pager left, and no object references within it, + * all that is necessary is to dispose of it. + */ + + assert( + (backing_object->ref_count == 1) && + (backing_object->resident_page_count == 0) && + (backing_object->paging_in_progress == 0) + ); + + assert(backing_object->alive); + assert(!backing_object->cached); + backing_object->alive = FALSE; + vm_object_unlock(backing_object); + + vm_object_unlock(object); + if (old_name_port != IP_NULL) + ipc_port_dealloc_kernel(old_name_port); + kmem_cache_free(&vm_object_cache, (vm_offset_t) backing_object); + vm_object_lock(object); + + object_collapses++; + } + else { + if (!vm_object_collapse_bypass_allowed) { + vm_object_unlock(backing_object); + return; + } + + /* + * If all of the pages in the backing object are + * shadowed by the parent object, the parent + * object no longer has to shadow the backing + * object; it can shadow the next one in the + * chain. + * + * The backing object must not be paged out - we'd + * have to check all of the paged-out pages, as + * well. + */ + + if (backing_object->pager_created) { + vm_object_unlock(backing_object); + return; + } + + /* + * Should have a check for a 'small' number + * of pages here. + */ + + queue_iterate(&backing_object->memq, p, + vm_page_t, listq) + { + new_offset = (p->offset - backing_offset); + + /* + * If the parent has a page here, or if + * this page falls outside the parent, + * keep going. + * + * Otherwise, the backing_object must be + * left in the chain. + */ + + if (p->offset >= backing_offset && + new_offset <= size && + (pp = vm_page_lookup(object, new_offset)) + == VM_PAGE_NULL) { + /* + * Page still needed. + * Can't go any further. + */ + vm_object_unlock(backing_object); + return; + } + } + + /* + * Make the parent shadow the next object + * in the chain. Deallocating backing_object + * will not remove it, since its reference + * count is at least 2. + */ + + vm_object_reference(object->shadow = backing_object->shadow); + object->shadow_offset += backing_object->shadow_offset; + + /* + * Backing object might have had a copy pointer + * to us. If it did, clear it. + */ + if (backing_object->copy == object) + backing_object->copy = VM_OBJECT_NULL; + + /* + * Drop the reference count on backing_object. + * Since its ref_count was at least 2, it + * will not vanish; so we don't need to call + * vm_object_deallocate. + */ + backing_object->ref_count--; + assert(backing_object->ref_count > 0); + vm_object_unlock(backing_object); + + object_bypasses ++; + + } + + /* + * Try again with this object's new backing object. + */ + } +} + +/* + * Routine: vm_object_page_remove: [internal] + * Purpose: + * Removes all physical pages in the specified + * object range from the object's list of pages. + * + * In/out conditions: + * The object must be locked. + */ +unsigned int vm_object_page_remove_lookup = 0; +unsigned int vm_object_page_remove_iterate = 0; + +void vm_object_page_remove( + vm_object_t object, + vm_offset_t start, + vm_offset_t end) +{ + vm_page_t p, next; + + /* + * One and two page removals are most popular. + * The factor of 16 here is somewhat arbitrary. + * It balances vm_object_lookup vs iteration. + */ + + if (atop(end - start) < object->resident_page_count/16) { + vm_object_page_remove_lookup++; + + for (; start < end; start += PAGE_SIZE) { + p = vm_page_lookup(object, start); + if (p != VM_PAGE_NULL) { + if (!p->fictitious) + pmap_page_protect(p->phys_addr, + VM_PROT_NONE); + VM_PAGE_FREE(p); + } + } + } else { + vm_object_page_remove_iterate++; + + p = (vm_page_t) queue_first(&object->memq); + while (!queue_end(&object->memq, (queue_entry_t) p)) { + next = (vm_page_t) queue_next(&p->listq); + if ((start <= p->offset) && (p->offset < end)) { + if (!p->fictitious) + pmap_page_protect(p->phys_addr, + VM_PROT_NONE); + VM_PAGE_FREE(p); + } + p = next; + } + } +} + +/* + * Routine: vm_object_coalesce + * Purpose: + * Tries to coalesce two objects backing up adjoining + * regions of memory into a single object. + * + * NOTE: Only works at the moment if one of the objects + * is NULL or if the objects are the same - otherwise, + * which object do we lock first? + * Returns: + * TRUE if objects have been coalesced. + * FALSE the objects could not be coalesced. + * Parameters: + * prev_object First object to coalesce + * prev_offset Offset into prev_object + * next_object Second object into coalesce + * next_offset Offset into next_object + * + * prev_size Size of reference to prev_object + * next_size Size of reference to next_object + * + * new_object Resulting colesced object + * new_offset Offset into the resulting object + * Conditions: + * The objects must *not* be locked. + * + * If the objects are coalesced successfully, the caller's + * references for both objects are consumed, and the caller + * gains a reference for the new object. + */ + +boolean_t vm_object_coalesce( + vm_object_t prev_object, + vm_object_t next_object, + vm_offset_t prev_offset, + vm_offset_t next_offset, + vm_size_t prev_size, + vm_size_t next_size, + vm_object_t *new_object, /* OUT */ + vm_offset_t *new_offset) /* OUT */ +{ + vm_object_t object; + vm_size_t newsize; + + if (prev_object == next_object) { + /* + * If neither object actually exists, + * the offsets don't matter. + */ + if (prev_object == VM_OBJECT_NULL) { + *new_object = VM_OBJECT_NULL; + *new_offset = 0; + return TRUE; + } + + if (prev_offset + prev_size == next_offset) { + *new_object = prev_object; + *new_offset = prev_offset; + /* + * Deallocate one of the two references. + */ + vm_object_deallocate(prev_object); + return TRUE; + } + + return FALSE; + } + + if (next_object != VM_OBJECT_NULL) { + /* + * Don't know how to merge two different + * objects yet. + */ + if (prev_object != VM_OBJECT_NULL) + return FALSE; + + object = next_object; + } else { + object = prev_object; + } + + vm_object_lock(object); + + /* + * Try to collapse the object first + */ + vm_object_collapse(object); + + /* + * Can't coalesce if pages not mapped to + * the object may be in use anyway: + * . more than one reference + * . paged out + * . shadows another object + * . has a copy elsewhere + * . paging references (pages might be in page-list) + */ + + if ((object->ref_count > 1) || + object->pager_created || + object->used_for_pageout || + (object->shadow != VM_OBJECT_NULL) || + (object->copy != VM_OBJECT_NULL) || + (object->paging_in_progress != 0)) { + vm_object_unlock(object); + return FALSE; + } + + if (object == prev_object) { + /* + * Remove any pages that may still be in + * the object from a previous deallocation. + */ + vm_object_page_remove(object, + prev_offset + prev_size, + prev_offset + prev_size + next_size); + /* + * Extend the object if necessary. + */ + newsize = prev_offset + prev_size + next_size; + if (newsize > object->size) + object->size = newsize; + + *new_offset = prev_offset; + } else { + /* + * Check if we have enough space in the object + * offset space to insert the new mapping before + * the existing one. + */ + if (next_offset < prev_size) { + vm_object_unlock(object); + return FALSE; + } + /* + * Remove any pages that may still be in + * the object from a previous deallocation. + */ + vm_object_page_remove(object, + next_offset - prev_size, + next_offset); + + *new_offset = next_offset - prev_size; + } + + vm_object_unlock(object); + *new_object = object; + return TRUE; +} + +vm_object_t vm_object_request_object( + ipc_port_t p) +{ + return vm_object_lookup(p); +} + +/* + * Routine: vm_object_name + * Purpose: + * Returns a naked send right to the "name" port associated + * with this object. + */ +ipc_port_t vm_object_name( + vm_object_t object) +{ + ipc_port_t p; + + if (object == VM_OBJECT_NULL) + return IP_NULL; + + vm_object_lock(object); + + while (object->shadow != VM_OBJECT_NULL) { + vm_object_t new_object = object->shadow; + vm_object_lock(new_object); + vm_object_unlock(object); + object = new_object; + } + + p = object->pager_name; + if (p != IP_NULL) + p = ipc_port_make_send(p); + vm_object_unlock(object); + + return p; +} + +/* + * Attach a set of physical pages to an object, so that they can + * be mapped by mapping the object. Typically used to map IO memory. + * + * The mapping function and its private data are used to obtain the + * physical addresses for each page to be mapped. + */ +kern_return_t +vm_object_page_map( + vm_object_t object, + vm_offset_t offset, + vm_size_t size, + phys_addr_t (*map_fn)(void *, vm_offset_t), + void * map_fn_data) /* private to map_fn */ +{ + int num_pages; + int i; + vm_page_t m; + vm_page_t old_page; + phys_addr_t addr; + + num_pages = atop(size); + + for (i = 0; i < num_pages; i++, offset += PAGE_SIZE) { + + addr = (*map_fn)(map_fn_data, offset); + if (addr == vm_page_fictitious_addr) + return KERN_NO_ACCESS; + + while ((m = vm_page_grab_fictitious()) == VM_PAGE_NULL) + vm_page_more_fictitious(); + + vm_object_lock(object); + if ((old_page = vm_page_lookup(object, offset)) + != VM_PAGE_NULL) + { + VM_PAGE_FREE(old_page); + } + + vm_page_init(m); + m->phys_addr = addr; + m->private = TRUE; /* don`t free page */ + m->wire_count = 1; + vm_page_lock_queues(); + vm_page_insert(m, object, offset); + vm_page_unlock_queues(); + + PAGE_WAKEUP_DONE(m); + vm_object_unlock(object); + } + return KERN_SUCCESS; +} + + +#if MACH_KDB +#include <vm/vm_print.h> +#define printf kdbprintf + +boolean_t vm_object_print_pages = FALSE; + +/* + * vm_object_print: [ debug ] + */ +void vm_object_print( + vm_object_t object) +{ + vm_page_t p; + + int count; + + if (object == VM_OBJECT_NULL) + return; + + iprintf("Object 0x%X: size=0x%X, %d references", + (vm_offset_t) object, (vm_offset_t) object->size, + object->ref_count); + printf("\n"); + iprintf("%lu resident pages,", object->resident_page_count); + printf(" %d absent pages,", object->absent_count); + printf(" %d paging ops\n", object->paging_in_progress); + indent += 1; + iprintf("memory object=0x%X (offset=0x%X),", + (vm_offset_t) object->pager, (vm_offset_t) object->paging_offset); + printf("control=0x%X, name=0x%X\n", + (vm_offset_t) object->pager_request, (vm_offset_t) object->pager_name); + iprintf("%s%s", + object->pager_ready ? " ready" : "", + object->pager_created ? " created" : ""); + printf("%s,%s ", + object->pager_initialized ? "" : "uninitialized", + object->temporary ? "temporary" : "permanent"); + printf("%s%s,", + object->internal ? "internal" : "external", + object->can_persist ? " cacheable" : ""); + printf("copy_strategy=%d\n", (vm_offset_t)object->copy_strategy); + iprintf("shadow=0x%X (offset=0x%X),", + (vm_offset_t) object->shadow, (vm_offset_t) object->shadow_offset); + printf("copy=0x%X\n", (vm_offset_t) object->copy); + + indent += 1; + + if (vm_object_print_pages) { + count = 0; + p = (vm_page_t) queue_first(&object->memq); + while (!queue_end(&object->memq, (queue_entry_t) p)) { + if (count == 0) iprintf("memory:="); + else if (count == 4) {printf("\n"); iprintf(" ..."); count = 0;} + else printf(","); + count++; + + printf("(off=0x%X,page=0x%X)", p->offset, (vm_offset_t) p); + p = (vm_page_t) queue_next(&p->listq); + } + if (count != 0) + printf("\n"); + } + indent -= 2; +} + +#endif /* MACH_KDB */ diff --git a/vm/vm_object.h b/vm/vm_object.h new file mode 100644 index 0000000..9c17541 --- /dev/null +++ b/vm/vm_object.h @@ -0,0 +1,415 @@ +/* + * Mach Operating System + * Copyright (c) 1993-1987 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm_object.h + * Author: Avadis Tevanian, Jr., Michael Wayne Young + * Date: 1985 + * + * Virtual memory object module definitions. + */ + +#ifndef _VM_VM_OBJECT_H_ +#define _VM_VM_OBJECT_H_ + +#include <sys/types.h> +#include <mach/kern_return.h> +#include <mach/boolean.h> +#include <mach/memory_object.h> +#include <mach/port.h> +#include <mach/vm_prot.h> +#include <mach/machine/vm_types.h> +#include <kern/queue.h> +#include <kern/lock.h> +#include <kern/assert.h> +#include <kern/debug.h> +#include <kern/macros.h> +#include <vm/pmap.h> +#include <ipc/ipc_types.h> + +#if MACH_PAGEMAP +#include <vm/vm_external.h> +#endif /* MACH_PAGEMAP */ + +typedef struct ipc_port * pager_request_t; +#define PAGER_REQUEST_NULL ((pager_request_t) 0) + +/* + * We use "struct ipc_port *" instead of "ipc_port_t" + * to avoid include file circularities. + */ + +struct vm_object { + queue_head_t memq; /* Resident memory */ + decl_simple_lock_data(, Lock) /* Synchronization */ +#if VM_OBJECT_DEBUG + thread_t LockHolder; /* Thread holding Lock */ +#endif /* VM_OBJECT_DEBUG */ + vm_size_t size; /* Object size (only valid + * if internal) + */ + + int ref_count; /* Number of references */ + unsigned long resident_page_count; + /* number of resident pages */ + + struct vm_object *copy; /* Object that should receive + * a copy of my changed pages + */ + struct vm_object *shadow; /* My shadow */ + vm_offset_t shadow_offset; /* Offset into shadow */ + + struct ipc_port *pager; /* Where to get data */ + vm_offset_t paging_offset; /* Offset into memory object */ + pager_request_t pager_request; /* Where data comes back */ + struct ipc_port *pager_name; /* How to identify region */ + + memory_object_copy_strategy_t + copy_strategy; /* How to handle data copy */ + + unsigned int + absent_count; /* The number of pages that + * have been requested but + * not filled. That is, the + * number of pages for which + * the "absent" attribute is + * asserted. + */ + + unsigned int /* boolean_t array */ + all_wanted; /* Bit array of "want to be + * awakened" notations. See + * VM_OBJECT_EVENT_* items + * below + */ + + unsigned int + paging_in_progress:16, + /* The memory object ports are + * being used (e.g., for pagein + * or pageout) -- don't change any + * of these fields (i.e., don't + * collapse, destroy or terminate) + */ + /* boolean_t */ used_for_pageout:1,/* The object carries data sent to + * a memory manager, which signals + * it's done by releasing memory. + * This flag prevents coalescing so + * that unmapping memory immediately + * results in object termination. + */ + /* boolean_t */ pager_created:1,/* Has pager ever been created? */ + /* boolean_t */ pager_initialized:1,/* Are fields ready to use? */ + /* boolean_t */ pager_ready:1, /* Will manager take requests? */ + + /* boolean_t */ can_persist:1, /* The kernel may keep the data + * for this object (and rights to + * the memory object) after all + * address map references are + * deallocated? + */ + /* boolean_t */ internal:1, /* Created by the kernel (and + * therefore, managed by the + * default memory manger) + */ + /* boolean_t */ temporary:1, /* Permanent objects may be changed + * externally by the memory manager, + * and changes made in memory must + * be reflected back to the memory + * manager. Temporary objects lack + * both of these characteristics. + */ + /* boolean_t */ alive:1, /* Not yet terminated (debug) */ + /* boolean_t */ lock_in_progress : 1, + /* Is a multi-page lock + * request in progress? + */ + /* boolean_t */ lock_restart : 1, + /* Should lock request in + * progress restart search? + */ + /* boolean_t */ use_shared_copy : 1,/* Use shared (i.e., + * delayed) copy on write */ + /* boolean_t */ shadowed: 1, /* Shadow may exist */ + + /* boolean_t */ cached: 1; /* Object is cached */ + queue_chain_t cached_list; /* Attachment point for the list + * of objects cached as a result + * of their can_persist value + */ + vm_offset_t last_alloc; /* last allocation offset */ +#if MACH_PAGEMAP + vm_external_t existence_info; +#endif /* MACH_PAGEMAP */ +}; + +extern +vm_object_t kernel_object; /* the single kernel object */ + +/* + * Declare procedures that operate on VM objects. + */ + +extern void vm_object_bootstrap(void); +extern void vm_object_init(void); +extern void vm_object_collect(vm_object_t); +extern void vm_object_terminate(vm_object_t); +extern vm_object_t vm_object_allocate(vm_size_t); +extern void vm_object_reference(vm_object_t); +extern void vm_object_deallocate(vm_object_t); +extern void vm_object_pmap_protect( + vm_object_t object, + vm_offset_t offset, + vm_size_t size, + pmap_t pmap, + vm_offset_t pmap_start, + vm_prot_t prot); +extern void vm_object_pmap_remove( + vm_object_t object, + vm_offset_t start, + vm_offset_t end); +extern void vm_object_page_remove( + vm_object_t object, + vm_offset_t start, + vm_offset_t end); +extern void vm_object_shadow( + vm_object_t *object, /* in/out */ + vm_offset_t *offset, /* in/out */ + vm_size_t length); +extern void vm_object_collapse(vm_object_t); +extern vm_object_t vm_object_lookup(struct ipc_port *); +extern vm_object_t vm_object_lookup_name(struct ipc_port *); +extern struct ipc_port *vm_object_name(vm_object_t); +extern void vm_object_remove(vm_object_t); + +extern boolean_t vm_object_copy_temporary( + vm_object_t *_object, /* in/out */ + vm_offset_t *_offset, /* in/out */ + boolean_t *_src_needs_copy, /* out */ + boolean_t *_dst_needs_copy); /* out */ +extern kern_return_t vm_object_copy_strategically( + vm_object_t src_object, + vm_offset_t src_offset, + vm_size_t size, + vm_object_t *dst_object, /* out */ + vm_offset_t *dst_offset, /* out */ + boolean_t *dst_needs_copy); /* out */ +extern kern_return_t vm_object_copy_slowly( + vm_object_t src_object, + vm_offset_t src_offset, + vm_size_t size, + boolean_t interruptible, + vm_object_t *_result_object); /* out */ + +extern vm_object_t vm_object_enter( + struct ipc_port *pager, + vm_size_t size, + boolean_t internal); +extern void vm_object_pager_create( + vm_object_t object); +extern void vm_object_destroy( + struct ipc_port *pager); + +extern kern_return_t vm_object_page_map( + vm_object_t, + vm_offset_t, + vm_size_t, + phys_addr_t (*)(void *, vm_offset_t), + void *); + +extern vm_object_t vm_object_request_object(struct ipc_port *); + +extern boolean_t vm_object_coalesce( + vm_object_t prev_object, + vm_object_t next_object, + vm_offset_t prev_offset, + vm_offset_t next_offset, + vm_size_t prev_size, + vm_size_t next_size, + vm_object_t *new_object, /* OUT */ + vm_offset_t *new_offset); /* OUT */ + +extern void vm_object_pager_wakeup(ipc_port_t pager); + +void memory_object_release( + ipc_port_t pager, + pager_request_t pager_request, + ipc_port_t pager_name); + +void vm_object_deactivate_pages(vm_object_t); + +vm_object_t vm_object_copy_delayed( + vm_object_t src_object); + +/* + * Event waiting handling + */ + +#define VM_OBJECT_EVENT_INITIALIZED 0 +#define VM_OBJECT_EVENT_PAGER_READY 1 +#define VM_OBJECT_EVENT_PAGING_IN_PROGRESS 2 +#define VM_OBJECT_EVENT_ABSENT_COUNT 3 +#define VM_OBJECT_EVENT_LOCK_IN_PROGRESS 4 + +#define vm_object_wait(object, event, interruptible) \ + MACRO_BEGIN \ + (object)->all_wanted |= 1 << (event); \ + vm_object_sleep(((vm_offset_t) object) + (event), \ + (object), \ + (interruptible)); \ + MACRO_END + +#define vm_object_assert_wait(object, event, interruptible) \ + MACRO_BEGIN \ + (object)->all_wanted |= 1 << (event); \ + assert_wait((event_t)(((vm_offset_t) object) + (event)), (interruptible)); \ + MACRO_END + +#define vm_object_wakeup(object, event) \ + MACRO_BEGIN \ + if ((object)->all_wanted & (1 << (event))) \ + thread_wakeup((event_t)(((vm_offset_t) object) + (event))); \ + (object)->all_wanted &= ~(1 << (event)); \ + MACRO_END + +/* + * Routines implemented as macros + */ + +#define vm_object_collectable(object) \ + (((object)->ref_count == 0) \ + && ((object)->resident_page_count == 0)) + +#define vm_object_paging_begin(object) \ + ((object)->paging_in_progress++) + +#define vm_object_paging_end(object) \ + MACRO_BEGIN \ + assert((object)->paging_in_progress != 0); \ + if (--(object)->paging_in_progress == 0) { \ + vm_object_wakeup(object, \ + VM_OBJECT_EVENT_PAGING_IN_PROGRESS); \ + } \ + MACRO_END + +#define vm_object_paging_wait(object, interruptible) \ + MACRO_BEGIN \ + while ((object)->paging_in_progress != 0) { \ + vm_object_wait( (object), \ + VM_OBJECT_EVENT_PAGING_IN_PROGRESS, \ + (interruptible)); \ + vm_object_lock(object); \ + \ + /*XXX if ((interruptible) && */ \ + /*XXX (current_thread()->wait_result != THREAD_AWAKENED))*/ \ + /*XXX break; */ \ + } \ + MACRO_END + +#define vm_object_absent_assert_wait(object, interruptible) \ + MACRO_BEGIN \ + vm_object_assert_wait( (object), \ + VM_OBJECT_EVENT_ABSENT_COUNT, \ + (interruptible)); \ + MACRO_END + + +#define vm_object_absent_release(object) \ + MACRO_BEGIN \ + (object)->absent_count--; \ + vm_object_wakeup((object), \ + VM_OBJECT_EVENT_ABSENT_COUNT); \ + MACRO_END + +/* + * Object locking macros (with and without debugging) + */ + +#if VM_OBJECT_DEBUG +#define vm_object_lock_init(object) \ +MACRO_BEGIN \ + simple_lock_init(&(object)->Lock); \ + (object)->LockHolder = 0; \ +MACRO_END +#define vm_object_lock(object) \ +MACRO_BEGIN \ + simple_lock(&(object)->Lock); \ + (object)->LockHolder = current_thread(); \ +MACRO_END +#define vm_object_unlock(object) \ +MACRO_BEGIN \ + if ((object)->LockHolder != current_thread()) \ + panic("vm_object_unlock 0x%x", (object)); \ + (object)->LockHolder = 0; \ + simple_unlock(&(object)->Lock); \ +MACRO_END +#define vm_object_lock_try(object) \ + (simple_lock_try(&(object)->Lock) \ + ? ( ((object)->LockHolder = current_thread()) , TRUE) \ + : FALSE) +#define vm_object_sleep(event, object, interruptible) \ +MACRO_BEGIN \ + if ((object)->LockHolder != current_thread()) \ + panic("vm_object_sleep %#x", (object)); \ + (object)->LockHolder = 0; \ + thread_sleep((event_t)(event), simple_lock_addr((object)->Lock), \ + (interruptible)); \ +MACRO_END +#define vm_object_lock_taken(object) \ + ((object)->LockHolder == current_thread()) +#else /* VM_OBJECT_DEBUG */ +#define vm_object_lock_init(object) simple_lock_init(&(object)->Lock) +#define vm_object_lock(object) simple_lock(&(object)->Lock) +#define vm_object_unlock(object) simple_unlock(&(object)->Lock) +#define vm_object_lock_try(object) simple_lock_try(&(object)->Lock) +#define vm_object_sleep(event, object, interruptible) \ + thread_sleep((event_t)(event), simple_lock_addr((object)->Lock), \ + (interruptible)) +#define vm_object_lock_taken(object) simple_lock_taken(&(object)->Lock) +#endif /* VM_OBJECT_DEBUG */ + +/* + * Page cache accounting. + * + * The page queues must be locked when changing these counters. + */ +extern int vm_object_external_count; +extern int vm_object_external_pages; + +/* Add a reference to a locked VM object. */ +static inline int +vm_object_reference_locked (vm_object_t obj) +{ + return (++obj->ref_count); +} + +/* Remove a reference from a locked VM object. */ +static inline int +vm_object_unreference_locked (vm_object_t obj) +{ + return (--obj->ref_count); +} + +#endif /* _VM_VM_OBJECT_H_ */ diff --git a/vm/vm_page.c b/vm/vm_page.c new file mode 100644 index 0000000..04decbb --- /dev/null +++ b/vm/vm_page.c @@ -0,0 +1,2164 @@ +/* + * Copyright (c) 2010-2014 Richard Braun. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * + * This implementation uses the binary buddy system to manage its heap. + * Descriptions of the buddy system can be found in the following works : + * - "UNIX Internals: The New Frontiers", by Uresh Vahalia. + * - "Dynamic Storage Allocation: A Survey and Critical Review", + * by Paul R. Wilson, Mark S. Johnstone, Michael Neely, and David Boles. + * + * In addition, this allocator uses per-CPU pools of pages for order 0 + * (i.e. single page) allocations. These pools act as caches (but are named + * differently to avoid confusion with CPU caches) that reduce contention on + * multiprocessor systems. When a pool is empty and cannot provide a page, + * it is filled by transferring multiple pages from the backend buddy system. + * The symmetric case is handled likewise. + * + * TODO Limit number of dirty pages, block allocations above a top limit. + */ + +#include <string.h> +#include <kern/assert.h> +#include <kern/counters.h> +#include <kern/cpu_number.h> +#include <kern/debug.h> +#include <kern/list.h> +#include <kern/lock.h> +#include <kern/macros.h> +#include <kern/printf.h> +#include <kern/thread.h> +#include <mach/vm_param.h> +#include <machine/pmap.h> +#include <sys/types.h> +#include <vm/memory_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pageout.h> + +#define DEBUG 0 + +#define __init +#define __initdata +#define __read_mostly + +#define thread_pin() +#define thread_unpin() + +/* + * Number of free block lists per segment. + */ +#define VM_PAGE_NR_FREE_LISTS 11 + +/* + * The size of a CPU pool is computed by dividing the number of pages in its + * containing segment by this value. + */ +#define VM_PAGE_CPU_POOL_RATIO 1024 + +/* + * Maximum number of pages in a CPU pool. + */ +#define VM_PAGE_CPU_POOL_MAX_SIZE 128 + +/* + * The transfer size of a CPU pool is computed by dividing the pool size by + * this value. + */ +#define VM_PAGE_CPU_POOL_TRANSFER_RATIO 2 + +/* + * Per-processor cache of pages. + */ +struct vm_page_cpu_pool { + simple_lock_data_t lock; + int size; + int transfer_size; + int nr_pages; + struct list pages; +} __aligned(CPU_L1_SIZE); + +/* + * Special order value for pages that aren't in a free list. Such pages are + * either allocated, or part of a free block of pages but not the head page. + */ +#define VM_PAGE_ORDER_UNLISTED (VM_PAGE_NR_FREE_LISTS + 1) + +/* + * Doubly-linked list of free blocks. + */ +struct vm_page_free_list { + unsigned long size; + struct list blocks; +}; + +/* + * XXX Because of a potential deadlock involving the default pager (see + * vm_map_lock()), it's currently impossible to reliably determine the + * minimum number of free pages required for successful pageout. Since + * that process is dependent on the amount of physical memory, we scale + * the minimum number of free pages from it, in the hope that memory + * exhaustion happens as rarely as possible... + */ + +/* + * Ratio used to compute the minimum number of pages in a segment. + */ +#define VM_PAGE_SEG_THRESHOLD_MIN_NUM 5 +#define VM_PAGE_SEG_THRESHOLD_MIN_DENOM 100 + +/* + * Number of pages reserved for privileged allocations in a segment. + */ +#define VM_PAGE_SEG_THRESHOLD_MIN 500 + +/* + * Ratio used to compute the threshold below which pageout is started. + */ +#define VM_PAGE_SEG_THRESHOLD_LOW_NUM 6 +#define VM_PAGE_SEG_THRESHOLD_LOW_DENOM 100 + +/* + * Minimum value the low threshold can have for a segment. + */ +#define VM_PAGE_SEG_THRESHOLD_LOW 600 + +#if VM_PAGE_SEG_THRESHOLD_LOW <= VM_PAGE_SEG_THRESHOLD_MIN +#error VM_PAGE_SEG_THRESHOLD_LOW invalid +#endif /* VM_PAGE_SEG_THRESHOLD_LOW >= VM_PAGE_SEG_THRESHOLD_MIN */ + +/* + * Ratio used to compute the threshold above which pageout is stopped. + */ +#define VM_PAGE_SEG_THRESHOLD_HIGH_NUM 10 +#define VM_PAGE_SEG_THRESHOLD_HIGH_DENOM 100 + +/* + * Minimum value the high threshold can have for a segment. + */ +#define VM_PAGE_SEG_THRESHOLD_HIGH 1000 + +#if VM_PAGE_SEG_THRESHOLD_HIGH <= VM_PAGE_SEG_THRESHOLD_LOW +#error VM_PAGE_SEG_THRESHOLD_HIGH invalid +#endif /* VM_PAGE_SEG_THRESHOLD_HIGH <= VM_PAGE_SEG_THRESHOLD_LOW */ + +/* + * Minimum number of pages allowed for a segment. + */ +#define VM_PAGE_SEG_MIN_PAGES 2000 + +#if VM_PAGE_SEG_MIN_PAGES <= VM_PAGE_SEG_THRESHOLD_HIGH +#error VM_PAGE_SEG_MIN_PAGES invalid +#endif /* VM_PAGE_SEG_MIN_PAGES <= VM_PAGE_SEG_THRESHOLD_HIGH */ + +/* + * Ratio used to compute the threshold of active pages beyond which + * to refill the inactive queue. + */ +#define VM_PAGE_HIGH_ACTIVE_PAGE_NUM 1 +#define VM_PAGE_HIGH_ACTIVE_PAGE_DENOM 3 + +/* + * Page cache queue. + * + * XXX The current implementation hardcodes a preference to evict external + * pages first and keep internal ones as much as possible. This is because + * the Hurd default pager implementation suffers from bugs that can easily + * cause the system to freeze. + */ +struct vm_page_queue { + struct list internal_pages; + struct list external_pages; +}; + +/* + * Segment name buffer size. + */ +#define VM_PAGE_NAME_SIZE 16 + +/* + * Segment of contiguous memory. + * + * XXX Per-segment locking is probably useless, since one or both of the + * page queues lock and the free page queue lock is held on any access. + * However it should first be made clear which lock protects access to + * which members of a segment. + */ +struct vm_page_seg { + struct vm_page_cpu_pool cpu_pools[NCPUS]; + + phys_addr_t start; + phys_addr_t end; + struct vm_page *pages; + struct vm_page *pages_end; + simple_lock_data_t lock; + struct vm_page_free_list free_lists[VM_PAGE_NR_FREE_LISTS]; + unsigned long nr_free_pages; + + /* Free memory thresholds */ + unsigned long min_free_pages; /* Privileged allocations only */ + unsigned long low_free_pages; /* Pageout daemon starts scanning */ + unsigned long high_free_pages; /* Pageout daemon stops scanning, + unprivileged allocations resume */ + + /* Page cache related data */ + struct vm_page_queue active_pages; + unsigned long nr_active_pages; + unsigned long high_active_pages; + struct vm_page_queue inactive_pages; + unsigned long nr_inactive_pages; +}; + +/* + * Bootstrap information about a segment. + */ +struct vm_page_boot_seg { + phys_addr_t start; + phys_addr_t end; + boolean_t heap_present; + phys_addr_t avail_start; + phys_addr_t avail_end; +}; + +static int vm_page_is_ready __read_mostly; + +/* + * Segment table. + * + * The system supports a maximum of 4 segments : + * - DMA: suitable for DMA + * - DMA32: suitable for DMA when devices support 32-bits addressing + * - DIRECTMAP: direct physical mapping, allows direct access from + * the kernel with a simple offset translation + * - HIGHMEM: must be mapped before it can be accessed + * + * Segments are ordered by priority, 0 being the lowest priority. Their + * relative priorities are DMA < DMA32 < DIRECTMAP < HIGHMEM or + * DMA < DIRECTMAP < DMA32 < HIGHMEM. + * Some segments may actually be aliases for others, e.g. if DMA is always + * possible from the direct physical mapping, DMA and DMA32 are aliases for + * DIRECTMAP, in which case the segment table contains DIRECTMAP and HIGHMEM + * only. + */ +static struct vm_page_seg vm_page_segs[VM_PAGE_MAX_SEGS]; + +/* + * Bootstrap segment table. + */ +static struct vm_page_boot_seg vm_page_boot_segs[VM_PAGE_MAX_SEGS] __initdata; + +/* + * Number of loaded segments. + */ +static unsigned int vm_page_segs_size __read_mostly; + +/* + * If true, unprivileged allocations are blocked, disregarding any other + * condition. + * + * This variable is also used to resume clients once pages are available. + * + * The free page queue lock must be held when accessing this variable. + */ +static boolean_t vm_page_alloc_paused; + +static void __init +vm_page_init_pa(struct vm_page *page, unsigned short seg_index, phys_addr_t pa) +{ + memset(page, 0, sizeof(*page)); + vm_page_init(page); /* vm_resident members */ + page->type = VM_PT_RESERVED; + page->seg_index = seg_index; + page->order = VM_PAGE_ORDER_UNLISTED; + page->priv = NULL; + page->phys_addr = pa; +} + +void +vm_page_set_type(struct vm_page *page, unsigned int order, unsigned short type) +{ + unsigned int i, nr_pages; + + nr_pages = 1 << order; + + for (i = 0; i < nr_pages; i++) + page[i].type = type; +} + +static boolean_t +vm_page_pageable(const struct vm_page *page) +{ + return (page->object != NULL) + && (page->wire_count == 0) + && (page->active || page->inactive); +} + +static boolean_t +vm_page_can_move(const struct vm_page *page) +{ + /* + * This function is called on pages pulled from the page queues, + * implying they're pageable, which is why the wire count isn't + * checked here. + */ + + return !page->busy + && !page->wanted + && !page->absent + && page->object->alive; +} + +static void +vm_page_remove_mappings(struct vm_page *page) +{ + page->busy = TRUE; + pmap_page_protect(page->phys_addr, VM_PROT_NONE); + + if (!page->dirty) { + page->dirty = pmap_is_modified(page->phys_addr); + } +} + +static void __init +vm_page_free_list_init(struct vm_page_free_list *free_list) +{ + free_list->size = 0; + list_init(&free_list->blocks); +} + +static inline void +vm_page_free_list_insert(struct vm_page_free_list *free_list, + struct vm_page *page) +{ + assert(page->order == VM_PAGE_ORDER_UNLISTED); + + free_list->size++; + list_insert_head(&free_list->blocks, &page->node); +} + +static inline void +vm_page_free_list_remove(struct vm_page_free_list *free_list, + struct vm_page *page) +{ + assert(page->order != VM_PAGE_ORDER_UNLISTED); + + free_list->size--; + list_remove(&page->node); +} + +static struct vm_page * +vm_page_seg_alloc_from_buddy(struct vm_page_seg *seg, unsigned int order) +{ + struct vm_page_free_list *free_list = free_list; + struct vm_page *page, *buddy; + unsigned int i; + + assert(order < VM_PAGE_NR_FREE_LISTS); + + if (vm_page_alloc_paused && current_thread() + && !current_thread()->vm_privilege) { + return NULL; + } else if (seg->nr_free_pages <= seg->low_free_pages) { + vm_pageout_start(); + + if ((seg->nr_free_pages <= seg->min_free_pages) + && current_thread() && !current_thread()->vm_privilege) { + vm_page_alloc_paused = TRUE; + return NULL; + } + } + + for (i = order; i < VM_PAGE_NR_FREE_LISTS; i++) { + free_list = &seg->free_lists[i]; + + if (free_list->size != 0) + break; + } + + if (i == VM_PAGE_NR_FREE_LISTS) + return NULL; + + page = list_first_entry(&free_list->blocks, struct vm_page, node); + vm_page_free_list_remove(free_list, page); + page->order = VM_PAGE_ORDER_UNLISTED; + + while (i > order) { + i--; + buddy = &page[1 << i]; + vm_page_free_list_insert(&seg->free_lists[i], buddy); + buddy->order = i; + } + + seg->nr_free_pages -= (1 << order); + + if (seg->nr_free_pages < seg->min_free_pages) { + vm_page_alloc_paused = TRUE; + } + + return page; +} + +static void +vm_page_seg_free_to_buddy(struct vm_page_seg *seg, struct vm_page *page, + unsigned int order) +{ + struct vm_page *buddy; + phys_addr_t pa, buddy_pa; + unsigned int nr_pages; + + assert(page >= seg->pages); + assert(page < seg->pages_end); + assert(page->order == VM_PAGE_ORDER_UNLISTED); + assert(order < VM_PAGE_NR_FREE_LISTS); + + nr_pages = (1 << order); + pa = page->phys_addr; + + while (order < (VM_PAGE_NR_FREE_LISTS - 1)) { + buddy_pa = pa ^ vm_page_ptoa(1ULL << order); + + if ((buddy_pa < seg->start) || (buddy_pa >= seg->end)) + break; + + buddy = &seg->pages[vm_page_atop(buddy_pa - seg->start)]; + + if (buddy->order != order) + break; + + vm_page_free_list_remove(&seg->free_lists[order], buddy); + buddy->order = VM_PAGE_ORDER_UNLISTED; + order++; + pa &= -vm_page_ptoa(1ULL << order); + page = &seg->pages[vm_page_atop(pa - seg->start)]; + } + + vm_page_free_list_insert(&seg->free_lists[order], page); + page->order = order; + seg->nr_free_pages += nr_pages; +} + +static void __init +vm_page_cpu_pool_init(struct vm_page_cpu_pool *cpu_pool, int size) +{ + simple_lock_init(&cpu_pool->lock); + cpu_pool->size = size; + cpu_pool->transfer_size = (size + VM_PAGE_CPU_POOL_TRANSFER_RATIO - 1) + / VM_PAGE_CPU_POOL_TRANSFER_RATIO; + cpu_pool->nr_pages = 0; + list_init(&cpu_pool->pages); +} + +static inline struct vm_page_cpu_pool * +vm_page_cpu_pool_get(struct vm_page_seg *seg) +{ + return &seg->cpu_pools[cpu_number()]; +} + +static inline struct vm_page * +vm_page_cpu_pool_pop(struct vm_page_cpu_pool *cpu_pool) +{ + struct vm_page *page; + + assert(cpu_pool->nr_pages != 0); + cpu_pool->nr_pages--; + page = list_first_entry(&cpu_pool->pages, struct vm_page, node); + list_remove(&page->node); + return page; +} + +static inline void +vm_page_cpu_pool_push(struct vm_page_cpu_pool *cpu_pool, struct vm_page *page) +{ + assert(cpu_pool->nr_pages < cpu_pool->size); + cpu_pool->nr_pages++; + list_insert_head(&cpu_pool->pages, &page->node); +} + +static int +vm_page_cpu_pool_fill(struct vm_page_cpu_pool *cpu_pool, + struct vm_page_seg *seg) +{ + struct vm_page *page; + int i; + + assert(cpu_pool->nr_pages == 0); + + simple_lock(&seg->lock); + + for (i = 0; i < cpu_pool->transfer_size; i++) { + page = vm_page_seg_alloc_from_buddy(seg, 0); + + if (page == NULL) + break; + + vm_page_cpu_pool_push(cpu_pool, page); + } + + simple_unlock(&seg->lock); + + return i; +} + +static void +vm_page_cpu_pool_drain(struct vm_page_cpu_pool *cpu_pool, + struct vm_page_seg *seg) +{ + struct vm_page *page; + int i; + + assert(cpu_pool->nr_pages == cpu_pool->size); + + simple_lock(&seg->lock); + + for (i = cpu_pool->transfer_size; i > 0; i--) { + page = vm_page_cpu_pool_pop(cpu_pool); + vm_page_seg_free_to_buddy(seg, page, 0); + } + + simple_unlock(&seg->lock); +} + +static void +vm_page_queue_init(struct vm_page_queue *queue) +{ + list_init(&queue->internal_pages); + list_init(&queue->external_pages); +} + +static void +vm_page_queue_push(struct vm_page_queue *queue, struct vm_page *page) +{ + if (page->external) { + list_insert_tail(&queue->external_pages, &page->node); + } else { + list_insert_tail(&queue->internal_pages, &page->node); + } +} + +static void +vm_page_queue_remove(struct vm_page_queue *queue, struct vm_page *page) +{ + (void)queue; + list_remove(&page->node); +} + +static struct vm_page * +vm_page_queue_first(struct vm_page_queue *queue, boolean_t external_only) +{ + struct vm_page *page; + + if (!list_empty(&queue->external_pages)) { + page = list_first_entry(&queue->external_pages, struct vm_page, node); + return page; + } + + if (!external_only && !list_empty(&queue->internal_pages)) { + page = list_first_entry(&queue->internal_pages, struct vm_page, node); + return page; + } + + return NULL; +} + +static struct vm_page_seg * +vm_page_seg_get(unsigned short index) +{ + assert(index < vm_page_segs_size); + return &vm_page_segs[index]; +} + +static unsigned int +vm_page_seg_index(const struct vm_page_seg *seg) +{ + unsigned int index; + + index = seg - vm_page_segs; + assert(index < vm_page_segs_size); + return index; +} + +static phys_addr_t __init +vm_page_seg_size(struct vm_page_seg *seg) +{ + return seg->end - seg->start; +} + +static int __init +vm_page_seg_compute_pool_size(struct vm_page_seg *seg) +{ + phys_addr_t size; + + size = vm_page_atop(vm_page_seg_size(seg)) / VM_PAGE_CPU_POOL_RATIO; + + if (size == 0) + size = 1; + else if (size > VM_PAGE_CPU_POOL_MAX_SIZE) + size = VM_PAGE_CPU_POOL_MAX_SIZE; + + return size; +} + +static void __init +vm_page_seg_compute_pageout_thresholds(struct vm_page_seg *seg) +{ + unsigned long nr_pages; + + nr_pages = vm_page_atop(vm_page_seg_size(seg)); + + if (nr_pages < VM_PAGE_SEG_MIN_PAGES) { + panic("vm_page: segment too small"); + } + + seg->min_free_pages = nr_pages * VM_PAGE_SEG_THRESHOLD_MIN_NUM + / VM_PAGE_SEG_THRESHOLD_MIN_DENOM; + + if (seg->min_free_pages < VM_PAGE_SEG_THRESHOLD_MIN) { + seg->min_free_pages = VM_PAGE_SEG_THRESHOLD_MIN; + } + + seg->low_free_pages = nr_pages * VM_PAGE_SEG_THRESHOLD_LOW_NUM + / VM_PAGE_SEG_THRESHOLD_LOW_DENOM; + + if (seg->low_free_pages < VM_PAGE_SEG_THRESHOLD_LOW) { + seg->low_free_pages = VM_PAGE_SEG_THRESHOLD_LOW; + } + + seg->high_free_pages = nr_pages * VM_PAGE_SEG_THRESHOLD_HIGH_NUM + / VM_PAGE_SEG_THRESHOLD_HIGH_DENOM; + + if (seg->high_free_pages < VM_PAGE_SEG_THRESHOLD_HIGH) { + seg->high_free_pages = VM_PAGE_SEG_THRESHOLD_HIGH; + } +} + +static void __init +vm_page_seg_init(struct vm_page_seg *seg, phys_addr_t start, phys_addr_t end, + struct vm_page *pages) +{ + phys_addr_t pa; + int pool_size; + unsigned int i; + + seg->start = start; + seg->end = end; + pool_size = vm_page_seg_compute_pool_size(seg); + + for (i = 0; i < ARRAY_SIZE(seg->cpu_pools); i++) + vm_page_cpu_pool_init(&seg->cpu_pools[i], pool_size); + + seg->pages = pages; + seg->pages_end = pages + vm_page_atop(vm_page_seg_size(seg)); + simple_lock_init(&seg->lock); + + for (i = 0; i < ARRAY_SIZE(seg->free_lists); i++) + vm_page_free_list_init(&seg->free_lists[i]); + + seg->nr_free_pages = 0; + + vm_page_seg_compute_pageout_thresholds(seg); + + vm_page_queue_init(&seg->active_pages); + seg->nr_active_pages = 0; + vm_page_queue_init(&seg->inactive_pages); + seg->nr_inactive_pages = 0; + + i = vm_page_seg_index(seg); + + for (pa = seg->start; pa < seg->end; pa += PAGE_SIZE) + vm_page_init_pa(&pages[vm_page_atop(pa - seg->start)], i, pa); +} + +static struct vm_page * +vm_page_seg_alloc(struct vm_page_seg *seg, unsigned int order, + unsigned short type) +{ + struct vm_page_cpu_pool *cpu_pool; + struct vm_page *page; + int filled; + + assert(order < VM_PAGE_NR_FREE_LISTS); + + if (order == 0) { + thread_pin(); + cpu_pool = vm_page_cpu_pool_get(seg); + simple_lock(&cpu_pool->lock); + + if (cpu_pool->nr_pages == 0) { + filled = vm_page_cpu_pool_fill(cpu_pool, seg); + + if (!filled) { + simple_unlock(&cpu_pool->lock); + thread_unpin(); + return NULL; + } + } + + page = vm_page_cpu_pool_pop(cpu_pool); + simple_unlock(&cpu_pool->lock); + thread_unpin(); + } else { + simple_lock(&seg->lock); + page = vm_page_seg_alloc_from_buddy(seg, order); + simple_unlock(&seg->lock); + + if (page == NULL) + return NULL; + } + + assert(page->type == VM_PT_FREE); + vm_page_set_type(page, order, type); + return page; +} + +static void +vm_page_seg_free(struct vm_page_seg *seg, struct vm_page *page, + unsigned int order) +{ + struct vm_page_cpu_pool *cpu_pool; + + assert(page->type != VM_PT_FREE); + assert(order < VM_PAGE_NR_FREE_LISTS); + + vm_page_set_type(page, order, VM_PT_FREE); + + if (order == 0) { + thread_pin(); + cpu_pool = vm_page_cpu_pool_get(seg); + simple_lock(&cpu_pool->lock); + + if (cpu_pool->nr_pages == cpu_pool->size) + vm_page_cpu_pool_drain(cpu_pool, seg); + + vm_page_cpu_pool_push(cpu_pool, page); + simple_unlock(&cpu_pool->lock); + thread_unpin(); + } else { + simple_lock(&seg->lock); + vm_page_seg_free_to_buddy(seg, page, order); + simple_unlock(&seg->lock); + } +} + +static void +vm_page_seg_add_active_page(struct vm_page_seg *seg, struct vm_page *page) +{ + assert(page->object != NULL); + assert(page->seg_index == vm_page_seg_index(seg)); + assert(page->type != VM_PT_FREE); + assert(page->order == VM_PAGE_ORDER_UNLISTED); + assert(!page->free && !page->active && !page->inactive); + page->active = TRUE; + page->reference = TRUE; + vm_page_queue_push(&seg->active_pages, page); + seg->nr_active_pages++; + vm_page_active_count++; +} + +static void +vm_page_seg_remove_active_page(struct vm_page_seg *seg, struct vm_page *page) +{ + assert(page->object != NULL); + assert(page->seg_index == vm_page_seg_index(seg)); + assert(page->type != VM_PT_FREE); + assert(page->order == VM_PAGE_ORDER_UNLISTED); + assert(!page->free && page->active && !page->inactive); + page->active = FALSE; + vm_page_queue_remove(&seg->active_pages, page); + seg->nr_active_pages--; + vm_page_active_count--; +} + +static void +vm_page_seg_add_inactive_page(struct vm_page_seg *seg, struct vm_page *page) +{ + assert(page->object != NULL); + assert(page->seg_index == vm_page_seg_index(seg)); + assert(page->type != VM_PT_FREE); + assert(page->order == VM_PAGE_ORDER_UNLISTED); + assert(!page->free && !page->active && !page->inactive); + page->inactive = TRUE; + vm_page_queue_push(&seg->inactive_pages, page); + seg->nr_inactive_pages++; + vm_page_inactive_count++; +} + +static void +vm_page_seg_remove_inactive_page(struct vm_page_seg *seg, struct vm_page *page) +{ + assert(page->object != NULL); + assert(page->seg_index == vm_page_seg_index(seg)); + assert(page->type != VM_PT_FREE); + assert(page->order == VM_PAGE_ORDER_UNLISTED); + assert(!page->free && !page->active && page->inactive); + page->inactive = FALSE; + vm_page_queue_remove(&seg->inactive_pages, page); + seg->nr_inactive_pages--; + vm_page_inactive_count--; +} + +/* + * Attempt to pull an active page. + * + * If successful, the object containing the page is locked. + */ +static struct vm_page * +vm_page_seg_pull_active_page(struct vm_page_seg *seg, boolean_t external_only) +{ + struct vm_page *page, *first; + boolean_t locked; + + first = NULL; + + for (;;) { + page = vm_page_queue_first(&seg->active_pages, external_only); + + if (page == NULL) { + break; + } else if (first == NULL) { + first = page; + } else if (first == page) { + break; + } + + vm_page_seg_remove_active_page(seg, page); + locked = vm_object_lock_try(page->object); + + if (!locked) { + vm_page_seg_add_active_page(seg, page); + continue; + } + + if (!vm_page_can_move(page)) { + vm_page_seg_add_active_page(seg, page); + vm_object_unlock(page->object); + continue; + } + + return page; + } + + return NULL; +} + +/* + * Attempt to pull an inactive page. + * + * If successful, the object containing the page is locked. + * + * XXX See vm_page_seg_pull_active_page (duplicated code). + */ +static struct vm_page * +vm_page_seg_pull_inactive_page(struct vm_page_seg *seg, boolean_t external_only) +{ + struct vm_page *page, *first; + boolean_t locked; + + first = NULL; + + for (;;) { + page = vm_page_queue_first(&seg->inactive_pages, external_only); + + if (page == NULL) { + break; + } else if (first == NULL) { + first = page; + } else if (first == page) { + break; + } + + vm_page_seg_remove_inactive_page(seg, page); + locked = vm_object_lock_try(page->object); + + if (!locked) { + vm_page_seg_add_inactive_page(seg, page); + continue; + } + + if (!vm_page_can_move(page)) { + vm_page_seg_add_inactive_page(seg, page); + vm_object_unlock(page->object); + continue; + } + + return page; + } + + return NULL; +} + +/* + * Attempt to pull a page cache page. + * + * If successful, the object containing the page is locked. + */ +static struct vm_page * +vm_page_seg_pull_cache_page(struct vm_page_seg *seg, + boolean_t external_only, + boolean_t *was_active) +{ + struct vm_page *page; + + page = vm_page_seg_pull_inactive_page(seg, external_only); + + if (page != NULL) { + *was_active = FALSE; + return page; + } + + page = vm_page_seg_pull_active_page(seg, external_only); + + if (page != NULL) { + *was_active = TRUE; + return page; + } + + return NULL; +} + +static boolean_t +vm_page_seg_page_available(const struct vm_page_seg *seg) +{ + return (seg->nr_free_pages > seg->high_free_pages); +} + +static boolean_t +vm_page_seg_usable(const struct vm_page_seg *seg) +{ + if ((seg->nr_active_pages + seg->nr_inactive_pages) == 0) { + /* Nothing to page out, assume segment is usable */ + return TRUE; + } + + return (seg->nr_free_pages >= seg->high_free_pages); +} + +static void +vm_page_seg_double_lock(struct vm_page_seg *seg1, struct vm_page_seg *seg2) +{ + assert(seg1 != seg2); + + if (seg1 < seg2) { + simple_lock(&seg1->lock); + simple_lock(&seg2->lock); + } else { + simple_lock(&seg2->lock); + simple_lock(&seg1->lock); + } +} + +static void +vm_page_seg_double_unlock(struct vm_page_seg *seg1, struct vm_page_seg *seg2) +{ + simple_unlock(&seg1->lock); + simple_unlock(&seg2->lock); +} + +/* + * Attempt to balance a segment by moving one page to another segment. + * + * Return TRUE if a page was actually moved. + */ +static boolean_t +vm_page_seg_balance_page(struct vm_page_seg *seg, + struct vm_page_seg *remote_seg) +{ + struct vm_page *src, *dest; + vm_object_t object; + vm_offset_t offset; + boolean_t was_active; + + vm_page_lock_queues(); + simple_lock(&vm_page_queue_free_lock); + vm_page_seg_double_lock(seg, remote_seg); + + if (vm_page_seg_usable(seg) + || !vm_page_seg_page_available(remote_seg)) { + goto error; + } + + src = vm_page_seg_pull_cache_page(seg, FALSE, &was_active); + + if (src == NULL) { + goto error; + } + + assert(src->object != NULL); + assert(!src->fictitious && !src->private); + assert(src->wire_count == 0); + assert(src->type != VM_PT_FREE); + assert(src->order == VM_PAGE_ORDER_UNLISTED); + + dest = vm_page_seg_alloc_from_buddy(remote_seg, 0); + assert(dest != NULL); + + vm_page_seg_double_unlock(seg, remote_seg); + simple_unlock(&vm_page_queue_free_lock); + + if (!was_active && !src->reference && pmap_is_referenced(src->phys_addr)) { + src->reference = TRUE; + } + + object = src->object; + offset = src->offset; + vm_page_remove(src); + + vm_page_remove_mappings(src); + + vm_page_set_type(dest, 0, src->type); + memcpy(&dest->vm_page_header, &src->vm_page_header, + VM_PAGE_BODY_SIZE); + vm_page_copy(src, dest); + + if (!src->dirty) { + pmap_clear_modify(dest->phys_addr); + } + + dest->busy = FALSE; + + simple_lock(&vm_page_queue_free_lock); + vm_page_init(src); + src->free = TRUE; + simple_lock(&seg->lock); + vm_page_set_type(src, 0, VM_PT_FREE); + vm_page_seg_free_to_buddy(seg, src, 0); + simple_unlock(&seg->lock); + simple_unlock(&vm_page_queue_free_lock); + + vm_object_lock(object); + vm_page_insert(dest, object, offset); + vm_object_unlock(object); + + if (was_active) { + vm_page_activate(dest); + } else { + vm_page_deactivate(dest); + } + + vm_page_unlock_queues(); + + return TRUE; + +error: + vm_page_seg_double_unlock(seg, remote_seg); + simple_unlock(&vm_page_queue_free_lock); + vm_page_unlock_queues(); + return FALSE; +} + +static boolean_t +vm_page_seg_balance(struct vm_page_seg *seg) +{ + struct vm_page_seg *remote_seg; + unsigned int i; + boolean_t balanced; + + /* + * It's important here that pages are moved to lower priority + * segments first. + */ + + for (i = vm_page_segs_size - 1; i < vm_page_segs_size; i--) { + remote_seg = vm_page_seg_get(i); + + if (remote_seg == seg) { + continue; + } + + balanced = vm_page_seg_balance_page(seg, remote_seg); + + if (balanced) { + return TRUE; + } + } + + return FALSE; +} + +static boolean_t +vm_page_seg_evict(struct vm_page_seg *seg, boolean_t external_only, + boolean_t alloc_paused) +{ + struct vm_page *page; + boolean_t reclaim, double_paging; + vm_object_t object; + boolean_t was_active; + + page = NULL; + object = NULL; + double_paging = FALSE; + +restart: + vm_page_lock_queues(); + simple_lock(&seg->lock); + + if (page != NULL) { + vm_object_lock(page->object); + } else { + page = vm_page_seg_pull_cache_page(seg, external_only, &was_active); + + if (page == NULL) { + goto out; + } + } + + assert(page->object != NULL); + assert(!page->fictitious && !page->private); + assert(page->wire_count == 0); + assert(page->type != VM_PT_FREE); + assert(page->order == VM_PAGE_ORDER_UNLISTED); + + object = page->object; + + if (!was_active + && (page->reference || pmap_is_referenced(page->phys_addr))) { + vm_page_seg_add_active_page(seg, page); + simple_unlock(&seg->lock); + vm_object_unlock(object); + vm_stat.reactivations++; + current_task()->reactivations++; + vm_page_unlock_queues(); + page = NULL; + goto restart; + } + + vm_page_remove_mappings(page); + + if (!page->dirty && !page->precious) { + reclaim = TRUE; + goto out; + } + + reclaim = FALSE; + + /* + * If we are very low on memory, then we can't rely on an external + * pager to clean a dirty page, because external pagers are not + * vm-privileged. + * + * The laundry bit tells vm_pageout_setup not to do any special + * processing of this page since it's immediately going to be + * double paged out to the default pager. The laundry bit is + * reset and the page is inserted into an internal object by + * vm_pageout_setup before the second double paging pass. + * + * There is one important special case: the default pager can + * back external memory objects. When receiving the first + * pageout request, where the page is no longer present, a + * fault could occur, during which the map would be locked. + * This fault would cause a new paging request to the default + * pager. Receiving that request would deadlock when trying to + * lock the map again. Instead, the page isn't double paged + * and vm_pageout_setup wires the page down, trusting the + * default pager as for internal pages. + */ + + assert(!page->laundry); + assert(!(double_paging && page->external)); + + if (object->internal || !alloc_paused || + memory_manager_default_port(object->pager)) { + double_paging = FALSE; + } else { + double_paging = page->laundry = TRUE; + } + +out: + simple_unlock(&seg->lock); + + if (object == NULL) { + vm_page_unlock_queues(); + return FALSE; + } + + if (reclaim) { + vm_page_free(page); + vm_page_unlock_queues(); + + if (vm_object_collectable(object)) { + vm_object_collect(object); + } else { + vm_object_unlock(object); + } + + return TRUE; + } + + vm_page_unlock_queues(); + + /* + * If there is no memory object for the page, create one and hand it + * to the default pager. First try to collapse, so we don't create + * one unnecessarily. + */ + + if (!object->pager_initialized) { + vm_object_collapse(object); + } + + if (!object->pager_initialized) { + vm_object_pager_create(object); + } + + if (!object->pager_initialized) { + panic("vm_page_seg_evict"); + } + + vm_pageout_page(page, FALSE, TRUE); /* flush it */ + vm_object_unlock(object); + + if (double_paging) { + goto restart; + } + + return TRUE; +} + +static void +vm_page_seg_compute_high_active_page(struct vm_page_seg *seg) +{ + unsigned long nr_pages; + + nr_pages = seg->nr_active_pages + seg->nr_inactive_pages; + seg->high_active_pages = nr_pages * VM_PAGE_HIGH_ACTIVE_PAGE_NUM + / VM_PAGE_HIGH_ACTIVE_PAGE_DENOM; +} + +static void +vm_page_seg_refill_inactive(struct vm_page_seg *seg) +{ + struct vm_page *page; + + simple_lock(&seg->lock); + + vm_page_seg_compute_high_active_page(seg); + + while (seg->nr_active_pages > seg->high_active_pages) { + page = vm_page_seg_pull_active_page(seg, FALSE); + + if (page == NULL) { + break; + } + + page->reference = FALSE; + pmap_clear_reference(page->phys_addr); + vm_page_seg_add_inactive_page(seg, page); + vm_object_unlock(page->object); + } + + simple_unlock(&seg->lock); +} + +void __init +vm_page_load(unsigned int seg_index, phys_addr_t start, phys_addr_t end) +{ + struct vm_page_boot_seg *seg; + + assert(seg_index < ARRAY_SIZE(vm_page_boot_segs)); + assert(vm_page_aligned(start)); + assert(vm_page_aligned(end)); + assert(start < end); + assert(vm_page_segs_size < ARRAY_SIZE(vm_page_boot_segs)); + + seg = &vm_page_boot_segs[seg_index]; + seg->start = start; + seg->end = end; + seg->heap_present = FALSE; + +#if DEBUG + printf("vm_page: load: %s: %llx:%llx\n", + vm_page_seg_name(seg_index), + (unsigned long long)start, (unsigned long long)end); +#endif + + vm_page_segs_size++; +} + +void +vm_page_load_heap(unsigned int seg_index, phys_addr_t start, phys_addr_t end) +{ + struct vm_page_boot_seg *seg; + + assert(seg_index < ARRAY_SIZE(vm_page_boot_segs)); + assert(vm_page_aligned(start)); + assert(vm_page_aligned(end)); + + seg = &vm_page_boot_segs[seg_index]; + + assert(seg->start <= start); + assert(end <= seg-> end); + + seg->avail_start = start; + seg->avail_end = end; + seg->heap_present = TRUE; + +#if DEBUG + printf("vm_page: heap: %s: %llx:%llx\n", + vm_page_seg_name(seg_index), + (unsigned long long)start, (unsigned long long)end); +#endif +} + +int +vm_page_ready(void) +{ + return vm_page_is_ready; +} + +static unsigned int +vm_page_select_alloc_seg(unsigned int selector) +{ + unsigned int seg_index; + + switch (selector) { + case VM_PAGE_SEL_DMA: + seg_index = VM_PAGE_SEG_DMA; + break; + case VM_PAGE_SEL_DMA32: + seg_index = VM_PAGE_SEG_DMA32; + break; + case VM_PAGE_SEL_DIRECTMAP: + seg_index = VM_PAGE_SEG_DIRECTMAP; + break; + case VM_PAGE_SEL_HIGHMEM: + seg_index = VM_PAGE_SEG_HIGHMEM; + break; + default: + panic("vm_page: invalid selector"); + } + + return MIN(vm_page_segs_size - 1, seg_index); +} + +static int __init +vm_page_boot_seg_loaded(const struct vm_page_boot_seg *seg) +{ + return (seg->end != 0); +} + +static void __init +vm_page_check_boot_segs(void) +{ + unsigned int i; + int expect_loaded; + + if (vm_page_segs_size == 0) + panic("vm_page: no physical memory loaded"); + + for (i = 0; i < ARRAY_SIZE(vm_page_boot_segs); i++) { + expect_loaded = (i < vm_page_segs_size); + + if (vm_page_boot_seg_loaded(&vm_page_boot_segs[i]) == expect_loaded) + continue; + + panic("vm_page: invalid boot segment table"); + } +} + +static phys_addr_t __init +vm_page_boot_seg_size(struct vm_page_boot_seg *seg) +{ + return seg->end - seg->start; +} + +static phys_addr_t __init +vm_page_boot_seg_avail_size(struct vm_page_boot_seg *seg) +{ + return seg->avail_end - seg->avail_start; +} + +phys_addr_t __init +vm_page_bootalloc(size_t size) +{ + struct vm_page_boot_seg *seg; + phys_addr_t pa; + unsigned int i; + + for (i = vm_page_select_alloc_seg(VM_PAGE_SEL_DIRECTMAP); + i < vm_page_segs_size; + i--) { + seg = &vm_page_boot_segs[i]; + + if (size <= vm_page_boot_seg_avail_size(seg)) { + pa = seg->avail_start; + seg->avail_start += vm_page_round(size); + return pa; + } + } + + panic("vm_page: no physical memory available"); +} + +void __init +vm_page_setup(void) +{ + struct vm_page_boot_seg *boot_seg; + struct vm_page_seg *seg; + struct vm_page *table, *page, *end; + size_t nr_pages, table_size; + unsigned long va; + unsigned int i; + phys_addr_t pa; + + vm_page_check_boot_segs(); + + /* + * Compute the page table size. + */ + nr_pages = 0; + + for (i = 0; i < vm_page_segs_size; i++) + nr_pages += vm_page_atop(vm_page_boot_seg_size(&vm_page_boot_segs[i])); + + table_size = vm_page_round(nr_pages * sizeof(struct vm_page)); + printf("vm_page: page table size: %lu entries (%luk)\n", nr_pages, + table_size >> 10); + table = (struct vm_page *)pmap_steal_memory(table_size); + va = (unsigned long)table; + + /* + * Initialize the segments, associating them to the page table. When + * the segments are initialized, all their pages are set allocated. + * Pages are then released, which populates the free lists. + */ + for (i = 0; i < vm_page_segs_size; i++) { + seg = &vm_page_segs[i]; + boot_seg = &vm_page_boot_segs[i]; + vm_page_seg_init(seg, boot_seg->start, boot_seg->end, table); + page = seg->pages + vm_page_atop(boot_seg->avail_start + - boot_seg->start); + end = seg->pages + vm_page_atop(boot_seg->avail_end + - boot_seg->start); + + while (page < end) { + page->type = VM_PT_FREE; + vm_page_seg_free_to_buddy(seg, page, 0); + page++; + } + + table += vm_page_atop(vm_page_seg_size(seg)); + } + + while (va < (unsigned long)table) { + pa = pmap_extract(kernel_pmap, va); + page = vm_page_lookup_pa(pa); + assert((page != NULL) && (page->type == VM_PT_RESERVED)); + page->type = VM_PT_TABLE; + va += PAGE_SIZE; + } + + vm_page_is_ready = 1; +} + +void __init +vm_page_manage(struct vm_page *page) +{ + assert(page->seg_index < ARRAY_SIZE(vm_page_segs)); + assert(page->type == VM_PT_RESERVED); + + vm_page_set_type(page, 0, VM_PT_FREE); + vm_page_seg_free_to_buddy(&vm_page_segs[page->seg_index], page, 0); +} + +struct vm_page * +vm_page_lookup_pa(phys_addr_t pa) +{ + struct vm_page_seg *seg; + unsigned int i; + + for (i = 0; i < vm_page_segs_size; i++) { + seg = &vm_page_segs[i]; + + if ((pa >= seg->start) && (pa < seg->end)) + return &seg->pages[vm_page_atop(pa - seg->start)]; + } + + return NULL; +} + +static struct vm_page_seg * +vm_page_lookup_seg(const struct vm_page *page) +{ + struct vm_page_seg *seg; + unsigned int i; + + for (i = 0; i < vm_page_segs_size; i++) { + seg = &vm_page_segs[i]; + + if ((page->phys_addr >= seg->start) && (page->phys_addr < seg->end)) { + return seg; + } + } + + return NULL; +} + +void vm_page_check(const struct vm_page *page) +{ + if (page->fictitious) { + if (page->private) { + panic("vm_page: page both fictitious and private"); + } + + if (page->phys_addr != vm_page_fictitious_addr) { + panic("vm_page: invalid fictitious page"); + } + } else { + struct vm_page_seg *seg; + + if (page->phys_addr == vm_page_fictitious_addr) { + panic("vm_page: real page has fictitious address"); + } + + seg = vm_page_lookup_seg(page); + + if (seg == NULL) { + if (!page->private) { + panic("vm_page: page claims it's managed but not in any segment"); + } + } else { + if (page->private) { + struct vm_page *real_page; + + if (vm_page_pageable(page)) { + panic("vm_page: private page is pageable"); + } + + real_page = vm_page_lookup_pa(page->phys_addr); + + if (vm_page_pageable(real_page)) { + panic("vm_page: page underlying private page is pageable"); + } + + if ((real_page->type == VM_PT_FREE) + || (real_page->order != VM_PAGE_ORDER_UNLISTED)) { + panic("vm_page: page underlying private pagei is free"); + } + } else { + unsigned int index; + + index = vm_page_seg_index(seg); + + if (index != page->seg_index) { + panic("vm_page: page segment mismatch"); + } + } + } + } +} + +struct vm_page * +vm_page_alloc_pa(unsigned int order, unsigned int selector, unsigned short type) +{ + struct vm_page *page; + unsigned int i; + + for (i = vm_page_select_alloc_seg(selector); i < vm_page_segs_size; i--) { + page = vm_page_seg_alloc(&vm_page_segs[i], order, type); + + if (page != NULL) + return page; + } + + if (!current_thread() || current_thread()->vm_privilege) + panic("vm_page: privileged thread unable to allocate page"); + + return NULL; +} + +void +vm_page_free_pa(struct vm_page *page, unsigned int order) +{ + assert(page != NULL); + assert(page->seg_index < ARRAY_SIZE(vm_page_segs)); + + vm_page_seg_free(&vm_page_segs[page->seg_index], page, order); +} + +const char * +vm_page_seg_name(unsigned int seg_index) +{ + /* Don't use a switch statement since segments can be aliased */ + if (seg_index == VM_PAGE_SEG_HIGHMEM) + return "HIGHMEM"; + else if (seg_index == VM_PAGE_SEG_DIRECTMAP) + return "DIRECTMAP"; + else if (seg_index == VM_PAGE_SEG_DMA32) + return "DMA32"; + else if (seg_index == VM_PAGE_SEG_DMA) + return "DMA"; + else + panic("vm_page: invalid segment index"); +} + +void +vm_page_info_all(void) +{ + struct vm_page_seg *seg; + unsigned long pages; + unsigned int i; + + for (i = 0; i < vm_page_segs_size; i++) { + seg = &vm_page_segs[i]; + pages = (unsigned long)(seg->pages_end - seg->pages); + printf("vm_page: %s: pages: %lu (%luM), free: %lu (%luM)\n", + vm_page_seg_name(i), pages, pages >> (20 - PAGE_SHIFT), + seg->nr_free_pages, seg->nr_free_pages >> (20 - PAGE_SHIFT)); + printf("vm_page: %s: min:%lu low:%lu high:%lu\n", + vm_page_seg_name(vm_page_seg_index(seg)), + seg->min_free_pages, seg->low_free_pages, seg->high_free_pages); + } +} + +phys_addr_t +vm_page_seg_end(unsigned int selector) +{ + return vm_page_segs[vm_page_select_alloc_seg(selector)].end; +} + +static unsigned long +vm_page_boot_table_size(void) +{ + unsigned long nr_pages; + unsigned int i; + + nr_pages = 0; + + for (i = 0; i < vm_page_segs_size; i++) { + nr_pages += vm_page_atop(vm_page_boot_seg_size(&vm_page_boot_segs[i])); + } + + return nr_pages; +} + +unsigned long +vm_page_table_size(void) +{ + unsigned long nr_pages; + unsigned int i; + + if (!vm_page_is_ready) { + return vm_page_boot_table_size(); + } + + nr_pages = 0; + + for (i = 0; i < vm_page_segs_size; i++) { + nr_pages += vm_page_atop(vm_page_seg_size(&vm_page_segs[i])); + } + + return nr_pages; +} + +unsigned long +vm_page_table_index(phys_addr_t pa) +{ + struct vm_page_seg *seg; + unsigned long index; + unsigned int i; + + index = 0; + + for (i = 0; i < vm_page_segs_size; i++) { + seg = &vm_page_segs[i]; + + if ((pa >= seg->start) && (pa < seg->end)) { + return index + vm_page_atop(pa - seg->start); + } + + index += vm_page_atop(vm_page_seg_size(seg)); + } + + panic("vm_page: invalid physical address"); +} + +phys_addr_t +vm_page_mem_size(void) +{ + phys_addr_t total; + unsigned int i; + + total = 0; + + for (i = 0; i < vm_page_segs_size; i++) { + total += vm_page_seg_size(&vm_page_segs[i]); + } + + return total; +} + +unsigned long +vm_page_mem_free(void) +{ + unsigned long total; + unsigned int i; + + total = 0; + + for (i = 0; i < vm_page_segs_size; i++) { + total += vm_page_segs[i].nr_free_pages; + } + + return total; +} + +/* + * Mark this page as wired down by yet another map, removing it + * from paging queues as necessary. + * + * The page's object and the page queues must be locked. + */ +void +vm_page_wire(struct vm_page *page) +{ + VM_PAGE_CHECK(page); + + if (page->wire_count == 0) { + vm_page_queues_remove(page); + + if (!page->private && !page->fictitious) { + vm_page_wire_count++; + } + } + + page->wire_count++; +} + +/* + * Release one wiring of this page, potentially enabling it to be paged again. + * + * The page's object and the page queues must be locked. + */ +void +vm_page_unwire(struct vm_page *page) +{ + struct vm_page_seg *seg; + + VM_PAGE_CHECK(page); + + assert(page->wire_count != 0); + page->wire_count--; + + if ((page->wire_count != 0) + || page->fictitious + || page->private) { + return; + } + + seg = vm_page_seg_get(page->seg_index); + + simple_lock(&seg->lock); + vm_page_seg_add_active_page(seg, page); + simple_unlock(&seg->lock); + + vm_page_wire_count--; +} + +/* + * Returns the given page to the inactive list, indicating that + * no physical maps have access to this page. + * [Used by the physical mapping system.] + * + * The page queues must be locked. + */ +void +vm_page_deactivate(struct vm_page *page) +{ + struct vm_page_seg *seg; + + VM_PAGE_CHECK(page); + + /* + * This page is no longer very interesting. If it was + * interesting (active or inactive/referenced), then we + * clear the reference bit and (re)enter it in the + * inactive queue. Note wired pages should not have + * their reference bit cleared. + */ + + if (page->active || (page->inactive && page->reference)) { + if (!page->fictitious && !page->private && !page->absent) { + pmap_clear_reference(page->phys_addr); + } + + page->reference = FALSE; + vm_page_queues_remove(page); + } + + if ((page->wire_count == 0) && !page->fictitious + && !page->private && !page->inactive) { + seg = vm_page_seg_get(page->seg_index); + + simple_lock(&seg->lock); + vm_page_seg_add_inactive_page(seg, page); + simple_unlock(&seg->lock); + } +} + +/* + * Put the specified page on the active list (if appropriate). + * + * The page queues must be locked. + */ +void +vm_page_activate(struct vm_page *page) +{ + struct vm_page_seg *seg; + + VM_PAGE_CHECK(page); + + /* + * Unconditionally remove so that, even if the page was already + * active, it gets back to the end of the active queue. + */ + vm_page_queues_remove(page); + + if ((page->wire_count == 0) && !page->fictitious && !page->private) { + seg = vm_page_seg_get(page->seg_index); + + if (page->active) + panic("vm_page_activate: already active"); + + simple_lock(&seg->lock); + vm_page_seg_add_active_page(seg, page); + simple_unlock(&seg->lock); + } +} + +void +vm_page_queues_remove(struct vm_page *page) +{ + struct vm_page_seg *seg; + + assert(!page->active || !page->inactive); + + if (!page->active && !page->inactive) { + return; + } + + seg = vm_page_seg_get(page->seg_index); + + simple_lock(&seg->lock); + + if (page->active) { + vm_page_seg_remove_active_page(seg, page); + } else { + vm_page_seg_remove_inactive_page(seg, page); + } + + simple_unlock(&seg->lock); +} + +/* + * Check whether segments are all usable for unprivileged allocations. + * + * If all segments are usable, resume pending unprivileged allocations + * and return TRUE. + * + * This function acquires vm_page_queue_free_lock, which is held on return. + */ +static boolean_t +vm_page_check_usable(void) +{ + struct vm_page_seg *seg; + boolean_t usable; + unsigned int i; + + simple_lock(&vm_page_queue_free_lock); + + for (i = 0; i < vm_page_segs_size; i++) { + seg = vm_page_seg_get(i); + + simple_lock(&seg->lock); + usable = vm_page_seg_usable(seg); + simple_unlock(&seg->lock); + + if (!usable) { + return FALSE; + } + } + + vm_page_external_laundry_count = -1; + vm_page_alloc_paused = FALSE; + thread_wakeup(&vm_page_alloc_paused); + return TRUE; +} + +static boolean_t +vm_page_may_balance(void) +{ + struct vm_page_seg *seg; + boolean_t page_available; + unsigned int i; + + for (i = 0; i < vm_page_segs_size; i++) { + seg = vm_page_seg_get(i); + + simple_lock(&seg->lock); + page_available = vm_page_seg_page_available(seg); + simple_unlock(&seg->lock); + + if (page_available) { + return TRUE; + } + } + + return FALSE; +} + +static boolean_t +vm_page_balance_once(void) +{ + boolean_t balanced; + unsigned int i; + + /* + * It's important here that pages are moved from higher priority + * segments first. + */ + + for (i = 0; i < vm_page_segs_size; i++) { + balanced = vm_page_seg_balance(vm_page_seg_get(i)); + + if (balanced) { + return TRUE; + } + } + + return FALSE; +} + +boolean_t +vm_page_balance(void) +{ + boolean_t balanced; + + while (vm_page_may_balance()) { + balanced = vm_page_balance_once(); + + if (!balanced) { + break; + } + } + + return vm_page_check_usable(); +} + +static boolean_t +vm_page_evict_once(boolean_t external_only, boolean_t alloc_paused) +{ + boolean_t evicted; + unsigned int i; + + /* + * It's important here that pages are evicted from lower priority + * segments first. + */ + + for (i = vm_page_segs_size - 1; i < vm_page_segs_size; i--) { + evicted = vm_page_seg_evict(vm_page_seg_get(i), + external_only, alloc_paused); + + if (evicted) { + return TRUE; + } + } + + return FALSE; +} + +#define VM_PAGE_MAX_LAUNDRY 5 +#define VM_PAGE_MAX_EVICTIONS 5 + +boolean_t +vm_page_evict(boolean_t *should_wait) +{ + boolean_t pause, evicted, external_only, alloc_paused; + unsigned int i; + + *should_wait = TRUE; + external_only = TRUE; + + simple_lock(&vm_page_queue_free_lock); + vm_page_external_laundry_count = 0; + alloc_paused = vm_page_alloc_paused; + simple_unlock(&vm_page_queue_free_lock); + +again: + vm_page_lock_queues(); + pause = (vm_page_laundry_count >= VM_PAGE_MAX_LAUNDRY); + vm_page_unlock_queues(); + + if (pause) { + simple_lock(&vm_page_queue_free_lock); + return FALSE; + } + + for (i = 0; i < VM_PAGE_MAX_EVICTIONS; i++) { + evicted = vm_page_evict_once(external_only, alloc_paused); + + if (!evicted) { + break; + } + } + + simple_lock(&vm_page_queue_free_lock); + + /* + * Keep in mind eviction may not cause pageouts, since non-precious + * clean pages are simply released. + */ + if ((vm_page_laundry_count == 0) && (vm_page_external_laundry_count == 0)) { + /* + * No pageout, but some clean pages were freed. Start a complete + * scan again without waiting. + */ + if (evicted) { + *should_wait = FALSE; + return FALSE; + } + + /* + * Eviction failed, consider pages from internal objects on the + * next attempt. + */ + if (external_only) { + simple_unlock(&vm_page_queue_free_lock); + external_only = FALSE; + goto again; + } + + /* + * TODO Find out what could cause this and how to deal with it. + * This will likely require an out-of-memory killer. + */ + + { + static boolean_t warned = FALSE; + + if (!warned) { + printf("vm_page warning: unable to recycle any page\n"); + warned = 1; + } + } + } + + simple_unlock(&vm_page_queue_free_lock); + + return vm_page_check_usable(); +} + +void +vm_page_refill_inactive(void) +{ + unsigned int i; + + vm_page_lock_queues(); + + for (i = 0; i < vm_page_segs_size; i++) { + vm_page_seg_refill_inactive(vm_page_seg_get(i)); + } + + vm_page_unlock_queues(); +} + +void +vm_page_wait(void (*continuation)(void)) +{ + assert(!current_thread()->vm_privilege); + + simple_lock(&vm_page_queue_free_lock); + + if (!vm_page_alloc_paused) { + simple_unlock(&vm_page_queue_free_lock); + return; + } + + assert_wait(&vm_page_alloc_paused, FALSE); + + simple_unlock(&vm_page_queue_free_lock); + + if (continuation != 0) { + counter(c_vm_page_wait_block_user++); + thread_block(continuation); + } else { + counter(c_vm_page_wait_block_kernel++); + thread_block((void (*)(void)) 0); + } +} + +#if MACH_KDB +#include <ddb/db_output.h> +#define PAGES_PER_MB ((1<<20) / PAGE_SIZE) +void db_show_vmstat(void) +{ + integer_t free_count = vm_page_mem_free(); + unsigned i; + + db_printf("%-20s %10uM\n", "size:", + (free_count + vm_page_active_count + + vm_page_inactive_count + vm_page_wire_count) + / PAGES_PER_MB); + + db_printf("%-20s %10uM\n", "free:", + free_count / PAGES_PER_MB); + db_printf("%-20s %10uM\n", "active:", + vm_page_active_count / PAGES_PER_MB); + db_printf("%-20s %10uM\n", "inactive:", + vm_page_inactive_count / PAGES_PER_MB); + db_printf("%-20s %10uM\n", "wired:", + vm_page_wire_count / PAGES_PER_MB); + + db_printf("%-20s %10uM\n", "zero filled:", + vm_stat.zero_fill_count / PAGES_PER_MB); + db_printf("%-20s %10uM\n", "reactivated:", + vm_stat.reactivations / PAGES_PER_MB); + db_printf("%-20s %10uM\n", "pageins:", + vm_stat.pageins / PAGES_PER_MB); + db_printf("%-20s %10uM\n", "pageouts:", + vm_stat.pageouts / PAGES_PER_MB); + db_printf("%-20s %10uM\n", "page faults:", + vm_stat.faults / PAGES_PER_MB); + db_printf("%-20s %10uM\n", "cow faults:", + vm_stat.cow_faults / PAGES_PER_MB); + db_printf("%-20s %10u%\n", "memobj hit ratio:", + (vm_stat.hits * 100) / vm_stat.lookups); + + db_printf("%-20s %10u%\n", "cached_memobjs", + vm_object_external_count); + db_printf("%-20s %10uM\n", "cache", + vm_object_external_pages / PAGES_PER_MB); + + for (i = 0; i < vm_page_segs_size; i++) + { + db_printf("\nSegment %s:\n", vm_page_seg_name(i)); + db_printf("%-20s %10uM\n", "size:", + vm_page_seg_size(&vm_page_segs[i]) >> 20); + db_printf("%-20s %10uM\n", "free:", + vm_page_segs[i].nr_free_pages / PAGES_PER_MB); + db_printf("%-20s %10uM\n", "min_free:", + vm_page_segs[i].min_free_pages / PAGES_PER_MB); + db_printf("%-20s %10uM\n", "low_free:", + vm_page_segs[i].low_free_pages / PAGES_PER_MB); + db_printf("%-20s %10uM\n", "high_free:", + vm_page_segs[i].high_free_pages / PAGES_PER_MB); + db_printf("%-20s %10uM\n", "active:", + vm_page_segs[i].nr_active_pages / PAGES_PER_MB); + db_printf("%-20s %10uM\n", "high active:", + vm_page_segs[i].high_active_pages / PAGES_PER_MB); + db_printf("%-20s %10uM\n", "inactive:", + vm_page_segs[i].nr_inactive_pages / PAGES_PER_MB); + } +} +#endif /* MACH_KDB */ diff --git a/vm/vm_page.h b/vm/vm_page.h new file mode 100644 index 0000000..3be75f1 --- /dev/null +++ b/vm/vm_page.h @@ -0,0 +1,567 @@ +/* + * Mach Operating System + * Copyright (c) 1993-1988 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/vm_page.h + * Author: Avadis Tevanian, Jr., Michael Wayne Young + * Date: 1985 + * + * Resident memory system definitions. + */ + +#ifndef _VM_VM_PAGE_H_ +#define _VM_VM_PAGE_H_ + +#include <mach/boolean.h> +#include <mach/vm_prot.h> +#include <machine/vm_param.h> +#include <vm/vm_object.h> +#include <vm/vm_types.h> +#include <kern/queue.h> +#include <kern/list.h> +#include <kern/lock.h> +#include <kern/log2.h> + +#include <kern/macros.h> +#include <kern/sched_prim.h> /* definitions of wait/wakeup */ + +#if MACH_VM_DEBUG +#include <mach_debug/hash_info.h> +#endif + +/* + * Management of resident (logical) pages. + * + * A small structure is kept for each resident + * page, indexed by page number. Each structure + * is an element of several lists: + * + * A hash table bucket used to quickly + * perform object/offset lookups + * + * A list of all pages for a given object, + * so they can be quickly deactivated at + * time of deallocation. + * + * An ordered list of pages due for pageout. + * + * In addition, the structure contains the object + * and offset to which this page belongs (for pageout), + * and sundry status bits. + * + * Fields in this structure are locked either by the lock on the + * object that the page belongs to (O) or by the lock on the page + * queues (P). [Some fields require that both locks be held to + * change that field; holding either lock is sufficient to read.] + */ + +struct vm_page { + struct list node; /* page queues or free list (P) */ + void *priv; + + /* + * This member is used throughout the code and may only change for + * fictitious pages. + */ + phys_addr_t phys_addr; + + queue_chain_t listq; /* all pages in same object (O) */ + struct vm_page *next; /* VP bucket link (O) */ + + /* We use an empty struct as the delimiter. */ + struct {} vm_page_header; + + vm_object_t object; /* which object am I in (O,P) */ + vm_offset_t offset; /* offset into that object (O,P) */ + + unsigned int wire_count:15, /* how many wired down maps use me? + (O&P) */ + /* boolean_t */ inactive:1, /* page is in inactive list (P) */ + active:1, /* page is in active list (P) */ + laundry:1, /* page is being cleaned now (P)*/ + external_laundry:1, /* same as laundry for external pagers (P)*/ + free:1, /* page is on free list (P) */ + reference:1, /* page has been used (P) */ + external:1, /* page in external object (P) */ + busy:1, /* page is in transit (O) */ + wanted:1, /* someone is waiting for page (O) */ + tabled:1, /* page is in VP table (O) */ + fictitious:1, /* Physical page doesn't exist (O) */ + private:1, /* Page should not be returned to + * the free list (O) */ + absent:1, /* Data has been requested, but is + * not yet available (O) */ + error:1, /* Data manager was unable to provide + * data due to error (O) */ + dirty:1, /* Page must be cleaned (O) */ + precious:1, /* Page is precious; data must be + * returned even if clean (O) */ + overwriting:1; /* Request to unlock has been made + * without having data. (O) + * [See vm_object_overwrite] */ + + vm_prot_t page_lock:3; /* Uses prohibited by data manager (O) */ + vm_prot_t unlock_request:3; /* Outstanding unlock request (O) */ + + struct {} vm_page_footer; + + unsigned short type:2; + unsigned short seg_index:2; + unsigned short order:4; +}; + +#define VM_PAGE_BODY_SIZE \ + (offsetof(struct vm_page, vm_page_footer) \ + - offsetof(struct vm_page, vm_page_header)) + +/* + * For debugging, this macro can be defined to perform + * some useful check on a page structure. + */ + +#define VM_PAGE_CHECK(mem) vm_page_check(mem) + +void vm_page_check(const struct vm_page *page); + +/* + * Each pageable resident page falls into one of three lists: + * + * free + * Available for allocation now. + * inactive + * Not referenced in any map, but still has an + * object/offset-page mapping, and may be dirty. + * This is the list of pages that should be + * paged out next. + * active + * A list of pages which have been placed in + * at least one physical map. This list is + * ordered, in LRU-like fashion. + */ + +#define VM_PAGE_DMA 0x01 +#if defined(VM_PAGE_DMA32_LIMIT) && VM_PAGE_DMA32_LIMIT > VM_PAGE_DIRECTMAP_LIMIT +#define VM_PAGE_DIRECTMAP 0x02 +#define VM_PAGE_DMA32 0x04 +#else +#define VM_PAGE_DMA32 0x02 +#define VM_PAGE_DIRECTMAP 0x04 +#endif +#define VM_PAGE_HIGHMEM 0x08 + +extern +int vm_page_fictitious_count;/* How many fictitious pages are free? */ +extern +int vm_page_active_count; /* How many pages are active? */ +extern +int vm_page_inactive_count; /* How many pages are inactive? */ +extern +int vm_page_wire_count; /* How many pages are wired? */ +extern +int vm_page_laundry_count; /* How many pages being laundered? */ +extern +int vm_page_external_laundry_count; /* How many external pages being paged out? */ + +decl_simple_lock_data(extern,vm_page_queue_lock)/* lock on active and inactive + page queues */ +decl_simple_lock_data(extern,vm_page_queue_free_lock) + /* lock on free page queue */ + +extern phys_addr_t vm_page_fictitious_addr; + /* (fake) phys_addr of fictitious pages */ + +extern void vm_page_bootstrap( + vm_offset_t *startp, + vm_offset_t *endp); +extern void vm_page_module_init(void); + +extern vm_page_t vm_page_lookup( + vm_object_t object, + vm_offset_t offset); +extern vm_page_t vm_page_grab_fictitious(void); +extern boolean_t vm_page_convert(vm_page_t *); +extern void vm_page_more_fictitious(void); +extern vm_page_t vm_page_grab(unsigned flags); +extern void vm_page_release(vm_page_t, boolean_t, boolean_t); +extern phys_addr_t vm_page_grab_phys_addr(void); +extern vm_page_t vm_page_grab_contig(vm_size_t, unsigned int); +extern void vm_page_free_contig(vm_page_t, vm_size_t); +extern void vm_page_wait(void (*)(void)); +extern vm_page_t vm_page_alloc( + vm_object_t object, + vm_offset_t offset); +extern void vm_page_init( + vm_page_t mem); +extern void vm_page_free(vm_page_t); +extern void vm_page_activate(vm_page_t); +extern void vm_page_deactivate(vm_page_t); +extern void vm_page_rename( + vm_page_t mem, + vm_object_t new_object, + vm_offset_t new_offset); +extern void vm_page_insert( + vm_page_t mem, + vm_object_t object, + vm_offset_t offset); +extern void vm_page_remove( + vm_page_t mem); + +extern void vm_page_zero_fill(vm_page_t); +extern void vm_page_copy(vm_page_t src_m, vm_page_t dest_m); + +extern void vm_page_wire(vm_page_t); +extern void vm_page_unwire(vm_page_t); + +#if MACH_VM_DEBUG +extern unsigned int vm_page_info( + hash_info_bucket_t *info, + unsigned int count); +#endif + +/* + * Functions implemented as macros + */ + +#define PAGE_ASSERT_WAIT(m, interruptible) \ + MACRO_BEGIN \ + (m)->wanted = TRUE; \ + assert_wait((event_t) (m), (interruptible)); \ + MACRO_END + +#define PAGE_WAKEUP_DONE(m) \ + MACRO_BEGIN \ + (m)->busy = FALSE; \ + if ((m)->wanted) { \ + (m)->wanted = FALSE; \ + thread_wakeup(((event_t) m)); \ + } \ + MACRO_END + +#define PAGE_WAKEUP(m) \ + MACRO_BEGIN \ + if ((m)->wanted) { \ + (m)->wanted = FALSE; \ + thread_wakeup((event_t) (m)); \ + } \ + MACRO_END + +#define VM_PAGE_FREE(p) \ + MACRO_BEGIN \ + vm_page_lock_queues(); \ + vm_page_free(p); \ + vm_page_unlock_queues(); \ + MACRO_END + +/* + * Macro to be used in place of pmap_enter() + */ + +#define PMAP_ENTER(pmap, virtual_address, page, protection, wired) \ + MACRO_BEGIN \ + pmap_enter( \ + (pmap), \ + (virtual_address), \ + (page)->phys_addr, \ + (protection) & ~(page)->page_lock, \ + (wired) \ + ); \ + MACRO_END + +#define VM_PAGE_WAIT(continuation) vm_page_wait(continuation) + +#define vm_page_lock_queues() simple_lock(&vm_page_queue_lock) +#define vm_page_unlock_queues() simple_unlock(&vm_page_queue_lock) + +#define VM_PAGE_QUEUES_REMOVE(mem) vm_page_queues_remove(mem) + +/* + * Copyright (c) 2010-2014 Richard Braun. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * + * Physical page management. + */ + +/* + * Address/page conversion and rounding macros (not inline functions to + * be easily usable on both virtual and physical addresses, which may not + * have the same type size). + */ +#define vm_page_atop(addr) ((addr) >> PAGE_SHIFT) +#define vm_page_ptoa(page) ((page) << PAGE_SHIFT) +#define vm_page_trunc(addr) P2ALIGN(addr, PAGE_SIZE) +#define vm_page_round(addr) P2ROUND(addr, PAGE_SIZE) +#define vm_page_aligned(addr) P2ALIGNED(addr, PAGE_SIZE) + +/* + * Segment selectors. + * + * Selector-to-segment-list translation table : + * DMA DMA + * if 32bit PAE + * DIRECTMAP DMA32 DMA + * DMA32 DMA32 DIRECTMAP DMA + * HIGHMEM HIGHMEM DMA32 DIRECTMAP DMA + * else + * DMA32 DMA32 DMA + * DIRECTMAP DIRECTMAP DMA32 DMA + * HIGHMEM HIGHMEM DIRECTMAP DMA32 DMA + * endif + */ +#define VM_PAGE_SEL_DMA 0 +#if defined(VM_PAGE_DMA32_LIMIT) && VM_PAGE_DMA32_LIMIT > VM_PAGE_DIRECTMAP_LIMIT +#define VM_PAGE_SEL_DIRECTMAP 1 +#define VM_PAGE_SEL_DMA32 2 +#else +#define VM_PAGE_SEL_DMA32 1 +#define VM_PAGE_SEL_DIRECTMAP 2 +#endif +#define VM_PAGE_SEL_HIGHMEM 3 + +/* + * Page usage types. + */ +#define VM_PT_FREE 0 /* Page unused */ +#define VM_PT_RESERVED 1 /* Page reserved at boot time */ +#define VM_PT_TABLE 2 /* Page is part of the page table */ +#define VM_PT_KERNEL 3 /* Type for generic kernel allocations */ + +static inline unsigned short +vm_page_type(const struct vm_page *page) +{ + return page->type; +} + +void vm_page_set_type(struct vm_page *page, unsigned int order, + unsigned short type); + +static inline unsigned int +vm_page_order(size_t size) +{ + return iorder2(vm_page_atop(vm_page_round(size))); +} + +static inline phys_addr_t +vm_page_to_pa(const struct vm_page *page) +{ + return page->phys_addr; +} + +/* + * Associate private data with a page. + */ +static inline void +vm_page_set_priv(struct vm_page *page, void *priv) +{ + page->priv = priv; +} + +static inline void * +vm_page_get_priv(const struct vm_page *page) +{ + return page->priv; +} + +/* + * Load physical memory into the vm_page module at boot time. + * + * All addresses must be page-aligned. Segments can be loaded in any order. + */ +void vm_page_load(unsigned int seg_index, phys_addr_t start, phys_addr_t end); + +/* + * Load available physical memory into the vm_page module at boot time. + * + * The segment referred to must have been loaded with vm_page_load + * before loading its heap. + */ +void vm_page_load_heap(unsigned int seg_index, phys_addr_t start, + phys_addr_t end); + +/* + * Return true if the vm_page module is completely initialized, false + * otherwise, in which case only vm_page_bootalloc() can be used for + * allocations. + */ +int vm_page_ready(void); + +/* + * Early allocation function. + * + * This function is used by the vm_resident module to implement + * pmap_steal_memory. It can be used after physical segments have been loaded + * and before the vm_page module is initialized. + */ +phys_addr_t vm_page_bootalloc(size_t size); + +/* + * Set up the vm_page module. + * + * Architecture-specific code must have loaded segments before calling this + * function. Segments must comply with the selector-to-segment-list table, + * e.g. HIGHMEM is loaded if and only if DIRECTMAP, DMA32 and DMA are loaded, + * notwithstanding segment aliasing. + * + * Once this function returns, the vm_page module is ready, and normal + * allocation functions can be used. + */ +void vm_page_setup(void); + +/* + * Make the given page managed by the vm_page module. + * + * If additional memory can be made usable after the VM system is initialized, + * it should be reported through this function. + */ +void vm_page_manage(struct vm_page *page); + +/* + * Return the page descriptor for the given physical address. + */ +struct vm_page * vm_page_lookup_pa(phys_addr_t pa); + +/* + * Allocate a block of 2^order physical pages. + * + * The selector is used to determine the segments from which allocation can + * be attempted. + * + * This function should only be used by the vm_resident module. + */ +struct vm_page * vm_page_alloc_pa(unsigned int order, unsigned int selector, + unsigned short type); + +/* + * Release a block of 2^order physical pages. + * + * This function should only be used by the vm_resident module. + */ +void vm_page_free_pa(struct vm_page *page, unsigned int order); + +/* + * Return the name of the given segment. + */ +const char * vm_page_seg_name(unsigned int seg_index); + +/* + * Display internal information about the module. + */ +void vm_page_info_all(void); + +/* + * Return the maximum physical address for a given segment selector. + */ +phys_addr_t vm_page_seg_end(unsigned int selector); + +/* + * Return the total number of physical pages. + */ +unsigned long vm_page_table_size(void); + +/* + * Return the index of a page in the page table. + */ +unsigned long vm_page_table_index(phys_addr_t pa); + +/* + * Return the total amount of physical memory. + */ +phys_addr_t vm_page_mem_size(void); + +/* + * Return the amount of free (unused) pages. + * + * XXX This currently relies on the kernel being non preemptible and + * uniprocessor. + */ +unsigned long vm_page_mem_free(void); + +/* + * Remove the given page from any page queue it might be in. + */ +void vm_page_queues_remove(struct vm_page *page); + +/* + * Balance physical pages among segments. + * + * This function should be called first by the pageout daemon + * on memory pressure, since it may be unnecessary to perform any + * other operation, let alone shrink caches, if balancing is + * enough to make enough free pages. + * + * Return TRUE if balancing made enough free pages for unprivileged + * allocations to succeed, in which case pending allocations are resumed. + * + * This function acquires vm_page_queue_free_lock, which is held on return. + */ +boolean_t vm_page_balance(void); + +/* + * Evict physical pages. + * + * This function should be called by the pageout daemon after balancing + * the segments and shrinking kernel caches. + * + * Return TRUE if eviction made enough free pages for unprivileged + * allocations to succeed, in which case pending allocations are resumed. + * + * Otherwise, report whether the pageout daemon should wait (some pages + * have been paged out) or not (only clean pages have been released). + * + * This function acquires vm_page_queue_free_lock, which is held on return. + */ +boolean_t vm_page_evict(boolean_t *should_wait); + +/* + * Turn active pages into inactive ones for second-chance LRU + * approximation. + * + * This function should be called by the pageout daemon on memory pressure, + * i.e. right before evicting pages. + * + * XXX This is probably not the best strategy, compared to keeping the + * active/inactive ratio in check at all times, but this means less + * frequent refills. + */ +void vm_page_refill_inactive(void); + +/* + * Print vmstat information + */ +void db_show_vmstat(void); + +#endif /* _VM_VM_PAGE_H_ */ diff --git a/vm/vm_pageout.c b/vm/vm_pageout.c new file mode 100644 index 0000000..e2f4cf2 --- /dev/null +++ b/vm/vm_pageout.c @@ -0,0 +1,515 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University. + * Copyright (c) 1993,1994 The University of Utah and + * the Computer Systems Laboratory (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF + * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY + * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF + * THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/vm_pageout.c + * Author: Avadis Tevanian, Jr., Michael Wayne Young + * Date: 1985 + * + * The proverbial page-out daemon. + */ + +#include <device/net_io.h> +#include <mach/mach_types.h> +#include <mach/memory_object.h> +#include <vm/memory_object_default.user.h> +#include <vm/memory_object_user.user.h> +#include <mach/vm_param.h> +#include <mach/vm_statistics.h> +#include <kern/counters.h> +#include <kern/debug.h> +#include <kern/slab.h> +#include <kern/task.h> +#include <kern/thread.h> +#include <kern/printf.h> +#include <vm/memory_object.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pageout.h> +#include <machine/locore.h> + +#define DEBUG 0 + +/* + * Maximum delay, in milliseconds, between two pageout scans. + */ +#define VM_PAGEOUT_TIMEOUT 50 + +/* + * Event placeholder for pageout requests, synchronized with + * the free page queue lock. + */ +static int vm_pageout_requested; + +/* + * Event placeholder for pageout throttling, synchronized with + * the free page queue lock. + */ +static int vm_pageout_continue; + +/* + * Routine: vm_pageout_setup + * Purpose: + * Set up a page for pageout. + * + * Move or copy the page to a new object, as part + * of which it will be sent to its memory manager + * in a memory_object_data_return or memory_object_initialize + * message. + * + * The "paging_offset" argument specifies the offset + * of the page within its external memory object. + * + * The "new_object" and "new_offset" arguments + * indicate where the page should be moved. + * + * The "flush" argument specifies whether the page + * should be flushed from its object. If not, a + * copy of the page is moved to the new object. + * + * In/Out conditions: + * The page in question must not be on any pageout queues, + * and must be busy. The object to which it belongs + * must be unlocked, and the caller must hold a paging + * reference to it. The new_object must not be locked. + * + * If the page is flushed from its original object, + * this routine returns a pointer to a place-holder page, + * inserted at the same offset, to block out-of-order + * requests for the page. The place-holder page must + * be freed after the data_return or initialize message + * has been sent. If the page is copied, + * the holding page is VM_PAGE_NULL. + * + * The original page is put on a paging queue and marked + * not busy on exit. + */ +vm_page_t +vm_pageout_setup( + vm_page_t m, + vm_offset_t paging_offset, + vm_object_t new_object, + vm_offset_t new_offset, + boolean_t flush) +{ + vm_object_t old_object = m->object; + vm_page_t holding_page = 0; /*'=0'to quiet gcc warnings*/ + vm_page_t new_m; + + assert(m->busy && !m->absent && !m->fictitious); + + /* + * If we are not flushing the page, allocate a + * page in the object. + */ + if (!flush) { + for (;;) { + vm_object_lock(new_object); + new_m = vm_page_alloc(new_object, new_offset); + vm_object_unlock(new_object); + + if (new_m != VM_PAGE_NULL) { + break; + } + + VM_PAGE_WAIT(NULL); + } + } + + if (flush) { + /* + * Create a place-holder page where the old one was, + * to prevent anyone from attempting to page in this + * page while we`re unlocked. + */ + while ((holding_page = vm_page_grab_fictitious()) + == VM_PAGE_NULL) + vm_page_more_fictitious(); + + vm_object_lock(old_object); + vm_page_lock_queues(); + vm_page_remove(m); + vm_page_unlock_queues(); + PAGE_WAKEUP_DONE(m); + + vm_page_lock_queues(); + vm_page_insert(holding_page, old_object, m->offset); + vm_page_unlock_queues(); + + /* + * Record that this page has been written out + */ +#if MACH_PAGEMAP + vm_external_state_set(old_object->existence_info, + paging_offset, + VM_EXTERNAL_STATE_EXISTS); +#endif /* MACH_PAGEMAP */ + + vm_object_unlock(old_object); + + vm_object_lock(new_object); + + /* + * Move this page into the new object + */ + + vm_page_lock_queues(); + vm_page_insert(m, new_object, new_offset); + vm_page_unlock_queues(); + + m->dirty = TRUE; + m->precious = FALSE; + m->page_lock = VM_PROT_NONE; + m->unlock_request = VM_PROT_NONE; + } + else { + /* + * Copy the data into the new page, + * and mark the new page as clean. + */ + vm_page_copy(m, new_m); + + vm_object_lock(old_object); + m->dirty = FALSE; + pmap_clear_modify(m->phys_addr); + + /* + * Deactivate old page. + */ + vm_page_lock_queues(); + vm_page_deactivate(m); + vm_page_unlock_queues(); + + PAGE_WAKEUP_DONE(m); + + /* + * Record that this page has been written out + */ + +#if MACH_PAGEMAP + vm_external_state_set(old_object->existence_info, + paging_offset, + VM_EXTERNAL_STATE_EXISTS); +#endif /* MACH_PAGEMAP */ + + vm_object_unlock(old_object); + + vm_object_lock(new_object); + + /* + * Use the new page below. + */ + m = new_m; + m->dirty = TRUE; + assert(!m->precious); + PAGE_WAKEUP_DONE(m); + } + + /* + * Make the old page eligible for replacement again; if a + * user-supplied memory manager fails to release the page, + * it will be paged out again to the default memory manager. + * + * Note that pages written to the default memory manager + * must be wired down -- in return, it guarantees to free + * this page, rather than reusing it. + */ + + vm_page_lock_queues(); + vm_stat.pageouts++; + if (m->laundry) { + + /* + * The caller is telling us that it is going to + * immediately double page this page to the default + * pager. + */ + + assert(!old_object->internal); + m->laundry = FALSE; + } else if (old_object->internal || + memory_manager_default_port(old_object->pager)) { + m->laundry = TRUE; + vm_page_laundry_count++; + + vm_page_wire(m); + } else { + m->external_laundry = TRUE; + + /* + * If vm_page_external_laundry_count is negative, + * the pageout daemon isn't expecting to be + * notified. + */ + + if (vm_page_external_laundry_count >= 0) { + vm_page_external_laundry_count++; + } + + vm_page_activate(m); + } + vm_page_unlock_queues(); + + /* + * Since IPC operations may block, we drop locks now. + * [The placeholder page is busy, and we still have + * paging_in_progress incremented.] + */ + + vm_object_unlock(new_object); + + /* + * Return the placeholder page to simplify cleanup. + */ + return (flush ? holding_page : VM_PAGE_NULL); +} + +/* + * Routine: vm_pageout_page + * Purpose: + * Causes the specified page to be written back to + * the appropriate memory object. + * + * The "initial" argument specifies whether this + * data is an initialization only, and should use + * memory_object_data_initialize instead of + * memory_object_data_return. + * + * The "flush" argument specifies whether the page + * should be flushed from the object. If not, a + * copy of the data is sent to the memory object. + * + * In/out conditions: + * The page in question must not be on any pageout queues. + * The object to which it belongs must be locked. + * Implementation: + * Move this page to a completely new object, if flushing; + * copy to a new page in a new object, if not. + */ +void +vm_pageout_page( + vm_page_t m, + boolean_t initial, + boolean_t flush) +{ + vm_map_copy_t copy; + vm_object_t old_object; + vm_object_t new_object; + vm_page_t holding_page; + vm_offset_t paging_offset; + kern_return_t rc; + boolean_t precious_clean; + + assert(m->busy); + + /* + * Cleaning but not flushing a clean precious page is a + * no-op. Remember whether page is clean and precious now + * because vm_pageout_setup will mark it dirty and not precious. + * + * XXX Check if precious_clean && !flush can really happen. + */ + precious_clean = (!m->dirty) && m->precious; + if (precious_clean && !flush) { + PAGE_WAKEUP_DONE(m); + return; + } + + /* + * Verify that we really want to clean this page. + */ + if (m->absent || m->error || (!m->dirty && !m->precious)) { + VM_PAGE_FREE(m); + return; + } + + /* + * Create a paging reference to let us play with the object. + */ + old_object = m->object; + paging_offset = m->offset + old_object->paging_offset; + vm_object_paging_begin(old_object); + vm_object_unlock(old_object); + + /* + * Allocate a new object into which we can put the page. + */ + new_object = vm_object_allocate(PAGE_SIZE); + new_object->used_for_pageout = TRUE; + + /* + * Move the page into the new object. + */ + holding_page = vm_pageout_setup(m, + paging_offset, + new_object, + 0, /* new offset */ + flush); /* flush */ + + rc = vm_map_copyin_object(new_object, 0, PAGE_SIZE, ©); + assert(rc == KERN_SUCCESS); + + if (initial) { + rc = memory_object_data_initialize( + old_object->pager, + old_object->pager_request, + paging_offset, (pointer_t) copy, PAGE_SIZE); + } + else { + rc = memory_object_data_return( + old_object->pager, + old_object->pager_request, + paging_offset, (pointer_t) copy, PAGE_SIZE, + !precious_clean, !flush); + } + + if (rc != KERN_SUCCESS) + vm_map_copy_discard(copy); + + /* + * Clean up. + */ + vm_object_lock(old_object); + if (holding_page != VM_PAGE_NULL) + VM_PAGE_FREE(holding_page); + vm_object_paging_end(old_object); +} + +/* + * vm_pageout_scan does the dirty work for the pageout daemon. + * + * Return TRUE if the pageout daemon is done for now, FALSE otherwise, + * in which case should_wait indicates whether the pageout daemon + * should wait to allow pagers to keep up. + * + * It returns with vm_page_queue_free_lock held. + */ + +static boolean_t vm_pageout_scan(boolean_t *should_wait) +{ + boolean_t done; + + /* + * Try balancing pages among segments first, since this + * may be enough to resume unprivileged allocations. + */ + + /* This function returns with vm_page_queue_free_lock held */ + done = vm_page_balance(); + + if (done) { + return TRUE; + } + + simple_unlock(&vm_page_queue_free_lock); + + /* + * Balancing is not enough. Shrink caches and scan pages + * for eviction. + */ + + stack_collect(); + net_kmsg_collect(); + consider_task_collect(); + if (0) /* XXX: pcb_collect doesn't do anything yet, so it is + pointless to call consider_thread_collect. */ + consider_thread_collect(); + + /* + * slab_collect should be last, because the other operations + * might return memory to caches. + */ + slab_collect(); + + vm_page_refill_inactive(); + + /* This function returns with vm_page_queue_free_lock held */ + return vm_page_evict(should_wait); +} + +void vm_pageout(void) +{ + boolean_t done, should_wait; + + current_thread()->vm_privilege = 1; + stack_privilege(current_thread()); + thread_set_own_priority(0); + + for (;;) { + done = vm_pageout_scan(&should_wait); + /* we hold vm_page_queue_free_lock now */ + + if (done) { + thread_sleep(&vm_pageout_requested, + simple_lock_addr(vm_page_queue_free_lock), + FALSE); + } else if (should_wait) { + assert_wait(&vm_pageout_continue, FALSE); + thread_set_timeout(VM_PAGEOUT_TIMEOUT * hz / 1000); + simple_unlock(&vm_page_queue_free_lock); + thread_block(NULL); + +#if DEBUG + if (current_thread()->wait_result != THREAD_AWAKENED) { + printf("vm_pageout: timeout," + " vm_page_laundry_count:%d" + " vm_page_external_laundry_count:%d\n", + vm_page_laundry_count, + vm_page_external_laundry_count); + } +#endif + } else { + simple_unlock(&vm_page_queue_free_lock); + } + } +} + +/* + * Start pageout + * + * The free page queue lock must be held before calling this function. + */ +void vm_pageout_start(void) +{ + if (!current_thread()) + return; + + thread_wakeup_one(&vm_pageout_requested); +} + +/* + * Resume pageout + * + * The free page queue lock must be held before calling this function. + */ +void vm_pageout_resume(void) +{ + thread_wakeup_one(&vm_pageout_continue); +} diff --git a/vm/vm_pageout.h b/vm/vm_pageout.h new file mode 100644 index 0000000..6ddd821 --- /dev/null +++ b/vm/vm_pageout.h @@ -0,0 +1,53 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/vm_pageout.h + * Author: Avadis Tevanian, Jr. + * Date: 1986 + * + * Declarations for the pageout daemon interface. + */ + +#ifndef _VM_VM_PAGEOUT_H_ +#define _VM_VM_PAGEOUT_H_ + +#include <vm/vm_page.h> + +/* + * Exported routines. + */ + +extern vm_page_t vm_pageout_setup(vm_page_t, vm_offset_t, vm_object_t, + vm_offset_t, boolean_t); +extern void vm_pageout_page(vm_page_t, boolean_t, boolean_t); + +extern void vm_pageout(void) __attribute__((noreturn)); + +extern void vm_pageout_start(void); + +extern void vm_pageout_resume(void); + +#endif /* _VM_VM_PAGEOUT_H_ */ diff --git a/vm/vm_print.h b/vm/vm_print.h new file mode 100644 index 0000000..8a36d75 --- /dev/null +++ b/vm/vm_print.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2013 Free Software Foundation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef VM_PRINT_H +#define VM_PRINT_H + +#include <vm/vm_map.h> +#include <machine/db_machdep.h> + +/* Debugging: print a map */ +extern void vm_map_print(db_expr_t addr, boolean_t have_addr, + db_expr_t count, const char *modif); + +/* Pretty-print a copy object for ddb. */ +extern void vm_map_copy_print(const vm_map_copy_t); + +#include <vm/vm_object.h> + +extern void vm_object_print(vm_object_t); + +#include <vm/vm_page.h> + +extern void vm_page_print(const vm_page_t); + +#endif /* VM_PRINT_H */ + diff --git a/vm/vm_resident.c b/vm/vm_resident.c new file mode 100644 index 0000000..3f0cc90 --- /dev/null +++ b/vm/vm_resident.c @@ -0,0 +1,1116 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University. + * Copyright (c) 1993,1994 The University of Utah and + * the Computer Systems Laboratory (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF + * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY + * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF + * THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/vm_resident.c + * Author: Avadis Tevanian, Jr., Michael Wayne Young + * + * Resident memory management module. + */ + +#include <kern/printf.h> +#include <string.h> + +#include <mach/vm_prot.h> +#include <kern/counters.h> +#include <kern/debug.h> +#include <kern/list.h> +#include <kern/sched_prim.h> +#include <kern/task.h> +#include <kern/thread.h> +#include <mach/vm_statistics.h> +#include <machine/vm_param.h> +#include <kern/xpr.h> +#include <kern/slab.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_page.h> +#include <vm/vm_pageout.h> +#include <vm/vm_kern.h> +#include <vm/vm_resident.h> + +#if MACH_VM_DEBUG +#include <mach/kern_return.h> +#include <mach_debug/hash_info.h> +#include <vm/vm_user.h> +#endif + +#if MACH_KDB +#include <ddb/db_output.h> +#include <vm/vm_print.h> +#endif /* MACH_KDB */ + + +/* + * Associated with each page of user-allocatable memory is a + * page structure. + */ + +/* + * These variables record the values returned by vm_page_bootstrap, + * for debugging purposes. The implementation of pmap_steal_memory + * here also uses them internally. + */ + +vm_offset_t virtual_space_start; +vm_offset_t virtual_space_end; + +/* + * The vm_page_lookup() routine, which provides for fast + * (virtual memory object, offset) to page lookup, employs + * the following hash table. The vm_page_{insert,remove} + * routines install and remove associations in the table. + * [This table is often called the virtual-to-physical, + * or VP, table.] + */ +typedef struct { + decl_simple_lock_data(,lock) + vm_page_t pages; +} vm_page_bucket_t; + +vm_page_bucket_t *vm_page_buckets; /* Array of buckets */ +unsigned long vm_page_bucket_count = 0; /* How big is array? */ +unsigned long vm_page_hash_mask; /* Mask for hash function */ + +static struct list vm_page_queue_fictitious; +def_simple_lock_data(,vm_page_queue_free_lock) +int vm_page_fictitious_count; +int vm_object_external_count; +int vm_object_external_pages; + +/* + * Occasionally, the virtual memory system uses + * resident page structures that do not refer to + * real pages, for example to leave a page with + * important state information in the VP table. + * + * These page structures are allocated the way + * most other kernel structures are. + */ +struct kmem_cache vm_page_cache; + +/* + * Fictitious pages don't have a physical address, + * but we must initialize phys_addr to something. + * For debugging, this should be a strange value + * that the pmap module can recognize in assertions. + */ +phys_addr_t vm_page_fictitious_addr = (phys_addr_t) -1; + +/* + * Resident page structures are also chained on + * queues that are used by the page replacement + * system (pageout daemon). These queues are + * defined here, but are shared by the pageout + * module. + */ +def_simple_lock_data(,vm_page_queue_lock) +int vm_page_active_count; +int vm_page_inactive_count; +int vm_page_wire_count; + +/* + * Several page replacement parameters are also + * shared with this module, so that page allocation + * (done here in vm_page_alloc) can trigger the + * pageout daemon. + */ +int vm_page_laundry_count = 0; +int vm_page_external_laundry_count = 0; + + +/* + * The VM system has a couple of heuristics for deciding + * that pages are "uninteresting" and should be placed + * on the inactive queue as likely candidates for replacement. + * These variables let the heuristics be controlled at run-time + * to make experimentation easier. + */ + +boolean_t vm_page_deactivate_behind = TRUE; +boolean_t vm_page_deactivate_hint = TRUE; + +/* + * vm_page_bootstrap: + * + * Initializes the resident memory module. + * + * Allocates memory for the page cells, and + * for the object/offset-to-page hash table headers. + * Each page cell is initialized and placed on the free list. + * Returns the range of available kernel virtual memory. + */ + +void vm_page_bootstrap( + vm_offset_t *startp, + vm_offset_t *endp) +{ + int i; + + /* + * Initialize the page queues. + */ + + simple_lock_init(&vm_page_queue_free_lock); + simple_lock_init(&vm_page_queue_lock); + + list_init(&vm_page_queue_fictitious); + + /* + * Allocate (and initialize) the virtual-to-physical + * table hash buckets. + * + * The number of buckets should be a power of two to + * get a good hash function. The following computation + * chooses the first power of two that is greater + * than the number of physical pages in the system. + */ + + if (vm_page_bucket_count == 0) { + unsigned long npages = vm_page_table_size(); + + vm_page_bucket_count = 1; + while (vm_page_bucket_count < npages) + vm_page_bucket_count <<= 1; + } + + vm_page_hash_mask = vm_page_bucket_count - 1; + + if (vm_page_hash_mask & vm_page_bucket_count) + printf("vm_page_bootstrap: WARNING -- strange page hash\n"); + + vm_page_buckets = (vm_page_bucket_t *) + pmap_steal_memory(vm_page_bucket_count * + sizeof(vm_page_bucket_t)); + + for (i = 0; i < vm_page_bucket_count; i++) { + vm_page_bucket_t *bucket = &vm_page_buckets[i]; + + bucket->pages = VM_PAGE_NULL; + simple_lock_init(&bucket->lock); + } + + vm_page_setup(); + + virtual_space_start = round_page(virtual_space_start); + virtual_space_end = trunc_page(virtual_space_end); + + *startp = virtual_space_start; + *endp = virtual_space_end; +} + +#ifndef MACHINE_PAGES +/* + * We implement pmap_steal_memory with the help + * of two simpler functions, pmap_virtual_space and vm_page_bootalloc. + */ + +vm_offset_t pmap_steal_memory( + vm_size_t size) +{ + vm_offset_t addr, vaddr; + phys_addr_t paddr; + + size = round_page(size); + + /* + * If this is the first call to pmap_steal_memory, + * we have to initialize ourself. + */ + + if (virtual_space_start == virtual_space_end) { + pmap_virtual_space(&virtual_space_start, &virtual_space_end); + + /* + * The initial values must be aligned properly, and + * we don't trust the pmap module to do it right. + */ + + virtual_space_start = round_page(virtual_space_start); + virtual_space_end = trunc_page(virtual_space_end); + } + + /* + * Allocate virtual memory for this request. + */ + + addr = virtual_space_start; + virtual_space_start += size; + + /* + * Allocate and map physical pages to back new virtual pages. + */ + + for (vaddr = round_page(addr); + vaddr < addr + size; + vaddr += PAGE_SIZE) { + paddr = vm_page_bootalloc(PAGE_SIZE); + + /* + * XXX Logically, these mappings should be wired, + * but some pmap modules barf if they are. + */ + + pmap_enter(kernel_pmap, vaddr, paddr, + VM_PROT_READ|VM_PROT_WRITE, FALSE); + } + + return addr; +} +#endif /* MACHINE_PAGES */ + +/* + * Routine: vm_page_module_init + * Purpose: + * Second initialization pass, to be done after + * the basic VM system is ready. + */ +void vm_page_module_init(void) +{ + kmem_cache_init(&vm_page_cache, "vm_page", sizeof(struct vm_page), 0, + NULL, 0); +} + +/* + * vm_page_hash: + * + * Distributes the object/offset key pair among hash buckets. + * + * NOTE: To get a good hash function, the bucket count should + * be a power of two. + */ +#define vm_page_hash(object, offset) \ + (((unsigned int)(vm_offset_t)object + (unsigned int)atop(offset)) \ + & vm_page_hash_mask) + +/* + * vm_page_insert: [ internal use only ] + * + * Inserts the given mem entry into the object/object-page + * table and object list. + * + * The object and page must be locked. + * The free page queue must not be locked. + */ + +void vm_page_insert( + vm_page_t mem, + vm_object_t object, + vm_offset_t offset) +{ + vm_page_bucket_t *bucket; + + VM_PAGE_CHECK(mem); + + assert(!mem->active && !mem->inactive); + assert(!mem->external); + + if (!object->internal) { + mem->external = TRUE; + vm_object_external_pages++; + } + + if (mem->tabled) + panic("vm_page_insert"); + + /* + * Record the object/offset pair in this page + */ + + mem->object = object; + mem->offset = offset; + + /* + * Insert it into the object_object/offset hash table + */ + + bucket = &vm_page_buckets[vm_page_hash(object, offset)]; + simple_lock(&bucket->lock); + mem->next = bucket->pages; + bucket->pages = mem; + simple_unlock(&bucket->lock); + + /* + * Now link into the object's list of backed pages. + */ + + queue_enter(&object->memq, mem, vm_page_t, listq); + mem->tabled = TRUE; + + /* + * Show that the object has one more resident page. + */ + + object->resident_page_count++; + assert(object->resident_page_count != 0); + + /* + * Detect sequential access and inactivate previous page. + * We ignore busy pages. + */ + + if (vm_page_deactivate_behind && + (offset == object->last_alloc + PAGE_SIZE)) { + vm_page_t last_mem; + + last_mem = vm_page_lookup(object, object->last_alloc); + if ((last_mem != VM_PAGE_NULL) && !last_mem->busy) + vm_page_deactivate(last_mem); + } + object->last_alloc = offset; +} + +/* + * vm_page_replace: + * + * Exactly like vm_page_insert, except that we first + * remove any existing page at the given offset in object + * and we don't do deactivate-behind. + * + * The object and page must be locked. + * The free page queue must not be locked. + */ + +void vm_page_replace( + vm_page_t mem, + vm_object_t object, + vm_offset_t offset) +{ + vm_page_bucket_t *bucket; + + VM_PAGE_CHECK(mem); + + assert(!mem->active && !mem->inactive); + assert(!mem->external); + + if (!object->internal) { + mem->external = TRUE; + vm_object_external_pages++; + } + + if (mem->tabled) + panic("vm_page_replace"); + + /* + * Record the object/offset pair in this page + */ + + mem->object = object; + mem->offset = offset; + + /* + * Insert it into the object_object/offset hash table, + * replacing any page that might have been there. + */ + + bucket = &vm_page_buckets[vm_page_hash(object, offset)]; + simple_lock(&bucket->lock); + if (bucket->pages) { + vm_page_t *mp = &bucket->pages; + vm_page_t m = *mp; + do { + if (m->object == object && m->offset == offset) { + /* + * Remove page from bucket and from object, + * and return it to the free list. + */ + *mp = m->next; + queue_remove(&object->memq, m, vm_page_t, + listq); + m->tabled = FALSE; + object->resident_page_count--; + VM_PAGE_QUEUES_REMOVE(m); + + if (m->external) { + m->external = FALSE; + vm_object_external_pages--; + } + + /* + * Return page to the free list. + * Note the page is not tabled now, so this + * won't self-deadlock on the bucket lock. + */ + + vm_page_free(m); + break; + } + mp = &m->next; + } while ((m = *mp) != 0); + mem->next = bucket->pages; + } else { + mem->next = VM_PAGE_NULL; + } + bucket->pages = mem; + simple_unlock(&bucket->lock); + + /* + * Now link into the object's list of backed pages. + */ + + queue_enter(&object->memq, mem, vm_page_t, listq); + mem->tabled = TRUE; + + /* + * And show that the object has one more resident + * page. + */ + + object->resident_page_count++; + assert(object->resident_page_count != 0); +} + +/* + * vm_page_remove: [ internal use only ] + * + * Removes the given mem entry from the object/offset-page + * table, the object page list, and the page queues. + * + * The object and page must be locked. + * The free page queue must not be locked. + */ + +void vm_page_remove( + vm_page_t mem) +{ + vm_page_bucket_t *bucket; + vm_page_t this; + + assert(mem->tabled); + VM_PAGE_CHECK(mem); + + /* + * Remove from the object_object/offset hash table + */ + + bucket = &vm_page_buckets[vm_page_hash(mem->object, mem->offset)]; + simple_lock(&bucket->lock); + if ((this = bucket->pages) == mem) { + /* optimize for common case */ + + bucket->pages = mem->next; + } else { + vm_page_t *prev; + + for (prev = &this->next; + (this = *prev) != mem; + prev = &this->next) + continue; + *prev = this->next; + } + simple_unlock(&bucket->lock); + + /* + * Now remove from the object's list of backed pages. + */ + + queue_remove(&mem->object->memq, mem, vm_page_t, listq); + + /* + * And show that the object has one fewer resident + * page. + */ + + mem->object->resident_page_count--; + + mem->tabled = FALSE; + + VM_PAGE_QUEUES_REMOVE(mem); + + if (mem->external) { + mem->external = FALSE; + vm_object_external_pages--; + } +} + +/* + * vm_page_lookup: + * + * Returns the page associated with the object/offset + * pair specified; if none is found, VM_PAGE_NULL is returned. + * + * The object must be locked. No side effects. + */ + +vm_page_t vm_page_lookup( + vm_object_t object, + vm_offset_t offset) +{ + vm_page_t mem; + vm_page_bucket_t *bucket; + + /* + * Search the hash table for this object/offset pair + */ + + bucket = &vm_page_buckets[vm_page_hash(object, offset)]; + + simple_lock(&bucket->lock); + for (mem = bucket->pages; mem != VM_PAGE_NULL; mem = mem->next) { + VM_PAGE_CHECK(mem); + if ((mem->object == object) && (mem->offset == offset)) + break; + } + simple_unlock(&bucket->lock); + return mem; +} + +/* + * vm_page_rename: + * + * Move the given memory entry from its + * current object to the specified target object/offset. + * + * The object must be locked. + */ +void vm_page_rename( + vm_page_t mem, + vm_object_t new_object, + vm_offset_t new_offset) +{ + /* + * Changes to mem->object require the page lock because + * the pageout daemon uses that lock to get the object. + */ + + vm_page_lock_queues(); + vm_page_remove(mem); + vm_page_insert(mem, new_object, new_offset); + vm_page_unlock_queues(); +} + +static void vm_page_init_template(vm_page_t m) +{ + m->object = VM_OBJECT_NULL; /* reset later */ + m->offset = 0; /* reset later */ + m->wire_count = 0; + + m->inactive = FALSE; + m->active = FALSE; + m->laundry = FALSE; + m->external_laundry = FALSE; + m->free = FALSE; + m->external = FALSE; + + m->busy = TRUE; + m->wanted = FALSE; + m->tabled = FALSE; + m->fictitious = FALSE; + m->private = FALSE; + m->absent = FALSE; + m->error = FALSE; + m->dirty = FALSE; + m->precious = FALSE; + m->reference = FALSE; + + m->page_lock = VM_PROT_NONE; + m->unlock_request = VM_PROT_NONE; +} + +/* + * vm_page_init: + * + * Initialize the fields in a new page. + * This takes a structure with random values and initializes it + * so that it can be given to vm_page_release or vm_page_insert. + */ +void vm_page_init( + vm_page_t mem) +{ + vm_page_init_template(mem); +} + +/* + * vm_page_grab_fictitious: + * + * Remove a fictitious page from the free list. + * Returns VM_PAGE_NULL if there are no free pages. + */ + +vm_page_t vm_page_grab_fictitious(void) +{ + vm_page_t m; + + simple_lock(&vm_page_queue_free_lock); + if (list_empty(&vm_page_queue_fictitious)) { + m = VM_PAGE_NULL; + } else { + m = list_first_entry(&vm_page_queue_fictitious, + struct vm_page, node); + assert(m->fictitious); + list_remove(&m->node); + m->free = FALSE; + vm_page_fictitious_count--; + } + simple_unlock(&vm_page_queue_free_lock); + + return m; +} + +/* + * vm_page_release_fictitious: + * + * Release a fictitious page to the free list. + */ + +static void vm_page_release_fictitious( + vm_page_t m) +{ + simple_lock(&vm_page_queue_free_lock); + if (m->free) + panic("vm_page_release_fictitious"); + m->free = TRUE; + list_insert_head(&vm_page_queue_fictitious, &m->node); + vm_page_fictitious_count++; + simple_unlock(&vm_page_queue_free_lock); +} + +/* + * vm_page_more_fictitious: + * + * Add more fictitious pages to the free list. + * Allowed to block. + */ + +int vm_page_fictitious_quantum = 5; + +void vm_page_more_fictitious(void) +{ + vm_page_t m; + int i; + + for (i = 0; i < vm_page_fictitious_quantum; i++) { + m = (vm_page_t) kmem_cache_alloc(&vm_page_cache); + if (m == VM_PAGE_NULL) + panic("vm_page_more_fictitious"); + + vm_page_init(m); + m->phys_addr = vm_page_fictitious_addr; + m->fictitious = TRUE; + vm_page_release_fictitious(m); + } +} + +/* + * vm_page_convert: + * + * Attempt to convert a fictitious page into a real page. + * + * The object referenced by *MP must be locked. + */ + +boolean_t vm_page_convert(struct vm_page **mp) +{ + struct vm_page *real_m, *fict_m; + vm_object_t object; + vm_offset_t offset; + + fict_m = *mp; + + assert(fict_m->fictitious); + assert(fict_m->phys_addr == vm_page_fictitious_addr); + assert(!fict_m->active); + assert(!fict_m->inactive); + + real_m = vm_page_grab(VM_PAGE_HIGHMEM); + if (real_m == VM_PAGE_NULL) + return FALSE; + + object = fict_m->object; + offset = fict_m->offset; + vm_page_remove(fict_m); + + memcpy(&real_m->vm_page_header, + &fict_m->vm_page_header, + VM_PAGE_BODY_SIZE); + real_m->fictitious = FALSE; + + vm_page_insert(real_m, object, offset); + + assert(real_m->phys_addr != vm_page_fictitious_addr); + assert(fict_m->fictitious); + assert(fict_m->phys_addr == vm_page_fictitious_addr); + + vm_page_release_fictitious(fict_m); + *mp = real_m; + return TRUE; +} + +/* + * vm_page_grab: + * + * Remove a page from the free list. + * Returns VM_PAGE_NULL if the free list is too small. + * + * FLAGS specify which constraint should be enforced for the allocated + * addresses. + */ + +vm_page_t vm_page_grab(unsigned flags) +{ + unsigned selector; + vm_page_t mem; + + if (flags & VM_PAGE_HIGHMEM) + selector = VM_PAGE_SEL_HIGHMEM; +#if defined(VM_PAGE_DMA32_LIMIT) && VM_PAGE_DMA32_LIMIT > VM_PAGE_DIRECTMAP_LIMIT + else if (flags & VM_PAGE_DMA32) + selector = VM_PAGE_SEL_DMA32; +#endif + else if (flags & VM_PAGE_DIRECTMAP) + selector = VM_PAGE_SEL_DIRECTMAP; +#if defined(VM_PAGE_DMA32_LIMIT) && VM_PAGE_DMA32_LIMIT <= VM_PAGE_DIRECTMAP_LIMIT + else if (flags & VM_PAGE_DMA32) + selector = VM_PAGE_SEL_DMA32; +#endif + else + selector = VM_PAGE_SEL_DMA; + + simple_lock(&vm_page_queue_free_lock); + + /* + * XXX Mach has many modules that merely assume memory is + * directly mapped in kernel space. Instead of updating all + * users, we assume those which need specific physical memory + * properties will wire down their pages, either because + * they can't be paged (not part of an object), or with + * explicit VM calls. The strategy is then to let memory + * pressure balance the physical segments with pageable pages. + */ + mem = vm_page_alloc_pa(0, selector, VM_PT_KERNEL); + + if (mem == NULL) { + simple_unlock(&vm_page_queue_free_lock); + return NULL; + } + + mem->free = FALSE; + simple_unlock(&vm_page_queue_free_lock); + + return mem; +} + +phys_addr_t vm_page_grab_phys_addr(void) +{ + vm_page_t p = vm_page_grab(VM_PAGE_DIRECTMAP); + if (p == VM_PAGE_NULL) + return -1; + else + return p->phys_addr; +} + +/* + * vm_page_release: + * + * Return a page to the free list. + */ + +void vm_page_release( + vm_page_t mem, + boolean_t laundry, + boolean_t external_laundry) +{ + simple_lock(&vm_page_queue_free_lock); + if (mem->free) + panic("vm_page_release"); + mem->free = TRUE; + vm_page_free_pa(mem, 0); + if (laundry) { + vm_page_laundry_count--; + + if (vm_page_laundry_count == 0) { + vm_pageout_resume(); + } + } + if (external_laundry) { + + /* + * If vm_page_external_laundry_count is negative, + * the pageout daemon isn't expecting to be + * notified. + */ + + if (vm_page_external_laundry_count > 0) { + vm_page_external_laundry_count--; + + if (vm_page_external_laundry_count == 0) { + vm_pageout_resume(); + } + } + } + + simple_unlock(&vm_page_queue_free_lock); +} + +/* + * vm_page_grab_contig: + * + * Remove a block of contiguous pages from the free list. + * Returns VM_PAGE_NULL if the request fails. + */ + +vm_page_t vm_page_grab_contig( + vm_size_t size, + unsigned int selector) +{ + unsigned int i, order, nr_pages; + vm_page_t mem; + + order = vm_page_order(size); + nr_pages = 1 << order; + + simple_lock(&vm_page_queue_free_lock); + + /* TODO Allow caller to pass type */ + mem = vm_page_alloc_pa(order, selector, VM_PT_KERNEL); + + if (mem == NULL) { + simple_unlock(&vm_page_queue_free_lock); + return NULL; + } + + for (i = 0; i < nr_pages; i++) { + mem[i].free = FALSE; + } + + simple_unlock(&vm_page_queue_free_lock); + + return mem; +} + +/* + * vm_page_free_contig: + * + * Return a block of contiguous pages to the free list. + */ + +void vm_page_free_contig(vm_page_t mem, vm_size_t size) +{ + unsigned int i, order, nr_pages; + + order = vm_page_order(size); + nr_pages = 1 << order; + + simple_lock(&vm_page_queue_free_lock); + + for (i = 0; i < nr_pages; i++) { + if (mem[i].free) + panic("vm_page_free_contig"); + + mem[i].free = TRUE; + } + + vm_page_free_pa(mem, order); + + simple_unlock(&vm_page_queue_free_lock); +} + +/* + * vm_page_alloc: + * + * Allocate and return a memory cell associated + * with this VM object/offset pair. + * + * Object must be locked. + */ + +vm_page_t vm_page_alloc( + vm_object_t object, + vm_offset_t offset) +{ + vm_page_t mem; + + mem = vm_page_grab(VM_PAGE_HIGHMEM); + if (mem == VM_PAGE_NULL) + return VM_PAGE_NULL; + + vm_page_lock_queues(); + vm_page_insert(mem, object, offset); + vm_page_unlock_queues(); + + return mem; +} + +/* + * vm_page_free: + * + * Returns the given page to the free list, + * disassociating it with any VM object. + * + * Object and page queues must be locked prior to entry. + */ +void vm_page_free( + vm_page_t mem) +{ + if (mem->free) + panic("vm_page_free"); + + if (mem->tabled) { + vm_page_remove(mem); + } + + assert(!mem->active && !mem->inactive); + + if (mem->wire_count != 0) { + if (!mem->private && !mem->fictitious) + vm_page_wire_count--; + mem->wire_count = 0; + } + + PAGE_WAKEUP_DONE(mem); + + if (mem->absent) + vm_object_absent_release(mem->object); + + /* + * XXX The calls to vm_page_init here are + * really overkill. + */ + + if (mem->private || mem->fictitious) { + vm_page_init(mem); + mem->phys_addr = vm_page_fictitious_addr; + mem->fictitious = TRUE; + vm_page_release_fictitious(mem); + } else { + boolean_t laundry = mem->laundry; + boolean_t external_laundry = mem->external_laundry; + vm_page_init(mem); + vm_page_release(mem, laundry, external_laundry); + } +} + +/* + * vm_page_zero_fill: + * + * Zero-fill the specified page. + */ +void vm_page_zero_fill( + vm_page_t m) +{ + VM_PAGE_CHECK(m); + + pmap_zero_page(m->phys_addr); +} + +/* + * vm_page_copy: + * + * Copy one page to another + */ + +void vm_page_copy( + vm_page_t src_m, + vm_page_t dest_m) +{ + VM_PAGE_CHECK(src_m); + VM_PAGE_CHECK(dest_m); + + pmap_copy_page(src_m->phys_addr, dest_m->phys_addr); +} + +#if MACH_VM_DEBUG +/* + * Routine: vm_page_info + * Purpose: + * Return information about the global VP table. + * Fills the buffer with as much information as possible + * and returns the desired size of the buffer. + * Conditions: + * Nothing locked. The caller should provide + * possibly-pageable memory. + */ + +unsigned int +vm_page_info( + hash_info_bucket_t *info, + unsigned int count) +{ + int i; + + if (vm_page_bucket_count < count) + count = vm_page_bucket_count; + + for (i = 0; i < count; i++) { + vm_page_bucket_t *bucket = &vm_page_buckets[i]; + unsigned int bucket_count = 0; + vm_page_t m; + + simple_lock(&bucket->lock); + for (m = bucket->pages; m != VM_PAGE_NULL; m = m->next) + bucket_count++; + simple_unlock(&bucket->lock); + + /* don't touch pageable memory while holding locks */ + info[i].hib_count = bucket_count; + } + + return vm_page_bucket_count; +} +#endif /* MACH_VM_DEBUG */ + + +#if MACH_KDB +#define printf kdbprintf + +/* + * Routine: vm_page_print [exported] + */ +void vm_page_print(const vm_page_t p) +{ + iprintf("Page 0x%X: object 0x%X,", (vm_offset_t) p, (vm_offset_t) p->object); + printf(" offset 0x%X", p->offset); + printf("wire_count %d,", p->wire_count); + printf(" %s", + (p->active ? "active" : (p->inactive ? "inactive" : "loose"))); + printf("%s", + (p->free ? " free" : "")); + printf("%s ", + (p->laundry ? " laundry" : "")); + printf("%s", + (p->dirty ? "dirty" : "clean")); + printf("%s", + (p->busy ? " busy" : "")); + printf("%s", + (p->absent ? " absent" : "")); + printf("%s", + (p->error ? " error" : "")); + printf("%s", + (p->fictitious ? " fictitious" : "")); + printf("%s", + (p->private ? " private" : "")); + printf("%s", + (p->wanted ? " wanted" : "")); + printf("%s,", + (p->tabled ? "" : "not_tabled")); + printf("phys_addr = 0x%X, lock = 0x%X, unlock_request = 0x%X\n", + p->phys_addr, + (vm_offset_t) p->page_lock, + (vm_offset_t) p->unlock_request); +} +#endif /* MACH_KDB */ diff --git a/vm/vm_resident.h b/vm/vm_resident.h new file mode 100644 index 0000000..e8bf681 --- /dev/null +++ b/vm/vm_resident.h @@ -0,0 +1,45 @@ +/* + * Resident memory management module functions. + * Copyright (C) 2008 Free Software Foundation, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Author: Barry deFreese. + */ +/* + * Resident memory management module functions. + * + */ + +#ifndef _VM_RESIDENT_H_ +#define _VM_RESIDENT_H_ + +#include <mach/std_types.h> + +/* + * vm_page_replace: + * + * Exactly like vm_page_insert, except that we first + * remove any existing page at the given offset in object + * and we don't do deactivate-behind. + * + * The object and page must be locked. + */ +extern void vm_page_replace ( + vm_page_t mem, + vm_object_t object, + vm_offset_t offset); + +#endif /* _VM_RESIDENT_H_ */ diff --git a/vm/vm_types.h b/vm/vm_types.h new file mode 100644 index 0000000..f64ebee --- /dev/null +++ b/vm/vm_types.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2007 Free Software Foundation, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Written by Thomas Schwinge. + */ + +#ifndef VM_VM_TYPES_H +#define VM_VM_TYPES_H + +/* + * Types defined: + * + * vm_map_t the high-level address map data structure. + * vm_object_t Virtual memory object. + * vm_page_t See `vm/vm_page.h'. + */ + +typedef struct vm_map *vm_map_t; +#define VM_MAP_NULL ((vm_map_t) 0) + +typedef struct vm_object *vm_object_t; +#define VM_OBJECT_NULL ((vm_object_t) 0) + +typedef struct vm_page *vm_page_t; +#define VM_PAGE_NULL ((vm_page_t) 0) + + +#endif /* VM_VM_TYPES_H */ diff --git a/vm/vm_user.c b/vm/vm_user.c new file mode 100644 index 0000000..868230a --- /dev/null +++ b/vm/vm_user.c @@ -0,0 +1,803 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/vm_user.c + * Author: Avadis Tevanian, Jr., Michael Wayne Young + * + * User-exported virtual memory functions. + */ + +#include <mach/boolean.h> +#include <mach/kern_return.h> +#include <mach/mach_types.h> /* to get vm_address_t */ +#include <mach/memory_object.h> +#include <mach/std_types.h> /* to get pointer_t */ +#include <mach/vm_attributes.h> +#include <mach/vm_param.h> +#include <mach/vm_statistics.h> +#include <mach/vm_cache_statistics.h> +#include <mach/vm_sync.h> +#include <kern/gnumach.server.h> +#include <kern/host.h> +#include <kern/mach.server.h> +#include <kern/mach_host.server.h> +#include <kern/task.h> +#include <vm/vm_fault.h> +#include <vm/vm_kern.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/memory_object_proxy.h> +#include <vm/vm_page.h> + + + +vm_statistics_data_t vm_stat; + +/* + * vm_allocate allocates "zero fill" memory in the specfied + * map. + */ +kern_return_t vm_allocate( + vm_map_t map, + vm_offset_t *addr, + vm_size_t size, + boolean_t anywhere) +{ + kern_return_t result; + + if (map == VM_MAP_NULL) + return(KERN_INVALID_ARGUMENT); + if (size == 0) { + *addr = 0; + return(KERN_SUCCESS); + } + + if (anywhere) + *addr = vm_map_min(map); + else + *addr = trunc_page(*addr); + size = round_page(size); + + result = vm_map_enter( + map, + addr, + size, + (vm_offset_t)0, + anywhere, + VM_OBJECT_NULL, + (vm_offset_t)0, + FALSE, + VM_PROT_DEFAULT, + VM_PROT_ALL, + VM_INHERIT_DEFAULT); + + return(result); +} + +/* + * vm_deallocate deallocates the specified range of addresses in the + * specified address map. + */ +kern_return_t vm_deallocate( + vm_map_t map, + vm_offset_t start, + vm_size_t size) +{ + if (map == VM_MAP_NULL) + return(KERN_INVALID_ARGUMENT); + + if (size == (vm_offset_t) 0) + return(KERN_SUCCESS); + + return(vm_map_remove(map, trunc_page(start), round_page(start+size))); +} + +/* + * vm_inherit sets the inheritance of the specified range in the + * specified map. + */ +kern_return_t vm_inherit( + vm_map_t map, + vm_offset_t start, + vm_size_t size, + vm_inherit_t new_inheritance) +{ + if (map == VM_MAP_NULL) + return(KERN_INVALID_ARGUMENT); + + switch (new_inheritance) { + case VM_INHERIT_NONE: + case VM_INHERIT_COPY: + case VM_INHERIT_SHARE: + break; + default: + return(KERN_INVALID_ARGUMENT); + } + + /*Check if range includes projected buffer; + user is not allowed direct manipulation in that case*/ + if (projected_buffer_in_range(map, start, start+size)) + return(KERN_INVALID_ARGUMENT); + + return(vm_map_inherit(map, + trunc_page(start), + round_page(start+size), + new_inheritance)); +} + +/* + * vm_protect sets the protection of the specified range in the + * specified map. + */ + +kern_return_t vm_protect( + vm_map_t map, + vm_offset_t start, + vm_size_t size, + boolean_t set_maximum, + vm_prot_t new_protection) +{ + if ((map == VM_MAP_NULL) || + (new_protection & ~(VM_PROT_ALL|VM_PROT_NOTIFY))) + return(KERN_INVALID_ARGUMENT); + + /*Check if range includes projected buffer; + user is not allowed direct manipulation in that case*/ + if (projected_buffer_in_range(map, start, start+size)) + return(KERN_INVALID_ARGUMENT); + + return(vm_map_protect(map, + trunc_page(start), + round_page(start+size), + new_protection, + set_maximum)); +} + +kern_return_t vm_statistics( + vm_map_t map, + vm_statistics_data_t *stat) +{ + if (map == VM_MAP_NULL) + return(KERN_INVALID_ARGUMENT); + + *stat = vm_stat; + + stat->pagesize = PAGE_SIZE; + stat->free_count = vm_page_mem_free(); + stat->active_count = vm_page_active_count; + stat->inactive_count = vm_page_inactive_count; + stat->wire_count = vm_page_wire_count; + + return(KERN_SUCCESS); +} + +kern_return_t vm_cache_statistics( + vm_map_t map, + vm_cache_statistics_data_t *stats) +{ + if (map == VM_MAP_NULL) + return KERN_INVALID_ARGUMENT; + + stats->cache_object_count = vm_object_external_count; + stats->cache_count = vm_object_external_pages; + + /* XXX Not implemented yet */ + stats->active_tmp_count = 0; + stats->inactive_tmp_count = 0; + stats->active_perm_count = 0; + stats->inactive_perm_count = 0; + stats->dirty_count = 0; + stats->laundry_count = 0; + stats->writeback_count = 0; + stats->slab_count = 0; + stats->slab_reclaim_count = 0; + return KERN_SUCCESS; +} + +/* + * Handle machine-specific attributes for a mapping, such + * as cachability, migrability, etc. + */ +kern_return_t vm_machine_attribute( + vm_map_t map, + vm_address_t address, + vm_size_t size, + vm_machine_attribute_t attribute, + vm_machine_attribute_val_t* value) /* IN/OUT */ +{ + if (map == VM_MAP_NULL) + return(KERN_INVALID_ARGUMENT); + + /*Check if range includes projected buffer; + user is not allowed direct manipulation in that case*/ + if (projected_buffer_in_range(map, address, address+size)) + return(KERN_INVALID_ARGUMENT); + + return vm_map_machine_attribute(map, address, size, attribute, value); +} + +kern_return_t vm_read( + vm_map_t map, + vm_address_t address, + vm_size_t size, + pointer_t *data, + mach_msg_type_number_t *data_size) +{ + kern_return_t error; + vm_map_copy_t ipc_address; + + if (map == VM_MAP_NULL) + return(KERN_INVALID_ARGUMENT); + + if ((error = vm_map_copyin(map, + address, + size, + FALSE, /* src_destroy */ + &ipc_address)) == KERN_SUCCESS) { + *data = (pointer_t) ipc_address; + *data_size = size; + } + return(error); +} + +kern_return_t vm_write( + vm_map_t map, + vm_address_t address, + pointer_t data, + mach_msg_type_number_t size) +{ + if (map == VM_MAP_NULL) + return KERN_INVALID_ARGUMENT; + + return vm_map_copy_overwrite(map, address, (vm_map_copy_t) data, + FALSE /* interruptible XXX */); +} + +kern_return_t vm_copy( + vm_map_t map, + vm_address_t source_address, + vm_size_t size, + vm_address_t dest_address) +{ + vm_map_copy_t copy; + kern_return_t kr; + + if (map == VM_MAP_NULL) + return KERN_INVALID_ARGUMENT; + + kr = vm_map_copyin(map, source_address, size, + FALSE, ©); + if (kr != KERN_SUCCESS) + return kr; + + kr = vm_map_copy_overwrite(map, dest_address, copy, + FALSE /* interruptible XXX */); + if (kr != KERN_SUCCESS) { + vm_map_copy_discard(copy); + return kr; + } + + return KERN_SUCCESS; +} + + +/* + * Routine: vm_map + */ +kern_return_t vm_map( + vm_map_t target_map, + vm_offset_t *address, + vm_size_t size, + vm_offset_t mask, + boolean_t anywhere, + ipc_port_t memory_object, + vm_offset_t offset, + boolean_t copy, + vm_prot_t cur_protection, + vm_prot_t max_protection, + vm_inherit_t inheritance) +{ + vm_object_t object; + kern_return_t result; + + if ((target_map == VM_MAP_NULL) || + (cur_protection & ~VM_PROT_ALL) || + (max_protection & ~VM_PROT_ALL)) + return(KERN_INVALID_ARGUMENT); + + switch (inheritance) { + case VM_INHERIT_NONE: + case VM_INHERIT_COPY: + case VM_INHERIT_SHARE: + break; + default: + return(KERN_INVALID_ARGUMENT); + } + + if (size == 0) + return KERN_INVALID_ARGUMENT; + +#ifdef USER32 + if (mask & 0x80000000) + mask |= 0xffffffff00000000; +#endif + + *address = trunc_page(*address); + size = round_page(size); + + if (!IP_VALID(memory_object)) { + object = VM_OBJECT_NULL; + offset = 0; + copy = FALSE; + } else if ((object = vm_object_enter(memory_object, size, FALSE)) + == VM_OBJECT_NULL) + { + ipc_port_t real_memobj; + vm_prot_t prot; + vm_offset_t start; + vm_offset_t len; + + result = memory_object_proxy_lookup (memory_object, &real_memobj, + &prot, &start, &len); + if (result != KERN_SUCCESS) + return result; + + if (!copy) + { + /* Reduce the allowed access to the memory object. */ + max_protection &= prot; + cur_protection &= prot; + } + else + { + /* Disallow making a copy unless the proxy allows reading. */ + if (!(prot & VM_PROT_READ)) + return KERN_PROTECTION_FAILURE; + } + + /* Reduce the allowed range */ + if ((start + offset + size) > (start + len)) + return KERN_INVALID_ARGUMENT; + + offset += start; + + if ((object = vm_object_enter(real_memobj, size, FALSE)) + == VM_OBJECT_NULL) + return KERN_INVALID_ARGUMENT; + } + + /* + * Perform the copy if requested + */ + + if (copy) { + vm_object_t new_object; + vm_offset_t new_offset; + + result = vm_object_copy_strategically(object, offset, size, + &new_object, &new_offset, + ©); + + /* + * Throw away the reference to the + * original object, as it won't be mapped. + */ + + vm_object_deallocate(object); + + if (result != KERN_SUCCESS) + return (result); + + object = new_object; + offset = new_offset; + } + + if ((result = vm_map_enter(target_map, + address, size, mask, anywhere, + object, offset, + copy, + cur_protection, max_protection, inheritance + )) != KERN_SUCCESS) + vm_object_deallocate(object); + return(result); +} + +/* + * Specify that the range of the virtual address space + * of the target task must not cause page faults for + * the indicated accesses. + * + * [ To unwire the pages, specify VM_PROT_NONE. ] + */ +kern_return_t vm_wire(const ipc_port_t port, + vm_map_t map, + vm_offset_t start, + vm_size_t size, + vm_prot_t access) +{ + boolean_t priv; + + if (!IP_VALID(port)) + return KERN_INVALID_HOST; + + ip_lock(port); + if (!ip_active(port) || + (ip_kotype(port) != IKOT_HOST_PRIV + && ip_kotype(port) != IKOT_HOST)) + { + ip_unlock(port); + return KERN_INVALID_HOST; + } + + priv = ip_kotype(port) == IKOT_HOST_PRIV; + ip_unlock(port); + + if (map == VM_MAP_NULL) + return KERN_INVALID_TASK; + + if (access & ~VM_PROT_ALL) + return KERN_INVALID_ARGUMENT; + + /*Check if range includes projected buffer; + user is not allowed direct manipulation in that case*/ + if (projected_buffer_in_range(map, start, start+size)) + return(KERN_INVALID_ARGUMENT); + + /* TODO: make it tunable */ + if (!priv && access != VM_PROT_NONE && map->size_wired + size > (8<<20)) + return KERN_NO_ACCESS; + + return vm_map_pageable(map, trunc_page(start), round_page(start+size), + access, TRUE, TRUE); +} + +kern_return_t vm_wire_all(const ipc_port_t port, vm_map_t map, vm_wire_t flags) +{ + if (!IP_VALID(port)) + return KERN_INVALID_HOST; + + ip_lock(port); + + if (!ip_active(port) + || (ip_kotype(port) != IKOT_HOST_PRIV)) { + ip_unlock(port); + return KERN_INVALID_HOST; + } + + ip_unlock(port); + + if (map == VM_MAP_NULL) { + return KERN_INVALID_TASK; + } + + if (flags & ~VM_WIRE_ALL) { + return KERN_INVALID_ARGUMENT; + } + + /*Check if range includes projected buffer; + user is not allowed direct manipulation in that case*/ + if (projected_buffer_in_range(map, map->min_offset, map->max_offset)) { + return KERN_INVALID_ARGUMENT; + } + + return vm_map_pageable_all(map, flags); +} + +/* + * vm_object_sync synchronizes out pages from the memory object to its + * memory manager, if any. + */ +kern_return_t vm_object_sync( + vm_object_t object, + vm_offset_t offset, + vm_size_t size, + boolean_t should_flush, + boolean_t should_return, + boolean_t should_iosync) +{ + if (object == VM_OBJECT_NULL) + return KERN_INVALID_ARGUMENT; + + /* FIXME: we should rather introduce an internal function, e.g. + vm_object_update, rather than calling memory_object_lock_request. */ + vm_object_reference(object); + + /* This is already always synchronous for now. */ + (void) should_iosync; + + size = round_page(offset + size) - trunc_page(offset); + offset = trunc_page(offset); + + return memory_object_lock_request(object, offset, size, + should_return ? + MEMORY_OBJECT_RETURN_ALL : + MEMORY_OBJECT_RETURN_NONE, + should_flush, + VM_PROT_NO_CHANGE, + NULL, 0); +} + +/* + * vm_msync synchronizes out pages from the map to their memory manager, + * if any. + */ +kern_return_t vm_msync( + vm_map_t map, + vm_address_t address, + vm_size_t size, + vm_sync_t sync_flags) +{ + if (map == VM_MAP_NULL) + return KERN_INVALID_ARGUMENT; + + return vm_map_msync(map, (vm_offset_t) address, size, sync_flags); +} + +/* + * vm_allocate_contiguous allocates "zero fill" physical memory and maps + * it into in the specfied map. + */ +/* TODO: respect physical alignment (palign) + * and minimum physical address (pmin) + */ +kern_return_t vm_allocate_contiguous( + host_t host_priv, + vm_map_t map, + vm_address_t *result_vaddr, + rpc_phys_addr_t *result_paddr, + vm_size_t size, + rpc_phys_addr_t pmin, + rpc_phys_addr_t pmax, + rpc_phys_addr_t palign) +{ + vm_size_t alloc_size; + unsigned int npages; + unsigned int i; + unsigned int order; + unsigned int selector; + vm_page_t pages; + vm_object_t object; + kern_return_t kr; + vm_address_t vaddr; + + if (host_priv == HOST_NULL) + return KERN_INVALID_HOST; + + if (map == VM_MAP_NULL) + return KERN_INVALID_TASK; + + /* FIXME */ + if (pmin != 0) + return KERN_INVALID_ARGUMENT; + + if (palign == 0) + palign = PAGE_SIZE; + + /* FIXME: Allows some small alignments less than page size */ + if ((palign < PAGE_SIZE) && (PAGE_SIZE % palign == 0)) + palign = PAGE_SIZE; + + /* FIXME */ + if (palign != PAGE_SIZE) + return KERN_INVALID_ARGUMENT; + + selector = VM_PAGE_SEL_DMA; + if (pmax > VM_PAGE_DMA_LIMIT) +#ifdef VM_PAGE_DMA32_LIMIT +#if VM_PAGE_DMA32_LIMIT < VM_PAGE_DIRECTMAP_LIMIT + if (pmax <= VM_PAGE_DMA32_LIMIT) + selector = VM_PAGE_SEL_DMA32; + if (pmax > VM_PAGE_DMA32_LIMIT) +#endif +#endif + if (pmax <= VM_PAGE_DIRECTMAP_LIMIT) + selector = VM_PAGE_SEL_DIRECTMAP; + if (pmax > VM_PAGE_DIRECTMAP_LIMIT) +#ifdef VM_PAGE_DMA32_LIMIT +#if VM_PAGE_DMA32_LIMIT > VM_PAGE_DIRECTMAP_LIMIT + if (pmax <= VM_PAGE_DMA32_LIMIT) + selector = VM_PAGE_SEL_DMA32; + if (pmax > VM_PAGE_DMA32_LIMIT) +#endif +#endif + if (pmax <= VM_PAGE_HIGHMEM_LIMIT) + selector = VM_PAGE_SEL_HIGHMEM; + + size = vm_page_round(size); + + if (size == 0) + return KERN_INVALID_ARGUMENT; + + object = vm_object_allocate(size); + + if (object == NULL) + return KERN_RESOURCE_SHORTAGE; + + /* + * XXX The page allocator returns blocks with a power-of-two size. + * The requested size may not be a power-of-two, requiring some + * work to release back the pages that aren't needed. + */ + order = vm_page_order(size); + alloc_size = (1 << (order + PAGE_SHIFT)); + npages = vm_page_atop(alloc_size); + + pages = vm_page_grab_contig(alloc_size, selector); + + if (pages == NULL) { + vm_object_deallocate(object); + return KERN_RESOURCE_SHORTAGE; + } + + vm_object_lock(object); + vm_page_lock_queues(); + + for (i = 0; i < vm_page_atop(size); i++) { + /* + * XXX We can safely handle contiguous pages as an array, + * but this relies on knowing the implementation of the + * page allocator. + */ + pages[i].busy = FALSE; + vm_page_insert(&pages[i], object, vm_page_ptoa(i)); + vm_page_wire(&pages[i]); + } + + vm_page_unlock_queues(); + vm_object_unlock(object); + + for (i = vm_page_atop(size); i < npages; i++) { + vm_page_release(&pages[i], FALSE, FALSE); + } + + vaddr = 0; + kr = vm_map_enter(map, &vaddr, size, 0, TRUE, object, 0, FALSE, + VM_PROT_READ | VM_PROT_WRITE, + VM_PROT_READ | VM_PROT_WRITE, VM_INHERIT_DEFAULT); + + if (kr != KERN_SUCCESS) { + vm_object_deallocate(object); + return kr; + } + + kr = vm_map_pageable(map, vaddr, vaddr + size, + VM_PROT_READ | VM_PROT_WRITE, + TRUE, TRUE); + + if (kr != KERN_SUCCESS) { + vm_map_remove(map, vaddr, vaddr + size); + return kr; + } + + *result_vaddr = vaddr; + *result_paddr = pages->phys_addr; + + assert(*result_paddr >= pmin); + assert(*result_paddr + size <= pmax); + + return KERN_SUCCESS; +} + +/* + * vm_pages_phys returns information about a region of memory + */ +kern_return_t vm_pages_phys( + host_t host, + vm_map_t map, + vm_address_t address, + vm_size_t size, + rpc_phys_addr_array_t *pagespp, + mach_msg_type_number_t *countp) +{ + if (host == HOST_NULL) + return KERN_INVALID_HOST; + if (map == VM_MAP_NULL) + return KERN_INVALID_TASK; + + if (!page_aligned(address)) + return KERN_INVALID_ARGUMENT; + if (!page_aligned(size)) + return KERN_INVALID_ARGUMENT; + + mach_msg_type_number_t count = atop(size), cur; + rpc_phys_addr_array_t pagesp = *pagespp; + kern_return_t kr; + + if (*countp < count) { + vm_offset_t allocated; + /* Avoid faults while we keep vm locks */ + kr = kmem_alloc(ipc_kernel_map, &allocated, + count * sizeof(pagesp[0])); + if (kr != KERN_SUCCESS) + return KERN_RESOURCE_SHORTAGE; + pagesp = (rpc_phys_addr_array_t) allocated; + } + + for (cur = 0; cur < count; cur++) { + vm_map_t cmap; /* current map in traversal */ + rpc_phys_addr_t paddr; + vm_map_entry_t entry; /* entry in current map */ + + /* find the entry containing (or following) the address */ + vm_map_lock_read(map); + for (cmap = map;;) { + /* cmap is read-locked */ + + if (!vm_map_lookup_entry(cmap, address, &entry)) { + entry = VM_MAP_ENTRY_NULL; + break; + } + + if (entry->is_sub_map) { + /* move down to the sub map */ + + vm_map_t nmap = entry->object.sub_map; + vm_map_lock_read(nmap); + vm_map_unlock_read(cmap); + cmap = nmap; + continue; + } else { + /* Found it */ + break; + } + /*NOTREACHED*/ + } + + paddr = 0; + if (entry) { + vm_offset_t offset = address - entry->vme_start + entry->offset; + vm_object_t object = entry->object.vm_object; + + if (object) { + vm_object_lock(object); + vm_page_t page = vm_page_lookup(object, offset); + if (page) { + if (page->phys_addr != (typeof(pagesp[cur])) page->phys_addr) + printf("warning: physical address overflow in vm_pages_phys!!\n"); + else + paddr = page->phys_addr; + } + vm_object_unlock(object); + } + } + vm_map_unlock_read(cmap); + pagesp[cur] = paddr; + + address += PAGE_SIZE; + } + + if (pagesp != *pagespp) { + vm_map_copy_t copy; + kr = vm_map_copyin(ipc_kernel_map, (vm_offset_t) pagesp, + count * sizeof(pagesp[0]), TRUE, ©); + assert(kr == KERN_SUCCESS); + *pagespp = (rpc_phys_addr_array_t) copy; + } + + *countp = count; + + return KERN_SUCCESS; +} diff --git a/vm/vm_user.h b/vm/vm_user.h new file mode 100644 index 0000000..c6f20a8 --- /dev/null +++ b/vm/vm_user.h @@ -0,0 +1,60 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/vm_user.h + * Author: Avadis Tevanian, Jr., Michael Wayne Young + * Date: 1986 + * + * Declarations of user-visible virtual address space + * management functionality. + */ + +#ifndef _VM_VM_USER_H_ +#define _VM_VM_USER_H_ + +#include <mach/kern_return.h> +#include <mach/std_types.h> +#include <mach/mach_types.h> + +extern kern_return_t vm_allocate(vm_map_t, vm_offset_t *, vm_size_t, + boolean_t); +extern kern_return_t vm_deallocate(vm_map_t, vm_offset_t, vm_size_t); +extern kern_return_t vm_inherit(vm_map_t, vm_offset_t, vm_size_t, + vm_inherit_t); +extern kern_return_t vm_protect(vm_map_t, vm_offset_t, vm_size_t, boolean_t, + vm_prot_t); +extern kern_return_t vm_statistics(vm_map_t, vm_statistics_data_t *); +extern kern_return_t vm_cache_statistics(vm_map_t, vm_cache_statistics_data_t *); +extern kern_return_t vm_read(vm_map_t, vm_address_t, vm_size_t, pointer_t *, + vm_size_t *); +extern kern_return_t vm_write(vm_map_t, vm_address_t, pointer_t, vm_size_t); +extern kern_return_t vm_copy(vm_map_t, vm_address_t, vm_size_t, + vm_address_t); +extern kern_return_t vm_map(vm_map_t, vm_offset_t *, vm_size_t, vm_offset_t, + boolean_t, ipc_port_t, vm_offset_t, boolean_t, + vm_prot_t, vm_prot_t, vm_inherit_t); + +#endif /* _VM_VM_USER_H_ */ |