diff options
author | Oded Gabbay <oded.gabbay@gmail.com> | 2015-09-30 11:42:21 +0300 |
---|---|---|
committer | Oded Gabbay <oded.gabbay@gmail.com> | 2015-09-30 11:43:59 +0300 |
commit | 27675a5f87f0c11ab8a59f119518f627598c4caf (patch) | |
tree | 62dbe253bbd7df7b2e85d84668b89ce7adda6e86 /hsakmt | |
parent | bbdfa9eeb6dd015f22479368d2440d62785a4bb8 (diff) |
Move all source/header files to hsakmt subfolder
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Diffstat (limited to 'hsakmt')
-rw-r--r-- | hsakmt/Makefile | 53 | ||||
-rw-r--r-- | hsakmt/debug.c | 249 | ||||
-rw-r--r-- | hsakmt/events.c | 269 | ||||
-rw-r--r-- | hsakmt/fmm.c | 486 | ||||
-rw-r--r-- | hsakmt/fmm.h | 60 | ||||
-rw-r--r-- | hsakmt/globals.c | 33 | ||||
-rw-r--r-- | hsakmt/hsakmt.h | 577 | ||||
-rw-r--r-- | hsakmt/hsakmttypes.h | 909 | ||||
-rw-r--r-- | hsakmt/kfd_ioctl.h | 292 | ||||
-rw-r--r-- | hsakmt/libhsakmt.c | 18 | ||||
-rw-r--r-- | hsakmt/libhsakmt.h | 76 | ||||
-rw-r--r-- | hsakmt/libhsakmt.ver | 46 | ||||
-rw-r--r-- | hsakmt/memory.c | 204 | ||||
-rw-r--r-- | hsakmt/openclose.c | 112 | ||||
-rw-r--r-- | hsakmt/perfctr.c | 370 | ||||
-rw-r--r-- | hsakmt/pmc_table.c | 134 | ||||
-rw-r--r-- | hsakmt/pmc_table.h | 50 | ||||
-rw-r--r-- | hsakmt/queues.c | 341 | ||||
-rw-r--r-- | hsakmt/time.c | 61 | ||||
-rw-r--r-- | hsakmt/topology.c | 991 | ||||
-rw-r--r-- | hsakmt/version.c | 49 |
21 files changed, 5380 insertions, 0 deletions
diff --git a/hsakmt/Makefile b/hsakmt/Makefile new file mode 100644 index 0000000..5608ab7 --- /dev/null +++ b/hsakmt/Makefile @@ -0,0 +1,53 @@ +# Include directories +INCLUDES += ../include +CFLAGS += $(foreach DIR,$(INCLUDES),-I$(DIR)) + +LIB_NAME = libhsakmt.so +LIB_MAJOR_VER = 1 + +# Compiler options +CFLAGS += -fPIC # Position-independent code required to build shared library +CFLAGS += -W -Wall -Wextra -Werror -Wno-unused-parameter +CFLAGS += -Wformat-security -Wswitch-default -Wundef \ + -Wshadow -Wpointer-arith -Wbad-function-cast -Wcast-qual \ + -Wlogical-op -Wstrict-prototypes -Wmissing-prototypes \ + -Wmissing-declarations -Wredundant-decls \ + -Wunreachable-code +CFLAGS += -std=gnu99 -ggdb -pthread -fvisibility=hidden -O2 + +LDFLAGS += -lrt -pthread -Wl,--version-script=libhsakmt.ver -Wl,-soname=$(LIB_NAME).$(LIB_MAJOR_VER) + +OBJS = debug.o globals.o memory.o perfctr.o time.o version.o \ + events.o openclose.o queues.o topology.o fmm.o pmc_table.o \ + libhsakmt.o + +.PHONY: all lnx lnx64a clean + +# Default target +all: lnx lnx64a + +BUILD_ROOT = ../build +BUILDDIR = $(BUILD_ROOT)/$(MAKECMDGOALS) + +TARGET = $(addprefix $(BUILDDIR)/,$(OBJS)) + +$(BUILDDIR)/$(LIB_NAME).$(LIB_MAJOR_VER): $(TARGET) + gcc -shared $(LDFLAGS) -o $@ $^ + +$(BUILDDIR)/$(LIB_NAME): $(BUILDDIR)/$(LIB_NAME).$(LIB_MAJOR_VER) + @ln -sf $(LIB_NAME).$(LIB_MAJOR_VER) $(BUILDDIR)/$(LIB_NAME) + +lnx: CFLAGS += -m32 +lnx: LDFLAGS += -m32 +lnx: $(BUILDDIR)/$(LIB_NAME) + +lnx64a: $(BUILDDIR)/$(LIB_NAME) + +clean: + rm -rf $(BUILD_ROOT) + +#Rule +$(BUILDDIR)/%.o: %.c ../include/hsakmt.h ../include/hsakmttypes.h ../include/linux/kfd_ioctl.h + @echo Compiling $^ + @mkdir -p $(dir $@) + gcc $(CFLAGS) -c $< -o $@ diff --git a/hsakmt/debug.c b/hsakmt/debug.c new file mode 100644 index 0000000..46f72e7 --- /dev/null +++ b/hsakmt/debug.c @@ -0,0 +1,249 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "libhsakmt.h" +#include "linux/kfd_ioctl.h" +#include <stdlib.h> +#include <string.h> + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtDbgRegister( + HSAuint32 NodeId //IN + ) +{ + HSAKMT_STATUS result; + uint32_t gpu_id; + CHECK_KFD_OPEN(); + + result = validate_nodeid(NodeId, &gpu_id); + if (result != HSAKMT_STATUS_SUCCESS) + return result; + + struct kfd_ioctl_dbg_register_args args; + memset(&args, 0, sizeof(args)); + args.gpu_id = gpu_id; + long err = kmtIoctl(kfd_fd, AMDKFD_IOC_DBG_REGISTER, &args); + + if (err == 0) + result = HSAKMT_STATUS_SUCCESS; + else + result = HSAKMT_STATUS_ERROR; + + return (result); +} + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtDbgUnregister( + HSAuint32 NodeId //IN + ) +{ + HSAKMT_STATUS result; + uint32_t gpu_id; + CHECK_KFD_OPEN(); + + result = validate_nodeid(NodeId, &gpu_id); + if (result != HSAKMT_STATUS_SUCCESS) + return result; + + struct kfd_ioctl_dbg_unregister_args args; + memset(&args, 0, sizeof(args)); + args.gpu_id = gpu_id; + long err = kmtIoctl(kfd_fd, AMDKFD_IOC_DBG_UNREGISTER, &args); + if (err == 0) + result = HSAKMT_STATUS_SUCCESS; + else + result = HSAKMT_STATUS_ERROR; + + return (result); +} + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtDbgWavefrontControl( + HSAuint32 NodeId, //IN + HSA_DBG_WAVEOP Operand, //IN + HSA_DBG_WAVEMODE Mode, //IN + HSAuint32 TrapId, //IN + HsaDbgWaveMessage* DbgWaveMsgRing //IN (? - see thunk API doc!) + ) +{ + HSAKMT_STATUS result; + uint32_t gpu_id; + + struct kfd_ioctl_dbg_wave_control_args *args; + + CHECK_KFD_OPEN(); + + result = validate_nodeid(NodeId, &gpu_id); + if (result != HSAKMT_STATUS_SUCCESS) + return result; + + /* Determine Size of the ioctl buffer */ + uint32_t buff_size = sizeof(Operand) + + sizeof(Mode) + sizeof(TrapId) + + sizeof(DbgWaveMsgRing->DbgWaveMsg) + + sizeof(DbgWaveMsgRing->MemoryVA) + + sizeof(*args); + + args = (struct kfd_ioctl_dbg_wave_control_args*) malloc(buff_size); + if (args == NULL) + return HSAKMT_STATUS_ERROR; + + memset(args, 0, buff_size); + + args->gpu_id = gpu_id; + args->buf_size_in_bytes = buff_size; + + /* increment pointer to the start of the non fixed part */ + unsigned char* run_ptr = (unsigned char*)args + sizeof(*args); + + /* save variable content pointer for kfd */ + args->content_ptr = (uint64_t) run_ptr; + + /* insert items, and increment pointer accordingly */ + *((HSA_DBG_WAVEOP*)run_ptr) = Operand; + run_ptr += sizeof(Operand); + + *((HSA_DBG_WAVEMODE*)run_ptr) = Mode; + run_ptr += sizeof(Mode); + + *((HSAuint32*)run_ptr) = TrapId; + run_ptr += sizeof(TrapId); + + *((HsaDbgWaveMessageAMD*)run_ptr) = DbgWaveMsgRing->DbgWaveMsg; + run_ptr += sizeof(DbgWaveMsgRing->DbgWaveMsg); + + *((void**)run_ptr) = DbgWaveMsgRing->MemoryVA; + run_ptr += sizeof(DbgWaveMsgRing->MemoryVA); + + /* send to kernel */ + long err = kmtIoctl(kfd_fd, AMDKFD_IOC_DBG_WAVE_CONTROL, args); + + free (args); + + if (err == 0) + return HSAKMT_STATUS_SUCCESS; + else + return HSAKMT_STATUS_ERROR; +} + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtDbgAddressWatch( + HSAuint32 NodeId, //IN + HSAuint32 NumWatchPoints, //IN + HSA_DBG_WATCH_MODE WatchMode[], //IN + void* WatchAddress[], //IN + HSAuint64 WatchMask[], //IN, optional + HsaEvent* WatchEvent[] //IN, optional + ) +{ + HSAKMT_STATUS result; + uint32_t gpu_id; + struct kfd_ioctl_dbg_address_watch_args *args; + uint32_t buff_size; + uint32_t watch_mask_items, watch_event_items; + HSAuint32 i; + + /* + * Determine the size of the watch mask and event buffers + * the value is NULL if and only if no vector data should be attached + */ + + watch_mask_items = WatchMask[0] > 0 ? NumWatchPoints : 1; + watch_event_items = WatchEvent != NULL ? NumWatchPoints : 0; + + CHECK_KFD_OPEN(); + + result = validate_nodeid(NodeId, &gpu_id); + if (result != HSAKMT_STATUS_SUCCESS) + return result; + + if (NumWatchPoints > MAX_ALLOWED_NUM_POINTS) + return HSAKMT_STATUS_INVALID_PARAMETER; + + /* + * Size and structure of the ioctl buffer is dynamic in this case + * Here we calculate the buff size. + */ + + buff_size = sizeof(NumWatchPoints) + + (sizeof(WatchMode[0]) + sizeof(WatchAddress[0])) * + NumWatchPoints + + watch_mask_items * sizeof(HSAuint64) + + watch_event_items * sizeof(HsaEvent*)+ + sizeof(*args); + + args = (struct kfd_ioctl_dbg_address_watch_args*) malloc(buff_size); + if (args == NULL) + return HSAKMT_STATUS_ERROR; + + memset(args, 0, buff_size); + + args->gpu_id = gpu_id; + args->buf_size_in_bytes = buff_size; + + /* increment pointer to the start of the non fixed part */ + unsigned char* run_ptr = (unsigned char*)args + sizeof(*args); + + /* save variable content pointer for kfd */ + args->content_ptr = (uint64_t) run_ptr; + + /* insert items, and increment pointer accordingly */ + *((HSAuint32*)run_ptr) = NumWatchPoints; + run_ptr += sizeof(NumWatchPoints); + + for (i = 0 ; i < NumWatchPoints ; i++) { + *((HSA_DBG_WATCH_MODE*)run_ptr) = WatchMode[i]; + run_ptr += sizeof(WatchMode[i]); + } + + for (i = 0 ; i < NumWatchPoints ; i++) { + *((void**)run_ptr) = WatchAddress[i]; + run_ptr += sizeof(WatchAddress[i]); + } + + for (i = 0 ; i < watch_mask_items ; i++) { + *((HSAuint64*)run_ptr) = WatchMask[i]; + run_ptr += sizeof(WatchMask[i]); + } + + for (i = 0 ; i < watch_event_items ; i++) { + *((HsaEvent**)run_ptr) = WatchEvent[i]; + run_ptr += sizeof(WatchEvent[i]); + } + + /* send to kernel */ + long err = kmtIoctl(kfd_fd, AMDKFD_IOC_DBG_ADDRESS_WATCH, args); + + free (args); + + if (err != 0) + return HSAKMT_STATUS_ERROR; + + return HSAKMT_STATUS_SUCCESS; +} diff --git a/hsakmt/events.c b/hsakmt/events.c new file mode 100644 index 0000000..5d6835e --- /dev/null +++ b/hsakmt/events.c @@ -0,0 +1,269 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "libhsakmt.h" +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <errno.h> +#include <unistd.h> +#include <sys/mman.h> +#include "linux/kfd_ioctl.h" + +static HSAuint64 *events_page = NULL; + +static bool IsSystemEventType(HSA_EVENTTYPE type) +{ + // Debug events behave as signal events. + return (type != HSA_EVENTTYPE_SIGNAL && type != HSA_EVENTTYPE_DEBUG_EVENT); +} + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtCreateEvent( + HsaEventDescriptor* EventDesc, //IN + bool ManualReset, //IN + bool IsSignaled, //IN + HsaEvent** Event //OUT + ) +{ + CHECK_KFD_OPEN(); + + if (EventDesc->EventType >= HSA_EVENTTYPE_MAXID) + { + return HSAKMT_STATUS_INVALID_PARAMETER; + } + + HsaEvent* e = malloc(sizeof(HsaEvent)); + if (e == NULL) + { + return HSAKMT_STATUS_ERROR; + } + + memset(e, 0, sizeof(*e)); + + struct kfd_ioctl_create_event_args args; + memset(&args, 0, sizeof(args)); + + args.event_type = EventDesc->EventType; + args.auto_reset = !ManualReset; + + if (kmtIoctl(kfd_fd, AMDKFD_IOC_CREATE_EVENT, &args) != 0) { + free(e); + *Event = NULL; + return HSAKMT_STATUS_ERROR; + } + + if (events_page == NULL && args.event_page_offset > 0) { + events_page = mmap(NULL, 4096, PROT_WRITE | PROT_READ, + MAP_SHARED, kfd_fd, args.event_page_offset); + if (events_page == NULL) { + hsaKmtDestroyEvent(e); + return HSAKMT_STATUS_ERROR; + } + } + + if (args.event_page_offset > 0 && args.event_slot_index < KFD_SIGNAL_EVENT_LIMIT) + e->EventData.HWData2 = (HSAuint64)&events_page[args.event_slot_index]; + + e->EventId = args.event_id; + e->EventData.EventType = EventDesc->EventType; + e->EventData.HWData1 = args.event_id; + + e->EventData.HWData3 = args.event_trigger_data; + + if (IsSignaled && !IsSystemEventType(e->EventData.EventType)) { + struct kfd_ioctl_set_event_args set_args; + memset(&set_args, 0, sizeof(set_args)); + set_args.event_id = args.event_id; + + kmtIoctl(kfd_fd, AMDKFD_IOC_SET_EVENT, &set_args); + } + + *Event = e; + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtDestroyEvent( + HsaEvent* Event //IN + ) +{ + CHECK_KFD_OPEN(); + + if (!Event) + return HSAKMT_STATUS_INVALID_HANDLE; + + struct kfd_ioctl_destroy_event_args args; + memset(&args, 0, sizeof(args)); + + args.event_id = Event->EventId; + + if (kmtIoctl(kfd_fd, AMDKFD_IOC_DESTROY_EVENT, &args) != 0) { + return HSAKMT_STATUS_ERROR; + } + + free(Event); + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtSetEvent( + HsaEvent* Event //IN + ) +{ + CHECK_KFD_OPEN(); + + if (!Event) + return HSAKMT_STATUS_INVALID_HANDLE; + + /* Although the spec is doesn't say, don't allow system-defined events to be signaled. */ + if (IsSystemEventType(Event->EventData.EventType)) + return HSAKMT_STATUS_ERROR; + + struct kfd_ioctl_set_event_args args; + memset(&args, 0, sizeof(args)); + + args.event_id = Event->EventId; + + if (kmtIoctl(kfd_fd, AMDKFD_IOC_SET_EVENT, &args) == -1) + return HSAKMT_STATUS_ERROR; + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtResetEvent( + HsaEvent* Event //IN + ) +{ + CHECK_KFD_OPEN(); + + if (!Event) + return HSAKMT_STATUS_INVALID_HANDLE; + + /* Although the spec is doesn't say, don't allow system-defined events to be signaled. */ + if (IsSystemEventType(Event->EventData.EventType)) + return HSAKMT_STATUS_ERROR; + + struct kfd_ioctl_reset_event_args args; + memset(&args, 0, sizeof(args)); + + args.event_id = Event->EventId; + + if (kmtIoctl(kfd_fd, AMDKFD_IOC_RESET_EVENT, &args) == -1) + return HSAKMT_STATUS_ERROR; + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtQueryEventState( + HsaEvent* Event //IN + ) +{ + CHECK_KFD_OPEN(); + + if (!Event) + return HSAKMT_STATUS_INVALID_HANDLE; + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtWaitOnEvent( + HsaEvent* Event, //IN + HSAuint32 Milliseconds //IN + ) +{ + if (!Event) + return HSAKMT_STATUS_INVALID_HANDLE; + + return hsaKmtWaitOnMultipleEvents(&Event, 1, true, Milliseconds); +} + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtWaitOnMultipleEvents( + HsaEvent* Events[], //IN + HSAuint32 NumEvents, //IN + bool WaitOnAll, //IN + HSAuint32 Milliseconds //IN + ) +{ + CHECK_KFD_OPEN(); + + if (!Events) + return HSAKMT_STATUS_INVALID_HANDLE; + + struct kfd_event_data *event_data = malloc(NumEvents * sizeof(struct kfd_event_data)); + for (HSAuint32 i = 0; i < NumEvents; i++) { + event_data[i].event_id = Events[i]->EventId; + event_data[i].kfd_event_data_ext = (uint64_t)(uintptr_t)NULL; + } + + struct kfd_ioctl_wait_events_args args; + memset(&args, 0, sizeof(args)); + + args.wait_for_all = WaitOnAll; + args.timeout = Milliseconds; + args.num_events = NumEvents; + args.events_ptr = (uint64_t)(uintptr_t)event_data; + + HSAKMT_STATUS result; + + if (kmtIoctl(kfd_fd, AMDKFD_IOC_WAIT_EVENTS, &args) == -1) { + result = HSAKMT_STATUS_ERROR; + } + else if (args.wait_result == KFD_IOC_WAIT_RESULT_TIMEOUT) { + result = HSAKMT_STATUS_WAIT_TIMEOUT; + } + else { + result = HSAKMT_STATUS_SUCCESS; + for (HSAuint32 i = 0; i < NumEvents; i++) { + if (Events[i]->EventData.EventType == HSA_EVENTTYPE_MEMORY) { + Events[i]->EventData.EventData.MemoryAccessFault.VirtualAddress = event_data[i].memory_exception_data.va; + result = gpuid_to_nodeid(event_data[i].memory_exception_data.gpu_id, &Events[i]->EventData.EventData.MemoryAccessFault.NodeId); + if (result != HSAKMT_STATUS_SUCCESS) + goto out; + Events[i]->EventData.EventData.MemoryAccessFault.Failure.NotPresent = event_data[i].memory_exception_data.failure.NotPresent; + Events[i]->EventData.EventData.MemoryAccessFault.Failure.ReadOnly = event_data[i].memory_exception_data.failure.ReadOnly; + Events[i]->EventData.EventData.MemoryAccessFault.Failure.NoExecute = event_data[i].memory_exception_data.failure.NoExecute; + Events[i]->EventData.EventData.MemoryAccessFault.Flags = HSA_EVENTID_MEMORY_FATAL_PROCESS; + } + } + } +out: + free(event_data); + + return result; +} diff --git a/hsakmt/fmm.c b/hsakmt/fmm.c new file mode 100644 index 0000000..a90fb95 --- /dev/null +++ b/hsakmt/fmm.c @@ -0,0 +1,486 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "fmm.h" +#include "linux/kfd_ioctl.h" +#include "libhsakmt.h" +#include <stdlib.h> +#include <stdio.h> +#include <inttypes.h> +#include <sys/mman.h> + +#define NON_VALID_GPU_ID 0 +#define ARRAY_LEN(array) (sizeof(array) / sizeof(array[0])) +#define INIT_APERTURE(base_value, limit_value) {.base = (void*)base_value, .limit = (void*)limit_value } +#define INIT_MANAGEBLE_APERTURE(base_value, limit_value) {.base = (void*)base_value,.limit = (void*)limit_value, .vm_ranges = NULL, .vm_objects = NULL, .fmm_mutex = PTHREAD_MUTEX_INITIALIZER} +#define INIT_GPU_MEM \ +{ .gpu_id = NON_VALID_GPU_ID,\ + .lds_aperture = INIT_APERTURE(0, 0), \ + .scratch_aperture = INIT_MANAGEBLE_APERTURE(0, 0),\ + .gpuvm_aperture = INIT_MANAGEBLE_APERTURE(0, 0)\ +} + +#define INIT_GPUs_MEM {[0 ... (NUM_OF_SUPPORTED_GPUS-1)] = INIT_GPU_MEM} +struct vm_object{ + void* start; + HSAuint64 size; + HSAuint64 handle; // opaque + struct vm_object* next; + struct vm_object* prev; +}; +typedef struct vm_object vm_object_t; + +struct vm_area{ + void* start; + void* end; + struct vm_area* next; + struct vm_area* prev; +}; +typedef struct vm_area vm_area_t; + +typedef struct { + void* base; + void* limit; + vm_area_t* vm_ranges; + vm_object_t* vm_objects; + pthread_mutex_t fmm_mutex; +} manageble_aperture_t; + +typedef struct { + void* base; + void* limit; +} aperture_t; + +typedef struct{ + HSAuint32 gpu_id; + aperture_t lds_aperture; + manageble_aperture_t scratch_aperture; + manageble_aperture_t gpuvm_aperture; +}gpu_mem_t; + +static gpu_mem_t gpu_mem[] = INIT_GPUs_MEM; + +static vm_area_t* vm_create_and_init_area(void* start, void* end){ + vm_area_t* area = (vm_area_t*)malloc(sizeof(vm_area_t));// TODO: Memory pool ??? + if (area){ + area->start = start; + area->end = end; + area->next = area->prev = NULL; + } + + return area; +} + +static vm_object_t* vm_create_and_init_object(void* start, uint64_t size, uint64_t handle){ + vm_object_t* object = (vm_object_t*)malloc(sizeof(vm_object_t)); // TODO: Memory pool ??? + if (object){ + object->start = start; + object->size = size; + object->handle = handle; + object->next = object->prev = NULL; + } + + return object; +} + + +static void vm_remove_area(manageble_aperture_t* app, vm_area_t* area){ + vm_area_t* next; + vm_area_t* prev; + + next = area->next; + prev = area->prev; + + if (prev == NULL )// The first element + app->vm_ranges = next; + else + prev->next = next; + + if(next) // If not the last element + next->prev = prev; + + free(area); + +} + +static void vm_remove_object(manageble_aperture_t* app, vm_object_t* object){ + vm_object_t* next; + vm_object_t* prev; + + next = object->next; + prev = object->prev; + + if (prev == NULL )// The first element + app->vm_objects = next; + else + prev->next = next; + + if(next) // If not the last element + next->prev = prev; + + free(object); + +} + + + +static void vm_add_area_after(vm_area_t* after_this, vm_area_t* new_area){ + vm_area_t* next = after_this->next; + after_this->next = new_area; + new_area->next = next; + + new_area->prev = after_this; + if (next) + next->prev = new_area; +} + +static void vm_add_object_before(vm_object_t* before_this, vm_object_t* new_object){ + vm_object_t* prev = before_this->prev; + before_this->prev = new_object; + new_object->next = before_this; + + new_object->prev = prev; + if (prev) + prev->next = new_object; +} + +static void vm_split_area(manageble_aperture_t* app, vm_area_t* area, void* address, uint64_t MemorySizeInBytes){ + + // The existing area is split to: [area->start, address - 1] and [address + MemorySizeInBytes, area->end] + vm_area_t* new_area = vm_create_and_init_area(VOID_PTR_ADD(address,MemorySizeInBytes), area->end); + + // Shrink the existing area + area->end = VOID_PTR_SUB(address,1); + + vm_add_area_after(area, new_area); + +} + +static vm_object_t* vm_find_object_by_address(manageble_aperture_t* app, void* address, uint64_t size){ + vm_object_t* cur = app->vm_objects; + + // Look up the appropriate address range containing the given address + while(cur){ + if(cur->start == address && cur->size == size) + break; + cur = cur->next; + }; + + return cur; // NULL if not found +} + +static vm_area_t* vm_find(manageble_aperture_t* app, void* address){ + vm_area_t* cur = app->vm_ranges; + + // Look up the appropriate address range containing the given address + while(cur){ + if(cur->start <= address && cur->end >= address) + break; + cur = cur->next; + }; + + return cur; // NULL if not found +} + +static bool aperture_is_valid(void* app_base, void* app_limit){ + if (app_base && app_limit && app_base < app_limit) + return true; + return false; +} + +/* + * Assumes that fmm_mutex is locked on entry. + */ +static int aperture_release(manageble_aperture_t* app, void* address, uint64_t MemorySizeInBytes){ + int rc = -1; + vm_area_t* area; + + area = vm_find(app, address); + vm_object_t* object = vm_find_object_by_address(app, address, MemorySizeInBytes); + if (object && area){ + vm_remove_object(app, object); + if (VOID_PTRS_SUB(area->end, area->start) + 1 > MemorySizeInBytes){ // the size of the released block is less than the size of area + if (area->start == address){ // shrink from the start + area->start = VOID_PTR_ADD(area->start,MemorySizeInBytes); + } else if (VOID_PTRS_SUB(area->end, address) + 1 == MemorySizeInBytes){ // shrink from the end + area->end = VOID_PTR_SUB(area->end, MemorySizeInBytes); + } else { // split the area + vm_split_area(app, area, address, MemorySizeInBytes); + } + rc = 0; + } else if (VOID_PTRS_SUB(area->end, area->start) + 1 == MemorySizeInBytes){ // the size of the released block is exactly the same as the size of area + vm_remove_area(app, area); + rc = 0; + } else { + //Inconsistent data. Fail it? + rc = -1; + } + } + + return rc; +} + +/* + * returns allocated address or NULL. Assumes, that fmm_mutex is locked on entry. + */ +static void* aperture_allocate(manageble_aperture_t* app, uint64_t MemorySizeInBytes){ + vm_area_t* cur, *next, *new_area, *start; + vm_object_t* new_object; + void* new_address = NULL; + next = NULL; + new_area = NULL; + + cur = app->vm_ranges; + if (cur){ // not empty + + // Look up the appropriate address space "hole" or end of the list + while(cur){ + next = cur->next; + + // End of the list reached + if (!next) + break; + + // address space "hole" + if ((VOID_PTRS_SUB(next->start,cur->end) >= MemorySizeInBytes)) + break; + + cur = next; + }; + + // If the new range is inside the reserved aperture + if (VOID_PTRS_SUB(app->limit, cur->end) + 1 >= MemorySizeInBytes){ + // cur points to the last inspected element: the tail of the list or the found "hole" + // Just extend the existing region + new_address = VOID_PTR_ADD(cur->end, 1); + cur->end = VOID_PTR_ADD(cur->end, MemorySizeInBytes); + } else + new_address = NULL; + + } else { // empty - create the first area + start = (void*)app->base; + new_area = vm_create_and_init_area(start, VOID_PTR_ADD(start, (MemorySizeInBytes - 1))); + if (new_area){ + app->vm_ranges = new_area; + new_address = new_area->start; + } + } + + // Allocate new object + if (new_address){ + new_object = vm_create_and_init_object(new_address, MemorySizeInBytes, 0); + if (new_object){ + if (app->vm_objects == NULL){ // empty list + // Update head + app->vm_objects = new_object; + } else { + // Add it before the first element + vm_add_object_before(app->vm_objects, new_object); + // Update head + app->vm_objects = new_object; + } + } else{ + // Failed to allocate object: remove just allocated range and return NULL + aperture_release(app, new_address, MemorySizeInBytes); + new_address = NULL; + } + } + + return new_address; + +} + + + +static int32_t gpu_mem_find_by_gpu_id(uint32_t gpu_id){ + int32_t i; + + for(i = 0; i < NUM_OF_SUPPORTED_GPUS; i++){ + if(gpu_mem[i].gpu_id == gpu_id) + return i; + } + + return -1; +} + +bool fmm_is_inside_some_aperture(void* address){ + + int32_t i; + + for(i = 0; i < NUM_OF_SUPPORTED_GPUS; i++){ + if(gpu_mem[i].gpu_id != NON_VALID_GPU_ID){ + if ((address>= gpu_mem[i].lds_aperture.base) && (address<= gpu_mem[i].lds_aperture.limit)) + return true; + if ((address>= gpu_mem[i].gpuvm_aperture.base) && (address<= gpu_mem[i].gpuvm_aperture.limit)) + return true; + if ((address>= gpu_mem[i].scratch_aperture.base) && (address<= gpu_mem[i].scratch_aperture.limit)) + return true; + } + } + + return false; +} + +#ifdef DEBUG_PRINT_APERTURE +static void aperture_print(aperture_t* app){ + printf("\t Base: %p\n", app->base); + printf("\t Limit: %p\n", app->limit); +} + +static void manageble_aperture_print(manageble_aperture_t* app){ + vm_area_t* cur = app->vm_ranges; + vm_object_t *object = app->vm_objects; + + printf("\t Base: %p\n", app->base); + printf("\t Limit: %p\n", app->limit); + printf("\t Ranges: \n"); + while(cur){ + printf("\t\t Range [%p - %p] \n", cur->start, cur->end); + cur = cur->next; + }; + printf("\t Objects: \n"); + while(object){ + printf("\t\t Object [%p - %" PRIu64 "] \n", object->start, object->size); + object = object->next; + }; +} + +void fmm_print(uint32_t gpu_id){ + int32_t i = gpu_mem_find_by_gpu_id(gpu_id); + if(i >= 0){ // Found + printf("LDS aperture: \n"); + aperture_print(&gpu_mem[i].lds_aperture); + printf("GPUVM aperture: \n"); + manageble_aperture_print(&gpu_mem[i].gpuvm_aperture); + printf("Scratch aperture: \n"); + manageble_aperture_print(&gpu_mem[i].scratch_aperture); + + } +} +#else +void fmm_print(uint32_t gpu_id){ + +} +#endif + + +void* fmm_allocate_scratch(uint32_t gpu_id, uint64_t MemorySizeInBytes){ + + void* mem = NULL; + int32_t i = gpu_mem_find_by_gpu_id(gpu_id); + + // If not found or aperture isn't properly initialized/supported + if(i < 0 || !aperture_is_valid(gpu_mem[i].scratch_aperture.base, gpu_mem[i].scratch_aperture.limit)) + return NULL; + + pthread_mutex_lock(&gpu_mem[i].scratch_aperture.fmm_mutex); + mem = aperture_allocate(&gpu_mem[i].scratch_aperture, MemorySizeInBytes); + pthread_mutex_unlock(&gpu_mem[i].scratch_aperture.fmm_mutex); + + return mem; +} + +void* fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes){ + + void* mem = NULL; + int32_t i = gpu_mem_find_by_gpu_id(gpu_id); + + // If not found or aperture isn't properly initialized/supported + if(i < 0 || !aperture_is_valid(gpu_mem[i].gpuvm_aperture.base, gpu_mem[i].gpuvm_aperture.limit)) + return NULL; + + pthread_mutex_lock(&gpu_mem[i].gpuvm_aperture.fmm_mutex); + mem = aperture_allocate(&gpu_mem[i].gpuvm_aperture, MemorySizeInBytes); + pthread_mutex_unlock(&gpu_mem[i].gpuvm_aperture.fmm_mutex); + + return mem; +} + + +int fmm_release(void* address, uint64_t MemorySizeInBytes){ + + uint32_t i; + int32_t rc = -1; + + for(i = 0; i < NUM_OF_SUPPORTED_GPUS; i++){ + if(gpu_mem[i].gpu_id == NON_VALID_GPU_ID) + continue; + + if (address >= gpu_mem[i].gpuvm_aperture.base && address <= gpu_mem[i].gpuvm_aperture.limit){ + pthread_mutex_lock(&gpu_mem[i].gpuvm_aperture.fmm_mutex); + rc = aperture_release(&gpu_mem[i].gpuvm_aperture, address, MemorySizeInBytes); + pthread_mutex_unlock(&gpu_mem[i].gpuvm_aperture.fmm_mutex); + fmm_print(gpu_mem[i].gpu_id); + } else if (address >= gpu_mem[i].scratch_aperture.base && address <= gpu_mem[i].scratch_aperture.limit) + pthread_mutex_lock(&gpu_mem[i].scratch_aperture.fmm_mutex); + rc = aperture_release(&gpu_mem[i].scratch_aperture, address, MemorySizeInBytes); + pthread_mutex_unlock(&gpu_mem[i].scratch_aperture.fmm_mutex); + } + + return rc; +} + +HSAKMT_STATUS fmm_init_process_apertures(){ + struct kfd_ioctl_get_process_apertures_args args; + uint8_t node_id; + + if (0 == kmtIoctl(kfd_fd, AMDKFD_IOC_GET_PROCESS_APERTURES, (void*)&args)){ + for(node_id = 0; node_id < args.num_of_nodes; node_id++){ + gpu_mem[node_id].gpu_id = args.process_apertures[node_id].gpu_id; + gpu_mem[node_id].lds_aperture.base = PORT_UINT64_TO_VPTR(args.process_apertures[node_id].lds_base); + gpu_mem[node_id].lds_aperture.limit = PORT_UINT64_TO_VPTR(args.process_apertures[node_id].lds_limit); + gpu_mem[node_id].gpuvm_aperture.base = PORT_UINT64_TO_VPTR(args.process_apertures[node_id].gpuvm_base); + gpu_mem[node_id].gpuvm_aperture.limit = PORT_UINT64_TO_VPTR(args.process_apertures[node_id].gpuvm_limit); + gpu_mem[node_id].scratch_aperture.base = PORT_UINT64_TO_VPTR(args.process_apertures[node_id].scratch_base); + gpu_mem[node_id].scratch_aperture.limit = PORT_UINT64_TO_VPTR(args.process_apertures[node_id].scratch_limit); + } + + return HSAKMT_STATUS_SUCCESS; + } + + return HSAKMT_STATUS_ERROR; + +} + +HSAuint64 fmm_get_aperture_base(aperture_type_e aperture_type, HSAuint32 gpu_id){ + int32_t slot = gpu_mem_find_by_gpu_id(gpu_id); + if (slot<0) + return HSAKMT_STATUS_INVALID_PARAMETER; + + switch(aperture_type){ + case FMM_GPUVM: + return aperture_is_valid(gpu_mem[slot].gpuvm_aperture.base, gpu_mem[slot].gpuvm_aperture.limit) ? PORT_VPTR_TO_UINT64(gpu_mem[slot].gpuvm_aperture.base) : 0; + break; + case FMM_SCRATCH: + return aperture_is_valid(gpu_mem[slot].scratch_aperture.base, gpu_mem[slot].scratch_aperture.limit) ? PORT_VPTR_TO_UINT64(gpu_mem[slot].scratch_aperture.base) : 0; + break; + case FMM_LDS: + return aperture_is_valid(gpu_mem[slot].lds_aperture.base, gpu_mem[slot].lds_aperture.limit) ? PORT_VPTR_TO_UINT64(gpu_mem[slot].lds_aperture.base) : 0; + break; + default: + return 0; + } + +} diff --git a/hsakmt/fmm.h b/hsakmt/fmm.h new file mode 100644 index 0000000..5924247 --- /dev/null +++ b/hsakmt/fmm.h @@ -0,0 +1,60 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef FMM_H_ +#define FMM_H_ + +#include "hsakmttypes.h" +#include <stddef.h> + +typedef enum { + FMM_FIRST_APERTURE_TYPE = 0, + FMM_GPUVM = FMM_FIRST_APERTURE_TYPE, + FMM_LDS, + FMM_SCRATCH, + FMM_LAST_APERTURE_TYPE +} aperture_type_e; + +typedef struct { + aperture_type_e app_type; + uint64_t size; + void* start_address; +} aperture_properties_t; + +HSAKMT_STATUS fmm_init_process_apertures(void); +/* + * Memory interface + */ +void* fmm_allocate_scratch(uint32_t gpu_id, uint64_t MemorySizeInBytes); +void* fmm_allocate_device(uint32_t gpu_id, uint64_t MemorySizeInBytes); +void fmm_print(uint32_t node); +bool fmm_is_inside_some_aperture(void* address); +int fmm_release(void* address, HSAuint64 MemorySizeInBytes); + +/* Topology interface*/ +HSAKMT_STATUS fmm_node_added(HSAuint32 gpu_id); +HSAKMT_STATUS fmm_node_removed(HSAuint32 gpu_id); +HSAuint64 fmm_get_aperture_base(aperture_type_e aperture_type, HSAuint32 gpu_id); +#endif /* FMM_H_ */ diff --git a/hsakmt/globals.c b/hsakmt/globals.c new file mode 100644 index 0000000..cad6b1f --- /dev/null +++ b/hsakmt/globals.c @@ -0,0 +1,33 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "libhsakmt.h" + +// HSAKMT global data + +int kfd_fd; +unsigned long kfd_open_count; +unsigned long system_properties_count; +pthread_mutex_t hsakmt_mutex = PTHREAD_MUTEX_INITIALIZER; diff --git a/hsakmt/hsakmt.h b/hsakmt/hsakmt.h new file mode 100644 index 0000000..c87b3f8 --- /dev/null +++ b/hsakmt/hsakmt.h @@ -0,0 +1,577 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef _HSAKMT_H_ +#define _HSAKMT_H_ + +#include "hsakmttypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +/** + "Opens" the HSA kernel driver for user-kernel mode communication. + + On Windows, this function gets a handle to the KFD's AMDKFDIO device object that + is responsible for user-kernel communication, this handle is used internally by + the thunk library to send device I/O control to the HSA kernel driver. + No other thunk library function may be called unless the user-kernel communication + channel is opened first. + + On Linux this call opens the "/dev/kfd" device file to establish a communication + path to the kernel. +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtOpenKFD( void ); + +/** + "Closes" the user-kernel communication path. + + On Windows, the handle obtained by the hsaKmtOpenKFD() function is closed; + no other communication with the kernel driver is possible after the successful + execution of the saKmdCloseKFD() function. Depending on the failure reason, + the user-kernel communication path may or may not be still active. + + On Linux the function closes the "dev/kfd" device file. + No further communication to the kernel driver is allowed until hsaKmtOpenKFD() + function is called again. +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtCloseKFD( void ); + + +/** + Returns the user-kernel interface version supported by KFD. + Higher major numbers usually add new features to KFD and may break user-kernel + compatibility; higher minor numbers define additional functionality associated + within a major number. + The calling software should validate that it meets the minimum interface version + as described in the API specification. +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtGetVersion( + HsaVersionInfo* VersionInfo //OUT + ); + +/** + The function takes a "snapshot" of the topology information within the KFD + to avoid any changes during the enumeration process. +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtAcquireSystemProperties( + HsaSystemProperties* SystemProperties //OUT + ); + +/** + Releases the topology "snapshot" taken by hsaKmtAcquireSystemProperties() +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtReleaseSystemProperties( void ) ; + +/** + Retrieves the discoverable sub-properties for a given HSA + node. The parameters returned allow the application or runtime to size the + management structures necessary to store the information. +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtGetNodeProperties( + HSAuint32 NodeId, //IN + HsaNodeProperties* NodeProperties //OUT + ); + +/** + Retrieves the memory properties of a specific HSA node. + the memory pointer passed as MemoryProperties is sized as + NumBanks * sizeof(HsaMemoryProperties). NumBanks is retrieved with the + hsaKmtGetNodeProperties() call. + + Some of the data returned is optional. Not all implementations may return all + parameters in the hsaMemoryProperties. +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtGetNodeMemoryProperties( + HSAuint32 NodeId, //IN + HSAuint32 NumBanks, //IN + HsaMemoryProperties* MemoryProperties //OUT + ); + +/** + Retrieves the cache properties of a specific HSA node and processor ID. + ProcessorID refers to either a CPU core or a SIMD unit as enumerated earlier + via the hsaKmtGetNodeProperties() call. + The memory pointer passed as CacheProperties is sized as + NumCaches * sizeof(HsaCacheProperties). NumCaches is retrieved with the + hsaKmtGetNodeProperties() call. + + The data returned is optional. Not all implementations may return all + parameters in the CacheProperties. +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtGetNodeCacheProperties( + HSAuint32 NodeId, //IN + HSAuint32 ProcessorId, //IN + HSAuint32 NumCaches, //IN + HsaCacheProperties* CacheProperties //OUT + ); + +/** + Retrieves the HSA IO affinity properties of a specific HSA node. + the memory pointer passed as Properties is sized as + NumIoLinks * sizeof(HsaIoLinkProperties). NumIoLinks is retrieved with the + hsaKmtGetNodeProperties() call. + + The data returned is optional. Not all implementations may return all + parameters in the IoLinkProperties. +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtGetNodeIoLinkProperties( + HSAuint32 NodeId, //IN + HSAuint32 NumIoLinks, //IN + HsaIoLinkProperties* IoLinkProperties //OUT + ); + + + +/** + Creates an operating system event associated with a HSA event ID +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtCreateEvent( + HsaEventDescriptor* EventDesc, //IN + bool ManualReset, //IN + bool IsSignaled, //IN + HsaEvent** Event //OUT + ); + +/** + Destroys an operating system event associated with a HSA event ID +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtDestroyEvent( + HsaEvent* Event //IN + ); + +/** + Sets the specified event object to the signaled state +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtSetEvent( + HsaEvent* Event //IN + ); + +/** + Sets the specified event object to the non-signaled state +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtResetEvent( + HsaEvent* Event //IN + ); + +/** + Queries the state of the specified event object +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtQueryEventState( + HsaEvent* Event //IN + ); + +/** + Checks the current state of the event object. If the object's state is + nonsignaled, the calling thread enters the wait state. + + The function returns when one of the following occurs: +- The specified event object is in the signaled state. +- The time-out interval elapses. +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtWaitOnEvent( + HsaEvent* Event, //IN + HSAuint32 Milliseconds //IN + ); + +/** + Checks the current state of multiple event objects. + + The function returns when one of the following occurs: +- Either any one or all of the specified objects are in the signaled state + - if "WaitOnAll" is "true" the function returns when the state of all + objects in array is signaled + - if "WaitOnAll" is "false" the function returns when the state of any + one of the objects is set to signaled +- The time-out interval elapses. +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtWaitOnMultipleEvents( + HsaEvent* Events[], //IN + HSAuint32 NumEvents, //IN + bool WaitOnAll, //IN + HSAuint32 Milliseconds //IN + ); + +/** + new TEMPORARY function definition - to be used only on "Triniti + Southern Islands" platform + If used on other platforms the function will return HSAKMT_STATUS_ERROR +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtReportQueue( + HSA_QUEUEID QueueId, //IN + HsaQueueReport* QueueReport //OUT + ); + +/** + Creates a GPU queue with user-mode access rights +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtCreateQueue( + HSAuint32 NodeId, //IN + HSA_QUEUE_TYPE Type, //IN + HSAuint32 QueuePercentage, //IN + HSA_QUEUE_PRIORITY Priority, //IN + void* QueueAddress, //IN + HSAuint64 QueueSizeInBytes, //IN + HsaEvent* Event, //IN + HsaQueueResource* QueueResource //OUT + ); + +/** + Updates a queue +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtUpdateQueue( + HSA_QUEUEID QueueId, //IN + HSAuint32 QueuePercentage,//IN + HSA_QUEUE_PRIORITY Priority, //IN + void* QueueAddress, //IN + HSAuint64 QueueSize, //IN + HsaEvent* Event //IN + ); + +/** + Destroys a queue +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtDestroyQueue( + HSA_QUEUEID QueueId //IN + ); + +/** + Allows an HSA process to set/change the default and alternate memory coherency, before starting to dispatch. +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtSetMemoryPolicy( + HSAuint32 Node, //IN + HSAuint32 DefaultPolicy, //IN + HSAuint32 AlternatePolicy, //IN + void* MemoryAddressAlternate, //IN (page-aligned) + HSAuint64 MemorySizeInBytes //IN (page-aligned) + ); +/** + Allocates a memory buffer that may be accessed by the GPU +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtAllocMemory( + HSAuint32 PreferredNode, //IN + HSAuint64 SizeInBytes, //IN (multiple of page size) + HsaMemFlags MemFlags, //IN + void** MemoryAddress //OUT (page-aligned) + ); + +/** + Frees a memory buffer +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtFreeMemory( + void* MemoryAddress, //IN (page-aligned) + HSAuint64 SizeInBytes //IN + ); + +/** + Registers with KFD a memory buffer that may be accessed by the GPU + This function will never be required for Linux +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtRegisterMemory( + void* MemoryAddress, //IN (page-aligned) + HSAuint64 MemorySizeInBytes //IN (page-aligned) + ); + + +/** + Unregisters with KFD a memory buffer + This function will never be required for Linux +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtDeregisterMemory( + void* MemoryAddress //IN + ); + + +/** + Ensures that the memory is resident and can be accessed by GPU +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtMapMemoryToGPU( + void* MemoryAddress, //IN (page-aligned) + HSAuint64 MemorySizeInBytes, //IN (page-aligned) + HSAuint64* AlternateVAGPU //OUT (page-aligned) + ); + +/** + Releases the residency of the memory +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtUnmapMemoryToGPU( + void* MemoryAddress //IN (page-aligned) + ); + + +/** + Notifies the kernel driver that a process wants to use GPU debugging facilities +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtDbgRegister( + HSAuint32 NodeId //IN + ); + +/** + Detaches the debugger process from the HW debug established by hsaKmtDbgRegister() API +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtDbgUnregister( + HSAuint32 NodeId //IN + ); + +/** + Controls a wavefront +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtDbgWavefrontControl( + HSAuint32 NodeId, //IN + HSA_DBG_WAVEOP Operand, //IN + HSA_DBG_WAVEMODE Mode, //IN + HSAuint32 TrapId, //IN + HsaDbgWaveMessage* DbgWaveMsgRing //IN + ); + +/** + Sets watch points on memory address ranges to generate exception events when the + watched addresses are accessed +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtDbgAddressWatch( + HSAuint32 NodeId, //IN + HSAuint32 NumWatchPoints, //IN + HSA_DBG_WATCH_MODE WatchMode[], //IN + void* WatchAddress[], //IN + HSAuint64 WatchMask[], //IN, optional + HsaEvent* WatchEvent[] //IN, optional + ); + +/** + Gets GPU and CPU clock counters for particular Node +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtGetClockCounters( + HSAuint32 NodeId, //IN + HsaClockCounters* Counters //OUT + ); + +/** + Retrieves information on the available HSA counters +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtPmcGetCounterProperties( + HSAuint32 NodeId, //IN + HsaCounterProperties** CounterProperties //OUT + ); + +/** + Registers a set of (HW) counters to be used for tracing/profiling +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtPmcRegisterTrace( + HSAuint32 NodeId, //IN + HSAuint32 NumberOfCounters, //IN + HsaCounter* Counters, //IN + HsaPmcTraceRoot* TraceRoot //OUT + ); + +/** + Unregisters a set of (HW) counters used for tracing/profiling +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtPmcUnregisterTrace( + HSAuint32 NodeId, //IN + HSATraceId TraceId //IN + ); + +/** + Allows a user mode process to get exclusive access to the defined set of (HW) counters + used for tracing/profiling +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtPmcAcquireTraceAccess( + HSAuint32 NodeId, //IN + HSATraceId TraceId //IN + ); + +/** + Allows a user mode process to release exclusive access to the defined set of (HW) counters + used for tracing/profiling +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtPmcReleaseTraceAccess( + HSAuint32 NodeId, //IN + HSATraceId TraceId //IN + ); + +/** + Starts tracing operation on a previously established set of performance counters +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtPmcStartTrace( + HSATraceId TraceId, //IN + void* TraceBuffer, //IN (page aligned) + HSAuint64 TraceBufferSizeBytes //IN (page aligned) + ); + +/** + Forces an update of all the counters that a previously started trace operation has registered +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtPmcQueryTrace( + HSATraceId TraceId //IN + ); + +/** + Stops tracing operation on a previously established set of performance counters +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtPmcStopTrace( + HSATraceId TraceId //IN + ); + +/** + Sets trap handler and trap buffer to be used for all queues associated with the specified NodeId within this process context +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtSetTrapHandler( + HSAuint32 NodeId, //IN + void* TrapHandlerBaseAddress, //IN + HSAuint64 TrapHandlerSizeInBytes, //IN + void* TrapBufferBaseAddress, //IN + HSAuint64 TrapBufferSizeInBytes //IN + ); + +#ifdef __cplusplus +} //extern "C" +#endif + +#endif //_HSAKMT_H_ + diff --git a/hsakmt/hsakmttypes.h b/hsakmt/hsakmttypes.h new file mode 100644 index 0000000..a7e0a81 --- /dev/null +++ b/hsakmt/hsakmttypes.h @@ -0,0 +1,909 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef _HSAKMTTYPES_H_ +#define _HSAKMTTYPES_H_ + +//the definitions and THUNK API are version specific - define the version numbers here +#define HSAKMT_VERSION_MAJOR 0 +#define HSAKMT_VERSION_MINOR 99 + + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_WIN64) || defined(_WINDOWS) || defined(_WIN32) + + #if defined(_WIN32) + #define HSAKMTAPI __stdcall + #else + #define HSAKMTAPI + #endif + + typedef unsigned char HSAuint8; + typedef char HSAint8; + typedef unsigned short HSAuint16; + typedef signed short HSAint16; + typedef unsigned __int32 HSAuint32; + typedef signed __int64 HSAint64; + typedef unsigned __int64 HSAuint64; + +#elif defined(__linux__) + +#include <stdbool.h> +#include <stdint.h> + + #define HSAKMTAPI + + typedef uint8_t HSAuint8; + typedef int8_t HSAint8; + typedef uint16_t HSAuint16; + typedef int16_t HSAint16; + typedef uint32_t HSAuint32; + typedef int64_t HSAint64; + typedef uint64_t HSAuint64; + +#endif + +typedef void* HSA_HANDLE; +typedef HSAuint64 HSA_QUEUEID; + +// This is included in order to force the alignments to be 4 bytes so that +// it avoids extra padding added by the compiler when a 64-bit binary is generated. +#pragma pack(push, hsakmttypes_h, 4) + +// +// HSA STATUS codes returned by the KFD Interfaces +// + +typedef enum _HSAKMT_STATUS +{ + HSAKMT_STATUS_SUCCESS = 0, // Operation successful + HSAKMT_STATUS_ERROR = 1, // General error return if not otherwise specified + HSAKMT_STATUS_DRIVER_MISMATCH = 2, // User mode component is not compatible with kernel HSA driver + + HSAKMT_STATUS_INVALID_PARAMETER = 3, // KFD identifies input parameters invalid + HSAKMT_STATUS_INVALID_HANDLE = 4, // KFD identifies handle parameter invalid + HSAKMT_STATUS_INVALID_NODE_UNIT = 5, // KFD identifies node or unit parameter invalid + + HSAKMT_STATUS_NO_MEMORY = 6, // No memory available (when allocating queues or memory) + HSAKMT_STATUS_BUFFER_TOO_SMALL = 7, // A buffer needed to handle a request is too small + + HSAKMT_STATUS_NOT_IMPLEMENTED = 10, // KFD function is not implemented for this set of paramters + HSAKMT_STATUS_NOT_SUPPORTED = 11, // KFD function is not supported on this node + HSAKMT_STATUS_UNAVAILABLE = 12, // KFD function is not available currently on this node (but + // may be at a later time) + + HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED = 20, // KFD driver path not opened + HSAKMT_STATUS_KERNEL_COMMUNICATION_ERROR = 21, // user-kernel mode communication failure + HSAKMT_STATUS_KERNEL_ALREADY_OPENED = 22, // KFD driver path already opened + HSAKMT_STATUS_HSAMMU_UNAVAILABLE = 23, // ATS/PRI 1.1 (Address Translation Services) not available + // (IOMMU driver not installed or not-available) + + HSAKMT_STATUS_WAIT_FAILURE = 30, // The wait operation failed + HSAKMT_STATUS_WAIT_TIMEOUT = 31, // The wait operation timed out + + HSAKMT_STATUS_MEMORY_ALREADY_REGISTERED = 35, // Memory buffer already registered + HSAKMT_STATUS_MEMORY_NOT_REGISTERED = 36, // Memory buffer not registered + HSAKMT_STATUS_MEMORY_ALIGNMENT = 37, // Memory parameter not aligned + +} HSAKMT_STATUS; + +// +// HSA KFD interface version information. Calling software has to validate that it meets +// the minimum interface version as described in the API specification. +// All future structures will be extended in a backward compatible fashion. +// + +typedef struct _HsaVersionInfo +{ + HSAuint32 KernelInterfaceMajorVersion; // supported kernel interface major version + HSAuint32 KernelInterfaceMinorVersion; // supported kernel interface minor version +} HsaVersionInfo; + +// +// HSA Topology Discovery Infrastructure structure definitions. +// The infrastructure implementation is based on design specified in the Kernel HSA Driver ADD +// The discoverable data is retrieved from ACPI structures in the platform infrastructure, as defined +// in the "Heterogeneous System Architecture Detail Topology" specification. +// +// The following structure is returned on a call to hsaKmtAcquireSystemProperties() as output. +// When the call is made within a process context, a "snapshot" of the topology information +// is taken within the KFD to avoid any changes during the enumeration process. +// The Snapshot is released when hsaKmtReleaseSystemProperties() is called +// or when the process exits or is terminated. +// + +typedef struct _HsaSystemProperties +{ + HSAuint32 NumNodes; // the number of "H-NUMA" memory nodes. + // each node represents a discoverable node of the system + // All other enumeration is done on a per-node basis + + HSAuint32 PlatformOem; // identifies HSA platform, reflects the OEMID in the CRAT + HSAuint32 PlatformId; // HSA platform ID, reflects OEM TableID in the CRAT + HSAuint32 PlatformRev; // HSA platform revision, reflects Platform Table Revision ID +} HsaSystemProperties; + + +typedef union +{ + HSAuint32 Value; + struct + { + unsigned int HotPluggable : 1; // the node may be removed by some system action + // (event will be sent) + unsigned int HSAMMUPresent : 1; // This node has an ATS/PRI 1.1 compatible + // translation agent in the system (e.g. IOMMUv2) + unsigned int SharedWithGraphics : 1; // this HSA nodes' GPU function is also used for OS primary + // graphics render (= UI) + unsigned int QueueSizePowerOfTwo : 1; // This node GPU requires the queue size to be a power of 2 value + unsigned int QueueSize32bit : 1; // This node GPU requires the queue size to be less than 4GB + unsigned int QueueIdleEvent : 1; // This node GPU supports notification on Queue Idle + unsigned int VALimit : 1; // This node GPU has limited VA range for platform + // (typical 40bit). Affects shared VM use for 64bit apps + unsigned int WatchPointsSupported: 1; // Indicates if Watchpoints are available on the node. + unsigned int WatchPointsTotalBits: 4; // ld(Watchpoints) available. To determine the number use 2^value + + unsigned int DoorbellType : 2; // 0: This node has pre-1.0 doorbell characteristic + // 1: This node has 1.0 doorbell characteristic + // 2,3: reserved for future use + unsigned int Reserved : 18; + } ui32; +} HSA_CAPABILITY; + + +// +// HSA node properties. This structure is an output parameter of hsaKmtGetNodeProperties() +// The application or runtime can use the information herein to size the topology management structures +// Unless there is some very weird setup, there is at most one "GPU" device (with a certain number +// of throughput compute units (= SIMDs) associated with a H-NUMA node. +// + +#define HSA_PUBLIC_NAME_SIZE 128 + +typedef struct _HsaNodeProperties +{ + HSAuint32 NumCPUCores; // # of latency (= CPU) cores present on this HSA node. + // This value is 0 for a HSA node with no such cores, + // e.g a "discrete HSA GPU" + HSAuint32 NumFComputeCores; // # of HSA throughtput (= GPU) FCompute cores ("SIMD") present in a node. + // This value is 0 if no FCompute cores are present (e.g. pure "CPU node"). + HSAuint32 NumMemoryBanks; // # of discoverable memory bank affinity properties on this "H-NUMA" node. + HSAuint32 NumCaches; // # of discoverable cache affinity properties on this "H-NUMA" node. + + HSAuint32 NumIOLinks; // # of discoverable IO link affinity properties of this node + // connecting to other nodes. + + HSAuint32 CComputeIdLo; // low value of the logical processor ID of the latency (= CPU) + // cores available on this node + HSAuint32 FComputeIdLo; // low value of the logical processor ID of the throughput (= GPU) + // units available on this node + + HSA_CAPABILITY Capability; // see above + + HSAuint32 MaxWavesPerSIMD; // This identifies the max. number of launched waves per SIMD. + // If NumFComputeCores is 0, this value is ignored. + HSAuint32 LDSSizeInKB; // Size of Local Data Store in Kilobytes per SIMD Wavefront + HSAuint32 GDSSizeInKB; // Size of Global Data Store in Kilobytes shared across SIMD Wavefronts + + HSAuint32 WaveFrontSize; // Number of SIMD cores per wavefront executed, typically 64, + // may be 32 or a different value for some HSA based architectures + + HSAuint32 NumShaderBanks; // Number of Shader Banks or Shader Engines, typical values are 1 or 2 + + + HSAuint32 NumArrays; // Number of SIMD arrays per engine + HSAuint32 NumCUPerArray; // Number of Compute Units (CU) per SIMD array + HSAuint32 NumSIMDPerCU; // Number of SIMD representing a Compute Unit (CU) + + HSAuint32 MaxSlotsScratchCU; // Number of temp. memory ("scratch") wave slots available to access, + // may be 0 if HW has no restrictions + + HSAuint32 EngineId; // Identifier (rev) of teh GPU uEngine or Firmware, may be 0 + + HSAuint16 VendorId; // GPU vendor id; 0 on latency (= CPU)-only nodes + HSAuint16 DeviceId; // GPU device id; 0 on latency (= CPU)-only nodes + + HSAuint32 LocationId; // GPU BDF (Bus/Device/function number) - identifies the device + // location in the overall system + HSAuint64 LocalMemSize; // Local memory size + HSAuint32 MaxEngineClockMhzFCompute; // maximum engine clocks for CPU and + HSAuint32 MaxEngineClockMhzCCompute; // GPU function, including any boost caopabilities, + + HSAuint16 MarketingName[HSA_PUBLIC_NAME_SIZE]; // Public name of the "device" on the node (board or APU name). + // Unicode string +} HsaNodeProperties; + + +typedef enum _HSA_HEAPTYPE +{ + HSA_HEAPTYPE_SYSTEM = 0, + HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC = 1, // CPU "visible" part of GPU device local memory (for discrete GPU) + HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE = 2, // CPU "invisible" part of GPU device local memory (for discrete GPU) + // All HSA accessible memory is per definition "CPU visible" + // "Private memory" is relevant for graphics interop only. + HSA_HEAPTYPE_GPU_GDS = 3, // GPU internal memory (GDS) + HSA_HEAPTYPE_GPU_LDS = 4, // GPU internal memory (LDS) + HSA_HEAPTYPE_GPU_SCRATCH = 5, // GPU special memory (scratch) + + HSA_HEAPTYPE_NUMHEAPTYPES, + HSA_HEAPTYPE_SIZE = 0xFFFFFFFF +} HSA_HEAPTYPE; + +typedef union +{ + HSAuint32 MemoryProperty; + struct + { + unsigned int HotPluggable : 1; // the memory may be removed by some system action, + // memory should be used for temporary data + unsigned int NonVolatile : 1; // memory content is preserved across a power-off cycle. + unsigned int Reserved :30; + } ui32; +} HSA_MEMORYPROPERTY; + + +// +// Discoverable HSA Memory properties. +// The structure is the output parameter of the hsaKmtGetNodeMemoryProperties() function +// + +typedef struct _HsaMemoryProperties +{ + HSA_HEAPTYPE HeapType; // system or frame buffer, + union + { + HSAuint64 SizeInBytes; // physical memory size of the memory range in bytes + struct + { + HSAuint32 SizeInBytesLow; // physical memory size of the memory range in bytes (lower 32bit) + HSAuint32 SizeInBytesHigh; // physical memory size of the memory range in bytes (higher 32bit) + } ui32; + }; + HSA_MEMORYPROPERTY Flags; // See definitions above + + HSAuint32 Width; // memory width - the number of parallel bits of the memory interface + HSAuint32 MemoryClockMax; // memory clock for the memory, this allows computing the available bandwidth + // to the memory when needed + HSAuint64 VirtualBaseAddress; // if set to value != 0, indicates the virtual base address of the memory + // in process virtual space +} HsaMemoryProperties; + +// +// Discoverable Cache Properties. (optional). +// The structure is the output parameter of the hsaKmtGetNodeMemoryProperties() function +// Any of the parameters may be 0 (= not defined) +// + +#define HSA_CPU_SIBLINGS 256 +#define HSA_PROCESSORID_ALL 0xFFFFFFFF + +typedef union +{ + HSAuint32 Value; + struct + { + unsigned int Data : 1; + unsigned int Instruction : 1; + unsigned int CPU : 1; + unsigned int HSACU : 1; + unsigned int Reserved :28; + } ui32; +} HsaCacheType; + +typedef struct _HaCacheProperties +{ + HSAuint32 ProcessorIdLow; // Identifies the processor number + + HSAuint32 CacheLevel; // Integer representing level: 1, 2, 3, 4, etc + HSAuint32 CacheSize; // Size of the cache + HSAuint32 CacheLineSize; // Cache line size in bytes + HSAuint32 CacheLinesPerTag; // Cache lines per Cache Tag + HSAuint32 CacheAssociativity; // Cache Associativity + HSAuint32 CacheLatency; // Cache latency in ns + HsaCacheType CacheType; + HSAuint32 SiblingMap[HSA_CPU_SIBLINGS]; +} HsaCacheProperties; + + +// +// Discoverable CPU Compute Properties. (optional). +// The structure is the output parameter of the hsaKmtGetCComputeProperties() function +// Any of the parameters may be 0 (= not defined) +// + +typedef struct _HsaCComputeProperties +{ + HSAuint32 SiblingMap[HSA_CPU_SIBLINGS]; +} HsaCComputeProperties; + +// +// Discoverable IoLink Properties (optional). +// The structure is the output parameter of the hsaKmtGetIoLinkProperties() function. +// Any of the parameters may be 0 (= not defined) +// + +typedef enum _HSA_IOLINKTYPE { + HSA_IOLINKTYPE_UNDEFINED = 0, + HSA_IOLINKTYPE_HYPERTRANSPORT = 1, + HSA_IOLINKTYPE_PCIEXPRESS = 2, + HSA_IOLINKTYPE_AMBA = 3, + HSA_IOLINKTYPE_MIPI = 4, + HSA_IOLINKTYPE_OTHER = 5, + HSA_IOLINKTYPE_NUMIOLINKTYPES, + HSA_IOLINKTYPE_SIZE = 0xFFFFFFFF +} HSA_IOLINKTYPE; + +typedef union +{ + HSAuint32 LinkProperty; + struct + { + unsigned int Override : 1; // bus link properties are determined by this structure + // not by the HSA_IOLINKTYPE. The other flags are valid + // only if this bit is set to one + unsigned int NonCoherent : 1; // The link doesn't support coherent transactions + // memory accesses across must not be set to "host cacheable"! + unsigned int NoAtomics32bit : 1; // The link doesn't support 32bit-wide atomic transactions + unsigned int NoAtomics64bit : 1; // The link doesn't support 64bit-wide atomic transactions + unsigned int Reserved :28; + } ui32; +} HSA_LINKPROPERTY; + + +typedef struct _HsaIoLinkProperties +{ + HSA_IOLINKTYPE IoLinkType; // see above + HSAuint32 VersionMajor; // Bus interface version (optional) + HSAuint32 VersionMinor; // Bus interface version (optional) + + HSAuint32 NodeFrom; // + HSAuint32 NodeTo; // + + HSAuint32 Weight; // weight factor (derived from CDIT) + + HSAuint32 MinimumLatency; // minimum cost of time to transfer (rounded to ns) + HSAuint32 MaximumLatency; // maximum cost of time to transfer (rounded to ns) + HSAuint32 MinimumBandwidth; // minimum interface Bandwidth in MB/s + HSAuint32 MaximumBandwidth; // maximum interface Bandwidth in MB/s + HSAuint32 RecTransferSize; // recommended transfer size to reach maximum bandwidth in Bytes + HSA_LINKPROPERTY Flags; // override flags (may be active for specific platforms) +} HsaIoLinkProperties; + +// +// Memory allocation definitions for the KFD HSA interface +// + +typedef struct _HsaMemFlags +{ + union + { + struct + { + unsigned int NonPaged : 1; // default = 0: pageable memory + unsigned int CachePolicy : 2; // see HSA_CACHING_TYPE + unsigned int ReadOnly : 1; // default = 0: Read/Write memory + unsigned int PageSize : 2; // see HSA_PAGE_SIZE + unsigned int HostAccess : 1; // default = 0: GPU access only + unsigned int NoSubstitute: 1; // default = 0: if specific memory is not available on node (e.g. on + // discrete GPU local), allocation may fall back to system memory node 0 + // memory (= always available). Otherwise no allocation is possible. + unsigned int GDSMemory : 1; // default = 0: If set, the allocation will occur in GDS heap. + // HostAccess must be 0, all other flags (except NoSubstitute) should + // be 0 when setting this entry to 1. GDS allocation may fail due to + // limited resources. Application code is required to work without + // any allocated GDS memory using regular memory. + // Allocation fails on any node without GPU function. + unsigned int Scratch : 1; // default = 0: If set, the allocation will occur in GPU "scratch area". + // HostAccess must be 0, all other flags (except NoSubstitute) should be 0 + // when setting this entry to 1. Scratch allocation may fail due to limited + // resources. Application code is required to work without any allocation. + // Allocation fails on any node without GPU function. + unsigned int AtomicAccessFull: 1; // default = 0: If set, the memory will be allocated and mapped to allow + // atomic ops processing. On AMD APU, this will use the ATC path on system + // memory, irrespective of the NonPaged flag setting (= if NonPaged is set, + // the memory is pagelocked but mapped through IOMMUv2 instead of GPUVM). + // All atomic ops must be supported on this memory. + unsigned int AtomicAccessPartial: 1; // default = 0: See above for AtomicAccessFull description, however + // focused on AMD discrete GPU that support PCIe atomics; the memory + // allocation is mapped to allow for PCIe atomics to operate on system + // memory, irrespective of NonPaged set or the presence of an ATC path + // in the system. The atomic operations supported are limited to SWAP, + // CompareAndSwap (CAS) and FetchAdd (this PCIe op allows both atomic + // increment and decrement via 2-complement arithmetic), which are the + // only atomic ops directly supported in PCI Express. + // On AMD APU, setting this flag will allocate the same type of memory + // as AtomicAccessFull, but it will be considered compatible with + // discrete GPU atomic operations access. + unsigned int ExecuteAccess: 1; // default = 0: Identifies if memory is primarily used for data or accessed + // for executable code (e.g. queue memory) by the host CPU or the device. + // Influences the page attribute setting within the allocation + unsigned int Reserved : 19; + + } ui32; + HSAuint32 Value; + }; +} HsaMemFlags; + +typedef enum _HSA_CACHING_TYPE +{ + HSA_CACHING_CACHED = 0, + HSA_CACHING_NONCACHED = 1, + HSA_CACHING_WRITECOMBINED = 2, + HSA_CACHING_RESERVED = 3, + HSA_CACHING_NUM_CACHING, + HSA_CACHING_SIZE = 0xFFFFFFFF +} HSA_CACHING_TYPE; + +typedef enum _HSA_PAGE_SIZE +{ + HSA_PAGE_SIZE_4KB = 0, + HSA_PAGE_SIZE_64KB = 1, //64KB pages, not generally available in systems + HSA_PAGE_SIZE_2MB = 2, + HSA_PAGE_SIZE_1GB = 3, //1GB pages, not generally available in systems +} HSA_PAGE_SIZE; + + +typedef enum _HSA_DEVICE +{ + HSA_DEVICE_CPU = 0, + HSA_DEVICE_GPU = 1, + MAX_HSA_DEVICE = 2 +} HSA_DEVICE; + + +typedef enum _HSA_QUEUE_PRIORITY +{ + HSA_QUEUE_PRIORITY_MINIMUM = -3, + HSA_QUEUE_PRIORITY_LOW = -2, + HSA_QUEUE_PRIORITY_BELOW_NORMAL = -1, + HSA_QUEUE_PRIORITY_NORMAL = 0, + HSA_QUEUE_PRIORITY_ABOVE_NORMAL = 1, + HSA_QUEUE_PRIORITY_HIGH = 2, + HSA_QUEUE_PRIORITY_MAXIMUM = 3, + HSA_QUEUE_PRIORITY_NUM_PRIORITY, + HSA_QUEUE_PRIORITY_SIZE = 0xFFFFFFFF +} HSA_QUEUE_PRIORITY; + +typedef enum _HSA_QUEUE_TYPE +{ + HSA_QUEUE_COMPUTE = 1, // AMD PM4 compatible Compute Queue + HSA_QUEUE_SDMA = 2, // SDMA Queue, used for data transport and format conversion (e.g. (de-)tiling, etc). + HSA_QUEUE_MULTIMEDIA_DECODE = 3, // reserved, for HSA multimedia decode queue + HSA_QUEUE_MULTIMEDIA_ENCODE = 4, // reserved, for HSA multimedia encode queue + + // the following values indicate a queue type permitted to reference OS graphics + // resources through the interoperation API. See [5] "HSA Graphics Interoperation + // specification" for more details on use of such resources. + + HSA_QUEUE_COMPUTE_OS = 11, // AMD PM4 compatible Compute Queue + HSA_QUEUE_SDMA_OS = 12, // SDMA Queue, used for data transport and format conversion (e.g. (de-)tiling, etc). + HSA_QUEUE_MULTIMEDIA_DECODE_OS = 13, // reserved, for HSA multimedia decode queue + HSA_QUEUE_MULTIMEDIA_ENCODE_OS = 14, // reserved, for HSA multimedia encode queue + + HSA_QUEUE_COMPUTE_AQL = 21, // HSA AQL packet compatible Compute Queue + HSA_QUEUE_DMA_AQL = 22, // HSA AQL packet compatible DMA Queue + + // more types in the future + + HSA_QUEUE_TYPE_SIZE = 0xFFFFFFFF //aligns to 32bit enum +} HSA_QUEUE_TYPE; + +typedef struct _HsaQueueResource +{ + HSA_QUEUEID QueueId; /** queue ID */ + /** Doorbell address to notify HW of a new dispatch */ + union + { + HSAuint32* Queue_DoorBell; + HSAuint64* Queue_DoorBell_aql; + HSAuint64 QueueDoorBell; + }; + + /** virtual address to notify HW of queue write ptr value */ + union + { + HSAuint32* Queue_write_ptr; + HSAuint64* Queue_write_ptr_aql; + HSAuint64 QueueWptrValue; + }; + + /** virtual address updated by HW to indicate current read location */ + union + { + HSAuint32* Queue_read_ptr; + HSAuint64* Queue_read_ptr_aql; + HSAuint64 QueueRptrValue; + }; + +} HsaQueueResource; + + +//TEMPORARY structure definition - to be used only on "Triniti + Southern Islands" platform +typedef struct _HsaQueueReport +{ + HSAuint32 VMID; //Required on SI to dispatch IB in primary ring + void* QueueAddress; //virtual address of UM mapped compute ring + HSAuint64 QueueSize; //size of the UM mapped compute ring +} HsaQueueReport; + + + +typedef enum _HSA_DBG_WAVEOP +{ + HSA_DBG_WAVEOP_HALT = 1, //Halts a wavefront + HSA_DBG_WAVEOP_RESUME = 2, //Resumes a wavefront + HSA_DBG_WAVEOP_KILL = 3, //Kills a wavefront + HSA_DBG_WAVEOP_DEBUG = 4, //Causes wavefront to enter debug mode + HSA_DBG_WAVEOP_TRAP = 5, //Causes wavefront to take a trap + HSA_DBG_NUM_WAVEOP = 5, + HSA_DBG_MAX_WAVEOP = 0xFFFFFFFF +} HSA_DBG_WAVEOP; + +typedef enum _HSA_DBG_WAVEMODE +{ + HSA_DBG_WAVEMODE_SINGLE = 0, //send command to a single wave + //Broadcast to all wavefronts of all processes is not supported for HSA user mode + HSA_DBG_WAVEMODE_BROADCAST_PROCESS = 2, //send to waves within current process + HSA_DBG_WAVEMODE_BROADCAST_PROCESS_CU = 3, //send to waves within current process on CU + HSA_DBG_NUM_WAVEMODE = 3, + HSA_DBG_MAX_WAVEMODE = 0xFFFFFFFF +} HSA_DBG_WAVEMODE; + + +typedef enum _HSA_DBG_WAVEMSG_TYPE +{ + HSA_DBG_WAVEMSG_AUTO = 0, + HSA_DBG_WAVEMSG_USER = 1, + HSA_DBG_WAVEMSG_ERROR = 2, + HSA_DBG_NUM_WAVEMSG, + HSA_DBG_MAX_WAVEMSG = 0xFFFFFFFF +} HSA_DBG_WAVEMSG_TYPE; + +typedef enum _HSA_DBG_WATCH_MODE +{ + HSA_DBG_WATCH_READ = 0, //Read operations only + HSA_DBG_WATCH_NONREAD = 1, //Write or Atomic operations only + HSA_DBG_WATCH_ATOMIC = 2, //Atomic Operations only + HSA_DBG_WATCH_ALL = 3, //Read, Write or Atomic operations + HSA_DBG_WATCH_NUM, + HSA_DBG_WATCH_SIZE = 0xFFFFFFFF +} HSA_DBG_WATCH_MODE; + + +//This structure is hardware specific and may change in the future +typedef struct _HsaDbgWaveMsgAMDGen2 +{ + HSAuint32 Value; + HSAuint32 Reserved2; + +} HsaDbgWaveMsgAMDGen2; + +typedef union _HsaDbgWaveMessageAMD +{ + HsaDbgWaveMsgAMDGen2 WaveMsgInfoGen2; + //for future HsaDbgWaveMsgAMDGen3; +} HsaDbgWaveMessageAMD; + +typedef struct _HsaDbgWaveMessage +{ + void* MemoryVA; // ptr to associated host-accessible data + HsaDbgWaveMessageAMD DbgWaveMsg; +} HsaDbgWaveMessage; + + +// +// HSA sync primitive, Event and HW Exception notification API definitions +// The API functions allow the runtime to define a so-called sync-primitive, a SW object +// combining a user-mode provided "syncvar" and a scheduler event that can be signaled +// through a defined GPU interrupt. A syncvar is a process virtual memory location of +// a certain size that can be accessed by CPU and GPU shader code within the process to set +// and query the content within that memory. The definition of the content is determined by +// the HSA runtime and potentially GPU shader code interfacing with the HSA runtime. +// The syncvar values may be commonly written through an PM4 WRITE_DATA packet in the +// user mode instruction stream. +// The OS scheduler event is typically associated and signaled by an interrupt issued by +// the GPU, but other HSA system interrupt conditions from other HW (e.g. IOMMUv2) may be +// surfaced by the KFD by this mechanism, too. +// + +// these are the new definitions for events +typedef enum _HSA_EVENTTYPE +{ + HSA_EVENTTYPE_SIGNAL = 0, //user-mode generated GPU signal + HSA_EVENTTYPE_NODECHANGE = 1, //HSA node change (attach/detach) + HSA_EVENTTYPE_DEVICESTATECHANGE = 2, //HSA device state change( start/stop ) + HSA_EVENTTYPE_HW_EXCEPTION = 3, //GPU shader exception event + HSA_EVENTTYPE_SYSTEM_EVENT = 4, //GPU SYSCALL with parameter info + HSA_EVENTTYPE_DEBUG_EVENT = 5, //GPU signal for debugging + HSA_EVENTTYPE_PROFILE_EVENT = 6, //GPU signal for profiling + HSA_EVENTTYPE_QUEUE_EVENT = 7, //GPU signal queue idle state (EOP pm4) + HSA_EVENTTYPE_MEMORY = 8, //GPU signal for signaling memory access faults and memory subsystem issues + //... + HSA_EVENTTYPE_MAXID, + HSA_EVENTTYPE_TYPE_SIZE = 0xFFFFFFFF +} HSA_EVENTTYPE; + +typedef HSAuint32 HSA_EVENTID; + +// +// Subdefinitions for various event types: Syncvar +// + +typedef struct _HsaSyncVar +{ + union + { + void* UserData; //pointer to user mode data + HSAuint64 UserDataPtrValue; //64bit compatibility of value + } SyncVar; + HSAuint64 SyncVarSize; +} HsaSyncVar; + +// +// Subdefinitions for various event types: NodeChange +// + +typedef enum _HSA_EVENTTYPE_NODECHANGE_FLAGS +{ + HSA_EVENTTYPE_NODECHANGE_ADD = 0, + HSA_EVENTTYPE_NODECHANGE_REMOVE = 1, + HSA_EVENTTYPE_NODECHANGE_SIZE = 0xFFFFFFFF +} HSA_EVENTTYPE_NODECHANGE_FLAGS; + +typedef struct _HsaNodeChange +{ + HSA_EVENTTYPE_NODECHANGE_FLAGS Flags; // HSA node added/removed on the platform +} HsaNodeChange; + +// +// Sub-definitions for various event types: DeviceStateChange +// + +typedef enum _HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS +{ + HSA_EVENTTYPE_DEVICESTATUSCHANGE_START = 0, //device started (and available) + HSA_EVENTTYPE_DEVICESTATUSCHANGE_STOP = 1, //device stopped (i.e. unavailable) + HSA_EVENTTYPE_DEVICESTATUSCHANGE_SIZE = 0xFFFFFFFF +} HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS; + +typedef struct _HsaDeviceStateChange +{ + HSAuint32 NodeId; // F-NUMA node that contains the device + HSA_DEVICE Device; // device type: GPU or CPU + HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS Flags; // event flags +} HsaDeviceStateChange; + +// +// Sub-definitions for various event types: Memory exception +// + +typedef enum _HSA_EVENTID_MEMORYFLAGS +{ + HSA_EVENTID_MEMORY_RECOVERABLE = 0, //access fault, recoverable after page adjustment + HSA_EVENTID_MEMORY_FATAL_PROCESS = 1, //memory access requires process context destruction, unrecoverable + HSA_EVENTID_MEMORY_FATAL_VM = 2, //memory access requires all GPU VA context destruction, unrecoverable +} HSA_EVENTID_MEMORYFLAGS; + +typedef struct _HsaAccessAttributeFailure +{ + unsigned int NotPresent : 1; // Page not present or supervisor privilege + unsigned int ReadOnly : 1; // Write access to a read-only page + unsigned int NoExecute : 1; // Execute access to a page marked NX + unsigned int GpuAccess : 1; // Host access only + unsigned int ECC : 1; // ECC failure (if supported by HW) + unsigned int Reserved : 27; // must be 0 +} HsaAccessAttributeFailure; + +// data associated with HSA_EVENTID_MEMORY +typedef struct _HsaMemoryAccessFault +{ + HSAuint32 NodeId; // H-NUMA node that contains the device where the memory access occurred + HSAuint64 VirtualAddress; // virtual address this occurred on + HsaAccessAttributeFailure Failure; // failure attribute + HSA_EVENTID_MEMORYFLAGS Flags; // event flags +} HsaMemoryAccessFault; + +typedef struct _HsaEventData +{ + HSA_EVENTTYPE EventType; //event type + + union + { + // return data associated with HSA_EVENTTYPE_SIGNAL and other events + HsaSyncVar SyncVar; + + // data associated with HSA_EVENTTYPE_NODE_CHANGE + HsaNodeChange NodeChangeState; + + // data associated with HSA_EVENTTYPE_DEVICE_STATE_CHANGE + HsaDeviceStateChange DeviceState; + + // data associated with HSA_EVENTTYPE_MEMORY + HsaMemoryAccessFault MemoryAccessFault; + + } EventData; + + // the following data entries are internal to the KFD & thunk itself. + + HSAuint64 HWData1; // internal thunk store for Event data (OsEventHandle) + HSAuint64 HWData2; // internal thunk store for Event data (HWAddress) + HSAuint32 HWData3; // internal thunk store for Event data (HWData) +} HsaEventData; + + +typedef struct _HsaEventDescriptor +{ + HSA_EVENTTYPE EventType; // event type to allocate + HSAuint32 NodeId; // H-NUMA node containing GPU device that is event source + HsaSyncVar SyncVar; // pointer to user mode syncvar data, syncvar->UserDataPtrValue may be NULL +} HsaEventDescriptor; + + +typedef struct _HsaEvent +{ + HSA_EVENTID EventId; + HsaEventData EventData; +} HsaEvent; + +typedef enum _HsaEventTimeout +{ + HSA_EVENTTIMEOUT_IMMEDIATE = 0, + HSA_EVENTTIMEOUT_INFINITE = 0xFFFFFFFF +} HsaEventTimeOut; + +typedef struct _HsaClockCounters +{ + HSAuint64 GPUClockCounter; + HSAuint64 CPUClockCounter; + HSAuint64 SystemClockCounter; + HSAuint64 SystemClockFrequencyHz; +} HsaClockCounters; + +#ifndef DEFINE_GUID +typedef struct _HSA_UUID +{ + HSAuint32 Data1; + HSAuint16 Data2; + HSAuint16 Data3; + HSAuint8 Data4[8]; +} HSA_UUID; + +#define HSA_DEFINE_UUID(name, dw, w1, w2, b1, b2, b3, b4, b5, b6, b7, b8) \ + static const HSA_UUID name = {dw, w1, w2, {b1, b2, b3, b4, b5, b6, b7, b8}} +#else +#define HSA_UUID GUID +#define HSA_DEFINE_UUID DEFINE_GUID +#endif + + +// GUID that identifies the GPU Shader Sequencer (SQ) block +// {B5C396B6-D310-47E4-86FC-5CC3043AF508} +HSA_DEFINE_UUID(HSA_PROFILEBLOCK_AMD_SQ, +0xb5c396b6, 0xd310, 0x47e4, 0x86, 0xfc, 0x5c, 0xc3, 0x4, 0x3a, 0xf5, 0x8); + +// GUID that identifies the GPU Memory Controller (MC) block +// {13900B57-4956-4D98-81D0-68521937F59C} +HSA_DEFINE_UUID(HSA_PROFILEBLOCK_AMD_MC, +0x13900b57, 0x4956, 0x4d98, 0x81, 0xd0, 0x68, 0x52, 0x19, 0x37, 0xf5, 0x9c); + +// GUID that identifies the IMOMMUv2 HW device +// {80969879-B0F6-4BE6-97F6-6A6300F5101D} +HSA_DEFINE_UUID(HSA_PROFILEBLOCK_AMD_IOMMUV2, +0x80969879, 0xb0f6, 0x4be6, 0x97, 0xf6, 0x6a, 0x63, 0x0, 0xf5, 0x10, 0x1d); + +// GUID that identifies the KFD +// {EA9B5AE1-6C3F-44B3-8954-DAF07565A90A} +HSA_DEFINE_UUID(HSA_PROFILEBLOCK_AMD_KERNEL_DRIVER, +0xea9b5ae1, 0x6c3f, 0x44b3, 0x89, 0x54, 0xda, 0xf0, 0x75, 0x65, 0xa9, 0xa); + +typedef enum _HSA_PROFILE_TYPE +{ + HSA_PROFILE_TYPE_PRIVILEGED_IMMEDIATE = 0, //immediate access counter (KFD access only) + HSA_PROFILE_TYPE_PRIVILEGED_STREAMING = 1, //streaming counter, HW continuously + //writes to memory on updates (KFD access only) + HSA_PROFILE_TYPE_NONPRIV_IMMEDIATE = 2, //user-queue accessible counter + HSA_PROFILE_TYPE_NONPRIV_STREAMING = 3, //user-queue accessible counter + //... + HSA_PROFILE_TYPE_NUM, + + HSA_PROFILE_TYPE_SIZE = 0xFFFFFFFF // In order to align to 32-bit value +} HSA_PROFILE_TYPE; + + +typedef struct _HsaCounterFlags +{ + union + { + struct + { + unsigned int Global : 1; // counter is global + // (not tied to VMID/WAVE/CU, ...) + unsigned int Resettable : 1; // counter can be reset by SW + // (always to 0?) + unsigned int ReadOnly : 1; // counter is read-only + // (but may be reset, if indicated) + unsigned int Stream : 1; // counter has streaming capability + // (after trigger, updates buffer) + unsigned int Reserved : 28; + } ui32; + HSAuint32 Value; + }; +} HsaCounterFlags; + + +typedef struct _HsaCounter +{ + HSA_PROFILE_TYPE Type; // specifies the counter type + HSAuint64 CounterId; // indicates counter register offset + HSAuint32 CounterSizeInBits; // indicates relevant counter bits + HSAuint64 CounterMask; // bitmask for counter value (if applicable) + HsaCounterFlags Flags; // Property flags (see above) + HSAuint32 BlockIndex; // identifies block the counter belongs to, + // value may be 0 to NumBlocks +} HsaCounter; + + +typedef struct _HsaCounterBlockProperties +{ + HSA_UUID BlockId; // specifies the block location + HSAuint32 NumCounters; // How many counters are available? + // (sizes Counters[] array below) + HSAuint32 NumConcurrent; // How many counter slots are available + // in block? + HsaCounter Counters[1]; // Start of counter array + // (NumCounters elements total) +} HsaCounterBlockProperties; + + +typedef struct _HsaCounterProperties +{ + HSAuint32 NumBlocks; // How many profilable block are available? + // (sizes Blocks[] array below) + HSAuint32 NumConcurrent; // How many blocks slots can be queried + // concurrently by HW? + HsaCounterBlockProperties Blocks[1]; // Start of block array + // (NumBlocks elements total) +} HsaCounterProperties; + +typedef HSAuint64 HSATraceId; + +typedef struct _HsaPmcTraceRoot +{ + HSAuint64 TraceBufferMinSizeBytes;// (page aligned) + HSAuint32 NumberOfPasses; + HSATraceId TraceId; +} HsaPmcTraceRoot; + +#pragma pack(pop, hsakmttypes_h) + + +#ifdef __cplusplus +} //extern "C" +#endif + +#endif //_HSAKMTTYPES_H_ diff --git a/hsakmt/kfd_ioctl.h b/hsakmt/kfd_ioctl.h new file mode 100644 index 0000000..d683342 --- /dev/null +++ b/hsakmt/kfd_ioctl.h @@ -0,0 +1,292 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef KFD_IOCTL_H_INCLUDED +#define KFD_IOCTL_H_INCLUDED + +#include <linux/types.h> +#include <linux/ioctl.h> + +#define KFD_IOCTL_MAJOR_VERSION 1 +#define KFD_IOCTL_MINOR_VERSION 1 + +struct kfd_ioctl_get_version_args { + uint32_t major_version; /* from KFD */ + uint32_t minor_version; /* from KFD */ +}; + +/* For kfd_ioctl_create_queue_args.queue_type. */ +#define KFD_IOC_QUEUE_TYPE_COMPUTE 0 +#define KFD_IOC_QUEUE_TYPE_SDMA 1 +#define KFD_IOC_QUEUE_TYPE_COMPUTE_AQL 2 + +#define KFD_MAX_QUEUE_PERCENTAGE 100 +#define KFD_MAX_QUEUE_PRIORITY 15 + +struct kfd_ioctl_create_queue_args { + uint64_t ring_base_address; /* to KFD */ + uint64_t write_pointer_address; /* from KFD */ + uint64_t read_pointer_address; /* from KFD */ + uint64_t doorbell_offset; /* from KFD */ + + uint32_t ring_size; /* to KFD */ + uint32_t gpu_id; /* to KFD */ + uint32_t queue_type; /* to KFD */ + uint32_t queue_percentage; /* to KFD */ + uint32_t queue_priority; /* to KFD */ + uint32_t queue_id; /* from KFD */ + + uint64_t eop_buffer_address; /* to KFD */ + uint64_t eop_buffer_size; /* to KFD */ + uint64_t ctx_save_restore_address; /* to KFD */ + uint64_t ctx_save_restore_size; /* to KFD */ +}; + +struct kfd_ioctl_destroy_queue_args { + uint32_t queue_id; /* to KFD */ + uint32_t pad; +}; + +struct kfd_ioctl_update_queue_args { + uint64_t ring_base_address; /* to KFD */ + + uint32_t queue_id; /* to KFD */ + uint32_t ring_size; /* to KFD */ + uint32_t queue_percentage; /* to KFD */ + uint32_t queue_priority; /* to KFD */ +}; + +/* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */ +#define KFD_IOC_CACHE_POLICY_COHERENT 0 +#define KFD_IOC_CACHE_POLICY_NONCOHERENT 1 + +struct kfd_ioctl_set_memory_policy_args { + uint64_t alternate_aperture_base; /* to KFD */ + uint64_t alternate_aperture_size; /* to KFD */ + + uint32_t gpu_id; /* to KFD */ + uint32_t default_policy; /* to KFD */ + uint32_t alternate_policy; /* to KFD */ + uint32_t pad; +}; + +/* + * All counters are monotonic. They are used for profiling of compute jobs. + * The profiling is done by userspace. + * + * In case of GPU reset, the counter should not be affected. + */ + +struct kfd_ioctl_get_clock_counters_args { + uint64_t gpu_clock_counter; /* from KFD */ + uint64_t cpu_clock_counter; /* from KFD */ + uint64_t system_clock_counter; /* from KFD */ + uint64_t system_clock_freq; /* from KFD */ + + uint32_t gpu_id; /* to KFD */ + uint32_t pad; +}; + +#define NUM_OF_SUPPORTED_GPUS 7 + +struct kfd_process_device_apertures { + uint64_t lds_base; /* from KFD */ + uint64_t lds_limit; /* from KFD */ + uint64_t scratch_base; /* from KFD */ + uint64_t scratch_limit; /* from KFD */ + uint64_t gpuvm_base; /* from KFD */ + uint64_t gpuvm_limit; /* from KFD */ + uint32_t gpu_id; /* from KFD */ + uint32_t pad; +}; + +struct kfd_ioctl_get_process_apertures_args { + struct kfd_process_device_apertures + process_apertures[NUM_OF_SUPPORTED_GPUS];/* from KFD */ + + /* from KFD, should be in the range [1 - NUM_OF_SUPPORTED_GPUS] */ + uint32_t num_of_nodes; + uint32_t pad; +}; + +#define MAX_ALLOWED_NUM_POINTS 100 +#define MAX_ALLOWED_AW_BUFF_SIZE 4096 +#define MAX_ALLOWED_WAC_BUFF_SIZE 128 + +struct kfd_ioctl_dbg_register_args { + uint32_t gpu_id; /* to KFD */ + uint32_t pad; +}; + +struct kfd_ioctl_dbg_unregister_args { + uint32_t gpu_id; /* to KFD */ + uint32_t pad; +}; + +struct kfd_ioctl_dbg_address_watch_args { + uint64_t content_ptr; /* a pointer to the actual content */ + uint32_t gpu_id; /* to KFD */ + uint32_t buf_size_in_bytes; /*including gpu_id and buf_size */ +}; + +struct kfd_ioctl_dbg_wave_control_args { + uint64_t content_ptr; /* a pointer to the actual content */ + uint32_t gpu_id; /* to KFD */ + uint32_t buf_size_in_bytes; /*including gpu_id and buf_size */ +}; + +/* Matching HSA_EVENTTYPE */ +#define KFD_IOC_EVENT_SIGNAL 0 +#define KFD_IOC_EVENT_NODECHANGE 1 +#define KFD_IOC_EVENT_DEVICESTATECHANGE 2 +#define KFD_IOC_EVENT_HW_EXCEPTION 3 +#define KFD_IOC_EVENT_SYSTEM_EVENT 4 +#define KFD_IOC_EVENT_DEBUG_EVENT 5 +#define KFD_IOC_EVENT_PROFILE_EVENT 6 +#define KFD_IOC_EVENT_QUEUE_EVENT 7 +#define KFD_IOC_EVENT_MEMORY 8 + +#define KFD_IOC_WAIT_RESULT_COMPLETE 0 +#define KFD_IOC_WAIT_RESULT_TIMEOUT 1 +#define KFD_IOC_WAIT_RESULT_FAIL 2 + +#define KFD_SIGNAL_EVENT_LIMIT 256 + +struct kfd_ioctl_create_event_args { + uint64_t event_page_offset; /* from KFD */ + uint32_t event_trigger_data; /* from KFD - signal events only */ + uint32_t event_type; /* to KFD */ + uint32_t auto_reset; /* to KFD */ + uint32_t node_id; /* to KFD - only valid for certain + event types */ + uint32_t event_id; /* from KFD */ + uint32_t event_slot_index; /* from KFD */ +}; + +struct kfd_ioctl_destroy_event_args { + uint32_t event_id; /* to KFD */ + uint32_t pad; +}; + +struct kfd_ioctl_set_event_args { + uint32_t event_id; /* to KFD */ + uint32_t pad; +}; + +struct kfd_ioctl_reset_event_args { + uint32_t event_id; /* to KFD */ + uint32_t pad; +}; + +struct kfd_memory_exception_failure { + uint32_t NotPresent; /* Page not present or supervisor privilege */ + uint32_t ReadOnly; /* Write access to a read-only page */ + uint32_t NoExecute; /* Execute access to a page marked NX */ + uint32_t pad; +}; + +/* memory exception data*/ +struct kfd_hsa_memory_exception_data { + struct kfd_memory_exception_failure failure; + uint64_t va; + uint32_t gpu_id; + uint32_t pad; +}; + +/* Event data*/ +struct kfd_event_data { + union { + struct kfd_hsa_memory_exception_data memory_exception_data; + }; /* From KFD */ + uint64_t kfd_event_data_ext; /* pointer to an extension structure + for future exception types */ + uint32_t event_id; /* to KFD */ + uint32_t pad; +}; + +struct kfd_ioctl_wait_events_args { + uint64_t events_ptr; /* pointed to struct + kfd_event_data array, to KFD */ + uint32_t num_events; /* to KFD */ + uint32_t wait_for_all; /* to KFD */ + uint32_t timeout; /* to KFD */ + uint32_t wait_result; /* from KFD */ +}; + +#define AMDKFD_IOCTL_BASE 'K' +#define AMDKFD_IO(nr) _IO(AMDKFD_IOCTL_BASE, nr) +#define AMDKFD_IOR(nr, type) _IOR(AMDKFD_IOCTL_BASE, nr, type) +#define AMDKFD_IOW(nr, type) _IOW(AMDKFD_IOCTL_BASE, nr, type) +#define AMDKFD_IOWR(nr, type) _IOWR(AMDKFD_IOCTL_BASE, nr, type) + +#define AMDKFD_IOC_GET_VERSION \ + AMDKFD_IOR(0x01, struct kfd_ioctl_get_version_args) + +#define AMDKFD_IOC_CREATE_QUEUE \ + AMDKFD_IOWR(0x02, struct kfd_ioctl_create_queue_args) + +#define AMDKFD_IOC_DESTROY_QUEUE \ + AMDKFD_IOWR(0x03, struct kfd_ioctl_destroy_queue_args) + +#define AMDKFD_IOC_SET_MEMORY_POLICY \ + AMDKFD_IOW(0x04, struct kfd_ioctl_set_memory_policy_args) + +#define AMDKFD_IOC_GET_CLOCK_COUNTERS \ + AMDKFD_IOWR(0x05, struct kfd_ioctl_get_clock_counters_args) + +#define AMDKFD_IOC_GET_PROCESS_APERTURES \ + AMDKFD_IOR(0x06, struct kfd_ioctl_get_process_apertures_args) + +#define AMDKFD_IOC_UPDATE_QUEUE \ + AMDKFD_IOW(0x07, struct kfd_ioctl_update_queue_args) + +#define AMDKFD_IOC_CREATE_EVENT \ + AMDKFD_IOWR(0x08, struct kfd_ioctl_create_event_args) + +#define AMDKFD_IOC_DESTROY_EVENT \ + AMDKFD_IOW(0x09, struct kfd_ioctl_destroy_event_args) + +#define AMDKFD_IOC_SET_EVENT \ + AMDKFD_IOW(0x0A, struct kfd_ioctl_set_event_args) + +#define AMDKFD_IOC_RESET_EVENT \ + AMDKFD_IOW(0x0B, struct kfd_ioctl_reset_event_args) + +#define AMDKFD_IOC_WAIT_EVENTS \ + AMDKFD_IOWR(0x0C, struct kfd_ioctl_wait_events_args) + +#define AMDKFD_IOC_DBG_REGISTER \ + AMDKFD_IOW(0x0D, struct kfd_ioctl_dbg_register_args) + +#define AMDKFD_IOC_DBG_UNREGISTER \ + AMDKFD_IOW(0x0E, struct kfd_ioctl_dbg_unregister_args) + +#define AMDKFD_IOC_DBG_ADDRESS_WATCH \ + AMDKFD_IOW(0x0F, struct kfd_ioctl_dbg_address_watch_args) + +#define AMDKFD_IOC_DBG_WAVE_CONTROL \ + AMDKFD_IOW(0x10, struct kfd_ioctl_dbg_wave_control_args) + +#define AMDKFD_COMMAND_START 0x01 +#define AMDKFD_COMMAND_END 0x11 + +#endif diff --git a/hsakmt/libhsakmt.c b/hsakmt/libhsakmt.c new file mode 100644 index 0000000..d7f79d3 --- /dev/null +++ b/hsakmt/libhsakmt.c @@ -0,0 +1,18 @@ +#include <errno.h> +#include <sys/ioctl.h> + +#include "libhsakmt.h" + +/** + * Call ioctl, restarting if it is interupted + */ +int +kmtIoctl(int fd, unsigned long request, void *arg) +{ + int ret; + + do { + ret = ioctl(fd, request, arg); + } while (ret == -1 && (errno == EINTR || errno == EAGAIN)); + return ret; +} diff --git a/hsakmt/libhsakmt.h b/hsakmt/libhsakmt.h new file mode 100644 index 0000000..0d73c8f --- /dev/null +++ b/hsakmt/libhsakmt.h @@ -0,0 +1,76 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef LIBHSAKMT_H_INCLUDED +#define LIBHSAKMT_H_INCLUDED + +#include "hsakmt.h" +#include <pthread.h> +#include <stdint.h> +#include <limits.h> + +extern int kfd_fd; +extern unsigned long kfd_open_count; +extern pthread_mutex_t hsakmt_mutex; + +#undef HSAKMTAPI +#define HSAKMTAPI __attribute__((visibility ("default"))) + +/*Avoid pointer-to-int-cast warning*/ +#define PORT_VPTR_TO_UINT64(vptr) ((uint64_t)(unsigned long)(vptr)) + +/*Avoid int-to-pointer-cast warning*/ +#define PORT_UINT64_TO_VPTR(v) ((void*)(unsigned long)(v)) + +#define CHECK_KFD_OPEN() \ + do { if (kfd_open_count == 0) return HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED; } while (0) + +#define PAGE_SIZE 4096 + +#define CHECK_PAGE_MULTIPLE(x) \ + do { if ((uint64_t)PORT_VPTR_TO_UINT64(x) % PAGE_SIZE) return HSAKMT_STATUS_INVALID_PARAMETER; } while(0) + +#define PAGE_ALIGN_UP(x) (((uint64_t)(x) + PAGE_SIZE - 1) & ~(uint64_t)(PAGE_SIZE-1)) +#define BITMASK(n) (((n) < sizeof(1ULL) * CHAR_BIT ? (1ULL << (n)) : 0) - 1ULL) + +/* + * Even though the toplogy code doesn't limit us to maximum number of nodes, + * the current HSA spec says the maximum is 8 nodes + */ +#define MAX_NODES 8 + +HSAKMT_STATUS validate_nodeid(uint32_t nodeid, uint32_t *gpu_id); +HSAKMT_STATUS gpuid_to_nodeid(uint32_t gpu_id, uint32_t* node_id); +uint16_t get_device_id_by_node(HSAuint32 node_id); + +extern int kmtIoctl(int fd, unsigned long request, void *arg); + +/* Void pointer arithmetic (or remove -Wpointer-arith to allow void pointers arithmetic) */ +#define VOID_PTR_ADD32(ptr,n) (void*)((uint32_t*)(ptr) + n)/*ptr + offset*/ +#define VOID_PTR_ADD(ptr,n) (void*)((uint8_t*)(ptr) + n)/*ptr + offset*/ +#define VOID_PTR_SUB(ptr,n) (void*)((uint8_t*)(ptr) - n)/*ptr - offset*/ +#define VOID_PTRS_SUB(ptr1,ptr2) (uint64_t)((uint8_t*)(ptr1) - (uint8_t*)(ptr2)) /*ptr1 - ptr2*/ + +#endif diff --git a/hsakmt/libhsakmt.ver b/hsakmt/libhsakmt.ver new file mode 100644 index 0000000..9c6e6cb --- /dev/null +++ b/hsakmt/libhsakmt.ver @@ -0,0 +1,46 @@ +HSAKMT_1 +{ +global: +hsaKmtOpenKFD; +hsaKmtCloseKFD; +hsaKmtGetVersion; +hsaKmtAcquireSystemProperties; +hsaKmtReleaseSystemProperties; +hsaKmtGetNodeProperties; +hsaKmtGetNodeMemoryProperties; +hsaKmtGetNodeCacheProperties; +hsaKmtGetNodeIoLinkProperties; +hsaKmtCreateEvent; +hsaKmtDestroyEvent; +hsaKmtSetEvent; +hsaKmtResetEvent; +hsaKmtQueryEventState; +hsaKmtWaitOnEvent; +hsaKmtWaitOnMultipleEvents; +hsaKmtCreateQueue; +hsaKmtUpdateQueue; +hsaKmtDestroyQueue; +hsaKmtSetMemoryPolicy; +hsaKmtAllocMemory; +hsaKmtFreeMemory; +hsaKmtRegisterMemory; +hsaKmtDeregisterMemory; +hsaKmtMapMemoryToGPU; +hsaKmtUnmapMemoryToGPU; +hsaKmtDbgRegister; +hsaKmtDbgUnregister; +hsaKmtDbgWavefrontControl; +hsaKmtDbgAddressWatch; +hsaKmtGetClockCounters; +hsaKmtPmcGetCounterProperties; +hsaKmtPmcRegisterTrace; +hsaKmtPmcUnregisterTrace; +hsaKmtPmcAcquireTraceAccess; +hsaKmtPmcReleaseTraceAccess; +hsaKmtPmcStartTrace; +hsaKmtPmcQueryTrace; +hsaKmtPmcStopTrace; + +local: *; +}; + diff --git a/hsakmt/memory.c b/hsakmt/memory.c new file mode 100644 index 0000000..718dd97 --- /dev/null +++ b/hsakmt/memory.c @@ -0,0 +1,204 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "libhsakmt.h" +#include "linux/kfd_ioctl.h" +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <assert.h> +#include <sys/types.h> +#include <sys/mman.h> +#include <fcntl.h> +#include "fmm.h" + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtSetMemoryPolicy( + HSAuint32 Node, + HSAuint32 DefaultPolicy, + HSAuint32 AlternatePolicy, + void* MemoryAddressAlternate, + HSAuint64 MemorySizeInBytes + ) +{ + HSAKMT_STATUS result; + uint32_t gpu_id; + + CHECK_KFD_OPEN(); + + result = validate_nodeid(Node, &gpu_id); + if (result != HSAKMT_STATUS_SUCCESS) + return result; + + // We accept any legal policy and alternate address location. You get CC everywhere anyway. + if ((DefaultPolicy != HSA_CACHING_CACHED && DefaultPolicy != HSA_CACHING_NONCACHED) + || (AlternatePolicy != HSA_CACHING_CACHED && AlternatePolicy != HSA_CACHING_NONCACHED)) + { + return HSAKMT_STATUS_INVALID_PARAMETER; + } + + CHECK_PAGE_MULTIPLE(MemoryAddressAlternate); + CHECK_PAGE_MULTIPLE(MemorySizeInBytes); + + struct kfd_ioctl_set_memory_policy_args args; + memset(&args, 0, sizeof(args)); + + args.gpu_id = gpu_id; + args.default_policy = (DefaultPolicy == HSA_CACHING_CACHED) ? KFD_IOC_CACHE_POLICY_COHERENT : KFD_IOC_CACHE_POLICY_NONCOHERENT; + args.alternate_policy = (AlternatePolicy == HSA_CACHING_CACHED) ? KFD_IOC_CACHE_POLICY_COHERENT : KFD_IOC_CACHE_POLICY_NONCOHERENT; + args.alternate_aperture_base = (uintptr_t)MemoryAddressAlternate; + args.alternate_aperture_size = MemorySizeInBytes; + + int err = kmtIoctl(kfd_fd, AMDKFD_IOC_SET_MEMORY_POLICY, &args); + + return (err == -1) ? HSAKMT_STATUS_ERROR : HSAKMT_STATUS_SUCCESS; +} + +static HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags) +{ + switch (pageSizeFlags) + { + case HSA_PAGE_SIZE_4KB: return 4*1024; + case HSA_PAGE_SIZE_64KB: return 64*1024; + case HSA_PAGE_SIZE_2MB: return 2*1024*1024; + case HSA_PAGE_SIZE_1GB: return 1024*1024*1024; + default: assert(false); return 4*1024; + } +} + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtAllocMemory( + HSAuint32 PreferredNode, //IN + HSAuint64 SizeInBytes, //IN (multiple of page size) + HsaMemFlags MemFlags, //IN + void** MemoryAddress //OUT (page-aligned) + ) +{ + CHECK_KFD_OPEN(); + HSAKMT_STATUS result; + uint32_t gpu_id; + int err; + + result = validate_nodeid(PreferredNode, &gpu_id); + if (result != HSAKMT_STATUS_SUCCESS) + return result; + + // The required size should be page aligned (GDS?) + HSAuint64 page_size = PageSizeFromFlags(MemFlags.ui32.PageSize); + if ((SizeInBytes & (page_size-1)) && !MemFlags.ui32.GDSMemory){ + return HSAKMT_STATUS_INVALID_PARAMETER; + } + + if (MemFlags.ui32.HostAccess && !MemFlags.ui32.NonPaged) { + err = posix_memalign(MemoryAddress, page_size, SizeInBytes); + if (err != 0) + return HSAKMT_STATUS_NO_MEMORY; + if (MemFlags.ui32.ExecuteAccess) { + err = mprotect(*MemoryAddress, SizeInBytes, PROT_READ | PROT_WRITE | PROT_EXEC); + if (err != 0) { + free(*MemoryAddress); + return err; + } + } + return HSAKMT_STATUS_SUCCESS; + } + + return HSAKMT_STATUS_INVALID_PARAMETER; +} + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtFreeMemory( + void* MemoryAddress, //IN (page-aligned) + HSAuint64 SizeInBytes //IN + ) +{ + HSAKMT_STATUS hsa_status = HSAKMT_STATUS_SUCCESS; + CHECK_KFD_OPEN(); + + if (fmm_is_inside_some_aperture(MemoryAddress)){ + if (fmm_release( MemoryAddress, SizeInBytes)) + hsa_status = HSAKMT_STATUS_INVALID_PARAMETER; + } + else + free(MemoryAddress); + + return hsa_status; +} + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtRegisterMemory( + void* MemoryAddress, //IN (page-aligned) + HSAuint64 MemorySizeInBytes //IN (page-aligned) + ) +{ + CHECK_KFD_OPEN(); + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtDeregisterMemory( + void* MemoryAddress //IN + ) +{ + CHECK_KFD_OPEN(); + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtMapMemoryToGPU( + void* MemoryAddress, //IN (page-aligned) + HSAuint64 MemorySizeInBytes, //IN (page-aligned) + HSAuint64* AlternateVAGPU //OUT (page-aligned) + ) +{ + CHECK_KFD_OPEN(); + + // We don't support GPUVM in the stub, there should never be a request for a GPUVA. + if (AlternateVAGPU) + { + *AlternateVAGPU = 0; + } + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtUnmapMemoryToGPU( + void* MemoryAddress //IN (page-aligned) + ) +{ + CHECK_KFD_OPEN(); + + return HSAKMT_STATUS_SUCCESS; +} diff --git a/hsakmt/openclose.c b/hsakmt/openclose.c new file mode 100644 index 0000000..d5b91e2 --- /dev/null +++ b/hsakmt/openclose.c @@ -0,0 +1,112 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "libhsakmt.h" + +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <fcntl.h> +#include <unistd.h> +#include "fmm.h" + +static const char kfd_device_name[] = "/dev/kfd"; +static const char tmp_file[] = "/var/lock/.amd_hsa_thunk_lock"; +int amd_hsa_thunk_lock_fd = 0; + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtOpenKFD(void) +{ + HSAKMT_STATUS result; + + pthread_mutex_lock(&hsakmt_mutex); + + if (kfd_open_count == 0) + { + int fd = open(kfd_device_name, O_RDWR | O_CLOEXEC); + + if (fd != -1) + { + kfd_fd = fd; + kfd_open_count = 1; + + result = fmm_init_process_apertures(); + if (result != HSAKMT_STATUS_SUCCESS) + close(fd); + } + else + { + result = HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED; + } + + amd_hsa_thunk_lock_fd = open(tmp_file, + O_CREAT | //create the file if it's not present. + O_RDWR, //only need write access for the internal locking semantics. + S_IRUSR | S_IWUSR); //permissions on the file, 600 here. + } + else + { + kfd_open_count++; + result = HSAKMT_STATUS_SUCCESS; + } + + pthread_mutex_unlock(&hsakmt_mutex); + + return result; +} + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtCloseKFD(void) +{ + HSAKMT_STATUS result; + + pthread_mutex_lock(&hsakmt_mutex); + + if (kfd_open_count > 0) + { + if (--kfd_open_count == 0) + { + close(kfd_fd); + + if (amd_hsa_thunk_lock_fd > 0) { + close(amd_hsa_thunk_lock_fd); + unlink(tmp_file); + } + + } + + result = HSAKMT_STATUS_SUCCESS; + } + else + { + result = HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED; + } + + pthread_mutex_unlock(&hsakmt_mutex); + + return result; +} diff --git a/hsakmt/perfctr.c b/hsakmt/perfctr.c new file mode 100644 index 0000000..64ab168 --- /dev/null +++ b/hsakmt/perfctr.c @@ -0,0 +1,370 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include <stdlib.h> +#include "libhsakmt.h" +#include "pmc_table.h" +#include "linux/kfd_ioctl.h" +#include <unistd.h> + +#define BITS_PER_BYTE CHAR_BIT + +#define HSA_PERF_MAGIC4CC 0x54415348 + +enum perf_trace_state { + PERF_TRACE_STATE__STOPPED = 0, + PERF_TRACE_STATE__STARTED +}; + +struct perf_trace { + uint32_t magic4cc; + uint32_t gpu_id; + enum perf_trace_state state; +}; + +extern int amd_hsa_thunk_lock_fd; + +static HsaCounterProperties *counter_props[MAX_NODES] = {NULL}; + +static int blockid2uuid(enum perf_block_id block_id, HSA_UUID *uuid) +{ + int rc = 0; + switch (block_id) { + case PERFCOUNTER_BLOCKID__SQ: + *uuid = HSA_PROFILEBLOCK_AMD_SQ; + break; + default: + /* If we reach this point, it's a bug */ + rc = -1; + } + + return rc; +} + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtPmcGetCounterProperties( + HSAuint32 NodeId, //IN + HsaCounterProperties** CounterProperties //OUT + ) +{ + HSAKMT_STATUS rc = HSAKMT_STATUS_SUCCESS; + uint32_t gpu_id, i, block_id; + uint16_t dev_id; + uint32_t counter_props_size = 0; + uint32_t total_counters = 0; + uint32_t total_concurrent = 0; + struct perf_counter_block block = {0}; + + if (CounterProperties == NULL) + return HSAKMT_STATUS_INVALID_PARAMETER; + + if (validate_nodeid(NodeId, &gpu_id) != 0) + return HSAKMT_STATUS_INVALID_NODE_UNIT; + + + + if (counter_props[NodeId] == NULL) { + dev_id = get_device_id_by_node(NodeId); + for (i = 0; i < PERFCOUNTER_BLOCKID__MAX; i++) { + rc = get_block_properties(dev_id, i, &block); + if (rc != HSAKMT_STATUS_SUCCESS) + return rc; + total_concurrent += block.num_of_slots; + total_counters += block.num_of_counters; + } + + counter_props_size = sizeof(HsaCounterProperties) + + sizeof(HsaCounterBlockProperties)*(PERFCOUNTER_BLOCKID__MAX-1) + + sizeof(HsaCounter)*(total_counters-1); + + counter_props[NodeId] = malloc(counter_props_size); + + if (counter_props[NodeId] == NULL) + return HSAKMT_STATUS_NO_MEMORY; + + counter_props[NodeId]->NumBlocks = PERFCOUNTER_BLOCKID__MAX; + counter_props[NodeId]->NumConcurrent = total_concurrent; + + for (block_id = 0; block_id < PERFCOUNTER_BLOCKID__MAX; block_id++) + { + rc = get_block_properties(dev_id, block_id, &block); + if (rc != HSAKMT_STATUS_SUCCESS) { + free(counter_props[NodeId]); + return rc; + } + + /* Filling the SQ block */ + blockid2uuid(block_id, &counter_props[NodeId]->Blocks[block_id].BlockId); + counter_props[NodeId]->Blocks[block_id].NumCounters = block.num_of_counters; + counter_props[NodeId]->Blocks[block_id].NumConcurrent = block.num_of_slots; + + for (i = 0; i < block.num_of_counters; i++) { + counter_props[NodeId]->Blocks[block_id].Counters[i].BlockIndex = block_id; + counter_props[NodeId]->Blocks[block_id].Counters[i].CounterId = block.counter_ids[i]; + counter_props[NodeId]->Blocks[block_id].Counters[i].CounterSizeInBits = block.counter_size_in_bits; + counter_props[NodeId]->Blocks[block_id].Counters[i].CounterMask = block.counter_mask; + counter_props[NodeId]->Blocks[block_id].Counters[i].Flags.ui32.Global = 1; + counter_props[NodeId]->Blocks[block_id].Counters[i].Type = HSA_PROFILE_TYPE_NONPRIV_IMMEDIATE; + } + } + } + + *CounterProperties = counter_props[NodeId]; + + return HSAKMT_STATUS_SUCCESS; +} + +/** + Registers a set of (HW) counters to be used for tracing/profiling +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtPmcRegisterTrace( + HSAuint32 NodeId, //IN + HSAuint32 NumberOfCounters, //IN + HsaCounter* Counters, //IN + HsaPmcTraceRoot* TraceRoot //OUT + ) +{ + uint32_t gpu_id, i; + uint64_t min_buf_size = 0; + uint32_t concurrent_counters[PERFCOUNTER_BLOCKID__MAX] = {0}; + struct perf_trace *trace = NULL; + + if (Counters == NULL || TraceRoot == NULL || NumberOfCounters == 0) + return HSAKMT_STATUS_INVALID_PARAMETER; + + if (validate_nodeid(NodeId, &gpu_id) != 0) + return HSAKMT_STATUS_INVALID_NODE_UNIT; + + /* Calculating the minimum buffer size */ + for (i = 0; i < NumberOfCounters; i++) { + if (Counters[i].BlockIndex >= PERFCOUNTER_BLOCKID__MAX) + return HSAKMT_STATUS_INVALID_PARAMETER; + min_buf_size += Counters[i].CounterSizeInBits/BITS_PER_BYTE; + concurrent_counters[Counters[i].BlockIndex]++; + } + + /* Verifying that the number of counters per block is not larger than the amount of slots */ + if (concurrent_counters[PERFCOUNTER_BLOCKID__SQ] > counter_props[NodeId]->Blocks[PERFCOUNTER_BLOCKID__SQ].NumConcurrent) + return HSAKMT_STATUS_INVALID_PARAMETER; + + trace = malloc(sizeof(trace)); + if (trace == NULL) + return HSAKMT_STATUS_NO_MEMORY; + + trace->magic4cc = HSA_PERF_MAGIC4CC; + trace->gpu_id = gpu_id; + trace->state = PERF_TRACE_STATE__STOPPED; + + TraceRoot->NumberOfPasses = 1; + TraceRoot->TraceBufferMinSizeBytes = PAGE_ALIGN_UP(min_buf_size); + TraceRoot->TraceId = PORT_VPTR_TO_UINT64(trace); + + return HSAKMT_STATUS_SUCCESS; +} + +/** + Unregisters a set of (HW) counters used for tracing/profiling +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtPmcUnregisterTrace( + HSAuint32 NodeId, //IN + HSATraceId TraceId //IN + ) +{ + uint32_t gpu_id; + struct perf_trace *trace; + + if (TraceId == 0) + return HSAKMT_STATUS_INVALID_PARAMETER; + + if (validate_nodeid(NodeId, &gpu_id) != 0) + return HSAKMT_STATUS_INVALID_NODE_UNIT; + + trace = (struct perf_trace *)PORT_UINT64_TO_VPTR(TraceId); + + if (trace->magic4cc != HSA_PERF_MAGIC4CC) + return HSAKMT_STATUS_INVALID_HANDLE; + + if (trace->gpu_id != gpu_id) + return HSAKMT_STATUS_INVALID_NODE_UNIT; + + /* If the trace is in the running state, stop it */ + if (trace->state == PERF_TRACE_STATE__STARTED) { + HSAKMT_STATUS status = hsaKmtPmcStopTrace(TraceId); + if (status != HSAKMT_STATUS_SUCCESS) + return status; + } + + free(trace); + + return HSAKMT_STATUS_SUCCESS; +} + + +/** + Allows a user mode process to get exclusive access to the defined set of (HW) counters + used for tracing/profiling +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtPmcAcquireTraceAccess( + HSAuint32 NodeId, //IN + HSATraceId TraceId //IN + ) +{ + struct perf_trace *trace; + + if (TraceId == 0) + return HSAKMT_STATUS_INVALID_PARAMETER; + + trace = (struct perf_trace *)PORT_UINT64_TO_VPTR(TraceId); + + if (trace->magic4cc != HSA_PERF_MAGIC4CC) + return HSAKMT_STATUS_INVALID_HANDLE; + + if (amd_hsa_thunk_lock_fd > 0) { + if (lockf( amd_hsa_thunk_lock_fd, F_TLOCK, 0 ) != 0) + return HSAKMT_STATUS_ERROR; + else + return HSAKMT_STATUS_SUCCESS; + } + else { + return HSAKMT_STATUS_ERROR; + } +} + + +/** + Allows a user mode process to release exclusive access to the defined set of (HW) counters + used for tracing/profiling +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtPmcReleaseTraceAccess( + HSAuint32 NodeId, //IN + HSATraceId TraceId //IN + ) +{ + struct perf_trace *trace; + + if (TraceId == 0) + return HSAKMT_STATUS_INVALID_PARAMETER; + + trace = (struct perf_trace *)PORT_UINT64_TO_VPTR(TraceId); + + if (trace->magic4cc != HSA_PERF_MAGIC4CC) + return HSAKMT_STATUS_INVALID_HANDLE; + + if (amd_hsa_thunk_lock_fd > 0) { + if (lockf( amd_hsa_thunk_lock_fd, F_ULOCK, 0 ) != 0) + return HSAKMT_STATUS_ERROR; + else + return HSAKMT_STATUS_SUCCESS; + } + else { + return HSAKMT_STATUS_ERROR; + } + +} + + +/** + Starts tracing operation on a previously established set of performance counters +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtPmcStartTrace( + HSATraceId TraceId, //IN + void* TraceBuffer, //IN (page aligned) + HSAuint64 TraceBufferSizeBytes //IN (page aligned) + ) +{ + struct perf_trace *trace = (struct perf_trace *)PORT_UINT64_TO_VPTR(TraceId); + + if (TraceId == 0 || TraceBuffer == NULL || TraceBufferSizeBytes == 0) + return HSAKMT_STATUS_INVALID_PARAMETER; + + if (trace->magic4cc != HSA_PERF_MAGIC4CC) + return HSAKMT_STATUS_INVALID_HANDLE; + + trace->state = PERF_TRACE_STATE__STARTED; + + return HSAKMT_STATUS_SUCCESS; +} + + +/** + Forces an update of all the counters that a previously started trace operation has registered +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtPmcQueryTrace( + HSATraceId TraceId //IN + ) +{ + struct perf_trace *trace = (struct perf_trace *)PORT_UINT64_TO_VPTR(TraceId); + + if (TraceId == 0) + return HSAKMT_STATUS_INVALID_PARAMETER; + + if (trace->magic4cc != HSA_PERF_MAGIC4CC) + return HSAKMT_STATUS_INVALID_HANDLE; + + return HSAKMT_STATUS_SUCCESS; +} + + +/** + Stops tracing operation on a previously established set of performance counters +*/ + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtPmcStopTrace( + HSATraceId TraceId //IN + ) +{ + struct perf_trace *trace = (struct perf_trace *)PORT_UINT64_TO_VPTR(TraceId); + + if (TraceId == 0) + return HSAKMT_STATUS_INVALID_PARAMETER; + + if (trace->magic4cc != HSA_PERF_MAGIC4CC) + return HSAKMT_STATUS_INVALID_HANDLE; + + trace->state = PERF_TRACE_STATE__STOPPED; + + return HSAKMT_STATUS_SUCCESS; +} diff --git a/hsakmt/pmc_table.c b/hsakmt/pmc_table.c new file mode 100644 index 0000000..0390639 --- /dev/null +++ b/hsakmt/pmc_table.c @@ -0,0 +1,134 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "libhsakmt.h" +#include "pmc_table.h" + + +static uint32_t kaveri_sq_counter_ids[] = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, + 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, + 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, + 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, + 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, + 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, + 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, + 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 168, 169, 170, + 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, + 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, + 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, + 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, + 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250 +}; + +static uint32_t carrizo_sq_counter_ids[] = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, + 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, + 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, + 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, + 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, + 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, + 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, + 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 168, 169, 170, + 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, + 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, + 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, + 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, + 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250 +}; + +static struct perf_counter_block kaveri_blocks[PERFCOUNTER_BLOCKID__MAX] = { + [PERFCOUNTER_BLOCKID__SQ] = { + .num_of_slots = 16, + .num_of_counters = sizeof(kaveri_sq_counter_ids)/sizeof(*kaveri_sq_counter_ids), + .counter_ids = kaveri_sq_counter_ids, + .counter_size_in_bits = 64, + .counter_mask = BITMASK(64) + }, +}; + +static struct perf_counter_block carrizo_blocks[PERFCOUNTER_BLOCKID__MAX] = { + [PERFCOUNTER_BLOCKID__SQ] = { + .num_of_slots = 16, + .num_of_counters = sizeof(carrizo_sq_counter_ids)/sizeof(*carrizo_sq_counter_ids), + .counter_ids = carrizo_sq_counter_ids, + .counter_size_in_bits = 64, + .counter_mask = BITMASK(64) + }, +}; + +HSAKMT_STATUS +get_block_properties(uint16_t dev_id, + enum perf_block_id block_id, + struct perf_counter_block *block) +{ + HSAKMT_STATUS rc = HSAKMT_STATUS_SUCCESS; + if (block_id > PERFCOUNTER_BLOCKID__MAX || block_id < PERFCOUNTER_BLOCKID__FIRST) + return HSAKMT_STATUS_INVALID_PARAMETER; + + switch(dev_id) { + case 0x1304: + case 0x1305: + case 0x1306: + case 0x1307: + case 0x1309: + case 0x130A: + case 0x130B: + case 0x130C: + case 0x130D: + case 0x130E: + case 0x130F: + case 0x1310: + case 0x1311: + case 0x1312: + case 0x1313: + case 0x1315: + case 0x1316: + case 0x1317: + case 0x1318: + case 0x131B: + case 0x131C: + case 0x131D: + *block = kaveri_blocks[block_id]; + break; + + case 0x9870: + case 0x9874: + case 0x9875: + case 0x9876: + case 0x9877: + *block = carrizo_blocks[block_id]; + break; + + default: + rc = HSAKMT_STATUS_INVALID_PARAMETER; + } + + return rc; +} + + diff --git a/hsakmt/pmc_table.h b/hsakmt/pmc_table.h new file mode 100644 index 0000000..35ed07e --- /dev/null +++ b/hsakmt/pmc_table.h @@ -0,0 +1,50 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef PMC_TABLE_H +#define PMC_TABLE_H + +#include "libhsakmt.h" + +enum perf_block_id { + PERFCOUNTER_BLOCKID__FIRST = 0, + PERFCOUNTER_BLOCKID__SQ = PERFCOUNTER_BLOCKID__FIRST, + PERFCOUNTER_BLOCKID__MAX +}; + +struct perf_counter_block { + uint32_t num_of_slots; + uint32_t num_of_counters; + uint32_t *counter_ids; + uint32_t counter_size_in_bits; + uint64_t counter_mask; +}; + +HSAKMT_STATUS +get_block_properties(uint16_t dev_id, + enum perf_block_id block_id, + struct perf_counter_block *block); + +#endif // PMC_TABLE_H diff --git a/hsakmt/queues.c b/hsakmt/queues.c new file mode 100644 index 0000000..2d7692f --- /dev/null +++ b/hsakmt/queues.c @@ -0,0 +1,341 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "libhsakmt.h" +#include "linux/kfd_ioctl.h" +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <math.h> +#include <stdio.h> +#include <sys/types.h> +#include <sys/mman.h> +#include <fcntl.h> + +/* 1024 doorbells, 4 bytes each doorbell */ +#define DOORBELLS_PAGE_SIZE 1024 * 4 + +struct device_info +{ + uint32_t ctx_save_restore_size; + uint32_t eop_buffer_size; +}; + +struct device_info kaveri_device_info = { + .ctx_save_restore_size = 0, + .eop_buffer_size = 0, +}; + +struct device_info carrizo_device_info = { + .ctx_save_restore_size = 2756608, + .eop_buffer_size = 4096, +}; + +struct device_id +{ + uint16_t dev_id; + struct device_info *dev_info; +}; + +struct device_id supported_devices[] = { + { 0x1304, &kaveri_device_info }, /* Kaveri */ + { 0x1305, &kaveri_device_info }, /* Kaveri */ + { 0x1306, &kaveri_device_info }, /* Kaveri */ + { 0x1307, &kaveri_device_info }, /* Kaveri */ + { 0x1309, &kaveri_device_info }, /* Kaveri */ + { 0x130A, &kaveri_device_info }, /* Kaveri */ + { 0x130B, &kaveri_device_info }, /* Kaveri */ + { 0x130C, &kaveri_device_info }, /* Kaveri */ + { 0x130D, &kaveri_device_info }, /* Kaveri */ + { 0x130E, &kaveri_device_info }, /* Kaveri */ + { 0x130F, &kaveri_device_info }, /* Kaveri */ + { 0x1310, &kaveri_device_info }, /* Kaveri */ + { 0x1311, &kaveri_device_info }, /* Kaveri */ + { 0x1312, &kaveri_device_info }, /* Kaveri */ + { 0x1313, &kaveri_device_info }, /* Kaveri */ + { 0x1315, &kaveri_device_info }, /* Kaveri */ + { 0x1316, &kaveri_device_info }, /* Kaveri */ + { 0x1317, &kaveri_device_info }, /* Kaveri */ + { 0x1318, &kaveri_device_info }, /* Kaveri */ + { 0x131B, &kaveri_device_info }, /* Kaveri */ + { 0x131C, &kaveri_device_info }, /* Kaveri */ + { 0x131D, &kaveri_device_info }, /* Kaveri */ + { 0x9870, &carrizo_device_info }, /* Carrizo */ + { 0x9874, &carrizo_device_info }, /* Carrizo */ + { 0x9875, &carrizo_device_info }, /* Carrizo */ + { 0x9876, &carrizo_device_info }, /* Carrizo */ + { 0x9877, &carrizo_device_info }, /* Carrizo */ + { 0, NULL } +}; + +struct queue +{ + uint32_t queue_id; + uint32_t wptr; + uint32_t rptr; + void *eop_buffer; + void *ctx_save_restore; +}; + +struct process_doorbells +{ + bool need_mmap; + void* doorbells; + pthread_mutex_t doorbells_mutex; +}; + +struct process_doorbells doorbells[] = {[0 ... (NUM_OF_SUPPORTED_GPUS-1)] = {.need_mmap = true, .doorbells = NULL, .doorbells_mutex = PTHREAD_MUTEX_INITIALIZER}}; + +static struct device_info *get_device_info_by_dev_id(uint16_t dev_id) +{ + int i = 0; + while (supported_devices[i].dev_id != 0) { + if (supported_devices[i].dev_id == dev_id) { + return supported_devices[i].dev_info; + } + i++; + } + + return NULL; +} + +static void free_queue(struct queue *q) +{ + if (q->eop_buffer) + free(q->eop_buffer); + if (q->ctx_save_restore) + free(q->ctx_save_restore); + free(q); +} + +static void* allocate_exec_aligned_memory(uint32_t size, uint32_t align) +{ + void *ptr; + int retval; + + retval = posix_memalign(&ptr, align, size); + if (retval != 0) + return NULL; + + retval = mprotect(ptr, size, PROT_READ | PROT_WRITE | PROT_EXEC); + if (retval != 0) { + free(ptr); + return NULL; + } + + memset(ptr, 0, size); + return ptr; +} + +static int handle_concrete_asic(struct device_info *dev_info, struct queue *q, + struct kfd_ioctl_create_queue_args *args) +{ + if (dev_info) { + if (dev_info->eop_buffer_size > 0) { + q->eop_buffer = + allocate_exec_aligned_memory(dev_info->eop_buffer_size, PAGE_SIZE); + if (q->eop_buffer == NULL) { + return HSAKMT_STATUS_NO_MEMORY; + } + args->eop_buffer_address = (uintptr_t)q->eop_buffer; + args->eop_buffer_size = dev_info->eop_buffer_size; + } + if (dev_info->ctx_save_restore_size > 0) { + args->ctx_save_restore_size = dev_info->ctx_save_restore_size; + q->ctx_save_restore = + allocate_exec_aligned_memory(dev_info->ctx_save_restore_size, PAGE_SIZE); + if (q->ctx_save_restore == NULL) {; + return HSAKMT_STATUS_NO_MEMORY; + } + args->ctx_save_restore_address = (uintptr_t)q->ctx_save_restore; + } + } + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtCreateQueue( + HSAuint32 NodeId, //IN + HSA_QUEUE_TYPE Type, //IN + HSAuint32 QueuePercentage, //IN + HSA_QUEUE_PRIORITY Priority, //IN + void* QueueAddress, //IN + HSAuint64 QueueSizeInBytes, //IN + HsaEvent* Event, //IN + HsaQueueResource* QueueResource //OUT + ) +{ + HSAKMT_STATUS result; + uint32_t gpu_id; + uint16_t dev_id; + struct device_info *dev_info; + int err; + void* ptr; + CHECK_KFD_OPEN(); + + result = validate_nodeid(NodeId, &gpu_id); + if (result != HSAKMT_STATUS_SUCCESS) + return result; + + struct queue *q = malloc(sizeof(struct queue)); + if (q == NULL) + return HSAKMT_STATUS_NO_MEMORY; + memset(q, 0, sizeof(*q)); + + struct kfd_ioctl_create_queue_args args; + memset(&args, 0, sizeof(args)); + + dev_id = get_device_id_by_node(NodeId); + dev_info = get_device_info_by_dev_id(dev_id); + args.gpu_id = gpu_id; + + err = handle_concrete_asic(dev_info, q, &args); + if (err != HSAKMT_STATUS_SUCCESS) { + free_queue(q); + return err; + } + + switch (Type) + { + case HSA_QUEUE_COMPUTE: args.queue_type = KFD_IOC_QUEUE_TYPE_COMPUTE; break; + case HSA_QUEUE_SDMA: free(q); return HSAKMT_STATUS_NOT_IMPLEMENTED; + case HSA_QUEUE_COMPUTE_AQL: args.queue_type = KFD_IOC_QUEUE_TYPE_COMPUTE_AQL; break; + default: free_queue(q); return HSAKMT_STATUS_INVALID_PARAMETER; + } + + if (Type != HSA_QUEUE_COMPUTE_AQL) + { + QueueResource->QueueRptrValue = (uintptr_t)&q->rptr; + QueueResource->QueueWptrValue = (uintptr_t)&q->wptr; + } + + args.read_pointer_address = QueueResource->QueueRptrValue; + args.write_pointer_address = QueueResource->QueueWptrValue; + args.ring_base_address = (uintptr_t)QueueAddress; + args.ring_size = QueueSizeInBytes; + args.queue_percentage = QueuePercentage; + args.queue_priority = Priority; + + err = kmtIoctl(kfd_fd, AMDKFD_IOC_CREATE_QUEUE, &args); + + if (err == -1) + { + free_queue(q); + return HSAKMT_STATUS_ERROR; + } + + q->queue_id = args.queue_id; + + pthread_mutex_lock(&doorbells[NodeId].doorbells_mutex); + + if (doorbells[NodeId].need_mmap) { + ptr = mmap(0, DOORBELLS_PAGE_SIZE, PROT_READ|PROT_WRITE, + MAP_SHARED, kfd_fd, args.doorbell_offset); + + if (ptr == MAP_FAILED) { + pthread_mutex_unlock(&doorbells[NodeId].doorbells_mutex); + hsaKmtDestroyQueue(q->queue_id); + free_queue(q); + return HSAKMT_STATUS_ERROR; + } + + doorbells[NodeId].need_mmap = false; + doorbells[NodeId].doorbells = ptr; + } + + pthread_mutex_unlock(&doorbells[NodeId].doorbells_mutex); + + QueueResource->QueueId = PORT_VPTR_TO_UINT64(q); + QueueResource->Queue_DoorBell = VOID_PTR_ADD32(doorbells[NodeId].doorbells, q->queue_id); + + return HSAKMT_STATUS_SUCCESS; +} + + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtUpdateQueue( + HSA_QUEUEID QueueId, //IN + HSAuint32 QueuePercentage,//IN + HSA_QUEUE_PRIORITY Priority, //IN + void* QueueAddress, //IN + HSAuint64 QueueSize, //IN + HsaEvent* Event //IN + ) +{ + struct kfd_ioctl_update_queue_args arg; + struct queue *q = PORT_UINT64_TO_VPTR(QueueId); + + CHECK_KFD_OPEN(); + + if (q == NULL) + return (HSAKMT_STATUS_INVALID_PARAMETER); + arg.queue_id = (HSAuint32)q->queue_id; + arg.ring_base_address = (uintptr_t)QueueAddress; + arg.ring_size = QueueSize; + arg.queue_percentage = QueuePercentage; + arg.queue_priority = Priority; + + int err = kmtIoctl(kfd_fd, AMDKFD_IOC_UPDATE_QUEUE, &arg); + if (err == -1) + { + return HSAKMT_STATUS_ERROR; + } + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtDestroyQueue( + HSA_QUEUEID QueueId //IN + ) +{ + CHECK_KFD_OPEN(); + + struct queue *q = PORT_UINT64_TO_VPTR(QueueId); + struct kfd_ioctl_destroy_queue_args args; + + if (q == NULL) + return (HSAKMT_STATUS_INVALID_PARAMETER); + + memset(&args, 0, sizeof(args)); + + args.queue_id = q->queue_id; + + int err = kmtIoctl(kfd_fd, AMDKFD_IOC_DESTROY_QUEUE, &args); + + if (err == -1) + { + return HSAKMT_STATUS_ERROR; + } + else + { + free_queue(q); + return HSAKMT_STATUS_SUCCESS; + } +} diff --git a/hsakmt/time.c b/hsakmt/time.c new file mode 100644 index 0000000..45709f9 --- /dev/null +++ b/hsakmt/time.c @@ -0,0 +1,61 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "libhsakmt.h" +#include "linux/kfd_ioctl.h" + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtGetClockCounters( + HSAuint32 NodeId, //IN + HsaClockCounters* Counters //OUT + ) +{ + HSAKMT_STATUS result; + uint32_t gpu_id; + struct kfd_ioctl_get_clock_counters_args args; + int err; + + CHECK_KFD_OPEN(); + + result = validate_nodeid(NodeId, &gpu_id); + if (result != HSAKMT_STATUS_SUCCESS) + return result; + + args.gpu_id = gpu_id; + + err = kmtIoctl(kfd_fd, AMDKFD_IOC_GET_CLOCK_COUNTERS, &args); + if (err < 0) { + result = HSAKMT_STATUS_ERROR; + } else { + /* At this point the result is already HSAKMT_STATUS_SUCCESS */ + Counters->GPUClockCounter = args.gpu_clock_counter; + Counters->CPUClockCounter = args.cpu_clock_counter; + Counters->SystemClockCounter = args.system_clock_counter; + Counters->SystemClockFrequencyHz = args.system_clock_freq; + } + + return result; +} diff --git a/hsakmt/topology.c b/hsakmt/topology.c new file mode 100644 index 0000000..903b6f7 --- /dev/null +++ b/hsakmt/topology.c @@ -0,0 +1,991 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include <assert.h> +#include <stdio.h> +#include <dirent.h> +#include <malloc.h> +#include <string.h> + +#include "libhsakmt.h" +#include "fmm.h" +#define PAGE_SIZE 4096 +#define MIN(X,Y) ((X) < (Y) ? (X) : (Y)) +#define NUM_OF_HEAPS 2 +/* SYSFS related */ +#define KFD_SYSFS_PATH_GENERATION_ID "/sys/devices/virtual/kfd/kfd/topology/generation_id" +#define KFD_SYSFS_PATH_SYSTEM_PROPERTIES "/sys/devices/virtual/kfd/kfd/topology/system_properties" +#define KFD_SYSFS_PATH_NODES "/sys/devices/virtual/kfd/kfd/topology/nodes" + +typedef struct { + uint32_t gpu_id; + HsaNodeProperties node; + HsaMemoryProperties *mem; /* node->NumBanks elements */ + HsaCacheProperties *cache; + HsaIoLinkProperties *link; +} node_t; + +static HsaSystemProperties *system = NULL; +static node_t *node = NULL; + +static HSAKMT_STATUS topology_take_snapshot(void); +static HSAKMT_STATUS topology_drop_snapshot(void); +static int get_cpu_stepping(uint16_t* stepping); + +static void +free_node(node_t *n) +{ + assert(n); + + if (n == NULL) + return; + + if ((n)->mem) + free((n)->mem); + if ((n)->cache) + free((n)->cache); + if ((n)->link) + free((n)->link); +} + +static HSAKMT_STATUS +topology_sysfs_get_generation(uint32_t *gen) { + FILE *fd; + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + + assert(gen); + fd = fopen(KFD_SYSFS_PATH_GENERATION_ID, "r"); + if (!fd) + return HSAKMT_STATUS_ERROR; + if (fscanf(fd, "%ul", gen) != 1) { + ret = HSAKMT_STATUS_ERROR; + goto err; + } + +err: + fclose(fd); + return ret; +} + +static HSAKMT_STATUS +topology_sysfs_get_system_props(HsaSystemProperties *props) { + FILE *fd; + DIR *dirp; + char *read_buf, *p; + char prop_name[256]; + long long unsigned int prop_val; + uint32_t node_count, prog; + struct dirent *dir; + int read_size; + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + + + assert(props); + fd = fopen(KFD_SYSFS_PATH_SYSTEM_PROPERTIES, "r"); + if (!fd) + return HSAKMT_STATUS_ERROR; + + read_buf = malloc(PAGE_SIZE); + if (!read_buf) { + ret = HSAKMT_STATUS_NO_MEMORY; + goto err1; + } + + read_size = fread(read_buf, 1, PAGE_SIZE, fd); + if (read_size <= 0) { + ret = HSAKMT_STATUS_ERROR; + goto err2; + } + + /* Since we're using the buffer as a string, we make sure the string terminates */ + if(read_size >= PAGE_SIZE) + read_size = PAGE_SIZE-1; + read_buf[read_size] = 0; + + /* + * Read the system properties + */ + prog = 0; + p = read_buf; + while(sscanf(p+=prog, "%s %llu\n%n", prop_name, &prop_val, &prog) == 2) { + if (strcmp(prop_name,"platform_oem") == 0) + props->PlatformOem = (uint32_t)prop_val; + else if (strcmp(prop_name,"platform_id") == 0) + props->PlatformId = (uint32_t)prop_val; + else if (strcmp(prop_name,"platform_rev") == 0) + props->PlatformRev = (uint32_t)prop_val; + } + + /* + * Discover the number of nodes + */ + node_count = 0; + dirp = opendir(KFD_SYSFS_PATH_NODES); + if(dirp) { + /* + * Assuming that inside nodes folder there are only folders + * which represent the node numbers + */ + while ((dir = readdir(dirp)) != 0) { + if ((strcmp(dir->d_name, ".") == 0) || + (strcmp(dir->d_name, "..") == 0)) + continue; + node_count++; + } + closedir(dirp); + } + props->NumNodes = node_count; + + +err2: + free(read_buf); +err1: + fclose(fd); + return ret; +} + +static HSAKMT_STATUS +topology_sysfs_get_gpu_id(uint32_t node_id, uint32_t *gpu_id) { + FILE *fd; + char path[256]; + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + + assert(gpu_id); + snprintf(path, 256, "%s/%d/gpu_id", KFD_SYSFS_PATH_NODES, node_id); + fd = fopen(path, "r"); + if (!fd) + return HSAKMT_STATUS_ERROR; + if (fscanf(fd, "%ul", gpu_id) != 1) { + ret = HSAKMT_STATUS_ERROR; + } + fclose(fd); + + return ret; +} + +static HSAKMT_STATUS +topology_sysfs_get_node_props(uint32_t node_id, HsaNodeProperties *props, uint32_t *gpu_id) { + FILE *fd; + char *read_buf, *p; + char prop_name[256]; + char path[256]; + long long unsigned int prop_val; + uint32_t i, prog; + uint16_t stepping = 0, fw_version = 0; + int read_size; + + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + + assert(props); + assert(gpu_id); + /* Retrieve the GPU ID */ + ret = topology_sysfs_get_gpu_id(node_id, gpu_id); + + /* Retrieve the marketing name of the node */ + snprintf(path, 256, "%s/%d/name", KFD_SYSFS_PATH_NODES, node_id); + fd = fopen(path, "r"); + if (!fd) + return HSAKMT_STATUS_ERROR; + + read_buf = malloc(PAGE_SIZE); + if (!read_buf) { + ret = HSAKMT_STATUS_NO_MEMORY; + goto err1; + } + + read_size = fread(read_buf, 1, PAGE_SIZE, fd); + if (read_size <= 0) { + ret = HSAKMT_STATUS_ERROR; + goto err2; + } + p = memchr(read_buf, '\n', read_size); + if ((!p) || ((p-read_buf) > HSA_PUBLIC_NAME_SIZE)) { + ret = HSAKMT_STATUS_ERROR; + goto err2; + } + /* + * Convert UTF8 to UTF16 + */ + for (i = 0; (i < HSA_PUBLIC_NAME_SIZE) && (read_buf[i] != '\n'); i++) + props->MarketingName[i] = read_buf[i]; + props->MarketingName[i] = 0; + fclose(fd); + + /* Retrieve the node properties */ + snprintf(path, 256, "%s/%d/properties", KFD_SYSFS_PATH_NODES, node_id); + fd = fopen(path, "r"); + if (!fd) { + free(read_buf); + return HSAKMT_STATUS_ERROR; + } + + read_size = fread(read_buf, 1, PAGE_SIZE, fd); + if (read_size <= 0) { + ret = HSAKMT_STATUS_ERROR; + goto err2; + } + + /* Since we're using the buffer as a string, we make sure the string terminates */ + if(read_size >= PAGE_SIZE) + read_size = PAGE_SIZE-1; + read_buf[read_size] = 0; + + /* + * Read the node properties + */ + prog = 0; + p = read_buf; + while(sscanf(p+=prog, "%s %llu\n%n", prop_name, &prop_val, &prog) == 2) { + if (strcmp(prop_name,"cpu_cores_count") == 0) + props->NumCPUCores = (uint32_t)prop_val; + else if (strcmp(prop_name,"simd_count") == 0) + props->NumFComputeCores = (uint32_t)prop_val; + else if (strcmp(prop_name,"mem_banks_count") == 0) + props->NumMemoryBanks = (uint32_t)prop_val; + else if (strcmp(prop_name,"caches_count") == 0) + props->NumCaches = (uint32_t)prop_val; + else if (strcmp(prop_name,"io_links_count") == 0) + props->NumIOLinks = (uint32_t)prop_val; + else if (strcmp(prop_name,"cpu_core_id_base") == 0) + props->CComputeIdLo = (uint32_t)prop_val; + else if (strcmp(prop_name,"simd_id_base") == 0) + props->FComputeIdLo = (uint32_t)prop_val; + else if (strcmp(prop_name,"capability") == 0) + props->Capability.Value = (uint32_t)prop_val; + else if (strcmp(prop_name,"max_waves_per_simd") == 0) + props->MaxWavesPerSIMD = (uint32_t)prop_val; + else if (strcmp(prop_name,"lds_size_in_kb") == 0) + props->LDSSizeInKB = (uint32_t)prop_val; + else if (strcmp(prop_name,"gds_size_in_kb") == 0) + props->GDSSizeInKB = (uint32_t)prop_val; + else if (strcmp(prop_name,"wave_front_size") == 0) + props->WaveFrontSize = (uint32_t)prop_val; + else if (strcmp(prop_name,"array_count") == 0) + props->NumShaderBanks = (uint32_t)prop_val; + else if (strcmp(prop_name,"simd_arrays_per_engine") == 0) + props->NumArrays = (uint32_t)prop_val; + else if (strcmp(prop_name,"cu_per_simd_array") == 0) + props->NumCUPerArray = (uint32_t)prop_val; + else if (strcmp(prop_name,"simd_per_cu") == 0) + props->NumSIMDPerCU = (uint32_t)prop_val; + else if (strcmp(prop_name,"max_slots_scratch_cu") == 0) + props->MaxSlotsScratchCU = (uint32_t)prop_val; + else if (strcmp(prop_name,"fw_version") == 0) + fw_version = (uint16_t)prop_val; + else if (strcmp(prop_name,"vendor_id") == 0) + props->VendorId = (uint32_t)prop_val; + else if (strcmp(prop_name,"device_id") == 0) + props->DeviceId = (uint32_t)prop_val; + else if (strcmp(prop_name,"location_id") == 0) + props->LocationId = (uint32_t)prop_val; + else if (strcmp(prop_name,"max_engine_clk_fcompute") == 0) + props->MaxEngineClockMhzFCompute = (uint32_t)prop_val; + else if (strcmp(prop_name,"max_engine_clk_ccompute") == 0) + props->MaxEngineClockMhzCCompute = (uint32_t)prop_val; + else if (strcmp(prop_name,"local_mem_size") == 0) + props->LocalMemSize = (uint32_t)prop_val; + + } + + get_cpu_stepping(&stepping); + props->EngineId = ((stepping << 16) | fw_version); + +err2: + free(read_buf); +err1: + fclose(fd); + return ret; +} + +static HSAKMT_STATUS +topology_sysfs_get_mem_props(uint32_t node_id, uint32_t mem_id, HsaMemoryProperties *props) { + FILE *fd; + char *read_buf, *p; + char prop_name[256]; + char path[256]; + long long unsigned int prop_val; + uint32_t prog; + int read_size; + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + + assert(props); + snprintf(path, 256, "%s/%d/mem_banks/%d/properties", KFD_SYSFS_PATH_NODES, node_id, mem_id); + fd = fopen(path, "r"); + if (!fd) { + return HSAKMT_STATUS_ERROR; + } + read_buf = malloc(PAGE_SIZE); + if (!read_buf) { + ret = HSAKMT_STATUS_NO_MEMORY; + goto err1; + } + + read_size = fread(read_buf, 1, PAGE_SIZE, fd); + if (read_size <= 0) { + ret = HSAKMT_STATUS_ERROR; + goto err2; + } + + /* Since we're using the buffer as a string, we make sure the string terminates */ + if(read_size >= PAGE_SIZE) + read_size = PAGE_SIZE-1; + read_buf[read_size] = 0; + + prog = 0; + p = read_buf; + while(sscanf(p+=prog, "%s %llu\n%n", prop_name, &prop_val, &prog) == 2) { + if (strcmp(prop_name,"heap_type") == 0) + props->HeapType = (uint32_t)prop_val; + else if (strcmp(prop_name,"size_in_bytes") == 0) + props->SizeInBytes = prop_val; + else if (strcmp(prop_name,"flags") == 0) + props->Flags.MemoryProperty = (uint32_t)prop_val; + else if (strcmp(prop_name,"width") == 0) + props->Width = (uint32_t)prop_val; + else if (strcmp(prop_name,"mem_clk_max") == 0) + props->MemoryClockMax = (uint32_t)prop_val; + } + +err2: + free(read_buf); +err1: + fclose(fd); + return ret; +} + +static HSAKMT_STATUS +topology_sysfs_get_cache_props(uint32_t node_id, uint32_t cache_id, HsaCacheProperties *props) { + FILE *fd; + char *read_buf, *p; + char prop_name[256]; + char path[256]; + long long unsigned int prop_val; + uint32_t i, prog; + int read_size; + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + + assert(props); + snprintf(path, 256, "%s/%d/caches/%d/properties", KFD_SYSFS_PATH_NODES, node_id, cache_id); + fd = fopen(path, "r"); + if (!fd) { + return HSAKMT_STATUS_ERROR; + } + read_buf = malloc(PAGE_SIZE); + if (!read_buf) { + ret = HSAKMT_STATUS_NO_MEMORY; + goto err1; + } + + read_size = fread(read_buf, 1, PAGE_SIZE, fd); + if (read_size <= 0) { + ret = HSAKMT_STATUS_ERROR; + goto err2; + } + + /* Since we're using the buffer as a string, we make sure the string terminates */ + if(read_size >= PAGE_SIZE) + read_size = PAGE_SIZE-1; + read_buf[read_size] = 0; + + prog = 0; + p = read_buf; + while(sscanf(p+=prog, "%s %llu\n%n", prop_name, &prop_val, &prog) == 2) { + if (strcmp(prop_name,"processor_id_low") == 0) + props->ProcessorIdLow = (uint32_t)prop_val; + else if (strcmp(prop_name,"level") == 0) + props->CacheLevel = (uint32_t)prop_val; + else if (strcmp(prop_name,"size") == 0) + props->CacheSize = (uint32_t)prop_val; + else if (strcmp(prop_name,"cache_line_size") == 0) + props->CacheLineSize = (uint32_t)prop_val; + else if (strcmp(prop_name,"cache_lines_per_tag") == 0) + props->CacheLinesPerTag = (uint32_t)prop_val; + else if (strcmp(prop_name,"association") == 0) + props->CacheAssociativity = (uint32_t)prop_val; + else if (strcmp(prop_name,"latency") == 0) + props->CacheLatency = (uint32_t)prop_val; + else if (strcmp(prop_name,"type") == 0) + props->CacheType.Value = (uint32_t)prop_val; + else if (strcmp(prop_name, "sibling_map") == 0) + break; + } + + prog = 0; + if ((sscanf(p, "sibling_map %n", &prog)) == 0 && prog) { + i = 0; + while ((i < HSA_CPU_SIBLINGS) && + (sscanf(p+=prog, "%u%*[,\n]%n", &props->SiblingMap[i++], + &prog) == 1)); + } + +err2: + free(read_buf); +err1: + fclose(fd); + return ret; +} + +static HSAKMT_STATUS +topology_sysfs_get_iolink_props(uint32_t node_id, uint32_t iolink_id, HsaIoLinkProperties *props) { + FILE *fd; + char *read_buf, *p; + char prop_name[256]; + char path[256]; + long long unsigned int prop_val; + uint32_t prog; + int read_size; + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + + assert(props); + snprintf(path, 256, "%s/%d/io_link/%d/properties", KFD_SYSFS_PATH_NODES, node_id, iolink_id); + fd = fopen(path, "r"); + if (!fd) { + return HSAKMT_STATUS_ERROR; + } + read_buf = malloc(PAGE_SIZE); + if (!read_buf) { + ret = HSAKMT_STATUS_NO_MEMORY; + goto err1; + } + + read_size = fread(read_buf, 1, PAGE_SIZE, fd); + if (read_size <= 0) { + ret = HSAKMT_STATUS_ERROR; + goto err2; + } + + /* Since we're using the buffer as a string, we make sure the string terminates */ + if(read_size >= PAGE_SIZE) + read_size = PAGE_SIZE-1; + read_buf[read_size] = 0; + + prog = 0; + p = read_buf; + while(sscanf(p+=prog, "%s %llu\n%n", prop_name, &prop_val, &prog) == 2) { + if (strcmp(prop_name,"type") == 0) + props->IoLinkType = (uint32_t)prop_val; + else if (strcmp(prop_name,"version_major") == 0) + props->VersionMajor = (uint32_t)prop_val; + else if (strcmp(prop_name,"version_minor") == 0) + props->VersionMinor = (uint32_t)prop_val; + else if (strcmp(prop_name,"node_from") == 0) + props->NodeFrom = (uint32_t)prop_val; + else if (strcmp(prop_name,"node_to") == 0) + props->NodeTo = (uint32_t)prop_val; + else if (strcmp(prop_name,"weight") == 0) + props->Weight = (uint32_t)prop_val; + else if (strcmp(prop_name,"min_latency") == 0) + props->MinimumLatency = (uint32_t)prop_val; + else if (strcmp(prop_name,"max_latency") == 0) + props->MaximumLatency = (uint32_t)prop_val; + else if (strcmp(prop_name,"min_bandwidth") == 0) + props->MinimumBandwidth = (uint32_t)prop_val; + else if (strcmp(prop_name,"max_bandwidth") == 0) + props->MaximumBandwidth = (uint32_t)prop_val; + else if (strcmp(prop_name,"recommended_transfer_size") == 0) + props->RecTransferSize = (uint32_t)prop_val; + else if (strcmp(prop_name,"flags") == 0) + props->Flags.LinkProperty = (uint32_t)prop_val; + } + + +err2: + free(read_buf); +err1: + fclose(fd); + return ret; +} + +HSAKMT_STATUS +topology_take_snapshot(void) +{ + uint32_t gen_start, gen_end, i, j, mem_id, cache_id, link_id; + HsaSystemProperties sys_props; + node_t *temp_nodes = 0; + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + +retry: + ret = topology_sysfs_get_generation(&gen_start); + if (ret != HSAKMT_STATUS_SUCCESS) + return ret; + ret = topology_sysfs_get_system_props(&sys_props); + if (ret != HSAKMT_STATUS_SUCCESS) + return ret; + if(sys_props.NumNodes > 0) { + temp_nodes = calloc(sys_props.NumNodes * sizeof(node_t),1); + if (!temp_nodes) + return HSAKMT_STATUS_NO_MEMORY; + for (i = 0; i < sys_props.NumNodes; i++) { + ret = topology_sysfs_get_node_props(i, + &temp_nodes[i].node, + &temp_nodes[i].gpu_id); + if (ret != HSAKMT_STATUS_SUCCESS) { + for (j=0; j < i; j++) + free_node(&temp_nodes[j]); + free(temp_nodes); + goto err; + } + if (temp_nodes[i].node.NumMemoryBanks) { + temp_nodes[i].mem = calloc(temp_nodes[i].node.NumMemoryBanks * sizeof(HsaMemoryProperties), 1); + if (!temp_nodes[i].mem) { + ret = HSAKMT_STATUS_NO_MEMORY; + for (j=0; j <= i; j++) + free_node(&temp_nodes[j]); + free(temp_nodes); + goto err; + } + for (mem_id = 0; mem_id < temp_nodes[i].node.NumMemoryBanks; mem_id++) { + ret = topology_sysfs_get_mem_props(i, mem_id, &temp_nodes[i].mem[mem_id]); + if (ret != HSAKMT_STATUS_SUCCESS) { + for (j=0; j <= i; j++) + free_node(&temp_nodes[j]); + free(temp_nodes); + goto err; + } + } + } + + if (temp_nodes[i].node.NumCaches) { + temp_nodes[i].cache = calloc(temp_nodes[i].node.NumCaches * sizeof(HsaCacheProperties), 1); + if (!temp_nodes[i].cache) { + ret = HSAKMT_STATUS_NO_MEMORY; + for (j=0; j <= i; j++) + free_node(&temp_nodes[j]); + free(temp_nodes); + goto err; + } + for (cache_id = 0; cache_id < temp_nodes[i].node.NumCaches; cache_id++) { + ret = topology_sysfs_get_cache_props(i, cache_id, &temp_nodes[i].cache[cache_id]); + if (ret != HSAKMT_STATUS_SUCCESS) { + for (j=0; j <= i; j++) + free_node(&temp_nodes[j]); + free(temp_nodes); + goto err; + } + } + } + + if (temp_nodes[i].node.NumIOLinks) { + temp_nodes[i].link = calloc(temp_nodes[i].node.NumIOLinks * sizeof(HsaIoLinkProperties), 1); + if (!temp_nodes[i].link) { + ret = HSAKMT_STATUS_NO_MEMORY; + for (j=0; j <= i; j++) + free_node(&temp_nodes[j]); + free(temp_nodes); + goto err; + } + for (link_id = 0; link_id < temp_nodes[i].node.NumIOLinks; link_id++) { + ret = topology_sysfs_get_iolink_props(i, link_id, &temp_nodes[i].link[link_id]); + if (ret != HSAKMT_STATUS_SUCCESS) { + for (j=0; j <= i; j++) + free_node(&temp_nodes[j]); + free(temp_nodes); + goto err; + } + } + } + + } + } + + ret = topology_sysfs_get_generation(&gen_end); + if (ret != HSAKMT_STATUS_SUCCESS) { + if (temp_nodes) { + for (j=0; j < sys_props.NumNodes; j++) + free_node(&temp_nodes[j]); + free(temp_nodes); + } + goto err; + } + + if (gen_start != gen_end) { + if (temp_nodes) { + for (j=0; j < sys_props.NumNodes; j++) + free_node(&temp_nodes[j]); + free(temp_nodes); + temp_nodes = 0; + } + goto retry; + } + + if (!system) { + system = malloc(sizeof(HsaSystemProperties)); + if (!system) { + if (temp_nodes) { + for (j=0; j < sys_props.NumNodes; j++) + free_node(&temp_nodes[j]); + free(temp_nodes); + } + return HSAKMT_STATUS_NO_MEMORY; + } + } + + *system = sys_props; + if (node) + free(node); + node = temp_nodes; +err: + + return ret; +} + +/* + * Drop the Snashot of the HSA topology information. + * Assume lock is held. + */ +HSAKMT_STATUS +topology_drop_snapshot(void) +{ + HSAKMT_STATUS err; + + if (!!system != !!node) { + printf("Probable inconsistency?\n"); + err = HSAKMT_STATUS_SUCCESS; + goto out; + } + + if (node) { + uint64_t nodeid; + + /* Remove state */ + for (nodeid = 0; nodeid < system->NumNodes; nodeid++) { + free_node(&node[nodeid]); + } + + free(node); + node = NULL; + } + + free(system); + system = NULL; + err = HSAKMT_STATUS_SUCCESS; + +out: + return err; +} + +HSAKMT_STATUS +validate_nodeid(uint32_t nodeid, uint32_t *gpu_id) +{ + if (nodeid >= MAX_NODES || !node || !system || system->NumNodes <= nodeid) + return HSAKMT_STATUS_INVALID_NODE_UNIT; + if (gpu_id) + *gpu_id = node[nodeid].gpu_id; + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS +gpuid_to_nodeid(uint32_t gpu_id, uint32_t* node_id){ + uint64_t node_idx; + for(node_idx = 0; node_idx < system->NumNodes; node_idx++){ + if (node[node_idx].gpu_id == gpu_id){ + *node_id = node_idx; + return HSAKMT_STATUS_SUCCESS; + } + } + + return HSAKMT_STATUS_INVALID_NODE_UNIT; + +} + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtAcquireSystemProperties( + HsaSystemProperties* SystemProperties //OUT + ) +{ + HSAKMT_STATUS err; + CHECK_KFD_OPEN(); + + if (!SystemProperties) + return HSAKMT_STATUS_INVALID_PARAMETER; + + pthread_mutex_lock(&hsakmt_mutex); + + err = topology_take_snapshot(); + if (err != HSAKMT_STATUS_SUCCESS) + goto out; + + assert(system); + + *SystemProperties = *system; + err = HSAKMT_STATUS_SUCCESS; + +out: + pthread_mutex_unlock(&hsakmt_mutex); + return err; +} + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtReleaseSystemProperties(void) +{ + CHECK_KFD_OPEN(); + + HSAKMT_STATUS err; + + pthread_mutex_lock(&hsakmt_mutex); + + err = topology_drop_snapshot(); + + pthread_mutex_unlock(&hsakmt_mutex); + + return err; +} + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtGetNodeProperties( + HSAuint32 NodeId, //IN + HsaNodeProperties* NodeProperties //OUT + ) +{ + HSAKMT_STATUS err; + uint32_t gpu_id; + + if (!NodeProperties) + return HSAKMT_STATUS_INVALID_PARAMETER; + + CHECK_KFD_OPEN(); + pthread_mutex_lock(&hsakmt_mutex); + + /* KFD ADD page 18, snapshot protocol violation */ + if (system == NULL) { + err = HSAKMT_STATUS_INVALID_NODE_UNIT; + assert(system); + goto out; + } + + if (NodeId >= system->NumNodes) { + err = HSAKMT_STATUS_INVALID_PARAMETER; + goto out; + } + + err = validate_nodeid(NodeId, &gpu_id); + if (err != HSAKMT_STATUS_SUCCESS) + return err; + + *NodeProperties = node[NodeId].node; + NodeProperties->NumMemoryBanks += NUM_OF_HEAPS; + + err = HSAKMT_STATUS_SUCCESS; + +out: + pthread_mutex_unlock(&hsakmt_mutex); + return err; +} + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtGetNodeMemoryProperties( + HSAuint32 NodeId, //IN + HSAuint32 NumBanks, //IN + HsaMemoryProperties* MemoryProperties //OUT + ) +{ + HSAKMT_STATUS err; + uint32_t i, gpu_id; + + if (!MemoryProperties) + return HSAKMT_STATUS_INVALID_PARAMETER; + + CHECK_KFD_OPEN(); + pthread_mutex_lock(&hsakmt_mutex); + + /* KFD ADD page 18, snapshot protocol violation */ + if (system == NULL) { + err = HSAKMT_STATUS_INVALID_NODE_UNIT; + assert(system); + goto out; + } + + /* Check still necessary */ + if (NodeId >= system->NumNodes ) { + err = HSAKMT_STATUS_INVALID_PARAMETER; + goto out; + } + + err = validate_nodeid(NodeId, &gpu_id); + if (err != HSAKMT_STATUS_SUCCESS) + return err; + + for (i = 0; i < MIN(node[NodeId].node.NumMemoryBanks, NumBanks); i++) { + assert(node[NodeId].mem); + MemoryProperties[i] = node[NodeId].mem[i]; + } + + /*Add LDS*/ + if (i < NumBanks){ + MemoryProperties[i].HeapType = HSA_HEAPTYPE_GPU_LDS; + MemoryProperties[i].SizeInBytes = node[NodeId].node.LDSSizeInKB * 1024; + MemoryProperties[i].VirtualBaseAddress = fmm_get_aperture_base(FMM_LDS, gpu_id); + i++; + } + + /*Add Local memory - HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE*/ + if ((i < NumBanks) && (node[NodeId].node.LocalMemSize > 0)) { + MemoryProperties[i].HeapType = HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE; + MemoryProperties[i].SizeInBytes = node[NodeId].node.LocalMemSize; + MemoryProperties[i].VirtualBaseAddress = fmm_get_aperture_base(FMM_GPUVM, gpu_id); + i++; + } + + err = HSAKMT_STATUS_SUCCESS; + +out: + pthread_mutex_unlock(&hsakmt_mutex); + return err; +} + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtGetNodeCacheProperties( + HSAuint32 NodeId, //IN + HSAuint32 ProcessorId, //IN + HSAuint32 NumCaches, //IN + HsaCacheProperties* CacheProperties //OUT + ) +{ + HSAKMT_STATUS err; + uint32_t i; + + if (!CacheProperties) + return HSAKMT_STATUS_INVALID_PARAMETER; + + CHECK_KFD_OPEN(); + pthread_mutex_lock(&hsakmt_mutex); + + /* KFD ADD page 18, snapshot protocol violation */ + if (system == NULL) { + err = HSAKMT_STATUS_INVALID_NODE_UNIT; + assert(system); + goto out; + } + + if (NodeId >= system->NumNodes || NumCaches > node[NodeId].node.NumCaches) { + err = HSAKMT_STATUS_INVALID_PARAMETER; + goto out; + } + + for (i = 0; i < MIN(node[NodeId].node.NumCaches, NumCaches); i++) { + assert(node[NodeId].cache); + CacheProperties[i] = node[NodeId].cache[i]; + } + + err = HSAKMT_STATUS_SUCCESS; + +out: + pthread_mutex_unlock(&hsakmt_mutex); + return err; +} + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtGetNodeIoLinkProperties( + HSAuint32 NodeId, //IN + HSAuint32 NumIoLinks, //IN + HsaIoLinkProperties* IoLinkProperties //OUT + ) +{ + HSAKMT_STATUS err; + uint32_t i; + + if (!IoLinkProperties) + return HSAKMT_STATUS_INVALID_PARAMETER; + + CHECK_KFD_OPEN(); + + pthread_mutex_lock(&hsakmt_mutex); + + /* KFD ADD page 18, snapshot protocol violation */ + if (system == NULL) { + err = HSAKMT_STATUS_INVALID_NODE_UNIT; + assert(system); + goto out; + } + + if (NodeId >= system->NumNodes || NumIoLinks > node[NodeId].node.NumIOLinks) { + err = HSAKMT_STATUS_INVALID_PARAMETER; + goto out; + } + + for (i = 0; i < MIN(node[NodeId].node.NumIOLinks, NumIoLinks); i++) { + assert(node[NodeId].link); + IoLinkProperties[i] = node[NodeId].link[i]; + } + + err = HSAKMT_STATUS_SUCCESS; + +out: + pthread_mutex_unlock(&hsakmt_mutex); + return err; +} + +uint16_t get_device_id_by_node(HSAuint32 node_id) +{ + if (!node || !system || system->NumNodes <= node_id) + return 0; + + return node[node_id].node.DeviceId; +} + +static int get_cpu_stepping(uint16_t* stepping) +{ + int ret; + FILE* fd = fopen("/proc/cpuinfo", "r"); + if (!fd) + return -1; + + char* read_buf = malloc(PAGE_SIZE); + if (!read_buf) { + ret = -1; + goto err1; + } + + int read_size = fread(read_buf, 1, PAGE_SIZE, fd); + if (read_size <= 0) { + ret = -2; + goto err2; + } + + /* Since we're using the buffer as a string, we make sure the string terminates */ + if(read_size >= PAGE_SIZE) + read_size = PAGE_SIZE-1; + read_buf[read_size] = 0; + + *stepping = 0; + + char* p = strstr(read_buf, "stepping"); + if (p) + sscanf(p , "stepping\t: %hu\n", stepping); + +err2: + free(read_buf); +err1: + fclose(fd); + + return ret; +} diff --git a/hsakmt/version.c b/hsakmt/version.c new file mode 100644 index 0000000..95bfec6 --- /dev/null +++ b/hsakmt/version.c @@ -0,0 +1,49 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "libhsakmt.h" +#include <stdlib.h> +#include <string.h> +#include "linux/kfd_ioctl.h" + +HSAKMT_STATUS +HSAKMTAPI +hsaKmtGetVersion( + HsaVersionInfo* VersionInfo //OUT + ) +{ + CHECK_KFD_OPEN(); + + struct kfd_ioctl_get_version_args args; + memset(&args, 0, sizeof(args)); + + if (kmtIoctl(kfd_fd, AMDKFD_IOC_GET_VERSION, &args) == -1) + return HSAKMT_STATUS_ERROR; + + VersionInfo->KernelInterfaceMajorVersion = args.major_version; + VersionInfo->KernelInterfaceMinorVersion = args.minor_version; + + return HSAKMT_STATUS_SUCCESS; +} |