summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPasha <pasha@member.fsf.org>2022-10-25 18:48:39 +0000
committerPasha <pasha@member.fsf.org>2022-10-25 18:48:39 +0000
commit4550eacc18535ea8480c8303c62adbd046fa031d (patch)
tree39c850531e37a9ae6d390ac35cf21a6029d5b5e7
downloadoneapi-4550eacc18535ea8480c8303c62adbd046fa031d.tar.gz
oneapi-4550eacc18535ea8480c8303c62adbd046fa031d.tar.bz2
initial test
-rwxr-xr-xsimplex-dev/CMakeLists.txt20
-rwxr-xr-xsimplex-dev/README.md268
-rwxr-xr-xsimplex-dev/sample.json41
-rwxr-xr-xsimplex-dev/src/CMakeLists.txt95
-rw-r--r--simplex-dev/src/host.cpp123
-rw-r--r--simplex-dev/src/kernel.cpp216
-rwxr-xr-xsimplex-dev/src/kernel.hpp13
7 files changed, 776 insertions, 0 deletions
diff --git a/simplex-dev/CMakeLists.txt b/simplex-dev/CMakeLists.txt
new file mode 100755
index 0000000..413315c
--- /dev/null
+++ b/simplex-dev/CMakeLists.txt
@@ -0,0 +1,20 @@
+if(UNIX)
+ # Direct CMake to use dpcpp rather than the default C++ compiler/linker
+ set(CMAKE_CXX_COMPILER dpcpp)
+else() # Windows
+ # Force CMake to use dpcpp rather than the default C++ compiler/linker
+ # (needed on Windows only)
+ include (CMakeForceCompiler)
+ CMAKE_FORCE_CXX_COMPILER (dpcpp IntelDPCPP)
+ include (Platform/Windows-Clang)
+endif()
+
+cmake_minimum_required (VERSION 3.4)
+
+project(FastRecompile CXX)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
diff --git a/simplex-dev/README.md b/simplex-dev/README.md
new file mode 100755
index 0000000..cec86e1
--- /dev/null
+++ b/simplex-dev/README.md
@@ -0,0 +1,268 @@
+
+# Separating Host and Device Code Compilation
+This FPGA tutorial demonstrates how to separate the compilation of a program's host code and device code to save development time. It's recommended to read the 'fpga_compile' code sample before this one.
+
+| Optimized for | Description
+--- |---
+| OS | Linux* Ubuntu* 18.04/20.04 <br> RHEL*/CentOS* 8 <br> SUSE* 15 <br> Windows* 10
+| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA <br> Intel® FPGA Programmable Acceleration Card (PAC) D5005 (with Intel Stratix® 10 SX) <br> Intel® FPGA 3rd party / custom platforms with oneAPI support <br> *__Note__: Intel® FPGA PAC hardware is only compatible with Ubuntu 18.04*
+| Software | Intel® oneAPI DPC++ Compiler <br> Intel® FPGA Add-On for oneAPI Base Toolkit
+| What you will learn | Why to separate host and device code compilation in your FPGA project <br> How to use the `-reuse-exe` and device link methods <br> Which method to choose for your project
+| Time to complete | 15 minutes
+
+
+
+## Purpose
+Intel® oneAPI DPC++ Compiler only supports ahead-of-time (AoT) compilation for FPGA, which means that an FPGA device image is generated at compile time. The FPGA device image generation process can take hours to complete. Suppose you make a change that is exclusive to the host code. In that case, it is more efficient to recompile your host code only, re-using the existing FPGA device image and circumventing the time-consuming device compilation process.
+
+The compiler provides two different mechanisms to separate device code and host code compilation.
+* Passing the `-reuse-exe=<exe_name>` flag to `dpcpp` instructs the compiler to attempt to reuse the existing FPGA device image.
+* The more explicit "device link" method requires you to separate the host and device code into separate files. When a code change only applies to host-only files, an FPGA device image is not regenerated.
+
+This tutorial explains both mechanisms and the pros and cons of each. The included code sample demonstrates the device link method but does **not** demonstrate the use of the `-reuse-exe` flag.
+
+### Using the `-reuse-exe` flag
+
+If the device code and options affecting the device have not changed since the previous compilation, passing the `-reuse-exe=<exe_name>` flag to `dpcpp` instructs the compiler to extract the compiled FPGA binary from the existing executable and package it into the new executable, saving the device compilation time.
+
+**Sample usage:**
+
+```
+# Initial compilation
+dpcpp <files.cpp> -o out.fpga -Xshardware -fintelfpga
+```
+The initial compilation generates an FPGA device image, which takes several hours. Now, make some changes to the host code.
+```
+# Subsequent recompilation
+dpcpp <files.cpp> -o out.fpga -reuse-exe=out.fpga -Xshardware -fintelfpga
+```
+If `out.fpga` does not exist, `-reuse-exe` is ignored and the FPGA device image is regenerated. This will always be the case the first time a project is compiled.
+
+If `out.fpga` is found, the compiler checks whether any changes affecting the FPGA device code have been made since the last compilation. If no such changes are detected, the compiler reuses the existing FPGA binary, and only the host code is recompiled. The recompilation process takes a few minutes. Note that the device code is partially re-compiled (similar to a report flow compile) to check that the FPGA binary can safely be reused.
+
+If `out.fpga` is found but the compiler cannot prove that the FPGA device code will yield a result identical to the last compilation, a warning is printed and the FPGA device code is fully recompiled. Since the compiler checks must be conservative, spurious recompilations can sometimes occur when using `-reuse-exe`.
+
+### Using the device link method
+
+The program accompanying this tutorial is separated into two files, `host.cpp` and `kernel.cpp`. Only the `kernel. cpp` file contains device code.
+
+In the normal compilation process, FPGA device image generation happens at link time. As a result, any change to either `host.cpp` or `kernel.cpp` will trigger an FPGA device image's regeneration.
+
+```
+# normal compile command
+dpcpp -fintelfpga host.cpp kernel.cpp -Xshardware -o link.fpga
+```
+
+The following graph depicts this compilation process:
+
+![](normal_compile.png)
+
+
+If you want to iterate on the host code and avoid long compile time for your FPGA device, consider using a device link to separate device and host compilation:
+
+```
+# device link command
+dpcpp -fintelfpga -fsycl-link=image <input files> [options]
+```
+
+The compilation is a 3-step process:
+
+1. Compile the device code:
+
+ ```
+ dpcpp -fintelfpga -fsycl-link=image kernel.cpp -o dev_image.a -Xshardware
+ ```
+ Input files should include all source files that contain device code. This step may take several hours.
+
+
+2. Compile the host code:
+
+ ```
+ dpcpp -fintelfpga host.cpp -c -o host.o
+ ```
+ Input files should include all source files that only contain host code. This takes seconds.
+
+
+3. Create the device link:
+
+ ```
+ dpcpp -fintelfpga host.o dev_image.a -o fast_recompile.fpga
+ ```
+ The input should have N (N >= 0) host object files *(.o)* and one device image file *(.a)*. This takes seconds.
+
+**NOTE:** You only need to perform steps 2 and 3 when modifying host-only files.
+
+The following graph depicts the device link compilation process:
+
+![](device_link.png)
+
+### Which method to use?
+Of the two methods described, `-reuse-exe` is easier to use. It also allows you to keep your host and device code as single source, which is preferred for small programs.
+
+For larger and more complex projects, the device link method has the advantage of giving you complete control over the compiler's behavior.
+* When using `-reuse-exe`, the compiler must partially recompile and then analyze the device code to ensure that it is unchanged. This takes several minutes for larger designs. Compiling separate files does not incur this extra time.
+* When using `-reuse-exe`, you may occasionally encounter a "false positive" where the compiler wrongly believes that it must recompile your device code. In a single source file, the device and host code are coupled, so some changes to the host code _can_ change the compiler's view of the device code. The compiler will always behave conservatively and trigger a full recompilation if it cannot prove that reusing the previous FPGA binary is safe. Compiling separate files eliminates this possibility.
+
+### Additional Documentation
+- [Explore SYCL* Through Intel&reg; FPGA Code Samples](https://software.intel.com/content/www/us/en/develop/articles/explore-dpcpp-through-intel-fpga-code-samples.html) helps you to navigate the samples and build your knowledge of FPGAs and SYCL.
+- [FPGA Optimization Guide for Intel&reg; oneAPI Toolkits](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) helps you understand how to target FPGAs using SYCL and Intel&reg; oneAPI Toolkits.
+- [Intel&reg; oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) helps you understand target-independent, SYCL-compliant programming using Intel&reg; oneAPI Toolkits.
+
+## Key Concepts
+* Why to separate host and device code compilation in your FPGA project
+* How to use the `-reuse-exe` and device link methods
+* Which method to choose for your project
+
+## Building the `fast_recompile` Tutorial
+> **Note**: If you have not already done so, set up your CLI
+> environment by sourcing the `setvars` script located in
+> the root of your oneAPI installation.
+>
+> Linux*:
+> - For system wide installations: `. /opt/intel/oneapi/setvars.sh`
+> - For private installations: `. ~/intel/oneapi/setvars.sh`
+>
+> Windows*:
+> - `C:\Program Files(x86)\Intel\oneAPI\setvars.bat`
+> - For PowerShell*, use the following command: `cmd.exe "/K" '"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" && powershell'`
+>
+>For more information on environment variables, see **Use the setvars Script** for [Linux or macOS](https://www.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/oneapi-development-environment-setup/use-the-setvars-script-with-linux-or-macos.html), or [Windows](https://www.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/oneapi-development-environment-setup/use-the-setvars-script-with-windows.html).
+
+
+### Include Files
+The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Samples in Intel&reg; DevCloud
+If running a sample in the Intel&reg; DevCloud, remember that you must specify the type of compute node and whether to run in batch or interactive mode. Compiles to FPGA are only supported on fpga_compile nodes. Executing programs on FPGA hardware is only supported on fpga_runtime nodes of the appropriate type, such as fpga_runtime:arria10 or fpga_runtime:stratix10. Neither compiling nor executing programs on FPGA hardware are supported on the login nodes. For more information, see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/documentation/base-toolkit/](https://devcloud.intel.com/oneapi/documentation/base-toolkit/)).
+
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h.
+
+
+### Using Visual Studio Code* (Optional)
+
+You can use Visual Studio Code (VS Code) extensions to set your environment, create launch configurations,
+and browse and download samples.
+
+The basic steps to build and run a sample using VS Code include:
+ - Download a sample using the extension **Code Sample Browser for Intel&reg; oneAPI Toolkits**.
+ - Configure the oneAPI environment with the extension **Environment Configurator for Intel&reg; oneAPI Toolkits**.
+ - Open a Terminal in VS Code (**Terminal>New Terminal**).
+ - Run the sample in the VS Code terminal using the instructions below.
+ - (Linux only) Debug your GPU application with GDB for Intel® oneAPI toolkits using the **Generate Launch Configurations** extension.
+
+To learn more about the extensions, see the
+[Using Visual Studio Code with Intel® oneAPI Toolkits User Guide](https://www.intel.com/content/www/us/en/develop/documentation/using-vs-code-with-intel-oneapi/top.html).
+
+
+After learning how to use the extensions for Intel oneAPI Toolkits, return to this readme for instructions on how to build and run a sample.
+
+### On a Linux* System
+
+1. Generate the `Makefile` by running `cmake`.
+ ```
+ mkdir build
+ cd build
+ ```
+ To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:
+ ```
+ cmake ..
+ ```
+ Alternatively, to compile for the Intel® FPGA PAC D5005 (with Intel Stratix® 10 SX), run `cmake` using the command:
+
+ ```
+ cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+ ```
+ You can also compile for a custom FPGA platform. Ensure that the board support package is installed on your system. Then run `cmake` using the command:
+ ```
+ cmake .. -DFPGA_BOARD=<board-support-package>:<board-variant>
+ ```
+
+ **NOTE:** For the FPGA emulator target and the FPGA target, the device link method is used.
+2. Compile the design through the generated `Makefile`. The following build targets are provided:
+
+ * Compile for emulation (fast compile time, targets emulated FPGA device):
+ ```
+ make fpga_emu
+ ```
+ * Compile for FPGA hardware (longer compile time, targets FPGA device):
+ ```
+ make fpga
+ ```
+3. (Optional) As the above hardware compile may take several hours to complete, FPGA precompiled binaries (compatible with Linux* Ubuntu* 18.04) can be downloaded <a href="https://iotdk.intel.com/fpga-precompiled-binaries/latest/fast_recompile.fpga.tar.gz" download>here</a>.
+
+### On a Windows* System
+
+1. Generate the `Makefile` by running `cmake`.
+ ```
+ mkdir build
+ cd build
+ ```
+ To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:
+ ```
+ cmake -G "NMake Makefiles" ..
+ ```
+ Alternatively, to compile for the Intel® FPGA PAC D5005 (with Intel Stratix® 10 SX), run `cmake` using the command:
+
+ ```
+ cmake -G "NMake Makefiles" .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+ ```
+ You can also compile for a custom FPGA platform. Ensure that the board support package is installed on your system. Then run `cmake` using the command:
+ ```
+ cmake -G "NMake Makefiles" .. -DFPGA_BOARD=<board-support-package>:<board-variant>
+ ```
+
+2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow:
+
+ * Compile for emulation (fast compile time, targets emulated FPGA device):
+ ```
+ nmake fpga_emu
+ ```
+ * Compile for FPGA hardware (longer compile time, targets FPGA device):
+ ```
+ nmake fpga
+ ```
+
+> **Note**: The Intel® PAC with Intel Arria® 10 GX FPGA and Intel® FPGA PAC D5005 (with Intel Stratix® 10 SX) do not support Windows*. Compiling to FPGA hardware on Windows* requires a third-party or custom Board Support Package (BSP) with Windows* support.
+
+> **Note**: If you encounter any issues with long paths when compiling under Windows*, you may have to create your ‘build’ directory in a shorter path, for example c:\samples\build. You can then run cmake from that directory, and provide cmake with the full path to your sample directory.
+
+### Troubleshooting
+If an error occurs, you can get more details by running `make` with
+the `VERBOSE=1` argument:
+``make VERBOSE=1``
+For more comprehensive troubleshooting, use the Diagnostics Utility for
+Intel® oneAPI Toolkits, which provides system checks to find missing
+dependencies and permissions errors.
+[Learn more](https://www.intel.com/content/www/us/en/develop/documentation/diagnostic-utility-user-guide/top.html).
+
+
+### In Third-Party Integrated Development Environments (IDEs)
+
+You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [FPGA Workflows on Third-Party IDEs for Intel&reg; oneAPI Toolkits](https://www.intel.com/content/www/us/en/developer/articles/technical/intel-oneapi-dpcpp-fpga-workflow-on-ide.html).
+
+
+## Running the Sample
+
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU):
+ ```
+ ./fast_recompile.fpga_emu (Linux)
+ fast_recompile.fpga_emu.exe (Windows)
+ ```
+2. Run the sample on the FPGA device:
+ ```
+ ./fast_recompile.fpga (Linux)
+ fast_recompile.fpga.exe (Windows)
+ ```
+
+### Example of Output
+```
+PASSED: results are correct
+```
+### Discussion of Results
+Try modifying `host.cpp` to produce a different output message. Then, perform a host-only recompile via the device link method to see how quickly the design is recompiled.
+
+## License
+Code samples are licensed under the MIT license. See
+[License.txt](https://github.com/oneapi-src/oneAPI-samples/blob/master/License.txt) for details.
+
+Third party program Licenses can be found here: [third-party-programs.txt](https://github.com/oneapi-src/oneAPI-samples/blob/master/third-party-programs.txt). \ No newline at end of file
diff --git a/simplex-dev/sample.json b/simplex-dev/sample.json
new file mode 100755
index 0000000..a67feba
--- /dev/null
+++ b/simplex-dev/sample.json
@@ -0,0 +1,41 @@
+{
+ "guid": "1457B49A-2CD3-48E5-B3A9-753EAD2D18F7",
+ "name": "Fast Recompile",
+ "categories": ["Toolkit/oneAPI Direct Programming/DPC++ FPGA/Getting Started Tutorials"],
+ "description": "An Intel® FPGA tutorial demonstrating how to separate the compilation of host and device code to save development time",
+ "toolchain": ["dpcpp"],
+ "os": ["linux", "windows"],
+ "targetDevice": ["FPGA"],
+ "builder": ["ide", "cmake"],
+ "languages": [{"cpp":{}}],
+ "ciTests": {
+ "linux": [
+ {
+ "id": "fpga_emu",
+ "steps": [
+ "dpcpp --version",
+ "mkdir build",
+ "cd build",
+ "cmake ..",
+ "make fpga_emu",
+ "./fast_recompile.fpga_emu"
+ ]
+ }
+ ],
+ "windows": [
+ {
+ "id": "fpga_emu",
+ "steps": [
+ "dpcpp --version",
+ "cd ../../..",
+ "mkdir build",
+ "cd build",
+ "cmake -G \"NMake Makefiles\" ../Tutorials/GettingStarted/fast_recompile",
+ "nmake fpga_emu",
+ "fast_recompile.fpga_emu.exe"
+ ]
+ }
+ ]
+ },
+ "expertise": "Getting Started"
+}
diff --git a/simplex-dev/src/CMakeLists.txt b/simplex-dev/src/CMakeLists.txt
new file mode 100755
index 0000000..becd251
--- /dev/null
+++ b/simplex-dev/src/CMakeLists.txt
@@ -0,0 +1,95 @@
+set(DEVICE_SOURCE_FILE kernel.cpp)
+set(KERNEL_HEADER_FILE kernel.hpp)
+set(HOST_SOURCE_FILE host.cpp)
+set(TARGET_NAME simplexdev)
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+
+# FPGA board selection
+if(NOT DEFINED FPGA_BOARD)
+ set(FPGA_BOARD "intel_a10gx_pac:pac_a10")
+ message(STATUS "FPGA_BOARD was not specified.\
+ \nConfiguring the design to run on the default FPGA board ${FPGA_BOARD} (Intel(R) PAC with Intel Arria(R) 10 GX FPGA). \
+ \nPlease refer to the README for information on board selection.")
+else()
+ message(STATUS "Configuring the design to run on FPGA board ${FPGA_BOARD}")
+endif()
+
+# This is a Windows-specific flag that enables exception handling in host code
+if(WIN32)
+ set(WIN_FLAG "/EHsc")
+endif()
+
+# A DPC++ ahead-of-time (AoT) compile processes the device code in two stages.
+# 1. The "compile" stage compiles the device code to an intermediate representation (SPIR-V).
+# 2. The "link" stage invokes the compiler's FPGA backend before linking.
+# For this reason, FPGA backend flags must be passed as link flags in CMake.
+set(EMULATOR_COMPILE_FLAGS "-Wall ${WIN_FLAG} -fintelfpga -DFPGA_EMULATOR")
+set(EMULATOR_LINK_FLAGS "-fintelfpga")
+set(HARDWARE_COMPILE_FLAGS "-Wall ${WIN_FLAG} -fintelfpga")
+set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${FPGA_BOARD} ${USER_HARDWARE_FLAGS}")
+# use cmake -D USER_HARDWARE_FLAGS=<flags> to set extra flags for FPGA backend compilation
+
+###############################################################################
+### FPGA Emulator
+###############################################################################
+# To compile in a single command:
+# dpcpp -fintelfpga -DFPGA_EMULATOR host.cpp kernel.cpp -o fast_recompile.fpga_emu
+# CMake executes:
+# [compile] dpcpp -fintelfpga -DFPGA_EMULATOR -o host.cpp.o -c host.cpp
+# [compile] dpcpp -fintelfpga -DFPGA_EMULATOR -o kernel.cpp.o -c kernel.cpp
+# [link] dpcpp -fintelfpga host.cpp.o kernel.cpp.o -o fast_recompile.fpga_emu
+add_executable(${EMULATOR_TARGET} ${HOST_SOURCE_FILE} ${DEVICE_SOURCE_FILE})
+set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS "${EMULATOR_COMPILE_FLAGS}")
+set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS "${EMULATOR_LINK_FLAGS}")
+add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+
+###############################################################################
+### Generate Report
+###############################################################################
+# To compile manually:
+# dpcpp -fintelfpga -Xshardware -Xsboard=<FPGA_BOARD> -fsycl-link=early host.cpp kernel.cpp -o fast_compile_report.a
+set(FPGA_EARLY_IMAGE ${TARGET_NAME}_report.a)
+# The compile output is not an executable, but an intermediate compilation result unique to DPC++.
+add_executable(${FPGA_EARLY_IMAGE} ${HOST_SOURCE_FILE} ${DEVICE_SOURCE_FILE})
+add_custom_target(report DEPENDS ${FPGA_EARLY_IMAGE})
+set_target_properties(${FPGA_EARLY_IMAGE} PROPERTIES COMPILE_FLAGS ${HARDWARE_COMPILE_FLAGS})
+set_target_properties(${FPGA_EARLY_IMAGE} PROPERTIES LINK_FLAGS "${HARDWARE_LINK_FLAGS} -fsycl-link=early")
+# fsycl-link=early stops the compiler after RTL generation, before invoking Quartus®
+
+
+###############################################################################
+### FPGA Hardware
+###############################################################################
+# To compile manually:
+# dpcpp -fintelfpga -c host.cpp -o host.o
+# dpcpp -fintelfpga -Xshardware -Xsboard=<FPGA_BOARD> -fsycl-link=image kernel.cpp -o dev_image.a
+# dpcpp -fintelfpga host.o dev_image.a -o fast_recompile.fpga
+
+if(WIN32)
+ set(FPGA_TARGET ${FPGA_TARGET}.exe)
+endif()
+add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+set(HOST_OBJ "host.o")
+set(DEVICE_OBJ "dev.o")
+set(DEVICE_IMAGE_OBJ "dev_image.a")
+
+set(CMAKE_CXX_FLAGS_LIST "${CMAKE_CXX_FLAGS}")
+separate_arguments(CMAKE_CXX_FLAGS_LIST)
+set(HARDWARE_COMPILE_FLAGS_LIST "${HARDWARE_COMPILE_FLAGS}")
+separate_arguments(HARDWARE_COMPILE_FLAGS_LIST)
+set(HARDWARE_LINK_FLAGS_LIST "${HARDWARE_LINK_FLAGS}")
+separate_arguments(HARDWARE_LINK_FLAGS_LIST)
+
+
+add_custom_command(OUTPUT ${HOST_OBJ}
+ COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS_LIST} ${HARDWARE_COMPILE_FLAGS_LIST} -c ${CMAKE_CURRENT_SOURCE_DIR}/${HOST_SOURCE_FILE} -o ${HOST_OBJ}
+ DEPENDS ${HOST_SOURCE_FILE} ${KERNEL_HEADER_FILE})
+
+add_custom_command(OUTPUT ${DEVICE_IMAGE_OBJ}
+ COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS_LIST} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link=image ${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} -o ${DEVICE_IMAGE_OBJ}
+ DEPENDS ${DEVICE_SOURCE_FILE} ${KERNEL_HEADER_FILE})
+
+add_custom_command(OUTPUT ${FPGA_TARGET}
+ COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS_LIST} -fintelfpga ${HOST_OBJ} ${DEVICE_IMAGE_OBJ} -o ${CMAKE_BINARY_DIR}/${FPGA_TARGET}
+ DEPENDS ${HOST_OBJ} ${DEVICE_IMAGE_OBJ})
diff --git a/simplex-dev/src/host.cpp b/simplex-dev/src/host.cpp
new file mode 100644
index 0000000..8d0507b
--- /dev/null
+++ b/simplex-dev/src/host.cpp
@@ -0,0 +1,123 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+
+#include <iostream>
+#include <vector>
+#include <string>
+#include <type_traits>
+
+#include <CL/sycl.hpp>
+#include <sycl/ext/intel/fpga_extensions.hpp>
+
+// dpc_common.hpp can be found in the dev-utilities include folder.
+// e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp
+#include "dpc_common.hpp"
+
+// This code sample demonstrates how to split the host and FPGA kernel code into
+// separate compilation units so that they can be separately recompiled.
+// Consult the README for a detailed discussion.
+// - host.cpp (this file) contains exclusively code that executes on the host.
+// - kernel.cpp contains almost exclusively code that executes on the device.
+// - kernel.hpp contains only the forward declaration of a function containing
+// the device code.
+#include "kernel.hpp"
+
+using namespace sycl;
+
+template <typename K>
+void printMatrix(std::vector<K> &vec, int col, std::string msg) {
+ std::cout << msg << ":" << std::endl << "[" << std::endl;
+ for (size_t i=0; i<vec.size(); ++i) {
+ std::cout << vec.at(i);
+ if (i<vec.size()-1 && vec.size() > 1) {
+ std::cout << ",\t";
+ }
+ if (i%col == col-1) {
+ std::cout << std::endl;
+ }
+ }
+ std::cout << "]" << std::endl;
+}
+
+template <typename K>
+void printVec(std::vector<K> &vec, std::string msg) {
+ std::cout << msg << ": ";
+ std::cout << "[";
+
+ for (size_t i=0; i<vec.size(); ++i) {
+ std::cout << vec.at(i);
+ if (i<vec.size()-1 && vec.size() > 1) {
+ std::cout << ", ";
+ }
+ }
+ std::cout << "]" << std::endl;
+}
+
+int main() {
+ std::vector<float> a = { 2, 1, 1, 1, 0, 0,
+ 1, 3, 2, 0, 1, 0,
+ 2, 1, 2, 0, 0, 1};
+
+ std::vector<float> c = {-6, -5, -4, 0, 0, 0};
+ std::vector<float> b = {180, 300, 240};
+
+ std::vector<int> resultFlags = {-1, -1, -1};
+
+ // Select either the FPGA emulator or FPGA device
+#if defined(FPGA_EMULATOR)
+ ext::intel::fpga_emulator_selector device_selector;
+#else
+ ext::intel::fpga_selector device_selector;
+#endif
+
+ try {
+ // Create a queue bound to the chosen device.
+ // If the device is unavailable, a SYCL runtime exception is thrown.
+ queue q(device_selector, dpc_common::exception_handler);
+
+
+ // make sure the device supports USM device allocations
+ device d = q.get_device();
+ if (!d.get_info<info::device::usm_device_allocations>()) {
+ std::cerr << "ERROR: The selected device does not support USM device"
+ << " allocations\n";
+ return 1;
+ }
+
+ printMatrix(a, 6, "a");
+ printVec(resultFlags, "result flags");
+ // The definition of this function is in a different compilation unit,
+ // so host and device code can be separately compiled.
+ double timePassed = RunKernel(q, a, b, c, resultFlags);
+
+ std::cout << "------------------------" << std::endl;
+ printMatrix(a, 6, "a");
+ printVec(resultFlags, "result flags");
+
+ std::cout << std::endl << std::endl;
+ std::cout << "timePassed: " << timePassed << std::endl;
+
+
+ } catch (exception const &e) {
+ // Catches exceptions in the host code
+ std::cerr << "Caught a SYCL host exception:\n" << e.what() << "\n";
+
+ // Most likely the runtime couldn't find FPGA hardware!
+ if (e.code().value() == CL_DEVICE_NOT_FOUND) {
+ std::cerr << "If you are targeting an FPGA, please ensure that your "
+ "system has a correctly configured FPGA board.\n";
+ std::cerr << "Run sys_check in the oneAPI root directory to verify.\n";
+ std::cerr << "If you are targeting the FPGA emulator, compile with "
+ "-DFPGA_EMULATOR.\n";
+ }
+ std::terminate();
+ }
+
+
+
+ std::cout << "done\n";
+ return 0;
+}
diff --git a/simplex-dev/src/kernel.cpp b/simplex-dev/src/kernel.cpp
new file mode 100644
index 0000000..86bd98e
--- /dev/null
+++ b/simplex-dev/src/kernel.cpp
@@ -0,0 +1,216 @@
+#include <sycl/ext/intel/fpga_extensions.hpp>
+#include <chrono>
+#include "kernel.hpp"
+
+
+template<typename T>
+SYCL_EXTERNAL bool checkOptimality(device_ptr<T> C, int size) {
+ bool isOptimal = false;
+ int positveValueCount = 0;
+ //check if the coefficients of the objective function are negative
+ for(int i=0; i<size;i++){
+ float value = C[i];
+ if(value >= 0){
+ positveValueCount++;
+ }
+ }
+ //if all the constraints are positive now,the table is optimal
+ if(positveValueCount == size){
+ isOptimal = true;
+ }
+ return isOptimal;
+}
+
+
+template<typename T>
+SYCL_EXTERNAL int findPivotColumn(device_ptr<T> C, int size) {
+ int location = 0;
+ float minimum = C[0];
+ for(int i=1; i<size; ++i) {
+ if(C[i]<minimum) {
+ minimum = C[i];
+ location = i;
+ }
+ }
+ return location;
+}
+
+
+//find the row with the pivot value.The least value item's row in the B array
+template<typename T>
+SYCL_EXTERNAL int findPivotRow(device_ptr<T> A, device_ptr<T> B, device_ptr<T> C, int pivotColumn, int rows, int cols, bool *isUnbounded) {
+ int negativeValueCount = 0;
+ for (int i = 0; i < rows; i++) {
+ // 2d to 1d array index mapping
+ int pivotColumnIndex = (i*cols)+pivotColumn;
+ if (A[pivotColumnIndex] <= 0) {
+ negativeValueCount += 1;
+ }
+ }
+ int location = 0;
+ //checking the unbound condition if all the values are negative ones
+ if (negativeValueCount == rows) {
+ *isUnbounded = true;
+ } else {
+ float minimum = 99999999.0;
+
+ for (int i = 0; i < rows; ++i) {
+ // 2d to 1d array index mapping
+ int pivotColumnIndex = (i*cols)+pivotColumn;
+ float tmpACols = A[pivotColumnIndex];
+ if (tmpACols > 0) {
+ float result = B[i] / tmpACols;
+ if (result > 0 && result < minimum) {
+ minimum = result;
+ location = i;
+ }
+ }
+ }
+ }
+ return location;
+}
+
+template<typename T>
+SYCL_EXTERNAL void doPivotting(device_ptr<T> A, device_ptr<T> B, device_ptr<T> C, int pivotRow, int pivotColumn, int rows, int cols) {
+ int columnIndex = (pivotRow*cols)+pivotColumn;
+ float pivetValue = A[columnIndex];
+
+ float pivotRowVals[6]; //the column with the pivot
+ float pivotColVals[3]; //the row with the pivot
+ float rowNew[6]; //the row after processing the pivot value
+
+ float maximum = 0;
+ maximum = maximum-(C[pivotColumn]*(B[pivotRow]/pivetValue)); //set the maximum step by step
+
+
+ //get the row that has the pivot value
+ for (int i = 0; i < cols; ++i) {
+ int pivotRowIndex = (pivotRow*cols)+i;
+ pivotRowVals[i] = A[pivotRowIndex];
+ }
+ //get the column that has the pivot value
+ for (int j = 0; j < rows; ++j) {
+ int pivotColIndex = (j*cols)+pivotColumn;
+ pivotColVals[j] = A[pivotColIndex];
+ }
+
+ //set the row values that has the pivot value divided by the pivot value and put into new row
+ for (int k = 0; k < cols; ++k) {
+ rowNew[k] = pivotRowVals[k]/pivetValue;
+ }
+
+ B[pivotRow] = B[pivotRow]/pivetValue;
+
+ //process the other coefficients in the A array by subtracting
+ for (int m=0; m < rows; ++m) {
+ //ignore the pivot row as we already calculated that
+ if (m != pivotRow) {
+ for (int p = 0; p<cols; ++p) {
+ float multiplyValue = pivotColVals[m];
+ int indexA_M_P = (m*cols)+p;
+ A[indexA_M_P] = A[indexA_M_P] - (multiplyValue * rowNew[p]);
+ //C[p] = C[p] - (multiplyValue*C[pivotRow]);
+ //B[i] = B[i] - (multiplyValue*B[pivotRow]);
+ }
+
+ }
+ }
+
+ //process the values of the B array
+ for (int i = 0; i<rows; ++i) { // rows = B.size()
+ if (i != pivotRow) {
+ float multiplyValue = pivotColVals[i];
+ B[i] = B[i]-(multiplyValue*B[pivotRow]);
+
+ }
+ }
+ //the least coefficient of the constraints of the objective function
+ float multiplyValue = C[pivotColumn];
+ //process the C array
+ for (int i = 0; i < C.size(); i++) {
+ C[i] = C[i]-(multiplyValue * rowNew[i]);
+
+ }
+
+ //replacing the pivot row in the new calculated A array
+ for (int i = 0; i<cols; ++i) {
+ int indexA_pivotRow_i = (pivotRow*cols)+i;
+ A[indexA_pivotRow_i] = rowNew[i];
+ }
+}
+
+// Forward declare the kernel names in the global scope. This FPGA best practice
+// reduces compiler name mangling in the optimization reports.
+class SimplexCalc;
+
+double RunKernel(queue &q, std::vector<T> &inAHost, std::vector<T> &inBHost,
+ std::vector<T> &inCHost, std::vector<int>& resultFlags) {
+
+ int rowSizeA = inBHost.size();
+ int colSizeA = inCHost.size();
+
+ T *inADevice = malloc_device<T> (inAHost.size(), q);
+ T *inBDevice = malloc_device<T> (inBHost.size(), q);
+ T *inCDevice = malloc_device<T> (inCHost.size(), q);
+ int *inResultFlagsDevice = malloc_device<int> (resultFlags.size(), q);
+
+ if (inADevice == nullptr) {
+ std::cerr << "ERROR: failed to allocate space for 'inADevice'\n";
+ std::terminate();
+ }
+ if (inBDevice == nullptr) {
+ std::cerr << "ERROR: failed to allocate space for 'inBDevice'\n";
+ std::terminate();
+ }
+ if (inCDevice == nullptr) {
+ std::cerr << "ERROR: failed to allocate space for 'inCDevice'\n";
+ std::terminate();
+ }
+
+ auto start = std::chrono::high_resolution_clock::now();
+
+ q.memcpy(inADevice, inAHost.data(), inAHost.size()*sizeof(T)).wait();
+ q.memcpy(inBDevice, inBHost.data(), inBHost.size() * sizeof(T)).wait();
+ q.memcpy(inCDevice, inCHost.data(), inCHost.size() * sizeof(T)).wait();
+ q.memcpy(inResultFlagsDevice, resultFlags.data(), resultFlags.size()*sizeof(int)).wait();
+
+ q.submit([&](handler &h) {
+ h.single_task < SimplexCalc > ([=]()[[intel::kernel_args_restrict]] {
+ device_ptr<T> inA(inADevice);
+ device_ptr<T> inB(inBDevice);
+ device_ptr<T> inC(inCDevice);
+ device_ptr<int> inResultFlags(inResultFlagsDevice);
+
+ bool tempIsOptimizal = checkOptimality(inC, colSizeA);
+ if (tempIsOptimizal) {
+ inResultFlags[0] = 1;
+ return;
+ } else {
+ inResultFlags[0] = 0;
+ }
+
+ int pivotColumn = findPivotColumn(inC, colSizeA);
+ inResultFlags[1] = pivotColumn;
+
+ inA[0] = 43.0; //test only
+ bool isUnbounded = true;
+ int pivotRow = findPivotRow(inA, inB, inC, pivotColumn, rowSizeA, colSizeA, &isUnbounded);
+ inResultFlags[2] = pivotRow;
+ });
+ }).wait();
+
+ q.memcpy(inAHost.data(), inADevice, inAHost.size()*sizeof(T));
+ q.memcpy(inBHost.data(), inBDevice, inBHost.size()*sizeof(T));
+ q.memcpy(inCHost.data(), inCDevice, inCHost.size()*sizeof(T));
+ q.memcpy(resultFlags.data(), inResultFlagsDevice, resultFlags.size()*sizeof(int));
+
+ auto end = std::chrono::high_resolution_clock::now();
+ std::chrono::duration<double, std::milli> diff = end - start;
+
+ sycl::free(inADevice, q);
+ sycl::free(inBDevice, q);
+ sycl::free(inCDevice, q);
+ sycl::free(inResultFlagsDevice, q);
+
+ return diff.count();
+}
diff --git a/simplex-dev/src/kernel.hpp b/simplex-dev/src/kernel.hpp
new file mode 100755
index 0000000..77b8ca7
--- /dev/null
+++ b/simplex-dev/src/kernel.hpp
@@ -0,0 +1,13 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include <CL/sycl.hpp>
+
+using namespace sycl;
+
+typedef float T;
+
+double RunKernel(queue &q, std::vector<T> &inAHost, std::vector<T> &inBHost,
+ std::vector<T> &inCHost, std::vector<int>& resultFlags);