Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,45 @@
driver/xrt/src/m2m
driver/xrt/build
.vscode
<<<<<<< HEAD
=======
coyote_build*
*xcu55c-fsvh2892-2L-e*
*xcu280-fsvh2892-2L-e*
sol1/
*_prj
*.gen
*.ip_user_files
*.cache
*.srcs
**/fpga_ips.txt

CMakeFiles/
CMakeCache.txt
CMakeDoxy*
*.cmake
accl_on_coyote
*.xsa
accl_log/
*.xci
packaged_kernel/
*.hw
*.xml
*.xpr
*/xrt/_deps/
>>>>>>> dev

# HLS-generated files to ignore
sol1/
sources_1/
packaged_kernel/
*.app
*.xsa
*.xpr
*.lpr
*.wpc
<<<<<<< HEAD
=======
.run/
_deps/
>>>>>>> dev
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ initialize_accl(std::vector<ACCL::rank_t> &ranks, int local_rank,
xrt::device device = xrt::device(),
std::filesystem::path xclbin = "", unsigned int nbufs = 16,
unsigned int bufsize = 1024, unsigned int egrsize = 0,
bool rsfec = false);
bool rsfec = false, bool eagerRx_host = false);

// Configure the VNX kernel, this function is called by initialize_accl
void configure_vnx(vnx::CMAC &cmac, vnx::Networklayer &network_layer,
Expand Down
4 changes: 2 additions & 2 deletions driver/utils/accl_network_utils/src/accl_network_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -452,7 +452,7 @@ std::unique_ptr<ACCL::ACCL>
initialize_accl(std::vector<rank_t> &ranks, int local_rank,
bool simulator, acclDesign design, xrt::device device,
fs::path xclbin, unsigned int nbufs, unsigned int bufsize,
unsigned int egrsize, bool rsfec) {
unsigned int egrsize, bool rsfec, bool eagerRx_host) {
std::size_t world_size = ranks.size();
std::unique_ptr<ACCL::ACCL> accl;

Expand Down Expand Up @@ -519,7 +519,7 @@ initialize_accl(std::vector<rank_t> &ranks, int local_rank,

accl = std::make_unique<ACCL::ACCL>(device, cclo_ip, hostctrl_ip, devicemem, rxbufmem);
}
accl.get()->initialize(ranks, local_rank, nbufs, bufsize, egrsize, std::min(nbufs*bufsize, (unsigned int)4*1024*1024));
accl.get()->initialize(ranks, local_rank, nbufs, bufsize, egrsize, std::min(nbufs*bufsize, (unsigned int)4*1024*1024), eagerRx_host);
return accl;
}
} // namespace accl_network_utils
4 changes: 2 additions & 2 deletions driver/xrt/include/accl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ class ACCL {
*/
void initialize(const std::vector<rank_t> &ranks, int local_rank,
int n_egr_rx_bufs = 16, addr_t egr_rx_buf_size = 1024,
addr_t max_egr_size = 1024, addr_t max_rndzv_size = 32*1024);
addr_t max_egr_size = 1024, addr_t max_rndzv_size = 32*1024, bool rxEager_host = false);

/**
* Get the return code of the last ACCL call.
Expand Down Expand Up @@ -1101,7 +1101,7 @@ ACCLRequest *barrier(communicatorId comm_id = GLOBAL_COMM,
void configure_arithmetic();

void setup_eager_rx_buffers(size_t n_egr_rx_bufs, addr_t egr_rx_buf_size,
const std::vector<int> &devicemem);
const std::vector<int> &devicemem, bool host=false);
void setup_eager_rx_buffers(size_t n_egr_rx_bufs, addr_t egr_rx_buf_size, int devicemem) {
std::vector<int> mems = {devicemem};
return setup_eager_rx_buffers(n_egr_rx_bufs, egr_rx_buf_size, mems);
Expand Down
81 changes: 56 additions & 25 deletions driver/xrt/src/accl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,6 @@ ACCLRequest *ACCL::send(BaseBuffer &srcbuf, unsigned int count,
if (from_fpga == false) {
srcbuf.sync_to_device();
}

options.scenario = operation::send;
options.comm = communicators[comm_id].communicators_addr();
options.addr_0 = &srcbuf;
Expand All @@ -143,7 +142,6 @@ ACCLRequest *ACCL::send(BaseBuffer &srcbuf, unsigned int count,
wait(handle);
check_return_value("send", handle);
}

return handle;
}

Expand Down Expand Up @@ -262,7 +260,6 @@ ACCLRequest *ACCL::recv(BaseBuffer &dstbuf, unsigned int count,
}
check_return_value("recv", handle);
}

return handle;
}

Expand Down Expand Up @@ -302,11 +299,9 @@ ACCLRequest *ACCL::copy(BaseBuffer *srcbuf, BaseBuffer *dstbuf, unsigned int cou
"sync_from_device() after waiting"
<< std::endl;
}

if (from_fpga == false) {
srcbuf->sync_to_device();
}

options.scenario = operation::copy;
options.addr_0 = srcbuf;
options.addr_2 = dstbuf;
Expand All @@ -316,15 +311,13 @@ ACCLRequest *ACCL::copy(BaseBuffer *srcbuf, BaseBuffer *dstbuf, unsigned int cou
options.stream_flags = stream_flags;
options.waitfor = waitfor;
ACCLRequest *handle = call_async(options);

if (!run_async) {
wait(handle);
if (to_fpga == false) {
dstbuf->sync_from_device();
}
check_return_value("copy", handle);
}

return handle;
}

Expand Down Expand Up @@ -1011,28 +1004,36 @@ std::string ACCL::dump_eager_rx_buffers(size_t n_egr_rx_bufs, bool dump_data) {
address += 4;
val_t addrh = cclo->read(address);
address += 4;
val_t max_len = cclo->read(address);
address += 4;
val_t rxtag = cclo->read(address);
address += 4;
val_t rxlen = cclo->read(address);
address += 4;
val_t rxsrc = cclo->read(address);
address += 4;
val_t seq = cclo->read(address);
address += 4;
val_t hostBit = cclo->read(address);

stream << "Spare RX Buffer " << i << ":\t address: 0x" << std::hex
<< addrh * (1UL << 32) + addrl << std::dec
<< " \t status: " << status << " \t occupancy: " << rxlen << "/"
<< maxsize << " \t MPI tag: " << std::hex << rxtag << std::dec
<< " \t seq: " << seq << " \t src: " << rxsrc;
<< " \t seq: " << seq << " \t src: " << rxsrc
<< " \t hostBit: " << hostBit;

if(dump_data) {
eager_rx_buffers[i]->sync_from_device();
//add if else, to check if is host or not and sync accordingly
if(!(hostBit && cclo->get_device_type() == CCLO::coyote_device)){
eager_rx_buffers[i]->sync_from_device();
}

stream << " \t data: " << std::hex << "[";
for (size_t j = 0; j < eager_rx_buffers[i]->size(); ++j) {
stream << "0x"
<< static_cast<uint16_t>(static_cast<uint8_t *>(
eager_rx_buffers[i]->byte_array())[j]);
<< static_cast<uint16_t>(static_cast<uint8_t *>(
eager_rx_buffers[i]->byte_array())[j]);
if (j != eager_rx_buffers[i]->size() - 1) {
stream << ", ";
}
Expand Down Expand Up @@ -1065,7 +1066,7 @@ void ACCL::parse_hwid(){

void ACCL::initialize(const std::vector<rank_t> &ranks, int local_rank,
int n_egr_rx_bufs, addr_t egr_rx_buf_size,
addr_t max_egr_size, addr_t max_rndzv_size) {
addr_t max_egr_size, addr_t max_rndzv_size, bool rxEager_host) {

parse_hwid();

Expand All @@ -1077,7 +1078,7 @@ void ACCL::initialize(const std::vector<rank_t> &ranks, int local_rank,
}

debug("Configuring Eager RX Buffers");
setup_eager_rx_buffers(n_egr_rx_bufs, egr_rx_buf_size, rxbufmem);
setup_eager_rx_buffers(n_egr_rx_bufs, egr_rx_buf_size, rxbufmem, rxEager_host);

debug("Configuring Rendezvous Spare Buffers");
setup_rendezvous_spare_buffers(max_rndzv_size, rxbufmem);
Expand Down Expand Up @@ -1129,23 +1130,40 @@ addr_t ACCL::get_arithmetic_config_addr(std::pair<dataType, dataType> id) {
}

void ACCL::setup_eager_rx_buffers(size_t n_egr_rx_bufs, addr_t egr_rx_buf_size,
const std::vector<int> &devicemem) {
const std::vector<int> &devicemem, bool host) {
addr_t address = CCLO_ADDR::EGR_RX_BUF_SIZE_OFFSET;
eager_rx_buffer_size = egr_rx_buf_size;
for (size_t i = 0; i < n_egr_rx_bufs; ++i) {
// create, clear and sync buffers to device
Buffer<int8_t> *buf;

if (sim_mode) {
buf = new SimBuffer(new int8_t[eager_rx_buffer_size](), eager_rx_buffer_size, dataType::int8,
if(host){
buf = new SimBuffer(new int8_t[eager_rx_buffer_size](), eager_rx_buffer_size, dataType::int8,
static_cast<SimDevice *>(cclo)->get_context(), true, ACCL_SIM_DEFAULT_BANK);
}else{
buf = new SimBuffer(new int8_t[eager_rx_buffer_size](), eager_rx_buffer_size, dataType::int8,
static_cast<SimDevice *>(cclo)->get_context());
}
} else if(cclo->get_device_type() == CCLO::xrt_device ){
buf = new XRTBuffer<int8_t>(eager_rx_buffer_size, dataType::int8, *(static_cast<XRTDevice *>(cclo)->get_device()), devicemem[i % devicemem.size()]);
if(host){
//TODO: how to define host buffers in XRT?
buf = new XRTBuffer<int8_t>(eager_rx_buffer_size, dataType::int8, *(static_cast<XRTDevice *>(cclo)->get_device()), devicemem[i % devicemem.size()]);
}else{
buf = new XRTBuffer<int8_t>(eager_rx_buffer_size, dataType::int8, *(static_cast<XRTDevice *>(cclo)->get_device()), devicemem[i % devicemem.size()]);
}
} else if(cclo->get_device_type() == CCLO::coyote_device){
buf = new CoyoteBuffer<int8_t>(eager_rx_buffer_size, dataType::int8, static_cast<CoyoteDevice *>(cclo));
if(host){
//buffers in coyote per default on host
buf = new CoyoteBuffer<int8_t>(eager_rx_buffer_size, dataType::int8, static_cast<CoyoteDevice *>(cclo));
}else{
buf = new CoyoteBuffer<int8_t>(eager_rx_buffer_size, dataType::int8, static_cast<CoyoteDevice *>(cclo));
}
}
//add if else as well, test for coyote backend + eager on host
if(!(host && cclo->get_device_type() == CCLO::coyote_device)){
buf->sync_to_device();
}

buf->sync_to_device();
eager_rx_buffers.emplace_back(buf);
// program this buffer into the accelerator
address += 4;
Expand All @@ -1155,10 +1173,18 @@ void ACCL::setup_eager_rx_buffers(size_t n_egr_rx_bufs, addr_t egr_rx_buf_size,
address += 4;
cclo->write(address, (buf->address() >> 32) & 0xffffffff);
// clear remaining 4 fields
for (size_t j = 0; j < 4; ++j) {
for (size_t j = 0; j < 5; ++j) {
address += 4;
cclo->write(address, 0);
}
//set the host flag
// NOTE: the host flag is set to true if the buffer is a host buffer
address += 4;
if(host){
cclo->write(address, 1); // set host flag
}else{
cclo->write(address, 0); // set host flag
}
}

//write buffer len
Expand All @@ -1184,7 +1210,7 @@ void ACCL::setup_rendezvous_spare_buffers(addr_t rndzv_spare_buf_size, const std
} else if(cclo->get_device_type() == CCLO::coyote_device){
buf = new CoyoteBuffer<int8_t>(max_rndzv_msg_size, dataType::int8, static_cast<CoyoteDevice *>(cclo));
}
buf->sync_to_device();
//buf->sync_to_device();
utility_spares.emplace_back(buf);
}
cclo->write(CCLO_ADDR::SPARE1_OFFSET, utility_spares.at(0)->address() & 0xffffffff);
Expand Down Expand Up @@ -1246,7 +1272,9 @@ void ACCL::prepare_call(CCLO::Options &options) {
}
else {
dtypes.insert(options.addr_0->type());
if(options.addr_0->is_host_only()) options.host_flags |= hostFlags::OP0_HOST;
if(options.addr_0->is_host_only()){
options.host_flags |= hostFlags::OP0_HOST;
}
}

if (options.addr_1 == nullptr) {
Expand All @@ -1255,7 +1283,9 @@ void ACCL::prepare_call(CCLO::Options &options) {
}
else {
dtypes.insert(options.addr_1->type());
if(options.addr_1->is_host_only()) options.host_flags |= hostFlags::OP1_HOST;
if(options.addr_1->is_host_only()) {
options.host_flags |= hostFlags::OP1_HOST;
}
}

if (options.addr_2 == nullptr) {
Expand All @@ -1264,7 +1294,9 @@ void ACCL::prepare_call(CCLO::Options &options) {
}
else {
dtypes.insert(options.addr_2->type());
if(options.addr_2->is_host_only()) options.host_flags |= hostFlags::RES_HOST;
if(options.addr_2->is_host_only()) {
options.host_flags |= hostFlags::RES_HOST;
}
}

dtypes.erase(dataType::none);
Expand Down Expand Up @@ -1351,7 +1383,6 @@ void ACCL::prepare_call(CCLO::Options &options) {
}
}
}

options.arithcfg_addr = arithcfg->addr();
}

Expand Down
2 changes: 1 addition & 1 deletion driver/xrt/src/coyotedevice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ void CoyoteRequest::start() {
function = static_cast<int>(options.reduce_function);
}
uint32_t flags = static_cast<uint32_t>(options.host_flags) << 8 | static_cast<uint32_t>(options.stream_flags);

auto coyote_proc = reinterpret_cast<ACCL::CoyoteDevice *>(cclo())->get_device();

if ((coyote_proc->getCSR((OFFSET_HOSTCTRL + HOSTCTRL_ADDR::AP_CTRL)>>2) & 0x4) == 0) { // read AP_CTRL and check bit 3 (the idle bit)
Expand Down Expand Up @@ -251,6 +250,7 @@ void CoyoteRequest::start() {
}
case ACCL::operation::config:{
coyote_proc->setCSR(static_cast<uint32_t>(options.scenario), (OFFSET_HOSTCTRL + HOSTCTRL_ADDR::SCEN)>>2);
coyote_proc->setCSR(static_cast<uint32_t>(options.count), (OFFSET_HOSTCTRL + HOSTCTRL_ADDR::LEN)>>2);
coyote_proc->setCSR(static_cast<uint32_t>(function), (OFFSET_HOSTCTRL + HOSTCTRL_ADDR::FUNCTION_R)>>2);
//coyote_proc->setCSR(static_cast<uint32_t>(flags), (OFFSET_HOSTCTRL + HOSTCTRL_ADDR::STREAM_FLAGS)>>2); //safe to delete?
break;
Expand Down
6 changes: 5 additions & 1 deletion driver/xrt/src/simdevice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <cassert>
#include <future>
#include "zmq_client.h"
#include <bitset>

static void finish_sim_request(ACCL::SimRequest *req) {
ACCL::SimDevice *cclo = reinterpret_cast<ACCL::SimDevice *>(req->cclo());
Expand All @@ -43,6 +44,8 @@ void SimRequest::start() {
options.addr_0->sync_bo_to_device();
options.addr_1->sync_bo_to_device();
options.addr_2->sync_bo_to_device();
std::cout << "SimRequest::start: addr_0: " << options.addr_0->address() << std::endl;
std::cout << "SimRequest::start: addr_2: " << options.addr_2->address() << std::endl;

if (options.scenario == operation::config) {
function = static_cast<int>(options.cfg_function);
Expand All @@ -51,6 +54,7 @@ void SimRequest::start() {
}

uint32_t flags = static_cast<uint32_t>(options.host_flags) << 8 | static_cast<uint32_t>(options.stream_flags);
std::cout << "host flags " << std::bitset<32>(static_cast<uint32_t>(options.host_flags)) << " shifted: " << std::bitset<32>(static_cast<uint32_t>(options.host_flags)<<8) << std::endl;

zmq_client_startcall(
reinterpret_cast<SimDevice *>(cclo_ptr)->get_context(),
Expand Down Expand Up @@ -164,7 +168,7 @@ void SimDevice::write(addr_t offset, val_t val) {

CCLO::deviceType SimDevice::get_device_type()
{
std::cout<<"get_device_type: sim_device"<<std::endl;
//std::cout<<"get_device_type: sim_device"<<std::endl;
return CCLO::sim_device;
}
void SimDevice::launch_request() {
Expand Down
2 changes: 2 additions & 0 deletions driver/xrt/src/xrtdevice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "accl/common.hpp"
#include <future>
#include <cassert>
#include <bitset>

static void finish_fpga_request(ACCL::FPGARequest *req) {
req->wait_kernel();
Expand All @@ -44,6 +45,7 @@ void FPGARequest::start() {
function = static_cast<int>(options.reduce_function);
}
uint32_t flags = static_cast<uint32_t>(options.host_flags) << 8 | static_cast<uint32_t>(options.stream_flags);
std::cout << "host flags xrt " << std::bitset<32>(static_cast<uint32_t>(options.host_flags)) << " shifted: " << std::bitset<32>(static_cast<uint32_t>(options.host_flags)<<8) << std::endl;
switch(options.scenario) {
case ACCL::operation::copy:
run.set_arg(ACCL::XRT_ARG_ID::SCENARIO_ID, static_cast<uint32_t>(options.scenario));
Expand Down
Loading