Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,11 @@
driver/xrt/src/m2m
driver/xrt/build
.vscode
coyote_build*
*xcu55c-fsvh2892-2L-e*
*_prj
*.gen
*.ip_user_files
*.cache
*.srcs
**/fpga_ips.txt
2 changes: 2 additions & 0 deletions driver/xrt/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ set(ACCL_DOCS_RST
)

set(EN_COYOTE ON)
set(EN_AVX 1 CACHE STRING "AVX environment.")
if(EN_COYOTE)
message("Enable Coyote")
set(ACCL_HEADERS
Expand All @@ -88,6 +89,7 @@ if(EN_COYOTE)
file(GLOB COYOTE_SOURCE "${COYOTE_SOURCE_PATH}/*.cpp")

if(EN_AVX)
add_definitions(-DEN_AVX)
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -mavx -march=native -O3")
else()
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -march=native -O1")
Expand Down
4 changes: 2 additions & 2 deletions driver/xrt/include/accl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ class ACCL {
*/
void initialize(const std::vector<rank_t> &ranks, int local_rank,
int n_egr_rx_bufs = 16, addr_t egr_rx_buf_size = 1024,
addr_t max_egr_size = 1024, addr_t max_rndzv_size = 32*1024);
addr_t max_egr_size = 1024, addr_t max_rndzv_size = 32*1024, bool rxEager_host = false);

/**
* Get the return code of the last ACCL call.
Expand Down Expand Up @@ -1101,7 +1101,7 @@ ACCLRequest *barrier(communicatorId comm_id = GLOBAL_COMM,
void configure_arithmetic();

void setup_eager_rx_buffers(size_t n_egr_rx_bufs, addr_t egr_rx_buf_size,
const std::vector<int> &devicemem);
const std::vector<int> &devicemem, bool host=false);
void setup_eager_rx_buffers(size_t n_egr_rx_bufs, addr_t egr_rx_buf_size, int devicemem) {
std::vector<int> mems = {devicemem};
return setup_eager_rx_buffers(n_egr_rx_bufs, egr_rx_buf_size, mems);
Expand Down
27 changes: 17 additions & 10 deletions driver/xrt/include/accl/coyotebuffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
#pragma once
#include "buffer.hpp"
#include "common.hpp"
#include "cProcess.hpp"
#include "cThread.hpp"
#include "cDefs.hpp"
#include "coyotedevice.hpp"
#include <cstdlib>
Expand Down Expand Up @@ -57,14 +57,13 @@ template <typename dtype> class CoyoteBuffer : public Buffer<dtype> {
size_t page_size = 1ULL << 21;
this->buffer_size = length * sizeof(dtype);
this->n_pages = (buffer_size + page_size - 1) / page_size;
std::cerr << "CoyoteBuffer contructor called! page_size:"<<page_size<<", buffer_size:"<<buffer_size<<",n_pages:"<<n_pages<< std::endl;

this->aligned_buffer = (dtype *)this->device->coyote_proc->getMem({fpga::CoyoteAlloc::HUGE_2M, n_pages});
this->aligned_buffer = (dtype *)this->device->coyote_proc->getMem({coyote::CoyoteAllocType::HPF, this->buffer_size, true});

this->update_buffer(this->aligned_buffer, (addr_t)this->aligned_buffer);

std::cerr << "Allocation successful! Allocated buffer: "<<std::setbase(16)<<(uint64_t)this->aligned_buffer << std::setbase(10) <<", Size: " << this->_size << std::endl;

//buffers in coyote per default on host memory
host_flag = true;


Expand Down Expand Up @@ -112,22 +111,30 @@ template <typename dtype> class CoyoteBuffer : public Buffer<dtype> {
*/
void sync_from_device() override
{
std::cerr << "calling sync: " << std::setbase(16) << (uint64_t)this->aligned_buffer << ", size: " << std::setbase(10) << this->size() << std::endl;
std::cerr << "sync_from_device at address: " << std::setbase(16) << (uint64_t)this->aligned_buffer << ", size: " << std::setbase(10) << this->size() << std::endl;

this->device->coyote_proc->invoke({fpga::CoyoteOper::SYNC, this->aligned_buffer, (uint32_t)this->_size, true, true, 0, false});
coyote::syncSg sg;
memset(&sg, 0, sizeof(coyote::syncSg));
sg.addr = this->aligned_buffer;
sg.len = this->size();
this->device->coyote_proc->invoke(coyote::CoyoteOper::LOCAL_SYNC, sg);

this->host_flag = true;
}

/**
* Sync the data from the host to the device.
*
*
*/
void sync_to_device() override
{
std::cerr << "calling offload: " << std::setbase(16) << (uint64_t)this->aligned_buffer << ", size: " << std::setbase(10) << this->size() << std::endl;
std::cerr << "sync_to_device at address: " << std::setbase(16) << (uint64_t)this->aligned_buffer << ", size: " << std::setbase(10) << this->size() << std::endl;

this->device->coyote_proc->invoke({fpga::CoyoteOper::OFFLOAD, this->aligned_buffer, (uint32_t)this->_size, true, true, 0, false});
coyote::syncSg sg;
memset(&sg, 0, sizeof(coyote::syncSg));
sg.addr = this->aligned_buffer;
sg.len = this->size();
this->device->coyote_proc->invoke(coyote::CoyoteOper::LOCAL_OFFLOAD, sg);

this->host_flag = false;
}
Expand All @@ -144,7 +151,7 @@ template <typename dtype> class CoyoteBuffer : public Buffer<dtype> {
// }
// }

std::cerr << "Free user buffer from cProc cPid:"<< std::setbase(10)<<this->device->coyote_proc->getCpid()<<", buffer_size:"<<buffer_size<<","<<std::setbase(16) << (uint64_t)this->aligned_buffer<<std::endl;
std::cerr << "Free user buffer from cProc cPid:"<< std::setbase(10)<<this->device->coyote_proc->getCtid()<<", buffer_size:"<<buffer_size<<","<<std::setbase(16) << (uint64_t)this->aligned_buffer<<std::endl;
this->device->coyote_proc->freeMem(this->aligned_buffer);
return;
}
Expand Down
12 changes: 6 additions & 6 deletions driver/xrt/include/accl/coyotedevice.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@
#include "acclrequest.hpp"
#include "cclo.hpp"
#include "constants.hpp"
#include "cProcess.hpp"
#include "ibvQpConn.hpp"
#include "ibvStructs.hpp"
#include "cThread.hpp"
// #include "ibvQpConn.hpp"
// #include "ibvStructs.hpp"
#include <string>
#include <iostream>
#include <fstream>
Expand Down Expand Up @@ -108,7 +108,7 @@ class CoyoteDevice : public CCLO {

void printDebug() override;

fpga::cProcess* get_device(){
coyote::cThread* get_device(){
return coyote_proc;
}

Expand All @@ -120,13 +120,13 @@ class CoyoteDevice : public CCLO {

val_t get_retcode(ACCLRequest *request) override;

fpga::cProcess* coyote_proc;
coyote::cThread* coyote_proc;

// RDMA related
// RDMA requires multiple processes to establish queue pairs
// The CCLO kernel is still managed by coyote_proc
unsigned int num_qp;
std::vector<fpga::cProcess*> coyote_qProc_vec;
std::vector<coyote::cThread*> coyote_qProc_vec;
private:
const size_t OFFSET_CCLO = 0x0;

Expand Down
Loading