/*
 * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */

/**
 * @file cufile.h
 * @brief  cuFile C APIs
 *
 * This file contains all the C APIs to perform GPUDirect Storage supported IO operations
 */

#if __cplusplus
extern "C"
{
#endif

/// @cond DOXYGEN_SKIP_MACRO
#ifndef __CUFILE_H_
#define __CUFILE_H_

#include <stdlib.h>
#include <stdbool.h>

#include <cuda.h>
#include <arpa/inet.h>
#include <sys/socket.h>

#define CUFILEOP_BASE_ERR 5000

//Note :Data path errors are captured via standard error codes
#define CUFILEOP_STATUS_ENTRIES \
	CUFILE_OP(0,                      CU_FILE_SUCCESS, cufile success) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 1,  CU_FILE_DRIVER_NOT_INITIALIZED, nvidia-fs driver is not loaded. Set allow_compat_mode to true in cufile.json file to enable compatible mode) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 2,  CU_FILE_DRIVER_INVALID_PROPS, invalid property) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 3,  CU_FILE_DRIVER_UNSUPPORTED_LIMIT, property range error) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 4,  CU_FILE_DRIVER_VERSION_MISMATCH, nvidia-fs driver version mismatch) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 5,  CU_FILE_DRIVER_VERSION_READ_ERROR, nvidia-fs driver version read error) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 6,  CU_FILE_DRIVER_CLOSING, driver shutdown in progress) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 7,  CU_FILE_PLATFORM_NOT_SUPPORTED, GPUDirect Storage not supported on current platform) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 8,  CU_FILE_IO_NOT_SUPPORTED, GPUDirect Storage not supported on current file) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 9,  CU_FILE_DEVICE_NOT_SUPPORTED, GPUDirect Storage not supported on current GPU) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 10, CU_FILE_NVFS_DRIVER_ERROR, nvidia-fs driver ioctl error) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 11, CU_FILE_CUDA_DRIVER_ERROR, CUDA Driver API error) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 12, CU_FILE_CUDA_POINTER_INVALID, invalid device pointer) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 13, CU_FILE_CUDA_MEMORY_TYPE_INVALID, invalid pointer memory type) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 14, CU_FILE_CUDA_POINTER_RANGE_ERROR, pointer range exceeds allocated address range) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 15, CU_FILE_CUDA_CONTEXT_MISMATCH, cuda context mismatch) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 16, CU_FILE_INVALID_MAPPING_SIZE, access beyond maximum pinned size) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 17, CU_FILE_INVALID_MAPPING_RANGE, access beyond mapped size) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 18, CU_FILE_INVALID_FILE_TYPE, unsupported file type) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 19, CU_FILE_INVALID_FILE_OPEN_FLAG, unsupported file open flags) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 20, CU_FILE_DIO_NOT_SET, fd direct IO not set) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 22, CU_FILE_INVALID_VALUE, invalid arguments) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 23, CU_FILE_MEMORY_ALREADY_REGISTERED, device pointer already registered) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 24, CU_FILE_MEMORY_NOT_REGISTERED, device pointer lookup failure) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 25, CU_FILE_PERMISSION_DENIED, driver or file access error) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 26, CU_FILE_DRIVER_ALREADY_OPEN, driver is already open) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 27, CU_FILE_HANDLE_NOT_REGISTERED, file descriptor is not registered) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 28, CU_FILE_HANDLE_ALREADY_REGISTERED, file descriptor is already registered) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 29, CU_FILE_DEVICE_NOT_FOUND, GPU device not found) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 30, CU_FILE_INTERNAL_ERROR, internal error) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 31, CU_FILE_GETNEWFD_FAILED, failed to obtain new file descriptor) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 33, CU_FILE_NVFS_SETUP_ERROR, NVFS driver initialization error) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 34, CU_FILE_IO_DISABLED, GPUDirect Storage disabled by config on current file)\
	CUFILE_OP(CUFILEOP_BASE_ERR + 35, CU_FILE_BATCH_SUBMIT_FAILED, failed to submit batch operation)\
	CUFILE_OP(CUFILEOP_BASE_ERR + 36, CU_FILE_GPU_MEMORY_PINNING_FAILED, failed to allocate pinned GPU Memory) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 37, CU_FILE_BATCH_FULL, queue full for batch operation) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 38, CU_FILE_ASYNC_NOT_SUPPORTED, cuFile stream operation not supported) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 39, CU_FILE_INTERNAL_BATCH_SETUP_ERROR, batch setup internal error - retry later) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 40, CU_FILE_INTERNAL_BATCH_SUBMIT_ERROR, batch submit internal error - retry later) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 41, CU_FILE_INTERNAL_BATCH_GETSTATUS_ERROR, batch get status internal error - retry later) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 42, CU_FILE_INTERNAL_BATCH_CANCEL_ERROR, batch cancel internal error - retry later) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 43, CU_FILE_NOMEM_ERROR, cufile no memory error - retry later) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 44, CU_FILE_IO_ERROR, cufile io error) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 45, CU_FILE_INTERNAL_BUF_REGISTER_ERROR, cufile buf registration error) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 46, CU_FILE_HASH_OPR_ERROR, cufile hash operation error) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 47, CU_FILE_INVALID_CONTEXT_ERROR, cufile invalid context error) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 48, CU_FILE_NVFS_INTERNAL_DRIVER_ERROR, nvfs internal driver error) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 49, CU_FILE_BATCH_NOCOMPAT_ERROR, compat mode off error) \
	CUFILE_OP(CUFILEOP_BASE_ERR + 50, CU_FILE_IO_MAX_ERROR, GPUDirect Storage Max Error)


/**
 * @brief cufileop status enum
 *
 * @note on success the error code is set to  @ref CU_FILE_SUCCESS.
 * @note The error code can be inspected using @ref IS_CUFILE_ERR and @ref CUFILE_ERRSTR.
 * @note The error code if set to @ref CU_FILE_CUDA_DRIVER_ERROR, then cuda error can be inspected using @ref IS_CUDA_ERR and @ref CU_FILE_CUDA_ERR.
 * @note Data path errors are captured via standard error codes
 */
typedef enum CUfileOpError {
        /// @cond DOXYGEN_SKIP_MACRO
	#define CUFILE_OP(code, name, string) name = code,
	CUFILEOP_STATUS_ENTRIES
	#undef CUFILE_OP
        ///@endcond
} CUfileOpError;

/// @endcond

/**
 * @brief cufileop status string
 */
static inline const char *cufileop_status_error(CUfileOpError status)
{
	switch (status) {
	/// @cond DOXYGEN_SKIP_MACRO
	#define CUFILE_OP(code, name, string) \
	case name: return #string;
	CUFILEOP_STATUS_ENTRIES
	#undef CUFILE_OP
	///@endcond
	default:return "unknown cufile error";
	}
}

/**
 * @brief cufileop status string
 */
typedef struct CUfileError {

	CUfileOpError err; // cufile error

	CUresult cu_err; // cuda driver error

}CUfileError_t;

/**
 * @brief  error macros to inspect error status of type @ref CUfileOpError
 */

#define IS_CUFILE_ERR(err) \
	(abs((err)) > CUFILEOP_BASE_ERR)

#define CUFILE_ERRSTR(err) \
	cufileop_status_error((CUfileOpError)abs((err)))

#define IS_CUDA_ERR(status) \
	((status).err == CU_FILE_CUDA_DRIVER_ERROR)

#define CU_FILE_CUDA_ERR(status) ((status).cu_err)

/* driver properties */
typedef enum CUfileDriverStatusFlags {
    CU_FILE_LUSTRE_SUPPORTED = 0, /*!< Support for DDN LUSTRE */

    CU_FILE_WEKAFS_SUPPORTED = 1, /*!< Support for WEKAFS */

    CU_FILE_NFS_SUPPORTED = 2, /*!< Support for NFS */

    CU_FILE_GPFS_SUPPORTED = 3, /*! < Support for GPFS */

    CU_FILE_NVME_SUPPORTED = 4, /*!< Support for NVMe */

    CU_FILE_NVMEOF_SUPPORTED = 5, /*!< Support for NVMeOF */

    CU_FILE_SCSI_SUPPORTED = 6, /*!< Support for SCSI */

    CU_FILE_SCALEFLUX_CSD_SUPPORTED = 7, /*!< Support for Scaleflux CSD*/

    CU_FILE_NVMESH_SUPPORTED = 8, /*!< Support for NVMesh Block Dev*/
    CU_FILE_BEEGFS_SUPPORTED = 9, /*!< Support for BeeGFS */
    //10 is reserved for YRCloudFile
    CU_FILE_NVME_P2P_SUPPORTED = 11,   /*!< Do not use this macro. This is deprecated now */
    CU_FILE_SCATEFS_SUPPORTED = 12,      /*!< Support for ScateFS */
    CU_FILE_VIRTIOFS_SUPPORTED = 13,   /*!<Support for VirtioFS */
    CU_FILE_MAX_TARGET_TYPES,       /*!<Maximum FS supported */
}CUfileDriverStatusFlags_t;

typedef enum CUfileDriverControlFlags {
        CU_FILE_USE_POLL_MODE = 0 , /*!< use POLL mode. properties.use_poll_mode*/

        CU_FILE_ALLOW_COMPAT_MODE = 1/*!< allow COMPATIBILITY mode. properties.allow_compat_mode*/

}CUfileDriverControlFlags_t;

typedef enum CUfileFeatureFlags {
        CU_FILE_DYN_ROUTING_SUPPORTED = 0, /*!< Support for Dynamic routing to handle devices across the PCIe bridges */

        CU_FILE_BATCH_IO_SUPPORTED = 1, /*!<  Supported */

        CU_FILE_STREAMS_SUPPORTED = 2, /*!<  Supported */

        CU_FILE_PARALLEL_IO_SUPPORTED = 3, /*!<  Supported */

        CU_FILE_P2P_SUPPORTED = 4 /*!< Support for PCI P2PDMA */
}CUfileFeatureFlags_t;

typedef enum CUfileP2PFlags {
        CUFILE_P2PDMA = 0, /*!< Support for PCI P2PDMA */
        CUFILE_NVFS = 1,   /*!< Support for nvidia-fs */
        CUFILE_DMABUF = 2, /*!< Support for DMA Buffer */
        CUFILE_C2C = 3,     /*!< Support for Chip-to-Chip (Grace-based systems) */
        CUFILE_NVIDIA_PEERMEM = 4 /*!< Only for IBM Spectrum Scale and WekaFS */
}CUfileP2PFlags_t;

/* P2P Flag constants for use with cuFileDriverSetP2PFlags */
#define CU_FILE_P2P_FLAG_PCI_P2PDMA ((CUfileP2PFlags_t)(1 << CUFILE_P2PDMA))
#define CU_FILE_P2P_FLAG_NVFS ((CUfileP2PFlags_t)(1 << CUFILE_NVFS))
#define CU_FILE_P2P_FLAG_DMABUF ((CUfileP2PFlags_t)(1 << CUFILE_DMABUF))
#define CU_FILE_P2P_FLAG_C2C ((CUfileP2PFlags_t)(1 << CUFILE_C2C))

typedef enum CUfileOpcode {
		CUFILE_READ = 0,
		CUFILE_WRITE
}CUfileOpcode_t;

typedef struct CUfileDrvProps {
        struct {
                unsigned int major_version;

                unsigned int minor_version;

                size_t poll_thresh_size;

                size_t max_direct_io_size;

                unsigned int dstatusflags;

                unsigned int dcontrolflags;

        } nvfs;

        unsigned int fflags;

        unsigned int max_device_cache_size;

	unsigned int per_buffer_cache_size;

        unsigned int max_device_pinned_mem_size;

        unsigned int max_batch_io_size;
        unsigned int max_batch_io_timeout_msecs;
}CUfileDrvProps_t;

typedef struct sockaddr sockaddr_t;

typedef struct cufileRDMAInfo
{
        int version;
        int desc_len;
        const char *desc_str;
}cufileRDMAInfo_t;

#define CU_FILE_RDMA_REGISTER 1
#define CU_FILE_RDMA_RELAXED_ORDERING (1<<1)



typedef struct CUfileFSOps {
      /* NULL means discover using fstat */
      const char* (*fs_type) (const void *handle);

      /* list of host addresses to use,  NULL means no restriction */
      int (*getRDMADeviceList)(const void *handle, sockaddr_t **hostaddrs);

      /* -1 no pref */
      int (*getRDMADevicePriority)(const void *handle, char*, size_t,
                                loff_t, const sockaddr_t* hostaddr);

      /* NULL means try VFS */
      ssize_t (*read) (const void *handle, char*, size_t, loff_t, const cufileRDMAInfo_t*);
      ssize_t (*write) (const void *handle, const char *, size_t, loff_t , const cufileRDMAInfo_t*);
}CUfileFSOps_t;

/* File Handle */
enum CUfileFileHandleType {
	CU_FILE_HANDLE_TYPE_OPAQUE_FD = 1,   /*!< Linux based fd */

	CU_FILE_HANDLE_TYPE_OPAQUE_WIN32 = 2, /*!< Windows based handle (unsupported) */

        CU_FILE_HANDLE_TYPE_USERSPACE_FS = 3,  /* Userspace based FS */
};

typedef struct CUfileDescr_t {
	enum CUfileFileHandleType type; /* type of file being registered */
	union {
		int fd; 		/* Linux   */
		void *handle; 		/* Windows */
	} handle;
        const CUfileFSOps_t *fs_ops;     /* file system operation table */
}CUfileDescr_t;

/**
 * @brief File handle type
 *
 */
typedef void* CUfileHandle_t;


#pragma GCC visibility push(default)

/**
 * @brief cuFileHandleRegister is required, and performs extra checking that is memoized to provide increased performance on later cuFile operations.
 *
 * @param fh @ref CUfileHandle_t opaque file handle for IO operations
 * @param descr @ref CUfileDescr_t  file descriptor (OS agnostic)
 *
 * @return      CU_FILE_SUCCESS on successful completion. fh will be updated for use in @ref cuFileRead, @ref cuFileWrite, @ref cuFileHandleDeregister
 * @return      CU_FILE_DRIVER_NOT_INITIALIZED on failure to load driver
 * @return      CU_FILE_IO_NOT_SUPPORTED -  if filesystem is not supported
 * @return      CU_FILE_INVALID_VALUE if null or bad api arguments
 * @return      CU_FILE_INVALID_FILE_OPEN_FLAG if file is opened with unsupported modes like no O_DIRECT
 * @return      CU_FILE_INVALID_FILE_TYPE if filepath is not valid or is not a regular file
 * @return      CU_FILE_HANDLE_ALREADY_REGISTERED if file handle/descriptor is already registered
 *
 * <b>Description</b>
 * cuFileHandleRegister registers the open file descriptor for use with cuFile IO operations.
 *
 * This API will ensure that the file's descriptor is checked for GPUDirect Storage support and returns a valid file handle on CU_FILE_SUCCESS.
 *
 * @note the file needs to be opened in O_DIRECT mode to support GPUDirect Storage.
 *
 * @see cuFileRead
 * @see cuFileWrite
 * @see cuFileHandleDeregister
 *
 */
CUfileError_t cuFileHandleRegister(CUfileHandle_t *fh, CUfileDescr_t *descr);

/**
 * @brief releases a registered filehandle from cuFile
 *
 * @param fh @ref CUfileHandle_t file handle
 *
 * @return void
 *
 * @see cuFileHandleRegister
 */
void cuFileHandleDeregister(CUfileHandle_t fh);

/**
 * @brief register an existing cudaMalloced memory with cuFile to pin for GPUDirect Storage access or 
 * register host allocated memory with cuFile.
 *
 * @param bufPtr_base buffer pointer allocated
 * @param length  size of memory region from the above specified bufPtr
 * @param flags   CU_FILE_RDMA_REGISTER
 *
 * @return  CU_FILE_SUCCESS on success
 * @return  CU_FILE_NVFS_DRIVER_ERROR
 * @return  CU_FILE_INVALID_VALUE
 * @return  CU_FILE_CUDA_ERROR for unsuported memory type
 * @return  CU_FILE_MEMORY_ALREADY_REGISTERED on error
 * @return  CU_FILE_GPU_MEMORY_PINNING_FAILED if not enough pinned memory is available
 * @note This memory will be use to perform GPU direct DMA from the supported storage.
 * @warning This API is intended for usecases where the memory is used as streaming buffer that is reused across multiple cuFile IO operations before calling @ref cuFileBufDeregister
 *
 * @see cuFileBufDeregister
 * @see cuFileRead
 * @see cuFileWrite
 */
CUfileError_t cuFileBufRegister(const void *bufPtr_base, size_t length, int flags);

/**
 * @brief  deregister an already registered device or host memory from cuFile
 *
 * @param bufPtr_base  buffer pointer to deregister
 *
 * @return  CU_FILE_SUCCESS on success
 * @return  CU_FILE_INVALID_VALUE on invalid memory pointer or unregistered memory pointer
 *
 * @see cuFileBufRegister
 * @see cuFileRead
 * @see cuFileWrite
 */

CUfileError_t cuFileBufDeregister(const void *bufPtr_base);

/**
 * @brief read data from a registered file handle to a specified device or host memory
 *
 * @param fh @ref CUfileHandle_t opaque file handle
 * @param bufPtr_base  base address of buffer in device or host memory
 * @param size    size bytes to read
 * @param file_offset  file-offset from begining of the file
 * @param bufPtr_offset  offset relative to the bufPtr_base pointer to read into.
 *
 * @return  size of bytes successfully read
 * @return  -1 on error, in which case errno is set to indicate filesystem errors.
 * @return  all other errors will return a negative integer value of @ref CUfileOpError enum value.
 *
 * @note  If the bufPtr is not registered with @ref cuFileBufRegister, the data will be buffered through preallocated pinned buffers if needed.
 * @note  This is useful for applications that need to perform IO to unaligned file offsets and/or size. This is also recommended
 *        for cases where the BAR1 memory size is smaller than the size of the allocated memory.
 *
 * @see cuFileBufRegister
 * @see cuFileHandleRegister
 * @see cuFileWrite
 */

ssize_t cuFileRead(CUfileHandle_t fh, void *bufPtr_base, size_t size, off_t file_offset, off_t bufPtr_offset);

/**
 * @brief  write data from a specified device or host memory to a registered file handle
 *
 * @param fh @ref CUfileHandle_t opaque file handle
 * @param bufPtr_base  base address of buffer in device or host memory
 * @param size    size bytes to write
 * @param file_offset  file-offset from begining of the file
 * @param bufPtr_offset  offset relative to the bufPtr_base pointer to write from.
 *
 * @return  size of bytes successfully written
 * @return  -1 on error, in which case errno is set to indicate filesystem errors.
 * @return  all other errors will return a negative integer value of @ref CUfileOpError enum value.
 *
 * @note  If the bufPtr is not registered with @ref cuFileBufRegister, the data will be buffered through preallocated pinned buffers if needed.
 * @note  This is useful for applications that need to perform IO to unaligned file offsets and/or size. This is also recommended
 *        for cases where the BAR1 memory size is smaller than the size of the allocated memory.
 *
 * @see cuFileBufRegister
 * @see cuFileHandleRegister
 * @see cuFileRead
 */

ssize_t cuFileWrite(CUfileHandle_t fh, const void *bufPtr_base, size_t size, off_t file_offset, off_t bufPtr_offset);

// CUFile Driver APIs

/**
 * @brief
 * Initialize the cuFile library and open the nvidia-fs driver
 *
 * @return CU_FILE_SUCCESS on success
 * @return CU_FILE_DRIVER_NOT_INITIALIZED
 * @return CU_FILE_DRIVER_VERSION_MISMATCH on driver version mismatch error
 *
 * @see cuFileDriverClose
 */
CUfileError_t cuFileDriverOpen(void);

CUfileError_t cuFileDriverClose(void);
#define cuFileDriverClose cuFileDriverClose_v2
/**
 * @brief
 * reset the cuFile library and release the nvidia-fs driver
 *
 * @return CU_FILE_SUCCESS on success
 * @return CU_FILE_DRIVER_CLOSING if there are any active IO operations using @ref cuFileRead or @ref cuFileWrite
 *
 * @see cuFileDriverOpen
 */
CUfileError_t cuFileDriverClose(void);

/**
 * @brief
 * returns use count of cufile drivers at that moment by the process.
 */
long cuFileUseCount(void);

/**
 * @brief
 * Gets the Driver session properties
 *
 * @return CU_FILE_SUCCESS on success
 *
 * @see cuFileDriverSetPollMode
 * @see cuFileDriverSetMaxDirectIOSize
 * @see cuFileDriverSetMaxCacheSize
 * @see cuFileDriverSetMaxPinnedMemSize
 */
CUfileError_t cuFileDriverGetProperties(CUfileDrvProps_t *props);

/**
 * @brief
 * Sets whether the Read/Write APIs use polling to do IO operations
 *
 * @param  poll boolean to indicate whether to use poll mode or not
 * @param  poll_threshold_size max IO size to use for POLLING mode in KB
 *
 * @return CU_FILE_SUCCESS on success
 * @return CU_FILE_DRIVER_NOT_INITIALIZED if the driver is not initialized
 * @return CU_FILE_DRIVER_VERSION_MISMATCH, CU_FILE_DRIVER_UNSUPPORTED_LIMIT on error
 *
 * @warning This is an advanced command and should be tuned based on available system memory
 *
 * @see cuFileDriverGetProperties
 */
CUfileError_t cuFileDriverSetPollMode(bool poll, size_t poll_threshold_size);

/**
 * @brief
 * Control parameter to set max IO size(KB) used by the library to talk to nvidia-fs driver
 *
 * @param  max_direct_io_size maximum allowed direct io size in KB
 *
 * @return CU_FILE_SUCCESS on success
 * @return CU_FILE_DRIVER_NOT_INITIALIZED if the driver is not initialized
 * @return CU_FILE_DRIVER_VERSION_MISMATCH, CU_FILE_DRIVER_UNSUPPORTED_LIMIT on error
 *
 * @warning This is an advanced command and should be tuned based on available system memory
 *
 * @see cuFileDriverGetProperties
 *
 */
CUfileError_t cuFileDriverSetMaxDirectIOSize(size_t max_direct_io_size);

/**
 * @brief
 * Control parameter to set maximum GPU memory reserved per device by the library for internal buffering
 *
 * @param  max_cache_size The maximum GPU buffer space per device used for internal use in KB
 *
 * @return CU_FILE_SUCCESS on success
 * @return CU_FILE_DRIVER_NOT_INITIALIZED if the driver is not initialized
 * @return CU_FILE_DRIVER_VERSION_MISMATCH, CU_FILE_DRIVER_UNSUPPORTED_LIMIT on error
 *
 * @warning This is an advanced command and should be tuned based on supported GPU memory
 *
 * @see cuFileDriverGetProperties
 */
CUfileError_t cuFileDriverSetMaxCacheSize(size_t max_cache_size);

/**
 * @brief
 * Sets maximum buffer space that is pinned in KB for use by @ref cuFileBufRegister
 *
 * @param max_pinned_size maximum buffer space that is pinned in KB
 *
 * @return CU_FILE_SUCCESS on success
 * @return CU_FILE_DRIVER_NOT_INITIALIZED if the driver is not initialized
 * @return CU_FILE_DRIVER_VERSION_MISMATCH, CU_FILE_DRIVER_UNSUPPORTED_LIMIT on error
 *
 * @warning This is an advanced command and should be tuned based on supported GPU memory
 *
 * @see cuFileDriverGetProperties
 *
 */
CUfileError_t cuFileDriverSetMaxPinnedMemSize(size_t max_pinned_size);

//Experimental Batch API's



typedef enum CUFILEStatus_enum {
	CUFILE_WAITING = 0x000001,  /* required value prior to submission */
	CUFILE_PENDING = 0x000002,  /* once enqueued */
	CUFILE_INVALID = 0x000004,  /* request was ill-formed or could not be enqueued */
	CUFILE_CANCELED = 0x000008, /* request successfully canceled */
	CUFILE_COMPLETE = 0x0000010, /* request successfully completed */
	CUFILE_TIMEOUT = 0x0000020,  /* request timed out */
	CUFILE_FAILED  = 0x0000040  /* unable to complete */
}CUfileStatus_t;
typedef enum cufileBatchMode {
	CUFILE_BATCH = 1,
} CUfileBatchMode_t;
typedef struct CUfileIOParams {
	CUfileBatchMode_t mode; // Must be the very first field.
	union {
		struct  {
			void *devPtr_base; //This can be a device memory or a host memory pointer.
			off_t file_offset;
			off_t devPtr_offset; 
			size_t size;
		}batch;
	}u;
	CUfileHandle_t fh;
	CUfileOpcode_t opcode;
	void *cookie;
}CUfileIOParams_t;
typedef struct CUfileIOEvents {
	void *cookie;
	CUfileStatus_t   status;      /* status of the operation */
	size_t ret; /* -ve error or amount of I/O done. */
}CUfileIOEvents_t;

typedef void* CUfileBatchHandle_t;

CUfileError_t cuFileBatchIOSetUp(CUfileBatchHandle_t *batch_idp, unsigned nr);
CUfileError_t cuFileBatchIOSubmit(CUfileBatchHandle_t batch_idp, unsigned nr, CUfileIOParams_t *iocbp, unsigned int flags);
CUfileError_t cuFileBatchIOGetStatus(CUfileBatchHandle_t batch_idp, unsigned min_nr, unsigned* nr,
				CUfileIOEvents_t *iocbp, struct timespec* timeout);
CUfileError_t cuFileBatchIOCancel(CUfileBatchHandle_t batch_idp);
void cuFileBatchIODestroy(CUfileBatchHandle_t batch_idp);

//Async API's with cuda streams

// cuFile stream API registration flags
// buffer pointer offset is set at submission time
#define CU_FILE_STREAM_FIXED_BUF_OFFSET         1
// file offset is set at submission time
#define CU_FILE_STREAM_FIXED_FILE_OFFSET        2
// file size is set at submission time
#define CU_FILE_STREAM_FIXED_FILE_SIZE          4
// size, offset and buffer offset are 4k aligned
#define CU_FILE_STREAM_PAGE_ALIGNED_INPUTS      8

/**
 *@brief

 * @param fh The cuFile handle for the file.
 * @param bufPtr_base  base address of buffer in device or host memory
 * @param size_p  pointer to size bytes to read
 * @note  *size_p if the size is not known at the time of submission, then must provide the max possible size for I/O request.
 * @param file_offset_p  pointer to file-offset from begining of the file
 * @param bufPtr_offset_p  pointer to offset relative to the bufPtr_base pointer to read into.
 * @param bytes_read_p  pointer to the number of bytes that were successfully read.
 * @param CUstream stream cuda stream for the operation.
 *
 * @return  size of bytes successfully read in *bytes_read_p
 * @return  -1 on error, in which case errno is set to indicate filesystem errors.
 * @return  all other errors will return a negative integer value of @ref CUfileOpError enum value.
 *
 * @note  If the bufPtr_base is not registered with @ref cuFileBufRegister, the data will be buffered through preallocated pinned buffers.
 * @note  This is useful for applications that need to perform IO to unaligned file offsets and/or size. This is also recommended
 *        for cases where the BAR1 memory size is smaller than the size of the allocated memory.
 * @note  If the stream is registered with cuFileStreamRegister, the IO setup and teardown overhead will be reduced.
 * @note  on cuda stream errors, the user must call cuFileStreamDeregister to release any outstanding cuFile resources for the stream.
 *
 *
 * @see cuFileBufRegister
 * @see cuFileHandleRegister
 * @see cuFileRead
 * @see cuFileStreamRegister
 * @see cuFileStreamDeregister
 */

CUfileError_t cuFileReadAsync(CUfileHandle_t fh, void *bufPtr_base,
                        size_t *size_p, off_t *file_offset_p, off_t *bufPtr_offset_p, ssize_t *bytes_read_p, CUstream stream);

/**
 *@brief

* @param fh The cuFile handle for the file.
 * @param bufPtr_base  base address of buffer in device or host memory
 * @param size_p    pointer to size bytes to write.
 * @note  *size_p if the size is not known at the time of submission, then must provide the max possible size for I/O request.
 * @param file_offset_p  pointer to file-offset from begining of the file
 * @param bufPtr_offset_p  pointer to offset relative to the bufPtr_base pointer to write from.
 * @param bytes_written_p pointer to the number of bytes that were successfully written.
 * @param CUstream cuda stream for the operation.
 *
 * @return  size of bytes successfully written in *bytes_written_p
 * @return  -1 on error, in which case errno is set to indicate filesystem errors.
 * @return  all other errors will return a negative integer value of @ref CUfileOpError enum value.
 *
 * @note  If the bufPtr_base is not registered with @ref cuFileBufRegister, the data will be buffered through preallocated pinned buffers.
 * @note  This is useful for applications that need to perform IO to unaligned file offsets and/or size. This is also recommended
 *        for cases where the BAR1 memory size is smaller than the size of the allocated memory.
 * @note  If the stream is registered with cuFileStreamRegister prior to this call, the IO setup and teardown overhead will be reduced.
 * @note  on cuda stream errors, the user must call cuFileStreamDeregister to release any outstanding cuFile resources for the stream.
 *
 * @see cuFileBufRegister
 * @see cuFileHandleRegister
 * @see cuFileWrite
 * @see cuFileStreamRegister
 * @see cuFileStreamDeregister
 */

CUfileError_t cuFileWriteAsync(CUfileHandle_t fh, void *bufPtr_base,
                        size_t *size_p, off_t *file_offset_p, off_t *bufPtr_offset_p, ssize_t *bytes_written_p, CUstream stream);

/**
 *@brief

 * @param CUstream cuda stream for the operation.
 * @param flags for the stream to improve the stream execution of IO based on input parameters.
 * @note  supported FLAGS are
 * @note CU_FILE_STREAM_FIXED_BUF_OFFSET - buffer pointer offset is set at submission time
 * @note CU_FILE_STREAM_FIXED_FILE_OFFSET - file offset is set at submission time
 * @note CU_FILE_STREAM_FIXED_FILE_SIZE  - file size is set at submission time
 * @note CU_FILE_STREAM_PAGE_ALIGNED_INPUTS - size, offset and buffer offset are 4k aligned
 *
 * @note  allocates resources needed to support cuFile operations asynchronously for the cuda stream
 * @note  This is useful for applications that need to perform IO to unaligned file offsets and/or size. This is also recommended
 *        for cases where the BAR1 memory size is smaller than the size of the allocated memory.
 *
 * @return CU_FILE_SUCCESS on success
 * @return CU_FILE_DRIVER_NOT_INITIALIZED if the driver is not initialized
 * @return CU_FILE_INVALID_VALUE if the stream is invalid
 *
 * @see cuFileReadAsync
 * @see cuFileWriteAsync
 * @see cuFileStreamDeregister
 */

CUfileError_t cuFileStreamRegister(CUstream stream, unsigned flags);

/**
 *@brief

 * @param CUstream cuda stream for the operation.
 *
 * @note  deallocates resources used by previous cuFile asynchronous operations for the cuda stream
 * @note  highly recommend to call after cuda stream errors to release any outstanding cuFile resources for this stream
 * @note  must be called before cuStreamDestroy call for the specified stream.
 * @note  This is useful for applications that need to perform IO to unaligned file offsets and/or size. This is also recommended
 *        for cases where the BAR1 memory size is smaller than the size of the allocated memory.
 *
 * @return CU_FILE_SUCCESS on success
 * @return CU_FILE_DRIVER_NOT_INITIALIZED if the driver is not initialized
 * @return CU_FILE_INVALID_VALUE if the stream is invalid
 *
 * @see cuFileReadAsync
 * @see cuFileWriteAsync
 * @see cuFileStreamRegister
 */

CUfileError_t cuFileStreamDeregister(CUstream stream);

/**
 *@brief

 * @returns cufile library version.
 * 
 * @The version is returned as (1000 major + 10 minor).
 * @For example, CUFILE 1.7.0 would be represented by 1070.
 * @note  This is useful for applications that need to inquire the library.
 *
 * @return CU_FILE_SUCCESS on success
 * @return CU_FILE_INVALID_VALUE if the input parameter is null.
 * @return CU_FILE_DRIVER_VERSION_READ_ERROR if the version is not available.
 *
 */

CUfileError_t cuFileGetVersion(int *version);

typedef enum CUFileSizeTConfigParameter_t {
	CUFILE_PARAM_PROFILE_STATS,
	CUFILE_PARAM_EXECUTION_MAX_IO_QUEUE_DEPTH,
	CUFILE_PARAM_EXECUTION_MAX_IO_THREADS,
	CUFILE_PARAM_EXECUTION_MIN_IO_THRESHOLD_SIZE_KB,
	CUFILE_PARAM_EXECUTION_MAX_REQUEST_PARALLELISM,
	CUFILE_PARAM_PROPERTIES_MAX_DIRECT_IO_SIZE_KB,
	CUFILE_PARAM_PROPERTIES_MAX_DEVICE_CACHE_SIZE_KB,
	CUFILE_PARAM_PROPERTIES_PER_BUFFER_CACHE_SIZE_KB,
	CUFILE_PARAM_PROPERTIES_MAX_DEVICE_PINNED_MEM_SIZE_KB,
	CUFILE_PARAM_PROPERTIES_IO_BATCHSIZE,
	CUFILE_PARAM_POLLTHRESHOLD_SIZE_KB,
	CUFILE_PARAM_PROPERTIES_BATCH_IO_TIMEOUT_MS,
} CUFileSizeTConfigParameter_t;

typedef enum CUFileBoolConfigParameter_t {
	CUFILE_PARAM_PROPERTIES_USE_POLL_MODE,
	CUFILE_PARAM_PROPERTIES_ALLOW_COMPAT_MODE,
	CUFILE_PARAM_FORCE_COMPAT_MODE,
	CUFILE_PARAM_FS_MISC_API_CHECK_AGGRESSIVE,
	CUFILE_PARAM_EXECUTION_PARALLEL_IO,
	CUFILE_PARAM_PROFILE_NVTX, //Do not use this macro. This is deprecated now.
	CUFILE_PARAM_PROPERTIES_ALLOW_SYSTEM_MEMORY,
	CUFILE_PARAM_USE_PCIP2PDMA,
	CUFILE_PARAM_PREFER_IO_URING,
	CUFILE_PARAM_FORCE_ODIRECT_MODE,
	CUFILE_PARAM_SKIP_TOPOLOGY_DETECTION,
	CUFILE_PARAM_STREAM_MEMOPS_BYPASS,
} CUFileBoolConfigParameter_t;

typedef enum CUFileStringConfigParameter_t {
	CUFILE_PARAM_LOGGING_LEVEL,
	CUFILE_PARAM_ENV_LOGFILE_PATH,
	CUFILE_PARAM_LOG_DIR,
} CUFileStringConfigParameter_t;

typedef enum CUFileArrayConfigParameter_t {
	CUFILE_PARAM_POSIX_POOL_SLAB_SIZE_KB,
	CUFILE_PARAM_POSIX_POOL_SLAB_COUNT,
} CUFileArrayConfigParameter_t;


// GET Parameter API
/**
 *@brief

 * @param param The parameter to get.
 * @param value The location where the value will be stored.
 *
 * @return  CU_FILE_SUCCESS on success
 * @return  CU_FILE_INVALID_VALUE if the input parameter is invalid
 *
 * @note If the driver is open, cuFileGetParameter*() will return the current runtime value for the given parameter. 
 * @note If the driver is not opened yet, cuFileGetParameter*() will return the currently staged value for that parameter.
 *       Staged parameter values are cleared when the driver is opened.
 *
 */
CUfileError_t cuFileGetParameterSizeT(CUFileSizeTConfigParameter_t param, size_t *value);	
CUfileError_t cuFileGetParameterBool(CUFileBoolConfigParameter_t param, bool *value);	
CUfileError_t cuFileGetParameterString(CUFileStringConfigParameter_t param, char *desc_str, int len);	

/**
 * @brief Get both the minimum and maximum settable values for a given size_t parameter in a single call
 *
 * @param param CUfile SizeT configuration parameter
 * @param min_value Pointer to store the minimum value
 * @param max_value Pointer to store the maximum value
 * @return CUfileError_t Returns CU_FILE_SUCCESS on success
 *                       Returns CU_FILE_INVALID_VALUE if min_value or max_value is NULL
 */
CUfileError_t cuFileGetParameterMinMaxValue(CUFileSizeTConfigParameter_t param, size_t *min_value, size_t *max_value);

// SET Parameter APIs
/**
 *@brief

 * @param param The parameter to set.
 * @param value The source of the parameter value.
 *
 * @return  CU_FILE_SUCCESS on success
 * @return  CU_FILE_INVALID_VALUE if the input parameter is inalid.
 * @return  CU_FILE_DRIVER_ALREADY_OPEN if the driver is already open.
 *
 * @note  Setting values is only permitted when the driver is not open - set parameter values are applied at driver load time.
 * @note  If the same parameter is set multiple times, only the last parameter is kept and used.
 * @note  Parameter precedence (highest to losest) is: cuFileGetParameter*() (if set), then environment variable (if exists and set), then cufile.json,
 *
 */
CUfileError_t cuFileSetParameterSizeT(CUFileSizeTConfigParameter_t param, size_t value);
CUfileError_t cuFileSetParameterBool(CUFileBoolConfigParameter_t param, bool value);
CUfileError_t cuFileSetParameterString(CUFileStringConfigParameter_t param, const char* desc_str);

/**
 * @brief Set the level of statistics collection for cuFile operations. This will override 
 * the cufile.json settings for stats.
 *
 * @param level Statistics level (0 = disabled, 1 = basic, 2 = detailed, 3 = verbose)
 *
 * @return CU_FILE_SUCCESS on success
 * @return CU_FILE_INVALID_VALUE if level is invalid
 *
 * @note Higher stats levels may impact performance. Level 0 disables statistics.
 * @note Changes to stats level take effect for future operations.
 *
 * @see cuFileGetStatsLevel
 */
CUfileError_t cuFileSetStatsLevel(int level);

/**
 * @brief Get the current level of statistics collection for cuFile operations
 *
 * @param level Pointer to store the current statistics level
 *
 * @return CU_FILE_SUCCESS on success
 * @return CU_FILE_INVALID_VALUE if level is NULL
 *
 * @see cuFileSetStatsLevel
 */
CUfileError_t cuFileGetStatsLevel(int *level);

#define CUFILE_GPU_UUID_LEN 16
/**
 * @brief Counter structure for tracking operation successes and failures
 */
typedef struct CUfileOpCounter {
    uint64_t ok;   // Number of successful operations
    uint64_t err;  // Number of failed operations
} CUfileOpCounter_t;

/**
 * @brief Level 1 Statistics: Basic IO and operation counters
 */
typedef struct CUfileStatsLevel1 {
    // Operation counters
    CUfileOpCounter_t read_ops;              // Read operations
    CUfileOpCounter_t write_ops;             // Write operations
    CUfileOpCounter_t hdl_register_ops;      // Handle register operations
    CUfileOpCounter_t hdl_deregister_ops;    // Handle deregister operations
    CUfileOpCounter_t buf_register_ops;      // Buffer register operations
    CUfileOpCounter_t buf_deregister_ops;    // Buffer deregister operations
    
    // Basic IO statistics
    uint64_t read_bytes;               // Total bytes read
    uint64_t write_bytes;              // Total bytes written
    uint64_t read_bw_bytes_per_sec;            // Read bandwidth (bytes/sec)
    uint64_t write_bw_bytes_per_sec;           // Write bandwidth (bytes/sec)
    uint64_t read_lat_avg_us;          // Average read latency (microseconds)
    uint64_t write_lat_avg_us;         // Average write latency (microseconds)
    
    // Operations per second
    uint64_t read_ops_per_sec;         // Read operations per second
    uint64_t write_ops_per_sec;        // Write operations per second
    
    // Latency sums
    uint64_t read_lat_sum_us;          // Sum of read latencies
    uint64_t write_lat_sum_us;         // Sum of write latencies
    
    // Batch operations counters
    CUfileOpCounter_t batch_submit_ops;      // Batch submit operations
    CUfileOpCounter_t batch_complete_ops;    // Batch complete operations
    CUfileOpCounter_t batch_setup_ops;       // Batch setup operations
    CUfileOpCounter_t batch_cancel_ops;      // Batch cancel operations
    CUfileOpCounter_t batch_destroy_ops;     // Batch destroy operations
    
    // Batch queue counters
    CUfileOpCounter_t batch_enqueued_ops;    // Batch enqueue operations
    CUfileOpCounter_t batch_posix_enqueued_ops;  // POSIX batch enqueue operations
    CUfileOpCounter_t batch_processed_ops;    // Batch process operations
    CUfileOpCounter_t batch_posix_processed_ops;  // POSIX batch process operations
    
    // Batch submission type counters
    CUfileOpCounter_t batch_nvfs_submit_ops;   // NVFS batch submit operations
    CUfileOpCounter_t batch_p2p_submit_ops;    // P2P batch submit operations
    CUfileOpCounter_t batch_aio_submit_ops;    // AIO batch submit operations
    CUfileOpCounter_t batch_iouring_submit_ops; // IO_URING batch submit operations
    CUfileOpCounter_t batch_mixed_io_submit_ops; // Mixed IO batch submit operations
    CUfileOpCounter_t batch_total_submit_ops;   // Total batch submit operations
    
    // Batch operations statistics
    uint64_t batch_read_bytes;         // Total batch read bytes
    uint64_t batch_write_bytes;        // Total batch write bytes
    uint64_t batch_read_bw_bytes;      // Batch read bandwidth
    uint64_t batch_write_bw_bytes;     // Batch write bandwidth
    uint64_t batch_submit_lat_avg_us;  // Avg batch submit latency
    uint64_t batch_completion_lat_avg_us; // Avg batch completion latency
    uint64_t batch_submit_ops_per_sec;  // Batch submit operations per second
    uint64_t batch_complete_ops_per_sec; // Batch complete operations per second
    uint64_t batch_submit_lat_sum_us;    // Sum of batch submit latencies
    uint64_t batch_completion_lat_sum_us; // Sum of batch completion latencies
    uint64_t last_batch_read_bytes;      // Last batch read bytes
    uint64_t last_batch_write_bytes;     // Last batch write bytes
} CUfileStatsLevel1_t;

/**
 * @brief Level 2 Statistics: Includes Level 1 plus size histograms and detailed metrics
 */
typedef struct CUfileStatsLevel2 {
    // Basic statistics (Level 1)
    CUfileStatsLevel1_t basic;
    
    // IO size histograms
    uint64_t read_size_kb_hist[32];    // Histogram of read sizes
    uint64_t write_size_kb_hist[32];   // Histogram of write sizes
} CUfileStatsLevel2_t;

/**
 * @brief Per-GPU statistics structure used in Level 3
 */
typedef struct CUfilePerGpuStats {
    char uuid[CUFILE_GPU_UUID_LEN];       // GPU UUID
    
    // Read operations
    uint64_t read_bytes;           // Total bytes read
    uint64_t read_bw_bytes_per_sec;// Read bandwidth in bytes per second
    uint64_t read_utilization;     // Read utilization percentage
    uint64_t read_duration_us;     // Read operation duration
    uint64_t n_total_reads;        // Total number of reads
    uint64_t n_p2p_reads;          // Number of PCIe P2PDMA reads
    uint64_t n_nvfs_reads;         // Number of nvidia-fs reads
    uint64_t n_posix_reads;        // Number of POSIX reads
    uint64_t n_unaligned_reads;    // Number of unaligned reads
    uint64_t n_dr_reads;           // Number of reads using dynamic routing
    uint64_t n_sparse_regions;     // Number of sparse regions
    uint64_t n_inline_regions;     // Number of inline regions
    uint64_t n_reads_err;          // Number of read errors
    
    // Write operations
    uint64_t writes_bytes;         // Total bytes written
    uint64_t write_bw_bytes_per_sec;// Write bandwidth in bytes per secind
    uint64_t write_utilization;    // Write utilization percentage  
    uint64_t write_duration_us;    // Write operation duration
    uint64_t n_total_writes;       // Total number of writes
    uint64_t n_p2p_writes;      // Number of PCIe P2PDMA writes
    uint64_t n_nvfs_writes;        // Number of nvidia-fs writes
    uint64_t n_posix_writes;       // Number of POSIX writes
    uint64_t n_unaligned_writes;   // Number of unaligned writes
    uint64_t n_dr_writes;          // Number of writes using dynamic routing
    uint64_t n_writes_err;         // Number of write errors
    
    // Buffer registration statistics
    uint64_t n_mmap;               // Number of buffer registrations
    uint64_t n_mmap_ok;            // Successful registrations
    uint64_t n_mmap_err;           // Failed registrations
    uint64_t n_mmap_free;          // Number of buffer deregistrations
    uint64_t reg_bytes;            // Total bytes registered
} CUfilePerGpuStats_t;

/**
 * @brief Level 3 Statistics: Includes Level 2 plus per-GPU and subsystem statistics
 */
typedef struct CUfileStatsLevel3 {
    // Detailed statistics (Level 2)
    CUfileStatsLevel2_t detailed;
    
    // Number of GPUs detected
    uint32_t num_gpus;
    
    // Per-GPU statistics (array for each GPU)
    CUfilePerGpuStats_t per_gpu_stats[16]; // Using the maxGpus constant value from cufile_stats.h
} CUfileStatsLevel3_t;

/**
 * @brief Start collecting cuFile statistics
 *
 * @return CU_FILE_SUCCESS on success
 * @return CU_FILE_DRIVER_NOT_INITIALIZED if the driver is not initialized
 * 
 * @note Statistics level must be set using cuFileSetStatsLevel before calling this function
 */
CUfileError_t cuFileStatsStart(void);

/**
 * @brief Stop collecting cuFile statistics
 *
 * @return CU_FILE_SUCCESS on success
 * @return CU_FILE_DRIVER_NOT_INITIALIZED if the driver is not initialized
 */
CUfileError_t cuFileStatsStop(void);

/**
 * @brief Reset all cuFile statistics counters
 *
 * @return CU_FILE_SUCCESS on success
 * @return CU_FILE_DRIVER_NOT_INITIALIZED if the driver is not initialized
 */
CUfileError_t cuFileStatsReset(void);

/**
 * @brief Get Level 1 cuFile statistics
 *
 * @param stats Pointer to CUfileStatsLevel1_t structure to be filled
 *
 * @return CU_FILE_SUCCESS on success
 * @return CU_FILE_INVALID_VALUE if stats is NULL or level 1 stats not enabled
 * @return CU_FILE_DRIVER_NOT_INITIALIZED if the driver is not initialized
 */
CUfileError_t cuFileGetStatsL1(CUfileStatsLevel1_t *stats);

/**
 * @brief Get Level 2 cuFile statistics
 *
 * @param stats Pointer to CUfileStatsLevel2_t structure to be filled
 *
 * @return CU_FILE_SUCCESS on success
 * @return CU_FILE_INVALID_VALUE if stats is NULL or level 2 stats not enabled
 * @return CU_FILE_DRIVER_NOT_INITIALIZED if the driver is not initialized
 */
CUfileError_t cuFileGetStatsL2(CUfileStatsLevel2_t *stats);

/**
 * @brief Get Level 3 cuFile statistics
 *
 * @param stats Pointer to CUfileStatsLevel3_t structure to be filled
 *
 * @return CU_FILE_SUCCESS on success
 * @return CU_FILE_INVALID_VALUE if stats is NULL or level 3 stats not enabled
 * @return CU_FILE_DRIVER_NOT_INITIALIZED if the driver is not initialized
 */
CUfileError_t cuFileGetStatsL3(CUfileStatsLevel3_t *stats);

/*
 * Get the BAR size for a specific GPU
 *
 * @param gpuIndex : GPU index to query
 * @param barSize  : Pointer to store the BAR size in MiB
 *
 * @returns: CU_FILE_SUCCESS on success or 
 *           CU_FILE_DRIVER_NOT_INITIALIZED, CU_FILE_INVALID_VALUE on error
 */
CUfileError_t cuFileGetBARSizeInKB(int gpuIndex, size_t *barSize);

/**
 * @brief Set both POSIX pool slab size and count parameters as a pair
 *
 * @param size_values Array of slab sizes in KB
 * @param count_values Array of slab counts
 * @param len Length of both arrays (must be the same)
 *
 * @return CU_FILE_SUCCESS on success
 * @return CU_FILE_INVALID_VALUE if inputs are invalid
 * @return CU_FILE_DRIVER_ALREADY_OPEN if driver is already open
 */
CUfileError_t cuFileSetParameterPosixPoolSlabArray(
                                     const size_t *size_values,
                                     const size_t *count_values,
                                     int len);

/**
 * @brief Get both POSIX pool slab size and count parameters as a pair
 *
 * @param size_values Buffer to receive slab sizes in KB
 * @param count_values Buffer to receive slab counts
 * @param len Buffer size (must match the actual parameter length)
 *
 * @return CU_FILE_SUCCESS on success
 * @return CU_FILE_INVALID_VALUE if inputs are invalid or buffer size doesn't match
 */
CUfileError_t cuFileGetParameterPosixPoolSlabArray(
                                     size_t *size_values,
                                     size_t *count_values,
                                     int len);

/**
 * @brief
 * Gets the P2P flags for a specific filesystem or block device
 *
 * @param  status_flag The filesystem/device status flag (e.g., CU_FILE_LUSTRE_SUPPORTED)
 * @param  p2p_flags Pointer to store the P2P flags
 *
 * @return CU_FILE_SUCCESS on success
 * @return CU_FILE_DRIVER_NOT_INITIALIZED if the driver is not initialized
 * @return CU_FILE_INVALID_VALUE if p2p_flags is NULL
 *
 * @see cuFileDriverSetP2PFlags
 * @see cuFileDriverGetProperties
 */
CUfileError_t cuFileDriverGetP2PFlags(CUfileDriverStatusFlags_t status_flag, CUfileP2PFlags_t *p2p_flags);

/**
 * @brief
 * Sets the P2P flags for a specific filesystem or block device
 *
 * This function configures the P2P and NVFS capabilities for a given filesystem
 * or block device type. The flags parameter should be a bitmask of the desired
 * P2P capabilities.
 *
 * @param[in] status_flag The filesystem/device status flag (e.g., CU_FILE_LUSTRE_SUPPORTED,
 *                        CU_FILE_NVME_SUPPORTED, etc.)
 * @param[in] p2p_flags   The P2P flags bitmask to set
 *
 * @return CU_FILE_SUCCESS on success
 * @return CU_FILE_DRIVER_NOT_INITIALIZED if the cuFile driver is not initialized
 * @return CU_FILE_INVALID_VALUE if status_flag is invalid
 * @return CU_FILE_P2P_FLAG_NOT_SUPPORTED if p2p_flags contains flags not supported
 *                               on the current platform
 *
 * @par Platform-Specific Restrictions:
 * This API enforces strict platform-specific flag restrictions:
 * - **x86_64 platforms**: Only CU_FILE_P2P_FLAG_PCI_P2PDMA is allowed
 * - **AARCH64 platforms**: Only CU_FILE_P2P_FLAG_C2C is allowed
 * - **Cross-platform flags**: CU_FILE_P2P_FLAG_NVFS and CU_FILE_P2P_FLAG_DMABUF are NOT allowed
 *                            to be set via this API (they are managed internally by the driver)
 *
 * @par Usage Example:
 * @code
 * // x86_64 platform - only P2PDMA allowed
 * #ifndef AARCH64_PLATFORM
 *     CUfileP2PFlags_t flags = CU_FILE_P2P_FLAG_PCI_P2PDMA;
 *     CUfileError_t error = cuFileDriverSetP2PFlags(CU_FILE_NVME_SUPPORTED, flags);
 *     if (error.err != CU_FILE_SUCCESS) {
 *         fprintf(stderr, "Failed to set P2P flags: %d\n", error.err);
 *     }
 * #endif
 * 
 * // AARCH64 platform - only C2C allowed
 * #ifdef AARCH64_PLATFORM
 *     CUfileP2PFlags_t flags = CU_FILE_P2P_FLAG_C2C;
 *     CUfileError_t error = cuFileDriverSetP2PFlags(CU_FILE_NVME_SUPPORTED, flags);
 *     if (error.err != CU_FILE_SUCCESS) {
 *         fprintf(stderr, "Failed to set P2P flags: %d\n", error.err);
 *     }
 * #endif
 * 
 * // This will fail on any platform (NVFS not allowed via API)
 * CUfileP2PFlags_t invalid_flags = CU_FILE_P2P_FLAG_NVFS;
 * error = cuFileDriverSetP2PFlags(CU_FILE_NVME_SUPPORTED, invalid_flags);
 * // error.err will be CU_FILE_P2P_FLAG_NOT_SUPPORTED
 * @endcode
 *
 * @warning This API only allows platform-specific P2P flags (P2PDMA on x86_64, C2C on AARCH64)
 * @warning NVFS and DMABUF flags are managed internally by the driver and cannot be set via this API
 * @warning Setting unsupported flags will result in CU_FILE_P2P_FLAG_NOT_SUPPORTED error
 *
 * @note The function validates that only platform-appropriate flags are provided
 * @note Use cuFileDriverGetP2PFlags() to query current flag status including NVFS and DMABUF
 *
 * @see cuFileDriverGetP2PFlags
 */
CUfileError_t cuFileDriverSetP2PFlags(CUfileDriverStatusFlags_t status_flag, CUfileP2PFlags_t p2p_flags);

#pragma GCC visibility pop

/// @cond DOXYGEN_SKIP_MACRO
#endif // CUFILE_H
/// @endcond
#ifdef __cplusplus
}
#endif
