Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions vortex-ffi/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ categories = { workspace = true }
all-features = true

[dependencies]
arrow-array = { workspace = true, features = ["ffi"] }
async-fs = { workspace = true }
futures = { workspace = true }
itertools = { workspace = true }
Expand Down
2 changes: 1 addition & 1 deletion vortex-ffi/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,6 @@ cmake -Bbuild -DWITH_ASAN=1 -DTARGET_TRIPLE=<target triple>

3. Run the tests (ctest doesn't output failures in detail):

```
```sh
./build/test/vortex_ffi_test 2>& 1 | rustfilt -i-
```
41 changes: 41 additions & 0 deletions vortex-ffi/cbindgen.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,51 @@ style = "type"
header = """
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors
#include <stdint.h>

//
// THIS FILE IS AUTO-GENERATED, DO NOT MAKE EDITS DIRECTLY
//

// https://arrow.apache.org/docs/format/CDataInterface.html#structure-definitions
// We don't want to bundle nanoarrow or similar just for these two definitions.
// If you use your own Arrow library, define this macro and
// typedef FFI_ArrowSchema ArrowSchema;
// typedef FFI_ArrowArrayStream ArrowArrayStream;
#ifndef USE_OWN_ARROW
struct ArrowSchema {
const char* format;
const char* name;
const char* metadata;
int64_t flags;
int64_t n_children;
struct ArrowSchema** children;
struct ArrowSchema* dictionary;
void (*release)(struct ArrowSchema*);
void* private_data;
};
struct ArrowArray {
int64_t length;
int64_t null_count;
int64_t offset;
int64_t n_buffers;
int64_t n_children;
const void** buffers;
struct ArrowArray** children;
struct ArrowArray* dictionary;
void (*release)(struct ArrowArray*);
void* private_data;
};
struct ArrowArrayStream {
int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out);
int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out);
const char* (*get_last_error)(struct ArrowArrayStream*);
void (*release)(struct ArrowArrayStream*);
void* private_data;
};
typedef ArrowSchema FFI_ArrowSchema;
typedef ArrowArrayStream FFI_ArrowArrayStream;
#endif
"""

[export.rename]
Expand Down
232 changes: 232 additions & 0 deletions vortex-ffi/cinclude/vortex.h
Original file line number Diff line number Diff line change
@@ -1,10 +1,51 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors
#include <stdint.h>

//
// THIS FILE IS AUTO-GENERATED, DO NOT MAKE EDITS DIRECTLY
//

// https://arrow.apache.org/docs/format/CDataInterface.html#structure-definitions
// We don't want to bundle nanoarrow or similar just for these two definitions.
// If you use your own Arrow library, define this macro and
// typedef FFI_ArrowSchema ArrowSchema;
// typedef FFI_ArrowArrayStream ArrowArrayStream;
#ifndef USE_OWN_ARROW
struct ArrowSchema {
const char *format;
const char *name;
const char *metadata;
int64_t flags;
int64_t n_children;
struct ArrowSchema **children;
struct ArrowSchema *dictionary;
void (*release)(struct ArrowSchema *);
void *private_data;
};
struct ArrowArray {
int64_t length;
int64_t null_count;
int64_t offset;
int64_t n_buffers;
int64_t n_children;
const void **buffers;
struct ArrowArray **children;
struct ArrowArray *dictionary;
void (*release)(struct ArrowArray *);
void *private_data;
};
struct ArrowArrayStream {
int (*get_schema)(struct ArrowArrayStream *, struct ArrowSchema *out);
int (*get_next)(struct ArrowArrayStream *, struct ArrowArray *out);
const char *(*get_last_error)(struct ArrowArrayStream *);
void (*release)(struct ArrowArrayStream *);
void *private_data;
};
typedef ArrowSchema FFI_ArrowSchema;
typedef ArrowArrayStream FFI_ArrowArrayStream;
#endif

#pragma once

#include <stdarg.h>
Expand Down Expand Up @@ -134,6 +175,12 @@ typedef enum {
VX_VALIDITY_ARRAY = 3,
} vx_validity_type;

typedef enum {
VX_CARD_UNKNOWN = 0,
VX_CARD_ESTIMATE = 1,
VX_CARD_MAXIMUM = 2,
} vx_cardinality;

/**
* Equalities, inequalities, and boolean operations over possibly null values.
* For most operations, if either side is null, the result is null.
Expand Down Expand Up @@ -224,6 +271,18 @@ typedef enum {
LOG_LEVEL_TRACE = 5,
} vx_log_level;

typedef enum {
VX_S_INCLUDE_ALL = 0,
VX_S_INCLUDE_RANGE = 1,
VX_S_EXCLUDE_RANGE = 2,
} vx_scan_selection_include;

typedef enum {
VX_ESTIMATE_UNKNOWN = 0,
VX_ESTIMATE_EXACT = 1,
VX_ESTIMATE_INEXACT = 2,
} vx_estimate_boundary;

/**
* Physical type enum, represents the in-memory physical layout but might represent a different logical type.
*/
Expand Down Expand Up @@ -315,6 +374,8 @@ typedef struct Nullability Nullability;

typedef struct Primitive Primitive;

typedef struct VxFileHandle VxFileHandle;

/**
* Arrays are reference-counted handles to owned memory buffers that hold
* scalars. These buffers can be held in a number of physical encodings to
Expand Down Expand Up @@ -369,6 +430,14 @@ typedef struct vx_array_sink vx_array_sink;
*/
typedef struct vx_binary vx_binary;

/**
* A data source is a reference to multiple possibly remote files. When
* created, it opens first file to determine the schema from DType, all
* other operations are deferred till a scan is requested. You can request
* multiple file scans from a data source
*/
typedef struct vx_data_source vx_data_source;

/**
* A Vortex data type.
*
Expand Down Expand Up @@ -403,6 +472,14 @@ typedef struct vx_expression vx_expression;
*/
typedef struct vx_file vx_file;

/**
* A Partition is a unit of work for a worker thread from which you can
* get vx_arrays.
*/
typedef struct vx_partition vx_partition;

typedef struct vx_scan vx_scan;

/**
* A handle to a Vortex session.
*/
Expand Down Expand Up @@ -435,6 +512,67 @@ typedef struct {
const vx_array *array;
} vx_validity;

typedef int (*vx_fs_use_vortex)(const char *schema, const char *path);

typedef void (*vx_fs_set_userdata)(void *userdata);

typedef void (*vx_fs_open)(void *userdata, const char *path, vx_error **err);

typedef void (*vx_fs_create)(void *userdata, const char *path, vx_error **err);

typedef void (*vx_list_callback)(void *userdata, const char *name, int is_dir);

typedef void (*vx_fs_list)(const void *userdata,
const char *path,
vx_list_callback callback,
vx_error **error);

typedef const VxFileHandle *vx_file_handle;

typedef void (*vx_fs_close)(vx_file_handle handle);

typedef uint64_t (*vx_fs_size)(vx_file_handle handle, vx_error **err);

typedef uint64_t (
*vx_fs_read)(vx_file_handle handle, uint64_t offset, size_t len, uint8_t *buffer, vx_error **err);

typedef uint64_t (
*vx_fs_write)(vx_file_handle handle, uint64_t offset, size_t len, uint8_t *buffer, vx_error **err);

typedef void (*vx_fs_sync)(vx_file_handle handle, vx_error **err);

typedef void (*vx_glob_callback)(void *userdata, const char *file);

typedef void (*vx_glob)(const char *glob, vx_glob_callback callback, vx_error **err);

/**
* Host must either implement all or none of fs_* callbacks.
*/
typedef struct {
const char *files;
/**
* Whether to use Vortex filesystem or host's filesystem.
* Return 1 to use Vortex for a given schema ("file", "s3") and path.
* Return 0 to use host's filesystem.
*/
vx_fs_use_vortex fs_use_vortex;
vx_fs_set_userdata fs_set_userdata;
vx_fs_open fs_open;
vx_fs_create fs_create;
vx_fs_list fs_list;
vx_fs_close fs_close;
vx_fs_size fs_size;
vx_fs_read fs_read;
vx_fs_write fs_write;
vx_fs_sync fs_sync;
vx_glob glob;
} vx_data_source_options;

typedef struct {
vx_cardinality cardinality;
uint64_t rows;
} vx_data_source_row_count;

/**
* Options supplied for opening a file.
*/
Expand Down Expand Up @@ -497,6 +635,28 @@ typedef struct {
unsigned long row_offset;
} vx_file_scan_options;

typedef struct {
uint64_t *idx;
size_t idx_len;
vx_scan_selection_include include;
} vx_scan_selection;

typedef struct {
const vx_expression *projection;
const vx_expression *filter;
uint64_t row_range_begin;
uint64_t row_range_end;
vx_scan_selection selection;
uint64_t limit;
uint64_t max_threads;
int ordered;
} vx_scan_options;

typedef struct {
uint64_t estimate;
vx_estimate_boundary type;
} vx_estimate;

#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
Expand Down Expand Up @@ -711,6 +871,34 @@ size_t vx_binary_len(const vx_binary *ptr);
*/
const char *vx_binary_ptr(const vx_binary *ptr);

/**
* Clone a borrowed [`vx_data_source`], returning an owned [`vx_data_source`].
*
*
* Must be released with [`vx_data_source_free`].
*/
const vx_data_source *vx_data_source_clone(const vx_data_source *ptr);

/**
* Free an owned [`vx_data_source`] object.
*/
void vx_data_source_free(const vx_data_source *ptr);

/**
* Create a new owned datasource which must be freed by the caller
*/
const vx_data_source *
vx_data_source_new(const vx_session *session, const vx_data_source_options *opts, vx_error **err);

/**
* Create a non-owned dtype referencing dataframe.
* This dtype's lifetime is bound to underlying data source.
* Caller should not free this dtype.
*/
const vx_dtype *vx_data_source_dtype(const vx_data_source *ds);

void vx_data_source_get_row_count(const vx_data_source *ds, vx_data_source_row_count *rc);

/**
* Clone a borrowed [`vx_dtype`], returning an owned [`vx_dtype`].
*
Expand Down Expand Up @@ -854,6 +1042,8 @@ uint8_t vx_dtype_time_unit(const DType *dtype);
*/
const vx_string *vx_dtype_time_zone(const DType *dtype);

void vx_type_to_arrow_schema(const vx_dtype *_dtype, FFI_ArrowSchema *_schema, vx_error **_err);

/**
* Free an owned [`vx_error`] object.
*/
Expand Down Expand Up @@ -1034,6 +1224,47 @@ vx_array_iterator *vx_file_scan(const vx_session *session,
*/
void vx_set_log_level(vx_log_level level);

/**
* Free an owned [`vx_scan`] object.
*/
void vx_scan_free(vx_scan *ptr);

/**
* Free an owned [`vx_partition`] object.
*/
void vx_partition_free(vx_partition *ptr);

vx_scan *vx_data_source_scan(const vx_data_source *data_source,
const vx_scan_options *options,
vx_estimate *estimate,
vx_error **err);

/**
* Get next owned partition out of a scan request.
* Caller must free this partition using vx_partition_free.
* This method is thread-safe.
* If using in a sync multi-thread runtime, users are encouraged to create a
* worker thread per partition.
* Returns NULL and doesn't set err on exhaustion.
* Returns NULL and sets err on error.
*/
vx_partition *vx_scan_next(vx_scan *scan, vx_error **err);

void vx_partition_row_count(const vx_partition *partition, vx_estimate *count, vx_error **err);

void vx_partition_scan_arrow(const vx_partition *_partition, FFI_ArrowArrayStream *_stream, vx_error **err);

/**
* Get next vx_array out of this partition.
* Thread-unsafe.
*/
const vx_array *vx_partition_next(vx_partition *partition, vx_error **err);

/**
* Scan progress between 0.0 and 1.0
*/
double vx_scan_progress(const vx_scan *_scan);

/**
* Free an owned [`vx_session`] object.
*/
Expand Down Expand Up @@ -1115,6 +1346,7 @@ void vx_struct_column_builder_free(vx_struct_column_builder *ptr);
/**
* Create a new column-wise struct array builder with given validity and a
* capacity hint. validity can't be NULL.
* Capacity hint is for the number of columns.
* If you don't know capacity, pass 0.
* if validity is NULL, returns NULL.
*/
Expand Down
Loading
Loading