Skip to main content

Python API Reference

Complete API reference for Veloxx Python bindings.

Installation

pip install veloxx

Quick Start

import veloxx as vx

# Create a DataFrame
df = vx.PyDataFrame({
"name": vx.PySeries("name", ["Alice", "Bob", "Charlie"]),
"age": vx.PySeries("age", [25, 30, 35]),
"salary": vx.PySeries("salary", [50000.0, 75000.0, 60000.0])
})

# Basic operations
# Filter rows where age > 25
age_series = df.get_column("age")
filtered_indices = [i for i, age in enumerate(age_series.to_list()) if age is not None and age > 25]
filtered = df.filter(filtered_indices)

# Group by age and calculate mean salary
grouped = df.group_by(["age"])
mean_salary = grouped.agg([("salary", "mean")])

Core Classes

PyDataFrame

The main data structure for working with tabular data in Python.

Constructors

PyDataFrame(columns: dict)

Creates a new DataFrame from a dictionary of column names to PySeries.

Parameters:

columns: dict - Dictionary mapping column names to PySeries objects

Example:

import veloxx as vx

df = vx.PyDataFrame({
"name": vx.PySeries("name", ["Alice", "Bob", "Charlie"]),
"age": vx.PySeries("age", [25, 30, 35]),
"salary": vx.PySeries("salary", [50000.0, 75000.0, 60000.0])
})

Class Methods

@classmethod from_csv(path: str) -> PyDataFrame

Loads a DataFrame from a CSV file with automatic type inference.

Parameters:

path: str - Path to the CSV file

Example:

df = vx.PyDataFrame.from_csv("data/employees.csv")
print(f"Loaded {df.row_count()} rows")
@classmethod from_json(path: str) -> PyDataFrame

Loads a DataFrame from a JSON file.

Parameters:

path: str - Path to the JSON file

Example:

df = vx.PyDataFrame.from_json("data/users.json")

Properties

row_count() -> int

Returns the number of rows in the DataFrame.

Example:

print(f"DataFrame has {df.row_count()} rows")
column_count() -> int

Returns the number of columns in the DataFrame.

Example:

print(f"DataFrame has {df.column_count()} columns")
column_names() -> List[str]

Returns a list of column names.

Example:

names = df.column_names()
for name in names:
print(f"Column: {name}")

Data Access

get_column(name: str) -> Optional[PySeries]

Gets a column by name.

Parameters:

name: str - Name of the column to retrieve

Example:

age_column = df.get_column("age")
if age_column:
print(f"Age column has {age_column.len()} values")
getitem(key: str) -> PySeries

Gets a column using bracket notation (syntactic sugar).

Example:

# These are equivalent
age1 = df.get_column("age")
age2 = df["age"]

Data Manipulation

filter(row_indices: List[int]) -> PyDataFrame

Filters rows by index positions.

Parameters:

row_indices: List[int] - List of row indices to keep

Example:

# Filter rows where age > 25
age_series = df.get_column("age")
indices = [i for i, age in enumerate(age_series.to_list()) if age and age > 25]
filtered_df = df.filter(indices)
select_columns(names: List[str]) -> PyDataFrame

Selects specific columns from the DataFrame.

Parameters:

names: List[str] - Names of columns to select

Example:

selected = df.select_columns(["name", "age"])
drop_columns(names: List[str]) -> PyDataFrame

Removes specified columns from the DataFrame.

Parameters:

names: List[str] - Names of columns to drop

Example:

without_id = df.drop_columns(["id"])
rename_column(old_name: str, new_name: str) -> PyDataFrame

Renames a column in the DataFrame.

Parameters:

old_name: str - Current name of the column

new_name: str - New name for the column

Example:

renamed = df.rename_column("age", "years")
with_column(name: str, expr: PyExpr) -> PyDataFrame

Adds a new column or replaces an existing one using an expression.

Parameters:

name: str - Name of the new column

expr: PyExpr - Expression to compute the column values

Example:

# Add a column with salary + 1000 bonus
expr = vx.PyExpr.add(
vx.PyExpr.column("salary"),
vx.PyExpr.literal(1000.0)
)
with_bonus = df.with_column("salary_with_bonus", expr)

Grouping and Aggregation

group_by(by_columns: List[str]) -> PyGroupedDataFrame

Groups the DataFrame by specified columns.

Parameters:

by_columns: List[str] - Columns to group by

Example:

grouped = df.group_by(["department"])
result = grouped.mean()
describe() -> PyDataFrame

Generates descriptive statistics for numeric columns.

Example:

stats = df.describe()
print(stats)

Statistical Methods

correlation(col1_name: str, col2_name: str) -> float

Calculates the Pearson correlation between two numeric columns.

Parameters:

col1_name: str - Name of the first column

col2_name: str - Name of the second column

Example:

corr = df.correlation("age", "salary")
print(f"Age-Salary correlation: {corr:.3f}")
covariance(col1_name: str, col2_name: str) -> float

Calculates the covariance between two numeric columns.

Parameters:

col1_name: str - Name of the first column

col2_name: str - Name of the second column

Example:

cov = df.covariance("age", "salary")
print(f"Age-Salary covariance: {cov:.2f}")

Joining

join(other: PyDataFrame, on_column: str, join_type: PyJoinType) -> PyDataFrame

Joins this DataFrame with another DataFrame.

Parameters:

other: PyDataFrame - DataFrame to join with

on_column: str - Column name to join on

join_type: PyJoinType - Type of join (Inner, Left, Right)

Example:

joined = df1.join(df2, "user_id", vx.PyJoinType.Inner)

Sorting and Ordering

sort(by_columns: List[str], ascending: bool = True) -> PyDataFrame

Sorts the DataFrame by specified columns.

Parameters:

by_columns: List[str] - Columns to sort by

ascending: bool - Sort order (default: True)

Example:

sorted_df = df.sort(["age", "name"], ascending=True)

Data Cleaning

drop_nulls(subset: Optional[List[str]] = None) -> PyDataFrame

Removes rows containing any null values. If subset is provided, only nulls in those columns are considered.

Parameters:

subset: Optional[List[str]] - List of column names to consider for dropping nulls. If None, all columns are considered.

Example:

clean_df = df.drop_nulls()
# Drop rows with nulls only in 'age' or 'salary'
clean_df_subset = df.drop_nulls(subset=['age', 'salary'])
fill_nulls(value: Any) -> PyDataFrame

Fills null values with a specified value. The filling only occurs if the value's type matches the DataType of the column being processed.

Parameters:

value: Any - Value to use for filling nulls

Example:

filled = df.fill_nulls(0)  # Fill with 0
filled_str = df.fill_nulls("Unknown") # Fill with string

I/O Operations

to_csv(path: str) -> None

Writes the DataFrame to a CSV file.

Parameters:

path: str - Output file path

Example:

df.to_csv("output/results.csv")

Concatenation

append(other: PyDataFrame) -> PyDataFrame

Appends another DataFrame vertically.

Parameters:

other: PyDataFrame - DataFrame to append

Example:

combined = df1.append(df2)

PyGroupedDataFrame

Represents a grouped DataFrame for aggregation operations.

Aggregation Methods

sum() -> PyDataFrame

Calculates the sum for each group.

Example:

grouped = df.group_by(["department"])
sums = grouped.sum()
mean() -> PyDataFrame

Calculates the mean for each group.

Example:

averages = grouped.mean()
count() -> PyDataFrame

Counts values for each group.

Example:

counts = grouped.count()
min() -> PyDataFrame

Finds the minimum value for each group.

Example:

minimums = grouped.min()
max() -> PyDataFrame

Finds the maximum value for each group.

Example:

maximums = grouped.max()
agg(aggregations: List[Tuple[str, str]]) -> PyDataFrame

Performs custom aggregations.

Parameters:

aggregations: List[Tuple[str, str]] - List of (column, aggregation_function) tuples

Example:

result = grouped.agg([
("salary", "mean"),
("age", "count"),
("experience", "max")
])

PySeries

Represents a single column of data.

Constructors

PySeries(name: str, data: List[Any])

Creates a new Series with automatic type inference.

Parameters:

name: str - Name of the series

data: List[Any] - List of values (supports None for nulls)

Example:

# Integer series
ages = vx.PySeries("age", [25, 30, None, 35])

# String series
names = vx.PySeries("name", ["Alice", "Bob", None, "Charlie"])

# Float series
salaries = vx.PySeries("salary", [50000.0, 75000.0, 60000.0])

# Boolean series
active = vx.PySeries("is_active", [True, False, True])

Properties

name() -> str

Returns the name of the Series.

Example:

print(f"Series name: {series.name()}")
len() -> int

Returns the length of the Series.

Example:

print(f"Series has {series.len()} values")
is_empty() -> bool

Checks if the Series is empty.

Example:

if series.is_empty():
print("Series is empty")
data_type() -> PyDataType

Returns the data type of the Series.

Example:

dtype = series.data_type()
print(f"Series type: {dtype}")

Data Access

get_value(index: int) -> Any

Gets the value at a specific index.

Parameters:

index: int - Index of the value to retrieve

Example:

first_value = series.get_value(0)
print(f"First value: {first_value}")
to_list() -> List[Any]

Converts the Series to a Python list.

Example:

values = series.to_list()
for value in values:
if value is not None:
print(value)

Statistical Methods

sum() -> float

Calculates the sum of numeric values.

Example:

total = series.sum()
print(f"Sum: {total}")
mean() -> float

Calculates the mean of numeric values.

Example:

average = series.mean()
print(f"Average: {average}")
median() -> float

Calculates the median of numeric values.

Example:

median = series.median()
print(f"Median: {median}")
min() -> Any

Finds the minimum value.

Example:

minimum = series.min()
print(f"Minimum: {minimum}")
max() -> Any

Finds the maximum value.

Example:

maximum = series.max()
print(f"Maximum: {maximum}")
std_dev() -> float

Calculates the standard deviation.

Example:

std_dev = series.std_dev()
print(f"Standard deviation: {std_dev}")
count() -> int

Counts non-null values.

Example:

non_null_count = series.count()
print(f"Non-null values: {non_null_count}")
unique() -> PySeries

Returns a Series with unique values.

Example:

unique_values = series.unique()
print(f"Unique values: {unique_values.len()}")
correlation(other: PySeries) -> float

Calculates the Pearson correlation between two numeric Series.

Parameters:

other: PySeries - Other series to correlate with

Example:

corr = age_series.correlation(salary_series)
print(f"Correlation: {corr}")
covariance(other: PySeries) -> float

Calculates the covariance between two numeric Series.

Parameters:

other: PySeries - Other series to calculate covariance with

Example:

cov = age_series.covariance(salary_series)
print(f"Covariance: {cov}")
interpolate_nulls() -> PySeries

Interpolates null values using linear interpolation for numeric Series.

Example:

s = vx.PySeries("data", [1, None, 3, None, 5])
interpolated_s = s.interpolate_nulls()
print(f"Interpolated: {interpolated_s.to_list()}")
append(other: PySeries) -> PySeries

Appends another Series to this one.

Parameters:

other: PySeries - Series to append

Example:

s1 = vx.PySeries("data", [1, 2])
s2 = vx.PySeries("data", [3, 4])
combined = s1.append(s2)
print(f"Combined: {combined.to_list()}")

Data Manipulation

filter(row_indices: List[int]) -> PySeries

Filters the Series by index positions.

Parameters:

row_indices: List[int] - List of indices to keep

Example:

filtered = series.filter([0, 2, 4])  # Keep indices 0, 2, 4
fill_nulls(value: Any) -> PySeries

Fills null values with a specified value.

Parameters:

value: Any - Value to use for filling nulls

Example:

filled = series.fill_nulls(0)

PyExpr

Represents expressions for computed columns.

Static Methods

@staticmethod column(name: str) -> PyExpr

Creates a column reference expression.

Parameters:

name: str - Name of the column to reference

Example:

expr = vx.PyExpr.column("salary")
@staticmethod literal(value: Any) -> PyExpr

Creates a literal value expression.

Parameters:

value: Any - The literal value

Example:

expr = vx.PyExpr.literal(1000.0)

Comparison Operations

@staticmethod equals(left: PyExpr, right: PyExpr) -> PyExpr

Creates an equality comparison expression.

Example:

expr = vx.PyExpr.equals(
vx.PyExpr.column("status"),
vx.PyExpr.literal("active")
)
@staticmethod not_equals(left: PyExpr, right: PyExpr) -> PyExpr

Creates a not-equals comparison expression.

@staticmethod greater_than(left: PyExpr, right: PyExpr) -> PyExpr

Creates a greater-than comparison expression.

@staticmethod less_than(left: PyExpr, right: PyExpr) -> PyExpr

Creates a less-than comparison expression.

@staticmethod greater_than_or_equal(left: PyExpr, right: PyExpr) -> PyExpr

Creates a greater-than-or-equal comparison expression.

@staticmethod less_than_or_equal(left: PyExpr, right: PyExpr) -> PyExpr

Creates a less-than-or-equal comparison expression.

Logical Operations

@staticmethod and_(left: PyExpr, right: PyExpr) -> PyExpr

Creates a logical AND expression.

Example:

expr = vx.PyExpr.and_(
vx.PyExpr.greater_than(vx.PyExpr.column("age"), vx.PyExpr.literal(18)),
vx.PyExpr.equals(vx.PyExpr.column("status"), vx.PyExpr.literal("active"))
)
@staticmethod or_(left: PyExpr, right: PyExpr) -> PyExpr

Creates a logical OR expression.

@staticmethod not_(expr: PyExpr) -> PyExpr

Creates a logical NOT expression.

Parameters:

expr: PyExpr - The expression to negate

Comparison Operations

@staticmethod equals(left: PyExpr, right: PyExpr) -> PyExpr

Creates an equality comparison expression.

Example:

expr = vx.PyExpr.equals(
vx.PyExpr.column("status"),
vx.PyExpr.literal("active")
)
@staticmethod not_equals(left: PyExpr, right: PyExpr) -> PyExpr

Creates a not-equals comparison expression.

@staticmethod greater_than(left: PyExpr, right: PyExpr) -> PyExpr

Creates a greater-than comparison expression.

@staticmethod less_than(left: PyExpr, right: PyExpr) -> PyExpr

Creates a less-than comparison expression.

@staticmethod greater_than_or_equal(left: PyExpr, right: PyExpr) -> PyExpr

Creates a greater-than-or-equal comparison expression.

@staticmethod less_than_or_equal(left: PyExpr, right: PyExpr) -> PyExpr

Creates a less-than-or-equal comparison expression.

Logical Operations

@staticmethod and_(left: PyExpr, right: PyExpr) -> PyExpr

Creates a logical AND expression.

Example:

expr = vx.PyExpr.and_(
vx.PyExpr.greater_than(vx.PyExpr.column("age"), vx.PyExpr.literal(18)),
vx.PyExpr.equals(vx.PyExpr.column("status"), vx.PyExpr.literal("active"))
)
@staticmethod or_(left: PyExpr, right: PyExpr) -> PyExpr

Creates a logical OR expression.

@staticmethod not_(expr: PyExpr) -> PyExpr

Creates a logical NOT expression.

Parameters:

expr: PyExpr - The expression to negate

PyJoinType

Enumeration for join types.

class PyJoinType:
Inner = "Inner"
Left = "Left"
Right = "Right"

Example:

joined = df1.join(df2, "user_id", vx.PyJoinType.Left)

Usage Patterns

Basic Data Analysis

import veloxx as vx

# Create sample data
df = vx.PyDataFrame({
"product": vx.PySeries("product", ["Laptop", "Mouse", "Keyboard", "Monitor"]),
"region": vx.PySeries("region", ["North", "South", "North", "East"]),
"sales": vx.PySeries("sales", [1200.0, 25.0, 75.0, 300.0]),
"quantity": vx.PySeries("quantity", [2, 5, 3, 1]),
"customer_id": vx.PySeries("customer_id", [101, 102, 101, 103]),
})

# Basic info
print(f"Dataset: {df.row_count()} rows, {df.column_count()} columns")
print(f"Columns: {df.column_names()}")

# Filter high-value sales (sales > 100)
high_value_indices = []
amount_series = df.get_column("sales")
for i, amount in enumerate(amount_series.to_list()):
if amount is not None and amount > 100:
high_value_indices.append(i)

high_value_sales = df.filter(high_value_indices)
print("\nHigh-Value Sales:")
print(high_value_sales)

# Group by region and aggregate
summary = high_value_sales.group_by(["region"]).agg([
("sales", "sum"),
("sales", "mean"),
("customer_id", "count")
])
print("\nRegional Sales Summary:")
print(summary)

Advanced Analytics

import veloxx as vx

def analyze_customer_data():
# Create sample data for customers and orders
customers_df = vx.PyDataFrame({
"customer_id": vx.PySeries("customer_id", [1, 2, 3]),
"name": vx.PySeries("name", ["Alice", "Bob", "Charlie"]),
"segment": vx.PySeries("segment", ["Premium", "Standard", "Premium"]),
})
orders_df = vx.PyDataFrame({
"order_id": vx.PySeries("order_id", [101, 102, 103, 104]),
"customer_id": vx.PySeries("customer_id", [1, 2, 1, 3]),
"order_value": vx.PySeries("order_value", [100.0, 50.0, 150.0, 75.0]),
"order_frequency": vx.PySeries("order_frequency", [10, 5, 15, 8]),
})

# Join datasets
customer_orders = customers_df.join(orders_df, "customer_id", vx.PyJoinType.Inner)

# Calculate customer lifetime value
clv_expr = vx.PyExpr.multiply(
vx.PyExpr.column("order_value"),
vx.PyExpr.column("order_frequency")
)

with_clv = customer_orders.with_column("lifetime_value", clv_expr)

# Segment customers (lifetime_value > 1000)
high_value_indices = []
clv_series = with_clv.get_column("lifetime_value")
for i, clv in enumerate(clv_series.to_list()):
if clv is not None and clv > 1000:
high_value_indices.append(i)

high_value_customers = with_clv.filter(high_value_indices)

# Analyze by segment
segment_analysis = high_value_customers.group_by(["segment"]).agg([
("lifetime_value", "mean"),
("order_frequency", "mean"),
("customer_id", "count")
])

return segment_analysis

# Run analysis
results = analyze_customer_data()
print("\nAdvanced Analytics Results:")
print(results)

Data Cleaning Pipeline

import veloxx as vx

def clean_dataset(df):
"""Clean and prepare dataset for analysis"""

# Remove rows with missing critical data
clean_df = df.drop_nulls()

# Fill missing values in optional columns
filled_df = clean_df.fill_nulls("Unknown")

# Remove outliers (example: ages > 100)
age_series = filled_df.get_column("age")
valid_indices = []
for i, age in enumerate(age_series.to_list()):
if age is not None and 0 <= age <= 100:
valid_indices.append(i)

filtered_df = filled_df.filter(valid_indices)

# Standardize column names
standardized = filtered_df.rename_column("customer_name", "name")
standardized = standardized.rename_column("customer_age", "age")

return standardized

# Usage
raw_data = vx.PyDataFrame({
"customer_name": vx.PySeries("customer_name", ["Alice", None, "Charlie"]),
"customer_age": vx.PySeries("customer_age", [30, 150, 25]),
"product": vx.PySeries("product", ["A", "B", "C"]),
})
clean_data = clean_dataset(raw_data)
print("\nCleaned Data:")
print(clean_data)

Performance Tips

  1. Use appropriate data types: Let Veloxx infer types automatically for best performance
  2. Filter early: Apply filters before expensive operations like joins
  3. Use expressions for vectorized operations: Leverage the PyExpr system for efficient column-wise computations instead of Python loops.
  4. Process in chunks: For very large datasets, process in smaller chunks to manage memory.
  5. Minimize data copying: Chain operations when possible to avoid unnecessary data duplication.

Error Handling

Veloxx operations can raise VeloxxError exceptions. It's recommended to catch specific error types for robust error management.

import veloxx as vx
from veloxx import VeloxxError

try:
# Example: Attempt to load a non-existent file
df = vx.PyDataFrame.from_csv("non_existent_file.csv")
print(df)
except VeloxxError as e:
print(f"Veloxx Error: {e}")
# You can check the error type for more specific handling
if "file not found" in str(e).lower():
print("Please ensure the CSV file exists.")
except Exception as e:
print(f"An unexpected error occurred: {e}")

Integration with Pandas

Veloxx provides seamless integration with Pandas DataFrames, allowing you to convert data between the two libraries.

import veloxx as vx
import pandas as pd

# Convert Pandas DataFrame to Veloxx PyDataFrame
def pandas_to_veloxx(pandas_df: pd.DataFrame) -> vx.PyDataFrame:
columns = {}
for col_name in pandas_df.columns:
# Convert Pandas Series to Python list, handling NaN values
data = pandas_df[col_name].replace({pd.NA: None, pd.NA: None}).tolist()
columns[col_name] = vx.PySeries(col_name, data)
return vx.PyDataFrame(columns)

# Convert Veloxx PyDataFrame to Pandas DataFrame
def veloxx_to_pandas(veloxx_df: vx.PyDataFrame) -> pd.DataFrame:
data = {}
for col_name in veloxx_df.column_names():
series = veloxx_df.get_column(col_name)
if series:
data[col_name] = series.to_list()
return pd.DataFrame(data)

# Usage Example
# Create a sample Pandas DataFrame
pandas_df_original = pd.DataFrame({
"id": [1, 2, 3],
"value": [10.5, pd.NA, 30.0],
"category": ["A", "B", "A"]
})
print("\nOriginal Pandas DataFrame:")
print(pandas_df_original)

# Convert Pandas to Veloxx
veloxx_df_converted = pandas_to_veloxx(pandas_df_original)
print("\nConverted Veloxx DataFrame:")
print(veloxx_df_converted)

# Perform some Veloxx operations (e.g., fill nulls)
veloxx_df_processed = veloxx_df_converted.fill_nulls(0.0)
print("\nProcessed Veloxx DataFrame (nulls filled):")
print(veloxx_df_processed)

# Convert Veloxx back to Pandas
pandas_df_final = veloxx_to_pandas(veloxx_df_processed)
print("\nFinal Pandas DataFrame:")
print(pandas_df_final)