Fixed-Length vs. Variable-Length Storage in HDF5
HDF5 gives you two ways to store “string-like” or array-like data: fixed-length and variable-length. Each comes with trade-offs, and we benchmarked them head-to-head.
The Setup
We compared writing large arrays of simple POD records, stored either as:
- Fixed-length fields: every record has the same size.
- Variable-length fields: each record may grow or shrink.
The benchmark (hdf5-fixed-length-bench.cpp
) measures throughput for millions of writes, simulating common HPC/quant workloads.
#include <iostream>
#include <vector>
#include <algorithm>
#include <h5bench>
#include <h5cpp/core>
#include "non-pod-struct.hpp"
#include <h5cpp/io>
#include <fmt/core.h>
#include <fstream>
namespace bh = h5::bench;
bh::arg_x record_size{10'000}; //, 100'000, 1'000'000};
bh::warmup warmup{3};
bh::sample sample{10};
h5::dcpl_t chunk_size = h5::chunk{4096};
std::vector<size_t> get_transfer_size(const std::vector<std::string>& strings ){
std::vector<size_t> transfer_size;
for (size_t i =0, j=0, N = 0; i < strings.size(); i++){
N += strings[i].length();
if( i == record_size[j] - 1) j++, transfer_size.push_back(N);
}
return transfer_size;
}
template<class T> std::vector<T> convert(const std::vector<std::string>& strings){
return std::vector<T>();
}
template <> std::vector<char[shim::pod_t::max_lenght::value]> convert(const std::vector<std::string>& strings){
std::vector<char[shim::pod_t::max_lenght::value]> out(strings.size());
for (size_t i = 0; i < out.size(); i++)
strncpy(out[i], strings[i].data(), shim::pod_t::max_lenght::value);
return out;
}
std::vector<const char*> get_data(const std::vector<std::string>& strings){
std::vector<const char*> data(strings.size());
// build a array of pointers to VL strings: one level of indirection
for (size_t i = 0; i < data.size(); i++)
data[i] = (char*) strings[i].data();
return data;
}
std::vector<h5::ds_t> get_datasets(const h5::fd_t& fd, const std::string& name, h5::bench::arg_x& rs){
std::vector<h5::ds_t> ds;
for(size_t i=0; i< rs.rank; i++)
ds.push_back( h5::create<std::string>(fd, fmt::format(name + "-{:010d}", rs[i]), h5::current_dims{rs[i]}, chunk_size));
return ds;
}
int main(int argc, const char **argv){
size_t max_size = *std::max_element(record_size.begin(), record_size.end());
h5::fd_t fd = h5::create("h5cpp.h5", H5F_ACC_TRUNC);
auto strings = h5::utils::get_test_data<std::string>(max_size, 10, shim::pod_t::max_lenght::value);
// LETS PRINT PUT SOME STRINGS TO GIVE YOU THE PICTURE
fmt::print("[{:5>}] [{:^30}] [{:6}]\n", "#", "value", "lenght");
for(size_t i=0; i<10; i++) fmt::print("{:>2d} {:>30} {:>8d}\n", i, strings[i], strings[i].length());
fmt::print("\n\n");
{ // POD: FIXED LENGTH STRING + ID
h5::pt_t ds = h5::create<shim::pod_t>(fd, "FLstring h5::append<pod_t>", h5::max_dims{H5S_UNLIMITED}, chunk_size);
std::vector<shim::pod_t> data(max_size);
// we have to copy the string into the pos struct
for (size_t i = 0; i < data.size(); i++)
data[i].id = i, strncpy(data[i].name, strings[i].data(), shim::pod_t::max_lenght::value);
// compute data transfer size, we will be using this to measure throughput:
std::vector<size_t> transfer_size;
for (auto i : record_size)
transfer_size.push_back(i * sizeof(shim::pod_t));
// actual measurement with burn in phase
bh::throughput(
bh::name{"FLstring h5::append<pod_t>"}, record_size, warmup, sample, ds,
[&](hsize_t idx, hsize_t size) -> double {
for (hsize_t k = 0; k < size; k++)
h5::append(ds, data[k]);
return transfer_size[idx];
});
}
{ // VL STRING, INDEXED BY HDF5 B+TREE, h5::append<std::string>
h5::pt_t ds = h5::create<std::string>(fd, "VLstring h5::append<std::vector<std::string>> ", h5::max_dims{H5S_UNLIMITED}, chunk_size);
std::vector<size_t> transfer_size = get_transfer_size(strings);
// actual measurement with burn in phase
bh::throughput(
bh::name{"VLstring h5::append<std::vector<std::string>>"}, record_size, warmup, sample,
[&](hsize_t idx, hsize_t size) -> double {
for (hsize_t i = 0; i < size; i++)
h5::append(ds, strings[i]);
return transfer_size[idx];
});
}
{ // VL STRING, INDEXED BY HDF5 B+TREE std::vector<std::string>
auto ds = get_datasets(fd, "VLstring h5::write<std::vector<const char*>> ", record_size);
std::vector<const char*> data = get_data(strings);
std::vector<size_t> transfer_size = get_transfer_size(strings);
// actual measurement with burn in phase
bh::throughput(
bh::name{"VLstring h5::write<std::vector<const char*>>"}, record_size, warmup, sample,
[&](hsize_t idx, hsize_t size) -> double {
h5::write(ds[idx], data.data(), h5::count{size});
return transfer_size[idx];
});
}
{ // VL STRING, INDEXED BY HDF5 B+TREE std::vector<std::string>
auto ds = get_datasets(fd, "VLstring std::vector<std::string> ", record_size);
std::vector<size_t> transfer_size = get_transfer_size(strings);
// actual measurement with burn in phase
bh::throughput(
bh::name{"VLstring std::vector<std::string>"}, record_size, warmup, sample,
[&](hsize_t idx, hsize_t size) -> double {
h5::write(ds[idx], strings, h5::count{size});
return transfer_size[idx];
});
}
{ // FL STRING, INDEXED BY HDF5 B+TREE std::vector<std::string>
using fixed_t = char[shim::pod_t::max_lenght::value]; // type alias
std::vector<size_t> transfer_size;
for (auto i : record_size)
transfer_size.push_back(i * sizeof(fixed_t));
std::vector<fixed_t> data = convert<fixed_t>(strings);
// modify VL type to fixed length
h5::dt_t<fixed_t> dt{H5Tcreate(H5T_STRING, sizeof(fixed_t))};
H5Tset_cset(dt, H5T_CSET_UTF8);
std::vector<h5::ds_t> ds;
for(auto size: record_size) ds.push_back(
h5::create<fixed_t>(fd, fmt::format("FLstring CAPI-{:010d}", size),
chunk_size, h5::current_dims{size}, dt));
// actual measurement
bh::throughput(
bh::name{"FLstring CAPI"}, record_size, warmup, sample,
[&](hsize_t idx, hsize_t size) -> double {
// memory space
h5::sp_t mem_space{H5Screate_simple(1, &size, nullptr )};
H5Sselect_all(mem_space);
// file space
h5::sp_t file_space{H5Dget_space(ds[idx])};
H5Sselect_all(file_space);
H5Dwrite( ds[idx], dt, mem_space, file_space, H5P_DEFAULT, data.data());
return transfer_size[idx];
});
}
{ // Variable Length STRING with CAPI IO calls
std::vector<size_t> transfer_size = get_transfer_size(strings);
std::vector<const char*> data = get_data(strings);
h5::dt_t<char*> dt;
std::vector<h5::ds_t> ds;
for(auto size: record_size) ds.push_back(
h5::create<char*>(fd, fmt::format("VLstring CAPI-{:010d}", size),
chunk_size, h5::current_dims{size}));
// actual measurement
bh::throughput(
bh::name{"VLstring CAPI"}, record_size, warmup, sample,
[&](hsize_t idx, hsize_t size) -> double {
// memory space
h5::sp_t mem_space{H5Screate_simple(1, &size, nullptr )};
H5Sselect_all(mem_space);
// file space
h5::sp_t file_space{H5Dget_space(ds[idx])};
H5Sselect_all(file_space);
H5Dwrite( ds[idx], dt, mem_space, file_space, H5P_DEFAULT, data.data());
return transfer_size[idx];
});
}
{ // C++ IO stream
std::vector<size_t> transfer_size = get_transfer_size(strings);
std::ofstream stream;
stream.open("somefile.txt", std::ios::out);
// actual measurement
bh::throughput(
bh::name{"C++ IOstream "}, record_size, warmup, sample,
[&](hsize_t idx, hsize_t size) -> double {
for (hsize_t k = 0; k < size; k++)
stream << strings[k] << std::endl;
return transfer_size[idx];
});
stream.close();
}
}
Results
- Fixed-length outperforms variable-length by a wide margin.
- Predictable size means HDF5 can lay out data contiguously and stream it efficiently.
- Variable-length introduces extra indirection and heap management, slowing things down.
In our runs, fixed-length writes achieved 70–95% of raw I/O speed, while variable-length lagged substantially behind.
Why It Matters
- If your schema permits it, prefer fixed-length types.
- Use variable-length only when data sizes truly vary (e.g., ragged arrays, free-form strings).
- For high-frequency trading, sensor arrays, or scientific simulations, fixed-length layouts maximize throughput.
POD Check
We also verified which record types qualify as POD (Plain Old Data) via a small utility (is-pod-test.cpp
). Only POD-eligible types map safely and efficiently into HDF5 compound layouts.
```cpp
static_assert(std::is_trivial_v
This ensures compatibility with direct binary writes—no surprises from constructors, vtables, or hidden padding.
Takeaway
- ✅ Fixed-length fields: fast, predictable, near raw I/O.
- ⚠️ Variable-length fields: flexible, but slower.
- 🔧 Use POD records to unlock HDF5’s full performance potential.
If performance is paramount, lock in fixed sizes and let your data pipeline fly.