Friday, May 17, 2013

My take on serialization (Part II: serialize)

After discussed the get_size(...) functor, which given an object returns its serialized size in bytes, we can go on and write the serialize function.

We can follow the same pattern of the get_size, but this time we have to store the content to a stream.
namespace detail {
template <class T>
class serialize_helper;
template <class T>
void serializer(const T& obj, StreamType::iterator&);
template <class tuple_type>
inline void serialize_tuple(const tuple_type& obj, StreamType::iterator& res, int_<0>) {
constexpr size_t idx = std::tuple_size<tuple_type>::value-1;
serializer(std::get<idx>(obj), res);
}
template <class tuple_type, size_t pos>
inline void serialize_tuple(const tuple_type& obj, StreamType::iterator& res, int_<pos>) {
constexpr size_t idx = std::tuple_size<tuple_type>::value-pos-1;
serializer(std::get<idx>(obj), res);
// recur
serialize_tuple(obj, res, int_<pos-1>());
}
template <class... T>
struct serialize_helper<std::tuple<T...>> {
static void apply(const std::tuple<T...>& obj, StreamType::iterator& res) {
detail::serialize_tuple(obj, res, detail::int_<sizeof...(T)-1>());
}
};
template <>
struct serialize_helper<std::string> {
static void apply(const std::string& obj, StreamType::iterator& res) {
// store the number of elements of this string at the beginning
serializer(obj.length(), res);
for(const auto& cur : obj) { serializer(cur, res); }
}
};
template <class T>
struct serialize_helper<std::vector<T>> {
static void apply(const std::vector<T>& obj, StreamType::iterator& res) {
// store the number of elements of this vector at the beginning
serializer(obj.size(), res);
for(const auto& cur : obj) { serializer(cur, res); }
}
};
template <class T>
struct serialize_helper {
static void apply(const T& obj, StreamType::iterator& res) {
const uint8_t* ptr = reinterpret_cast<const uint8_t*>(&obj);
std::copy(ptr,ptr+sizeof(T),res);
res+=sizeof(T);
}
};
template <class T>
inline void serializer(const T& obj, StreamType::iterator& res) {
serialize_helper<T>::apply(obj,res);
}
} // end detail namespace
template <class T>
inline void serialize(const T& obj, StreamType& res) {
size_t offset = res.size();
size_t size = get_size(obj);
res.resize(res.size() + size);
StreamType::iterator it = res.begin() + offset;
detail::serializer(obj,it);
assert(res.begin() + offset + size == it);
}
view raw gistfile1.cpp hosted with ❤ by GitHub

As before, we have a specialization of the serialize_helper template for tuples, vectors, strings and POD datatypes. In order to be performance efficient we presize the vector in lines 66-67 (because we know how many bytes we need to store the object) thanks to the get_size predicate.

Let's write some unit tests (because they are very important):
TEST(Serialize, Ints) {
uint32_t v1 = 10;
StreamType res;
serialize(v1,res);
EXPECT_EQ(sizeof(uint32_t), res.size());
EXPECT_EQ(res, std::vector<uint8_t>({0xA, 0, 0, 0}));
res.clear();
uint64_t v2 = 64;
serialize(v2,res);
EXPECT_EQ(sizeof(uint64_t), res.size());
EXPECT_EQ(res, std::vector<uint8_t>({0x40, 0, 0, 0, 0, 0, 0, 0}));
res.clear();
int v3 = -1;
serialize(v3,res);
EXPECT_EQ(sizeof(int), res.size());
EXPECT_EQ(res, std::vector<uint8_t>({0xFF, 0xFF, 0xFF, 0xFF}));
}
TEST(Serialize, Vector) {
auto t1 = std::vector<int>{1,2};
StreamType res;
serialize(t1,res);
EXPECT_EQ(sizeof(decltype(t1)::value_type)*t1.size()+sizeof(size_t), res.size());
EXPECT_EQ(res, std::vector<uint8_t>({/*size(*/2, 0, 0, 0, 0, 0, 0, 0,/*)*/ 1, 0, 0, 0, 2, 0, 0, 0}));
res.clear();
auto t2 = std::vector<std::vector<uint8_t>>{{1,2}, {3,4}};
serialize(t2,res);
EXPECT_EQ(get_size(t2), res.size());
EXPECT_EQ(28u, res.size());
EXPECT_EQ(res, std::vector<uint8_t>(
{/*size(*/2, 0, 0, 0, 0, 0, 0, 0, /*) size(*/ 2, 0, 0, 0, 0, 0, 0, 0, /*)*/ 1, 2,
/*size(*/2, 0, 0, 0, 0, 0, 0, 0, /*)*/ 3, 4 }));
}
TEST(Serialize, IntTuple) {
auto t1 = std::make_tuple(1,2);
StreamType res;
serialize(t1,res);
EXPECT_EQ(sizeof(decltype(t1)), res.size());
EXPECT_EQ(res, std::vector<uint8_t>({1, 0, 0, 0, 2, 0, 0, 0}));
res.clear();
auto t2 = std::make_tuple(256,256*2,256*3);
serialize(t2,res);
EXPECT_EQ(sizeof(decltype(t2)), res.size());
EXPECT_EQ(res, std::vector<uint8_t>({0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0}));
res.clear();
auto t3 = std::tuple<boost::uint32_t, boost::uint64_t>(0,1);
serialize(t3,res);
EXPECT_EQ(get_size(t3), res.size());
EXPECT_EQ(res, std::vector<uint8_t>({0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0}));
}
TEST(Serialize, TupleVec) {
auto t1 = std::tuple<int,int,std::vector<uint8_t>>(10, 20, std::vector<uint8_t>{1,2});
StreamType res;
serialize(t1,res);
EXPECT_EQ(18u, res.size());
EXPECT_EQ(res, std::vector<uint8_t>({
/*get<0>*/ 10, 0, 0, 0,
/*get<1>*/ 20, 0, 0, 0,
/*size(*/ 2, 0, 0, 0, 0, 0, 0, 0,/*)*/ 1, 2}));
}
TEST(Serialize, String) {
std::string s = "string";
StreamType res;
serialize(s,res);
EXPECT_EQ(14u, get_size(s));
EXPECT_EQ(14u, res.size());
EXPECT_EQ(res, std::vector<uint8_t>({/*size(*/6, 0, 0, 0, 0, 0, 0, 0,/*)*/ 's', 't', 'r', 'i', 'n', 'g'}));
}
view raw gistfile1.cpp hosted with ❤ by GitHub


So it's time for running some benchmarks. YAY! Let's compare this serializer with boost and check whether spending time re implementing this was worth some. Since I don't have much time, we only run 1 experiment... if you are interested you can run more :) 

std::tuple<int,int,std::vector<uint8_t>> t(10, 20, {0,1,2,3,4,5,6,7,8,9});
TEST(Performance, Water) {
auto start = std::chrono::high_resolution_clock::now();
for (size_t i=0; i<500000; ++i) {
StreamType res;
serialize(t,res);
}
auto tot_time = std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::system_clock::now()-start).count();
std::cout << "time: " << tot_time << std::endl;
}
/* boost serializer */
template <class T>
inline std::string to_bytes(const T &v) {
std::ostringstream ss;
boost::archive::text_oarchive oa(ss);
oa << v;
return ss.str();
}
TEST(Performance, Boost) {
auto start = std::chrono::high_resolution_clock::now();
StreamType res;
for (size_t i=0; i<500000; ++i) { to_bytes(t1); }
auto tot_time = std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::system_clock::now()-start).count();
std::cout << "time: " << tot_time << std::endl;
}
view raw gistfile1.cpp hosted with ❤ by GitHub

So, as long as size of the serialized stream is concerned, boost uses 63 bytes while ours only 26 bytes (more than half). Performance-wise boost::serialization needs 4.410 secs to run the test, our serialization solution 0.975 seconds, more than 4 times faster. Of course we are not saying that boost::serialization sucks... as a matter of fact it solves a different problem since it also store the typing information which allows everyone (even a Java program) to reload the stream. Our serialization strategy (as I explained in the first post) is based on the fact that the type is known at receiver side (therefore we can avoid to store typing).

Let's show some graphs, comparison of our serialization strategy relative to boost::serialization:

No comments:

Post a Comment