klarna/erlavro

Schema evolution without writers schema

LostKobrakai opened this issue · 2 comments

We're using erlavro in a system with lots of mobile devices and encountered the need to be able to deal with avro data, which was encoded using a newer or older version of a schema without knowing the exact writers schema.

For forward compatibility we made erlavro be able to not require the whole data to be matched, so we can append fields. While for backwards compatibility we made it use defaults of record fields if an explicit value for it was missing. This was aligned to the schema resolution suggestions in the spec sans knowing the writers schema.

The following are our "brute force" changes, but we're wondering if there's the possibility to bring that functionality upstream, likely behind some flag to toggle the behavior.

diff --git a/src/avro_binary_decoder.erl b/src/avro_binary_decoder.erl
index 93d480f..a0b6129 100644
--- a/src/avro_binary_decoder.erl
+++ b/src/avro_binary_decoder.erl
@@ -63,7 +63,7 @@ decode(IoData, Type, StoreOrLkupFun) ->
 decode(IoData, Type, StoreOrLkupFun, Options) ->
   %% return decoded value as raw erlang term directly
   Lkup = avro_util:ensure_lkup_fun(StoreOrLkupFun),
-  {Value, <<>>} = do_decode(IoData, Type, Lkup, Options),
+  {Value, _} = do_decode(IoData, Type, Lkup, Options),
   Value.

 %% @doc decode_stream/4 equivalent with default hook fun.
@@ -149,12 +149,18 @@ dec(Bin, T, _Lkup, #{hook := Hook}) when ?IS_FIXED_TYPE(T) ->
 -spec dec_record(binary(), record_type(), lkup_fun(),
                  decoder_options()) -> {avro:out(), binary()}.
 dec_record(Bin, T, Lkup, #{record_type := RecordType} = Options) ->
-  FieldTypes = avro_record:get_all_field_types(T),
+  FieldTypes = avro_record:get_all_field_data(T),
   {FieldValuesReversed, Tail} =
     lists:foldl(
-      fun({FieldName, FieldType}, {Values, BinIn}) ->
-        {Value, BinOut} = dec_item(T, FieldName, FieldType,
-                                   BinIn, Lkup, Options),
+      fun({FieldName, FieldType, ?NO_VALUE}, {Values, BinIn}) ->
+        {Value, BinOut} = dec_item(T, FieldName, FieldType, BinIn, Lkup, Options),
+        {[{FieldName, Value} | Values], BinOut};
+      ({FieldName, FieldType, Default}, {Values, BinIn}) ->
+        {Value, BinOut} = try dec_item(T, FieldName, FieldType, BinIn, Lkup, Options) of
+          {DecodedValue, BinRest} -> {DecodedValue, BinRest}
+        catch
+          _:_ -> {Default, BinIn}
+        end,
         {[{FieldName, Value} | Values], BinOut}
       end, {[], Bin}, FieldTypes),
   FieldValues1 = case RecordType of

this only works for fields appended to the very end of the stream?
what happens if new fields are appended to an array-of-records?

avro record fields and array elements are encoded one after another as a stream of bytes
i.e. fields are not tagged with id, array elements are not tagged with length.

Good point. We're currently only working with top level records, so this naive change did it for us. Could there be a more appropriate place to handle this, but not affecting other types? I'm not sure if there's currently the bandwidth on our side to develop a solution, which handles all possible top level types.