From 1250453a4411811157dec30bc676e716644b0d51 Mon Sep 17 00:00:00 2001 From: Abhishek8108 <87538407+Abhishek8108@users.noreply.github.com> Date: Sat, 9 May 2026 10:07:43 +0100 Subject: [PATCH] fix: Handle numpy.ndarray from Arrow/Athena in Array(String) materialisation Athena (and other Arrow-backed offline stores) deserialises Array(String) feature columns as numpy.ndarray with object dtype rather than plain Python lists. Two code paths in type_map.py did not handle this: 1. _validate_collection_item_types iterated over the ndarray directly. Nullable Arrow columns can embed None elements, and type(None) is not in the valid_types set for STRING_LIST ([np.str_, str]), causing TypeError. 2. The generic list conversion path in _convert_list_values_to_proto passed the raw ndarray to StringList(val=...). Protobuf rejects non-list inputs with TypeError: bad argument type for built-in operation. Fix: - Coerce ndarray to a plain Python list via .tolist() before type validation, and skip None elements (nullable elements cannot be held in protobuf fixed-type lists and are stripped). - In the generic conversion path, apply the same coercion so protobuf always receives a plain list. Adds four unit tests covering: plain ndarray, ndarray with None elements, empty ndarray, and mixed None/ndarray rows. Fixes #6325 Signed-off-by: Abhishek8108 <87538407+Abhishek8108@users.noreply.github.com> --- sdk/python/feast/type_map.py | 31 +++++++++++++++--- sdk/python/tests/unit/test_type_map.py | 45 ++++++++++++++++++++++++++ 2 files changed, 71 insertions(+), 5 deletions(-) diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index e9ccee08f25..1c59b6ac3d4 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -739,7 +739,11 @@ def _validate_collection_item_types( """ if sample is None: return - if all(type(item) in valid_types for item in sample): + # Arrow/Athena deserialises Array columns as numpy.ndarray with object dtype; + # coerce to a plain list so element-level checks work uniformly. + items = sample.tolist() if isinstance(sample, np.ndarray) else sample + # None elements are valid in nullable Arrow columns — skip them when checking types. + if all(type(item) in valid_types for item in items if item is not None): return # to_numpy() upcasts INT32/INT64 with NULL to Float64 automatically @@ -749,11 +753,13 @@ def _validate_collection_item_types( ValueType.INT32_SET, ValueType.INT64_SET, ] - for item in sample: + for item in items: + if item is None: + continue if type(item) not in valid_types: if feast_value_type in int_collection_types: # Check if the float values are due to NULL upcast - if not any(np.isnan(i) for i in sample if isinstance(i, float)): + if not any(np.isnan(i) for i in items if isinstance(i, float)): logger.error( f"{feast_value_type.name} has NULL values. to_numpy() upcasts to Float64 automatically." ) @@ -945,9 +951,24 @@ def _convert_list_values_to_proto( for value in values ] - # Generic list conversion + # Generic list conversion. + # Arrow/Athena may return each row as a numpy.ndarray (object dtype) rather + # than a plain Python list. Protobuf rejects ndarray directly, so coerce to + # list and strip None elements (which protobuf fixed-type lists cannot hold). return [ - ProtoValue(**{field_name: proto_type(val=value)}) # type: ignore[arg-type] + ProtoValue( + **{ + field_name: proto_type( # type: ignore[arg-type] + val=[ + v + for v in ( + value.tolist() if isinstance(value, np.ndarray) else value + ) + if v is not None + ] + ) + } + ) if value is not None else ProtoValue() for value in values diff --git a/sdk/python/tests/unit/test_type_map.py b/sdk/python/tests/unit/test_type_map.py index bdaea63a607..7b6dce799ac 100644 --- a/sdk/python/tests/unit/test_type_map.py +++ b/sdk/python/tests/unit/test_type_map.py @@ -115,6 +115,51 @@ def test_python_values_to_proto_values_int_list_with_null_not_supported(): _ = python_values_to_proto_values(arr, ValueType.INT32_LIST) +class TestAthenaArrayStringConversion: + """Regression tests for Array(String) materialisation via Athena offline store. + + Arrow/Athena deserialises Array(String) columns as numpy.ndarray with object + dtype rather than plain Python lists. Two bugs were present: + - _validate_collection_item_types raised TypeError on None elements inside ndarrays. + - The generic list conversion path passed the ndarray directly to protobuf, which + rejects non-list inputs with TypeError. + """ + + def test_string_list_from_ndarray(self): + """Plain ndarray of strings converts without error.""" + values = [np.array(["a", "b", "c"], dtype=object)] + protos = python_values_to_proto_values(values, ValueType.STRING_LIST) + result = feast_value_type_to_python_type(protos[0]) + assert result == ["a", "b", "c"] + + def test_string_list_from_ndarray_with_none_elements(self): + """ndarray containing None elements (nullable Arrow column) converts without TypeError.""" + values = [np.array(["a", None, "c"], dtype=object)] + protos = python_values_to_proto_values(values, ValueType.STRING_LIST) + result = feast_value_type_to_python_type(protos[0]) + # None elements are stripped (protobuf StringList cannot hold nulls) + assert result == ["a", "c"] + + def test_string_list_from_empty_ndarray(self): + """Empty ndarray (entity row with no array values) converts to empty list.""" + values = [np.array([], dtype=object)] + protos = python_values_to_proto_values(values, ValueType.STRING_LIST) + result = feast_value_type_to_python_type(protos[0]) + assert result == [] + + def test_string_list_mixed_null_and_ndarray_rows(self): + """Mix of None rows (null feature) and ndarray rows converts correctly.""" + values = [ + np.array(["x", "y"], dtype=object), + None, + np.array(["z"], dtype=object), + ] + protos = python_values_to_proto_values(values, ValueType.STRING_LIST) + assert feast_value_type_to_python_type(protos[0]) == ["x", "y"] + assert feast_value_type_to_python_type(protos[1]) is None + assert feast_value_type_to_python_type(protos[2]) == ["z"] + + class TestMapTypes: """Test cases for MAP and MAP_LIST value types."""