Decoding vector fails
Closed this issue · 3 comments
Hello, i cant figure out why decoding a vector doesnt get me the same results and shape
This is my schema
schema = {
"index": {
"name": "layout_panorama_encodings",
"prefix": "layout_panorama_encodings_docs",
},
"fields": [
{"name": "panorama_name", "type": "tag"},
{"name": "dist_to_floor_m", "type": "tag"},
{"name": "used_def_dtf", "type": "tag"},
{
"name": "panorama_embedding",
"type": "vector",
"attrs": {
"dims": 512 * 768,
"distance_metric": "cosine",
"algorithm": "flat",
"datatype": "float32",
},
},
],
}
This is how I save the vector:
enc_out.astype(np.float32).flatten().tobytes()
However retrieving it fails:
t = Tag("panorama_name") == panorama_name
query = FilterQuery(filter_expression=t, num_results=1)
# Create a query to search for items with the tag "panorama_name"
# Execute the search
results = index.search(query)
if len(results.docs) > 0:
item = results.docs[0]
enc_out = np.frombuffer(["panorama_embedding"].encode(), dtype=np.float32).reshape(512, 768)
What am i doing wrong?
I have triple checked the shapes and dtypes - they are always the same and correct inputs
Here is fully receratable code of this issue:
from redis import Redis
from redisvl.index import SearchIndex
import numpy as np
from redisvl.query.filter import Tag
from redisvl.query import FilterQuery
schema = {
"index": {
"name": "my_index",
"prefix": "my_index_docs",
},
"fields": [
{"name": "panorama_name", "type": "tag"},
{
"name": "panorama_embedding",
"type": "vector",
"attrs": {
"dims": 512 * 768,
"distance_metric": "cosine",
"algorithm": "flat",
"datatype": "float32",
},
},
],
}
index = SearchIndex.from_dict(schema)
client = Redis.from_url("redis://redis:6379")
index.set_client(client)
index.create(overwrite=True)
embeddings = np.random.rand(512, 768).astype(np.float32)
bytes = embeddings.astype(np.float32).flatten().tobytes()
panorama_name = "PAN1"
# Save
index.load([{"panorama_name": panorama_name, "panorama_embedding": bytes}])
# Retrieve by tag only
t = Tag("panorama_name") == panorama_name
query = FilterQuery(filter_expression=t, num_results=1)
# Create a query to search for items with the tag "panorama_name"
# Execute the search
results = index.search(query)
item = results.docs[0]
arr = np.frombuffer(item["panorama_embedding"].encode(), dtype=np.float32).reshape(
512, 768
)
print(arr)
redis==5.0.8
redisvl==0.3.3
im using docker compose: redis/redis-stack:latest
Update:
Well, after reading docs https://www.redisvl.com/user_guide/hash_vs_json_05.html
I went with the json approach, which is quite sad because the size now is 5x larger
@marisancans it's not you, there is an issue in the underlying driver (redis-py) that breaks the encoding of the Vector field when retrieved with FT.SEARCH, see redis/redis-py#2275
A workaround requires extra Redis calls at the moment. One thing you can do is query for only the keys that you need and the retrieve the Hashes for those keys using HGETs in a pipeline
A fix has been merged but redis-py hasn't been released yet
Thanks @bsbodden, i did manage to get it working with hget, it is okay for now.
For anyone having similar issues, this is how i handled it:
from redis import Redis
from redisvl.index import SearchIndex
import numpy as np
from redisvl.query.filter import Tag
from redisvl.query import FilterQuery
import json
schema = {
"index": {"name": "my_index", "prefix": "my_index_docs"},
"fields": [
{"name": "panorama_name", "type": "tag"},
{
"name": "panorama_embedding",
"type": "vector",
"attrs": {
"dims": 512 * 768,
"distance_metric": "cosine",
"algorithm": "flat",
"datatype": "float32",
},
},
],
}
index = SearchIndex.from_dict(schema)
client = Redis.from_url("redis://redis:6379")
index.set_client(client)
index.create(overwrite=True)
embeddings = np.random.rand(512, 768).astype(np.float32)
random_arr = embeddings.astype(np.float32).flatten()
panorama_name = "PAN1"
# Save
key = index.load(
[{"panorama_name": panorama_name, "panorama_embedding": random_arr.tobytes()}]
)
arr_bytes = client.hget(name=key[0], key="panorama_embedding")
if arr_bytes:
arr = np.frombuffer(arr_bytes, dtype=np.float32)
arr = arr.reshape(512, 768)
print(arr)