Decoding vector fails

Question

Decoding vector fails

Closed this issue a year ago · 3 comments

Hello, i cant figure out why decoding a vector doesnt get me the same results and shape
This is my schema

schema = {
    "index": {
        "name": "layout_panorama_encodings",
        "prefix": "layout_panorama_encodings_docs",
    },
    "fields": [
        {"name": "panorama_name", "type": "tag"},
        {"name": "dist_to_floor_m", "type": "tag"},
        {"name": "used_def_dtf", "type": "tag"},
        {
            "name": "panorama_embedding",
            "type": "vector",
            "attrs": {
                "dims": 512 * 768,
                "distance_metric": "cosine",
                "algorithm": "flat",
                "datatype": "float32",
            },
        },
    ],
}

This is how I save the vector:
enc_out.astype(np.float32).flatten().tobytes()

However retrieving it fails:

t = Tag("panorama_name") == panorama_name
query = FilterQuery(filter_expression=t, num_results=1)

# Create a query to search for items with the tag "panorama_name"

# Execute the search
results = index.search(query)

if len(results.docs) > 0:
    item = results.docs[0]

     enc_out =  np.frombuffer(["panorama_embedding"].encode(), dtype=np.float32).reshape(512, 768)

What am i doing wrong?
I have triple checked the shapes and dtypes - they are always the same and correct inputs

Here is fully receratable code of this issue:

from redis import Redis
from redisvl.index import SearchIndex
import numpy as np
from redisvl.query.filter import Tag
from redisvl.query import FilterQuery

schema = {
    "index": {
        "name": "my_index",
        "prefix": "my_index_docs",
    },
    "fields": [
        {"name": "panorama_name", "type": "tag"},
        {
            "name": "panorama_embedding",
            "type": "vector",
            "attrs": {
                "dims": 512 * 768,
                "distance_metric": "cosine",
                "algorithm": "flat",
                "datatype": "float32",
            },
        },
    ],
}

index = SearchIndex.from_dict(schema)
client = Redis.from_url("redis://redis:6379")

index.set_client(client)
index.create(overwrite=True)

embeddings = np.random.rand(512, 768).astype(np.float32)

bytes = embeddings.astype(np.float32).flatten().tobytes()

panorama_name = "PAN1"

# Save
index.load([{"panorama_name": panorama_name, "panorama_embedding": bytes}])


# Retrieve by tag only

t = Tag("panorama_name") == panorama_name
query = FilterQuery(filter_expression=t, num_results=1)

# Create a query to search for items with the tag "panorama_name"

# Execute the search
results = index.search(query)
item = results.docs[0]

arr = np.frombuffer(item["panorama_embedding"].encode(), dtype=np.float32).reshape(
    512, 768
)
print(arr)

redis==5.0.8
redisvl==0.3.3

im using docker compose: redis/redis-stack:latest

Answer 1 · 2024-09-13T11:58:05.000Z

Update:
Well, after reading docs https://www.redisvl.com/user_guide/hash_vs_json_05.html
I went with the json approach, which is quite sad because the size now is 5x larger

Answer 2 · 2024-09-13T14:00:30.000Z

@marisancans it's not you, there is an issue in the underlying driver (redis-py) that breaks the encoding of the Vector field when retrieved with FT.SEARCH, see redis/redis-py#2275

A workaround requires extra Redis calls at the moment. One thing you can do is query for only the keys that you need and the retrieve the Hashes for those keys using HGETs in a pipeline

A fix has been merged but redis-py hasn't been released yet

Answer 3 · 2024-09-16T07:31:04.000Z

Thanks @bsbodden, i did manage to get it working with hget, it is okay for now.
For anyone having similar issues, this is how i handled it:

from redis import Redis
from redisvl.index import SearchIndex
import numpy as np
from redisvl.query.filter import Tag
from redisvl.query import FilterQuery
import json

schema = {
    "index": {"name": "my_index", "prefix": "my_index_docs"},
    "fields": [
        {"name": "panorama_name", "type": "tag"},
        {
            "name": "panorama_embedding",
            "type": "vector",
            "attrs": {
                "dims": 512 * 768,
                "distance_metric": "cosine",
                "algorithm": "flat",
                "datatype": "float32",
            },
        },
    ],
}

index = SearchIndex.from_dict(schema)
client = Redis.from_url("redis://redis:6379")

index.set_client(client)
index.create(overwrite=True)

embeddings = np.random.rand(512, 768).astype(np.float32)

random_arr = embeddings.astype(np.float32).flatten()

panorama_name = "PAN1"

# Save
key = index.load(
    [{"panorama_name": panorama_name, "panorama_embedding": random_arr.tobytes()}]
)

arr_bytes = client.hget(name=key[0], key="panorama_embedding")

if arr_bytes:
    arr = np.frombuffer(arr_bytes, dtype=np.float32)
    arr = arr.reshape(512, 768)
    print(arr)