lancedb/vectordb-recipes

Invalid argument error: Dictionary replacement detected when writing IPC file format. Arrow IPC files only support a single dictionary for a given field across all batches.

DomEscobar opened this issue · 3 comments

Heyo me again 👯

i created a little helper file for my usecase with lanceDb but when i run my example i get an error which doesnt help

[Error: Invalid argument error: Dictionary replacement detected when writing IPC file format. Arrow IPC files only support a single dictionary for a given field across all batches.]

Here is my code:

lanceDb-retriver.ts

import { OpenAIEmbeddingFunction, connect, } from 'vectordb';
const dbPath = 'assets/db'
let embedFunction;

export interface IngestOptions {
    table: string;
    data: Array<Record<string, unknown>>;
}

export interface RetriveOptions {
    query: string;
    table: string;
    limit?: number;
    filter?: string;
    select?: Array<string>;
}

export interface DeleteOptions {
    table: string;
    filter: string;
}

export interface UpdateOptions {
    table: string;
    data: Record<string, unknown>[]
}

export async function useLocalEmbedding() {
    const { pipeline } = await import('@xenova/transformers');
    const pipe = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2');

    const embed_fun: any = {};
    embed_fun.sourceColumn = 'text';
    embed_fun.embed = async function (batch) {
        let result = [];
        for (let text of batch) {
            const res = await pipe(text, { pooling: 'mean', normalize: true });
            result.push(Array.from(res['data']));
        }
        return result;
    }

    embedFunction = embed_fun;
}

export function useOpenAiEmbedding(apiKey: string, sourceColumn = 'pageContent') {
    embedFunction = new OpenAIEmbeddingFunction(sourceColumn, apiKey)
}

export async function update(options: UpdateOptions) {
    try {
        const db = await connect(dbPath)

        if ((await db.tableNames()).includes(options.table)) {
            const tbl = await db.openTable(options.table, embedFunction)
            await tbl.overwrite(options.data)
        } else {
            return new Error("Table does not exist")
        }
    } catch (e) {
        console.error(e);
        throw e;
    }
}

export async function remove(options: DeleteOptions) {
    try {
        const db = await connect(dbPath)

        if ((await db.tableNames()).includes(options.table)) {
            const tbl = await db.openTable(options.table, embedFunction)
            await tbl.delete(options.filter)
        } else {
            return new Error("Table does not exist")
        }
    } catch (e) {
        console.error(e);
        throw e;
    }
}

export async function ingest(options: IngestOptions) {
    try {
        const db = await connect(dbPath)
        if ((await db.tableNames()).includes(options.table)) {
            const tbl = await db.openTable(options.table, embedFunction)
            await tbl.overwrite(options.data)
        } else {
            await db.createTable(options.table, options.data, embedFunction)
        }
    }
    catch (e) {
        console.error(e);
        throw e;
    }
}

export async function retrive(options: RetriveOptions) {
    try {
        const db = await connect(dbPath)

        if ((await db.tableNames()).includes(options.table)) {
            const tbl = await db.openTable(options.table, embedFunction)
            const build = tbl.search(options.query);

            if (options.filter) {
                build.filter(options.filter)
            }

            if (options.select) {
                build.select(options.select)
            }

            if (options.limit) {
                build.limit(options.limit)
            }

            const results = await build.execute();
            return results;
        } else {
            return new Error("Table does not exist")
        }
    }
    catch (e) {
        console.error(e);
        throw e;
    }
}

and i call it on an other file

test.ts

import dotenv from 'dotenv';
dotenv.config();
const apiKey = process.env.OPENAI_API_KEY;

async function main() {
    console.time('ingest');
    const data = [
        {
            id: 1,
            metadata: {
                title: "Lorem Ipsum Document",
                author: "John Doe",
                date: "2023-09-20"
            },
            pageContent: "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed ac ipsum nec justo consequat dignissim. Nulla facilisi. Integer gravida tincidunt turpis eget iaculis."
        },
        {
            id: 2,
            metadata: {
                title: "Technical Report on AI Ethics",
                author: "Jane Smith",
                date: "2023-09-21"
            },
            pageContent: "This document provides an overview of the ethical considerations surrounding artificial intelligence. It covers topics such as bias in machine learning, data privacy, and responsible AI development."
        },
    ];
    useOpenAiEmbedding(apiKey);
    await ingest({
        data,
        table: 'vectors'
    })

    const retriveData = await retrive({
        table: 'vectors',
        query: 'what is lorem?'
    });

    console.log(retriveData);
    console.timeEnd('ingest');
}

main();

Thanks for reporting. I'll take a look and get back asap

Have you tried my sexy helper yet?

Oh i forgot that i resolved the issue, i had to flatten the data :)