Invalid argument error: Dictionary replacement detected when writing IPC file format. Arrow IPC files only support a single dictionary for a given field across all batches.
DomEscobar opened this issue · 3 comments
DomEscobar commented
Heyo me again 👯
i created a little helper file for my usecase with lanceDb but when i run my example i get an error which doesnt help
[Error: Invalid argument error: Dictionary replacement detected when writing IPC file format. Arrow IPC files only support a single dictionary for a given field across all batches.]
Here is my code:
lanceDb-retriver.ts
import { OpenAIEmbeddingFunction, connect, } from 'vectordb';
const dbPath = 'assets/db'
let embedFunction;
export interface IngestOptions {
table: string;
data: Array<Record<string, unknown>>;
}
export interface RetriveOptions {
query: string;
table: string;
limit?: number;
filter?: string;
select?: Array<string>;
}
export interface DeleteOptions {
table: string;
filter: string;
}
export interface UpdateOptions {
table: string;
data: Record<string, unknown>[]
}
export async function useLocalEmbedding() {
const { pipeline } = await import('@xenova/transformers');
const pipe = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2');
const embed_fun: any = {};
embed_fun.sourceColumn = 'text';
embed_fun.embed = async function (batch) {
let result = [];
for (let text of batch) {
const res = await pipe(text, { pooling: 'mean', normalize: true });
result.push(Array.from(res['data']));
}
return result;
}
embedFunction = embed_fun;
}
export function useOpenAiEmbedding(apiKey: string, sourceColumn = 'pageContent') {
embedFunction = new OpenAIEmbeddingFunction(sourceColumn, apiKey)
}
export async function update(options: UpdateOptions) {
try {
const db = await connect(dbPath)
if ((await db.tableNames()).includes(options.table)) {
const tbl = await db.openTable(options.table, embedFunction)
await tbl.overwrite(options.data)
} else {
return new Error("Table does not exist")
}
} catch (e) {
console.error(e);
throw e;
}
}
export async function remove(options: DeleteOptions) {
try {
const db = await connect(dbPath)
if ((await db.tableNames()).includes(options.table)) {
const tbl = await db.openTable(options.table, embedFunction)
await tbl.delete(options.filter)
} else {
return new Error("Table does not exist")
}
} catch (e) {
console.error(e);
throw e;
}
}
export async function ingest(options: IngestOptions) {
try {
const db = await connect(dbPath)
if ((await db.tableNames()).includes(options.table)) {
const tbl = await db.openTable(options.table, embedFunction)
await tbl.overwrite(options.data)
} else {
await db.createTable(options.table, options.data, embedFunction)
}
}
catch (e) {
console.error(e);
throw e;
}
}
export async function retrive(options: RetriveOptions) {
try {
const db = await connect(dbPath)
if ((await db.tableNames()).includes(options.table)) {
const tbl = await db.openTable(options.table, embedFunction)
const build = tbl.search(options.query);
if (options.filter) {
build.filter(options.filter)
}
if (options.select) {
build.select(options.select)
}
if (options.limit) {
build.limit(options.limit)
}
const results = await build.execute();
return results;
} else {
return new Error("Table does not exist")
}
}
catch (e) {
console.error(e);
throw e;
}
}
and i call it on an other file
test.ts
import dotenv from 'dotenv';
dotenv.config();
const apiKey = process.env.OPENAI_API_KEY;
async function main() {
console.time('ingest');
const data = [
{
id: 1,
metadata: {
title: "Lorem Ipsum Document",
author: "John Doe",
date: "2023-09-20"
},
pageContent: "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed ac ipsum nec justo consequat dignissim. Nulla facilisi. Integer gravida tincidunt turpis eget iaculis."
},
{
id: 2,
metadata: {
title: "Technical Report on AI Ethics",
author: "Jane Smith",
date: "2023-09-21"
},
pageContent: "This document provides an overview of the ethical considerations surrounding artificial intelligence. It covers topics such as bias in machine learning, data privacy, and responsible AI development."
},
];
useOpenAiEmbedding(apiKey);
await ingest({
data,
table: 'vectors'
})
const retriveData = await retrive({
table: 'vectors',
query: 'what is lorem?'
});
console.log(retriveData);
console.timeEnd('ingest');
}
main();
AyushExel commented
Thanks for reporting. I'll take a look and get back asap
DomEscobar commented
Have you tried my sexy helper yet?
DomEscobar commented
Oh i forgot that i resolved the issue, i had to flatten the data :)