Collections

Collections are the organizational building blocks of the SDK. They manage all documents and related chunks, embeddings, tsvectors, and pipelines.

Various collection methods have their own guides:

Creating Collections

By default, collections will read and write to the database specified by KORVUS_DATABASE_URL environment variable.

Default KORVUS_DATABASE_URL

content_copy
const collection = korvus.newCollection("test_collection")

content_copy
collection = Collection("test_collection")

content_copy
let mut collection = Collection::new("test_collection", None)?;

content_copy
CollectionC * collection = korvus_collectionc_new("test_collection", NULL);

Custom KORVUS_DATABASE_URL

Create a Collection that reads from a different database than that set by the environment variable KORVUS_DATABASE_URL.

content_copy
const collection = korvus.newCollection("test_collection", CUSTOM_DATABASE_URL)

content_copy
collection = Collection("test_collection", CUSTOM_DATABASE_URL)

content_copy
let mut collection = Collection::new("test_collection", Some(CUSTOM_DATABASE_URL))?;

content_copy
CollectionC * collection = korvus_collectionc_new("test_collection", CUSTOM_DATABASE_URL);

Upserting Documents

Documents are dictionaries with one required key: id. All other keys/value pairs are stored and can be chunked, embedded, broken into tsvectors, and searched over as specified by a Pipeline.

See our guide on Constructing Pipelines for more information on building pipelines.

content_copy
const documents = [
{
id: "document_one",
title: "Document One",
text: "document one contents...",
random_key: "here is some random data",
},
{
id: "document_two",
title: "Document Two",
text: "document two contents...",
random_key: "here is some random data",
},
];
await collection.upsert_documents(documents);

content_copy
documents = [
{
"id": "document_one",
"title": "Document One",
"text": "Here are the contents of Document 1",
"random_key": "here is some random data",
},
{
"id": "document_two",
"title": "Document Two",
"text": "Here are the contents of Document 2",
"random_key": "here is some random data",
},
]
await collection.upsert_documents(documents)

content_copy
let documents: Vec = vec![
serde_json::json!({
"id": "document_one",
"title": "Document One",
"text": "Here are the contents of Document 1",
"random_key": "here is some random data",
})
.into(),
serde_json::json!({
"id": "document_two",
"title": "Document Two",
"text": "Here are the contents of Document 2",
"random_key": "here is some random data",
})
.into(),
];
collection.upsert_documents(documents, None).await?;

content_copy
char * documents[2] = {
"{\"id\": \"document_one\", \"title\": \"Document One\", \"text\": \"Here are the contents of Document 1\", \"random_key\": \"here is some random data\"}",
"{\"id\": \"document_two\", \"title\": \"Document Two\", \"text\": \"Here are the contents of Document 2\", \"random_key\": \"here is some random data\"}"
};
korvus_collectionc_upsert_documents(collection, documents, 2, NULL);

Documents can be replaced by upserting documents with the same id.

content_copy
const documents = [
{
id: "document_one",
title: "Document One New Title",
text: "Here is some new text for document one",
random_key: "here is some new random data",
},
{
id: "document_two",
title: "Document Two New Title",
text: "Here is some new text for document two",
random_key: "here is some new random data",
},
];
await collection.upsert_documents(documents);

content_copy
documents = [
{
"id": "document_one",
"title": "Document One",
"text": "Here is some new text for document one",
"random_key": "here is some random data",
},
{
"id": "document_two",
"title": "Document Two",
"text": "Here is some new text for document two",
"random_key": "here is some random data",
},
]
await collection.upsert_documents(documents)

content_copy
let documents: Vec = vec![
serde_json::json!({
"id": "document_one",
"title": "Document One",
"text": "Here is some new text for document one",
"random_key": "here is some random data",
})
.into(),
serde_json::json!({
"id": "document_two",
"title": "Document Two",
"text": "Here is some new text for document two",
"random_key": "here is some random data",
})
.into(),
];
collection.upsert_documents(documents, None).await?;

content_copy
char * documents[2] = {
"{\"id\": \"document_one\", \"title\": \"Document One\", \"text\": \"Here is some new text for document one\", \"random_key\": \"here is some random data\"}",
"{\"id\": \"document_two\", \"title\": \"Document Two\", \"text\": \"Here is some new text for document two\", \"random_key\": \"here is some random data\"}"
};
korvus_collectionc_upsert_documents(collection, documents, 2, NULL);

Documents can be merged by setting the merge option. On conflict, new document keys will override old document keys.

content_copy
const documents = [
{
id: "document_one",
new_key: "this will be a new key in document one",
random_key: "this will replace old random_key"
},
{
id: "document_two",
new_key: "this will bew a new key in document two",
random_key: "this will replace old random_key"
},
];
await collection.upsert_documents(documents, {
merge: true
});

content_copy
documents = [
{
"id": "document_one",
"new_key": "this will be a new key in document one",
"random_key": "this will replace old random_key",
},
{
"id": "document_two",
"new_key": "this will be a new key in document two",
"random_key": "this will replace old random_key",
},
]
await collection.upsert_documents(documents, {"merge": True})

content_copy
let documents: Vec = vec![
serde_json::json!({
"id": "document_one",
"new_key": "this will be a new key in document one",
"random_key": "this will replace old random_key"
})
.into(),
serde_json::json!({
"id": "document_two",
"new_key": "this will be a new key in document two",
"random_key": "this will replace old random_key"
})
.into(),
];
collection
.upsert_documents(documents, Some(serde_json::json!({"merge": true}).into()))
.await?;

content_copy
char * documents[2] = {
"{\"id\": \"document_one\", \"new_key\": \"this will be a new key in document one\", \"random_key\": \"this will replace old random_key\"}",
"{\"id\": \"document_two\", \"new_key\": \"this will be a new key in document two\", \"random_key\": \"this will replace old random_key\"}"
};
korvus_collectionc_upsert_documents(collection, documents, 2, "{\"merge\": true}");

Getting Documents

Documents can be retrieved using the get_documents method on the collection object.

content_copy
const documents = await collection.get_documents({limit: 100 })

content_copy
documents = await collection.get_documents({ "limit": 100 })

content_copy
let documents = collection
.get_documents(Some(serde_json::json!({"limit": 100}).into()))
.await?;

content_copy
unsigned long r_size = 0;
char** documents = korvus_collectionc_get_documents(collection, "{\"limit\": 100}", &r_size);

Paginating Documents

The SDK supports limit-offset pagination and keyset pagination.

Limit-Offset Pagination

content_copy
const documents = await collection.get_documents({ limit: 100, offset: 10 })

content_copy
documents = await collection.get_documents({ "limit": 100, "offset": 10 })

content_copy
let documents = collection
.get_documents(Some(serde_json::json!({"limit": 100, "offset": 10}).into()))
.await?;

content_copy
unsigned long r_size = 0;
char** documents = korvus_collectionc_get_documents(collection, "{\"limit\": 100, \"offset\": 10}", &r_size);

Keyset Pagination

content_copy
const documents = await collection.get_documents({ limit: 100, last_row_id: 10 })

content_copy
documents = await collection.get_documents({ "limit": 100, "last_row_id": 10 })

content_copy
let documents = collection
.get_documents(Some(serde_json::json!({"limit": 100, "last_row_id": 10}).into()))
.await?;

content_copy
unsigned long r_size = 0;
char** documents = korvus_collectionc_get_documents(collection, "{\"limit\": 100, \"last_row_id\": 10}", &r_size);

The last_row_id can be taken from the row_id field in the returned document's dictionary. Keyset pagination does not currently work when specifying the order_by key.

Filtering Documents

Documents can be filtered by passing in the filter key.

content_copy
const documents = await collection.get_documents({
limit: 10,
filter: {
id: {
$eq: "document_one"
}
}
})

content_copy
documents = await collection.get_documents(
{
"limit": 100,
"filter": {
"id": {"$eq": "document_one"},
},
}
)

content_copy
let documents = collection
.get_documents(Some(
serde_json::json!({
"limit": 100,
"filter": {
"id": {"$eq": "document_one"},
}
})
.into(),
))
.await?;

content_copy
unsigned long r_size = 0;
char** documents = korvus_collectionc_get_documents(collection, "{\"limit\": 100, \"filter\": {\"id\": {\"$eq\": \"document_one\"}}}", &r_size);

Sorting Documents

Documents can be sorted on any key. Note that this does not currently work well with Keyset based pagination. If paginating and sorting, use Limit-Offset based pagination.

content_copy
const documents = await collection.get_documents({
limit: 100,
offset: 10,
order_by: {
id: "desc"
}
})

content_copy
documents = await collection.get_documents({
"limit": 100,
"offset": 10,
"order_by": {
"id": "desc"
}
})

content_copy
let documents = collection
.get_documents(Some(
serde_json::json!({
"limit": 100,
"offset": 10,
"order_by": {
"id": "desc"
}
})
.into(),
))
.await?;

content_copy
unsigned long r_size = 0;
char** documents = korvus_collectionc_get_documents(collection, "{\"limit\": 100, \"offset\": 10, \"order_by\": {\"id\": \"desc\"}}", &r_size);

Deleting Documents

Documents can be deleted with the delete_documents method on the collection object.

content_copy
const documents = await collection.delete_documents({
id: {
$eq: 1
}
})

content_copy
documents = await collection.delete_documents(
{
"id": {"$eq": 1},
}
)

content_copy
let documents = collection
.delete_documents(
serde_json::json!({
"id": {
"$eq": 1
}
})
.into(),
)
.await?;

content_copy
korvus_collectionc_delete_documents(collection, "{\"id\": { \"$eq\": 1}}");

See: Vector search

See: Document search

RAG

See: RAG