Semantic Search

This example demonstrates using the korvus SDK to create a collection, add documents, build a pipeline for vector search and make a sample query.


                content_copy
            
const korvus = require("korvus");

// Initialize our Collection
const collection = korvus.newCollection("semantic-search-demo");

// Initialize our Pipeline
// Our Pipeline will split and embed the `text` key of documents we upsert
const pipeline = korvus.newPipeline("v1", {
  text: {
    splitter: { model: "recursive_character" },
    semantic_search: {
      model: "mixedbread-ai/mxbai-embed-large-v1",
    }
  },
});

const main = async () => {
  // Add our Pipeline to our Collection
  await collection.add_pipeline(pipeline);

  // Upsert our documents
  // The `text` key of our documents will be split and embedded per our Pipeline specification above
  let documents = [
    {
      id: "1",
      text: "Korvus is incredibly fast and easy to use.",
    },
    {
      id: "2",
      text: "Tomatoes are incredible on burgers.",
    },
  ]
  await collection.upsert_documents(documents)

  // Perform vector_search
  // We are querying for the string "Is Korvus fast?"
  // Notice that the `mixedbread-ai/mxbai-embed-large-v1` embedding model takes a prompt parameter when embedding for search
  // We specify that we only want to return the `id` of documents. If the `document` key was blank it would return the entire document with every result
  // Limit the results to 5. In our case we only have two documents in our Collection so we will only get two results
  const results = await collection.vector_search(
    {
      query: {
        fields: {
          text: {
            query: "Is Korvus fast?",
            parameters: {
              prompt:
                "Represent this sentence for searching relevant passages: ",
            }
          },
        },
      },
      document: {
        keys: [
          "id"
        ]
      },
      limit: 5,
    },
    pipeline);
  console.log(results)
}

main().then(() => console.log("DONE!"))


                content_copy
            
from korvus import Collection, Pipeline
from rich import print
import asyncio

# Initialize our Collection
collection = Collection("semantic-search-demo")

# Initialize our Pipeline
# Our Pipeline will split and embed the `text` key of documents we upsert
pipeline = Pipeline(
    "v1",
    {
        "text": {
            "splitter": {"model": "recursive_character"},
            "semantic_search": {
                "model": "mixedbread-ai/mxbai-embed-large-v1",
            },
        },
    },
)


async def main():
    # Add our Pipeline to our Collection
    await collection.add_pipeline(pipeline)

    # Upsert our documents
    # The `text` key of our documents will be split and embedded per our Pipeline specification above
    documents = [
        {
            "id": "1",
            "text": "Korvus is incredibly fast and easy to use.",
        },
        {
            "id": "2",
            "text": "Tomatoes are incredible on burgers.",
        },
    ]
    await collection.upsert_documents(documents)

    # Perform vector_search
    # We are querying for the string "Is Korvus fast?"
    # Notice that the `mixedbread-ai/mxbai-embed-large-v1` embedding model takes a prompt parameter when embedding for search
    # We specify that we only want to return the `id` of documents. If the `document` key was blank it would return the entire document with every result
    # Limit the results to 5. In our case we only have two documents in our Collection so we will only get two results
    results = await collection.vector_search(
        {
            "query": {
                "fields": {
                    "text": {
                        "query": "Is Korvus fast?",
                        "parameters": {
                            "prompt": "Represent this sentence for searching relevant passages: ",
                        },
                    },
                },
            },
            "document": {"keys": ["id"]},
            "limit": 5,
        },
        pipeline,
    )
    print(results)


asyncio.run(main())

Running this example outputs:


                content_copy
            
[
    {'chunk': 'Korvus is incredibly fast and easy to use.', 'document': {'id': '1'}, 'rerank_score': None, 'score': 0.7855310349374217},
    {'chunk': 'Tomatoes are incredible on burgers.', 'document': {'id': '2'}, 'rerank_score': None, 'score': 0.3634796874710092}
]

Notice how much higher the score for Korvus is incredibly fast and easy to use. is compared to Tomatoes are incredible on burgers.. This means our semantic search is working!

Semantic Search

The Code

PostgresML