Semantic Search

This tutorial demonstrates using the pgml SDK to create a collection, add documents, build a pipeline for vector search, make a sample query, and archive the collection when finished.

The SDK is imported and environment variables are loaded.


                content_copy
                link
                edit
            
const pgml = require("pgml");
require("dotenv").config();


                content_copy
                link
                edit
            
from pgml import Collection, Pipeline
from datasets import load_dataset
from time import time
from dotenv import load_dotenv
from rich.console import Console
import asyncio

Initialize Collection

A collection object is created to represent the search collection.


                content_copy
                link
                edit
            
const main = async () => { // Open the main function, we close it at the bottom
  // Initialize the collection
  const collection = pgml.newCollection("semantic_search_collection");


                content_copy
                link
                edit
            
async def main(): # Start the main function, we end it after archiving
    load_dotenv()
    console = Console()

    # Initialize collection
    collection = Collection("quora_collection")

Create Pipeline

A pipeline encapsulating a model and splitter is created and added to the collection.


                content_copy
                link
                edit
            
  // Add a pipeline
  const pipeline = pgml.newPipeline("semantic_search_pipeline", {
    text: {
      splitter: { model: "recursive_character" },
      semantic_search: {
        model: "intfloat/e5-small",
      },
    },
  });
  await collection.add_pipeline(pipeline);


                content_copy
                link
                edit
            
    # Create and add pipeline
    pipeline = Pipeline(
        "quorav1",
        {
            "text": {
                "splitter": {"model": "recursive_character"},
                "semantic_search": {"model": "intfloat/e5-small"},
            }
        },
    )
    await collection.add_pipeline(pipeline)

Upsert Documents

Documents are upserted into the collection and indexed by the pipeline.


                content_copy
                link
                edit
            
  // Upsert documents, these documents are automatically split into chunks and embedded by our pipeline
  const documents = [
    {
      id: "Document One",
      text: "document one contents...",
    },
    {
      id: "Document Two",
      text: "document two contents...",
    },
  ];
  await collection.upsert_documents(documents);


                content_copy
                link
                edit
            
    # Prep documents for upserting
    dataset = load_dataset("quora", split="train")
    questions = []
    for record in dataset["questions"]:
        questions.extend(record["text"])

    # Remove duplicates and add id
    documents = []
    for i, question in enumerate(list(set(questions))):
        if question:
            documents.append({"id": i, "text": question})

    # Upsert documents
    await collection.upsert_documents(documents[:2000])

Query

A vector similarity search query is made on the collection.


                content_copy
                link
                edit
            
  // Perform vector search
  const query = "Something that will match document one first";
  const queryResults = await collection.vector_search(
    {
      query: {
        fields: {
          text: { query: query }
        }
      }, limit: 2
    }, pipeline);
  console.log("The results");
  console.log(queryResults);


                content_copy
                link
                edit
            
    # Query
    query = "What is a good mobile os?"
    console.print("Querying for %s..." % query)
    start = time()
    results = await collection.vector_search(
        {"query": {"fields": {"text": {"query": query}}}, "limit": 5}, pipeline
    )
    end = time()
    console.print("\n Results for '%s' " % (query), style="bold")
    console.print(results)
    console.print("Query time = %0.3f" % (end - start))

Archive Collection

The collection is archived when finished.


                content_copy
                link
                edit
            
  await collection.archive();
} // Close the main function


                content_copy
                link
                edit
            
    await collection.archive()
# The end of the main function

Main

Boilerplate to call main() async function.


                content_copy
                link
                edit
            
main().then(() => console.log("Done!"));


                content_copy
                link
                edit
            
if __name__ == "__main__":
    asyncio.run(main())