Skip to content

pybase-net/py_vectordb

Repository files navigation

PGVector

-- in psql:
CREATE TABLE products (
  id        TEXT PRIMARY KEY,
  title     TEXT,
  price     REAL,
  category  TEXT,
  embedding VECTOR(384)
);
-- Build an IVF-Flat index for cosine search:
CREATE INDEX products_embedding_idx
  ON products USING ivfflat (embedding vector_cosine_ops)
  WITH (lists = 64);
-- Populate the index metadata
VACUUM products;
uv add pandas psycopg2-binary sentence-transformers
SELECT language, embedding
FROM programming_languages;

-- 2. Nearest-neighbor by Euclidean (L2) distance

-- probe vector: [year_norm, type_code, para_code, rank_norm, perf, name_norm]
WITH probe AS (
  SELECT ARRAY[0.5, 2, 0, 0.4, 0.8, 0.9]::vector(6) AS v
)
SELECT
  pl.language,
  (pl.embedding <-> probe.v) AS l2_dist
FROM programming_languages pl, probe
ORDER BY l2_dist
LIMIT 3;

-- 3. Nearest-neighbor by Cosine distance

WITH probe AS (
  SELECT ARRAY[0.5,2,0,0.4,0.8,0.9]::vector(6) AS v
)
SELECT
  pl.language,
  (pl.embedding <=> probe.v) AS cos_dist
FROM programming_languages pl, probe
ORDER BY cos_dist
LIMIT 3;

-- 4. Manhattan (L1) distance

WITH probe AS (
  SELECT ARRAY[0.5,2,0,0.4,0.8,0.9]::vector(6) AS v
)
SELECT
  pl.language,
  (pl.embedding <+> probe.v) AS l1_dist
FROM programming_languages pl, probe
ORDER BY l1_dist
LIMIT 3;

-- 5. Hybrid filter + semantic

WITH probe AS (
  SELECT ARRAY[0.5,1,1,0.4,0.8,0.9]::vector(6) AS v
)
SELECT
  pl.language,
  pl.typing_discipline,
  (pl.embedding <-> probe.v) AS score
FROM programming_languages pl, probe
WHERE pl.primary_paradigm = 'Object-Oriented'
ORDER BY score
LIMIT 5;

-- 6. Viewing all distances for a specific language
WITH target AS (
  SELECT embedding AS v
  FROM programming_languages
  WHERE language = 'Python'
)
SELECT
  pl.language,
  (pl.embedding <-> target.v) AS l2_dist
FROM programming_languages pl, target
WHERE pl.language <> 'Python'
ORDER BY l2_dist;

Programming languages

Operator What it measures Example use-case on the languages table
<-> L₂ (Euclidean) distance Straight-line distance in 6-D feature space “Find the 3 languages whose overall profiles (age, typing, paradigm, popularity, speed, name length) are closest to Go.”
<+> L₁ (Manhattan) distance Sum of absolute feature differences “Which languages differ the least in total from Python? (e.g. similar maturity, typing discipline, performance, etc.)”
<#> Negative inner product –(v·w), preserves magnitude “Which languages have the highest combined score on (year_norm + rank_norm + perf)?” (useful if larger norms mean ‘more overall’)
<=> Cosine distance Angular separation (direction only) “Which languages share the same profile pattern as Rust, regardless of their absolute ‘age’ or ‘popularity’ values?”
(binary) <~>, <%> Hamming/Jaccard (not applicable—our vectors aren’t binary) N/A
-- Most ‘overall-similar’ to Go
WITH probe AS (SELECT ARRAY[1.0,2,0,0.777,0.50,0.72]::vector(6) AS v)
SELECT language, embedding <-> v AS dist
FROM programming_languages, probe
ORDER BY dist
LIMIT 3;

-- Least total deviation from Python
WITH probe AS (SELECT embedding FROM programming_languages WHERE language='Python')
SELECT pl.language, pl.embedding <+> probe.embedding AS manh
FROM programming_languages pl, probe
ORDER BY manh
LIMIT 3;

-- Highest combined magnitude similarity to a ‘high-perf & mature’ probe
WITH probe AS (
  -- probe vector: [year_norm=1.0, type=2, para=0, rank_norm=0.5, perf=1.10, name_norm=0.9]
  SELECT ARRAY[1.0,2,0,0.5,1.10,0.9]::vector(6) AS v
)
SELECT
  pl.language,
  -- inner-product operator returns negative dot; negate it to rank highest first
  - (pl.embedding <#> probe.v) AS score
FROM programming_languages pl, probe
ORDER BY pl.embedding <#> probe.v
LIMIT 3;

WITH rust_vec AS (
  SELECT embedding AS v
  FROM programming_languages
  WHERE language = 'Rust'
)
SELECT
  pl.language,
  (pl.embedding <=> rust_vec.v) AS cosine_dist
FROM programming_languages pl, rust_vec
WHERE pl.language <> 'Rust'
ORDER BY cosine_dist
LIMIT 3;
Operator Metric Best-Suited Use-Case
<-> L2 (Euclidean) distance Geometric proximity: embedding spaces where absolute differences matter (e.g., image feature vectors).
General purpose: often yields smooth neighborhoods in dense vector spaces.
<+> L1 (Manhattan) distance Sparsity‐robust: more forgiving of outliers in individual dimensions (e.g., tabular feature vectors with some large differences).
Grid‐like spaces: when features represent orthogonal axes (e.g., city-block routing).
<#> Negative inner-product Recommendation scores: when vectors aren’t normalized and raw dot-product corresponds to “affinity” (e.g., user×item latent factors).
Unnormalized embeddings: preserves magnitude information.
<=> Cosine distance Textual semantics: embeddings normalized for length (e.g., BERT/SBERT outputs).
Magnitude-invariant similarity: finds items with similar direction regardless of vector norm.
<~> Hamming distance (binary) Binary fingerprints: comparing bit‐vector representations (e.g., perceptual image hashes, locality-sensitive hashes).
Error detection: counting differing bits in fixed-width codes.
<%> Jaccard distance (binary) Set membership: binary vectors encoding presence/absence (e.g., tag sets, characteristic shingles).
Overlap measures: when you care about
A∩B / A∪B rather than bit‐differences alone.

References

About

Pybase VectorDB Research, theory and real world usecases

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Languages