[CORE-4523] feat(Vector Search): Merge to 4.2.0; (#177)

MichaelZengTG · jue-yuan · web-flow · commit aaa1bc61e94a · 2025-01-31T07:20:41.000-08:00
* [GLE-8861] feat(vector): built-in TG function for pairwise vector embedding;

* [GLE-8861] change euclidean to l2;

* [GLE-8861] add missing range for foreach statements;

* [GLE-8861] address comments;

* [GLE-8861] add OR REPLACE for each GSQL function;

---------

Co-authored-by: jue-yuan &lt;jue.yuan@tigergraph.com&gt;
diff --git a/gds/vector/cosine_distance.gsql b/gds/vector/cosine_distance.gsql
@@ -0,0 +1,67 @@
+CREATE OR REPLACE FUNCTION gds.vector.cosine_distance(list<double> list1, list<double> list2) RETURNS(float) {
+
+  /*
+  First Author: Jue Yuan
+  First Commit Date:  Nov 27, 2024 
+
+  Recent Author: Jue Yuan
+  Recent Commit Date: Nov 27, 2024 
+
+  Maturity:
+      alpha
+
+  Description:
+      Calculates the cosine distance between two vectors represented as lists of doubles.
+      The cosine distance is derived from the cosine similarity and provides a measure of the angle
+      between two non-zero vectors in a multi-dimensional space. A distance of 0 indicates identical
+      vectors, while a distance of 1 indicates orthogonal (maximally dissimilar) vectors.
+
+  Parameters:
+      list<double> list1:
+          The first vector as a list of double values.
+      list<double> list2:
+          The second vector as a list of double values.
+
+  Returns:
+      float:
+          The cosine distance between the two input vectors.
+  Exceptions:
+      list_size_mismatch (90000):
+          Raised when the input lists are not of equal size.
+      zero_divisor(90001);
+          Raised either list is all zero to avoid zero-divisor issue.
+
+  Logic Overview:
+      Validates that both input vectors have the same length.
+      Computes the inner (dot) product of the two vectors.
+      Calculates the magnitudes (Euclidean norms) of both vectors.
+      Returns the cosine distance as 1 - (inner product) / (product of magnitudes).
+
+  Use Case:
+      This function is commonly used in machine learning, natural language processing,
+      and information retrieval tasks to quantify the similarity between vector representations,
+      such as word embeddings or document feature vectors.
+  */
+
+  EXCEPTION list_size_mismatch (90000);
+  EXCEPTION zero_divisor(90001);
+  ListAccum<double> @@myList1 = list1;
+  ListAccum<double> @@myList2 = list2;
+
+  IF (@@myList1.size() != @@myList2.size()) THEN
+    RAISE list_size_mismatch ("Two lists provided for gds.vector.cosine_distance have different sizes.");
+  END;
+
+  double inner_p = inner_product(@@myList1, @@myList2);
+  double v1_magn = sqrt(inner_product(@@myList1, @@myList1));
+  double v2_magn = sqrt(inner_product(@@myList2, @@myList2));
+  IF (abs(v1_magn) < 0.0000001) THEN
+    // use a small positive float to avoid numeric comparison error
+    RAISE zero_divisor ("The elements in the first list are all zero. It will introduce a zero divisor.");
+  END;
+  IF (abs(v1_magn) < 0.0000001) THEN
+    // use a small positive float to avoid numeric comparison error
+    RAISE zero_divisor ("The elements in the second list are all zero. It will introduce a zero divisor.");
+  END;
+  RETURN (1 - inner_p / (v1_magn * v2_magn));
+}
diff --git a/gds/vector/dimension_count.gsql b/gds/vector/dimension_count.gsql
@@ -0,0 +1,37 @@
+CREATE OR REPLACE FUNCTION gds.vector.dimension_count(list<double> list1) RETURNS(int) {
+
+  /*
+  First Author: Jue Yuan
+  First Commit Date:  Nov 27, 2024 
+
+  Recent Author: Jue Yuan
+  Recent Commit Date: Nov 27, 2024 
+
+  Maturity:
+      alpha
+
+  Description:
+      Returns the number of dimensions (elements) in a given vector, represented as a list of double values.
+      This function is useful for determining the size or dimensionality of input vectors in mathematical
+      and data processing operations.
+
+  Parameters:
+      list<double> list1:
+          The input vector as a list of double values.
+
+  Returns:
+      int:
+          The number of elements (dimensions) in the input vector.
+
+  Logic Overview:
+      Accepts a list of double values as input.
+      Calculates the size of the list, which corresponds to the number of dimensions.
+      Returns the size as an integer.
+  Use Case:
+      This function is valuable in vector-based computations, such as machine learning or data analysis tasks,
+      where understanding the dimensionality of vectors is crucial for validation, preprocessing, or compatibility checks.
+  */
+
+  ListAccum<double> @@myList1 = list1;
+  RETURN @@myList1.size();
+}
diff --git a/gds/vector/distance.gsql b/gds/vector/distance.gsql
@@ -0,0 +1,99 @@
+CREATE OR REPLACE FUNCTION gds.vector.distance(list<double> list1, list<double> list2, string metric) RETURNS(float) {
+
+  /*
+  First Author: Jue Yuan
+  First Commit Date:  Nov 27, 2024 
+
+  Recent Author: Jue Yuan
+  Recent Commit Date: Nov 27, 2024 
+
+  Maturity:
+      alpha
+
+  Description:
+      Calculates the distance between two vectors represented as lists of double values,
+      based on a specified distance metric. This function supports multiple metrics,
+      allowing for flexible similarity or dissimilarity measurements in various computational tasks.
+
+  Parameters:
+      list<double> list1:
+          The first vector as a list of double values.
+      list<double> list2:
+          The second vector as a list of double values.
+      string metric:
+          The distance metric to use. Supported metrics are:
+              "cosine": Cosine distance
+              "l2": Euclidean distance
+              "ip": Inner product (dot product)
+  Returns:
+      float:
+          The computed distance between the two input vectors based on the specified metric.
+
+  Exceptions:
+      list_size_mismatch (90000):
+          Raised when the input vectors are not of equal size.
+      zero_divisor(90001);
+          Raised either list is all zero to avoid zero-divisor issue.
+      invalid_metric_type (90002):
+          Raised when an unsupported distance metric is provided.
+
+  Logic Overview:
+      Input Validation:
+          Ensures both vectors have the same size.
+  Metric Handling:
+      Cosine Distance:
+          Calculated as 1 - (inner product of vectors) / (product of magnitudes).
+      L2 Distance:
+          Computes the square root of the sum of squared differences between corresponding elements.
+      Inner Product:
+          Directly computes the dot product of the two vectors.
+
+  Error Handling:
+      Raises an exception if the provided metric is invalid.
+
+  Use Case:
+      This function is essential for machine learning, data science, and information retrieval applications,
+      where distance or similarity calculations between vector representations (such as embeddings or feature vectors) are required.
+  */
+
+  EXCEPTION list_size_mismatch (90000);
+  EXCEPTION zero_divisor(90001);
+  EXCEPTION invalid_metric_type (90002);
+  ListAccum<double> @@myList1 = list1;
+  ListAccum<double> @@myList2 = list2;
+
+  IF (@@myList1.size() != @@myList2.size()) THEN
+    RAISE list_size_mismatch ("Two lists provided for gds.vector.distance have different sizes.");
+  END;
+
+  SumAccum<float> @@myResult;
+  SumAccum<float> @@sqrSum;
+
+  CASE lower(metric)
+    WHEN "cosine" THEN
+      double inner_p = inner_product(@@myList1, @@myList2);
+      double v1_magn = sqrt(inner_product(@@myList1, @@myList1));
+      double v2_magn = sqrt(inner_product(@@myList2, @@myList2));
+      IF (abs(v1_magn) < 0.0000001) THEN
+      // use a small positive float to avoid numeric comparison error
+        RAISE zero_divisor ("The elements in the first list are all zero. It will introduce a zero divisor.");
+      END;
+      IF (abs(v2_magn) < 0.0000001) THEN
+      // use a small positive float to avoid numeric comparison error
+        RAISE zero_divisor ("The elements in the second list are all zero. It will introduce a zero divisor.");
+      END;
+      @@myResult = 1 - inner_p / (v1_magn * v2_magn);
+    WHEN "l2" THEN
+      FOREACH i IN RANGE [0, @@myList1.size() - 1 ] DO
+        @@sqrSum += (@@myList1.get(i) - @@myList2.get(i)) * (@@myList1.get(i) - @@myList2.get(i));
+      END;
+      @@myResult = sqrt(@@sqrSum);
+    WHEN "ip" THEN
+      @@myResult = inner_product(@@myList1, @@myList2);
+    ELSE
+      RAISE invalid_metric_type ("Invalid metric algorithm provided, currently supported: cosine, l2 and ip.");
+    END
+  ;
+
+  RETURN @@myResult;
+}
diff --git a/gds/vector/elements_sum.gsql b/gds/vector/elements_sum.gsql
@@ -0,0 +1,41 @@
+CREATE OR REPLACE FUNCTION gds.vector.elements_sum(list<double> list1) RETURNS(float) {
+
+  /*
+  First Author: Jue Yuan
+  First Commit Date:  Nov 27, 2024 
+
+  Recent Author: Jue Yuan
+  Recent Commit Date: Nov 27, 2024 
+
+  Maturity:
+      alpha
+
+  Description:
+      Calculates the sum of all elements in a vector, represented as a list of double values.
+      This function is useful for aggregating vector components in mathematical and statistical operations.
+
+  Parameters:
+      list<double> list1:
+          The input vector as a list of double values.
+
+  Returns:
+      float:
+          The sum of all elements in the input vector.
+
+  Logic Overview:
+      Iterates through each element in the input list.
+      Accumulates the sum of all elements.
+      Returns the final sum as a floating-point value.
+
+  Use Case:
+      This function is valuable in various data processing tasks, such as computing vector norms,
+      validating data integrity, or performing aggregations in machine learning and statistical analysis.
+  */
+
+  SumAccum<float> @@mySum;
+
+  FOREACH i IN list1 DO
+    @@mySum += i;
+  END;
+  RETURN @@mySum;
+}
diff --git a/gds/vector/ip_distance.gsql b/gds/vector/ip_distance.gsql
@@ -0,0 +1,58 @@
+CREATE OR REPLACE FUNCTION gds.vector.ip_distance(list<double> list1, list<double> list2) RETURNS(float) {
+
+  /*
+  First Author: Jue Yuan
+  First Commit Date:  Nov 27, 2024 
+
+  Recent Author: Jue Yuan
+  Recent Commit Date: Nov 27, 2024 
+
+  Maturity:
+      alpha
+
+  Description:
+      Calculates the inner product (dot product) between two vectors represented as lists of double values.
+      The inner product is a key measure in linear algebra, indicating the magnitude of the projection of one vector onto another.
+      This function provides a similarity measure commonly used in machine learning and data analysis.
+
+  Parameters:
+      list<double> list1:
+          The first vector as a list of double values.
+      list<double> list2:
+          The second vector as a list of double values.
+
+  Returns:
+      float:
+          The inner product (dot product) of the two input vectors.
+
+  Exceptions:
+      list_size_mismatch (90000):
+          Raised when the input vectors are not of equal size.
+
+  Logic Overview:
+      Input Validation:
+          Ensures both vectors have the same length.
+      Inner Product Calculation:
+          Computes the sum of the element-wise products of the two vectors.
+
+  Formula:
+      Inner Product = (x1 x y1) + (x2 x y2) + ... + (xn x yn)
+      Where xi and yi are elements of list1 and list2, respectively.
+
+  Use Case:
+      This function is widely used in:
+          Calculating similarity in machine learning models (e.g., recommendation systems).
+          Performing vector projections in linear algebra.
+          Evaluating similarity between embeddings in natural language processing (NLP).
+  */
+
+  EXCEPTION list_size_mismatch (90000);
+  ListAccum<double> @@myList1 = list1;
+  ListAccum<double> @@myList2 = list2;
+
+  IF (@@myList1.size() != @@myList2.size()) THEN
+    RAISE list_size_mismatch ("Two lists provided for gds.vector.ip_distance have different sizes.");
+  END;
+
+  RETURN inner_product(@@myList1, @@myList2);
+}
diff --git a/gds/vector/kth_element.gsql b/gds/vector/kth_element.gsql
@@ -0,0 +1,54 @@
+CREATE OR REPLACE FUNCTION gds.vector.kth_element(list<double> list1, int kth_index) RETURNS(float) {
+
+  /*
+  First Author: Jue Yuan
+  First Commit Date:  Nov 27, 2024 
+
+  Recent Author: Jue Yuan
+  Recent Commit Date: Nov 27, 2024 
+
+  Maturity:
+      alpha
+
+  Description:
+      Retrieves the k-th element from a vector, represented as a list of double values.
+      This function ensures safe access by validating the index against the vector's size,
+      preventing out-of-range errors.
+
+  Parameters:
+      list<double> list1:
+          The input vector as a list of double values.
+      int kth_index:
+          The zero-based index of the element to retrieve.
+
+  Returns:
+      float:
+          The value of the element at the specified k-th index in the input vector.
+
+  Exceptions:
+      out_of_range (90000):
+          Raised when the specified index is either negative or exceeds the size of the input vector.
+
+  Logic Overview:
+      Input Validation:
+          Checks if the provided index is within the valid range (0 to list size - 1).
+          Raises an exception if the index is out of range.
+      Element Retrieval:
+          Returns the element at the specified index.
+
+  Use Case:
+      This function is useful in scenarios where specific elements of a vector need to be accessed programmatically, such as:
+          Extracting features from a dataset.
+          Implementing custom vector operations in data processing pipelines.
+          Accessing indexed components in mathematical computations.
+  */
+
+  EXCEPTION out_of_range (90000);
+
+  ListAccum<double> @@myList1 = list1;
+  IF (kth_index >= @@myList1.size() OR kth_index < 0) THEN
+    RAISE out_of_range("Kth index provided for gds.vector.kth_element is out of the range of this list.");
+  END;
+
+  RETURN @@myList1.get(kth_index);
+}
diff --git a/gds/vector/l2_distance.gsql b/gds/vector/l2_distance.gsql
diff --git a/gds/vector/norm.gsql b/gds/vector/norm.gsql