Skip to content

Correct confusion matrix calculation-function evaluate_detection_batch #1853

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: develop
Choose a base branch
from
Open
48 changes: 27 additions & 21 deletions supervision/metrics/detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,32 +299,38 @@ def evaluate_detection_batch(
iou_batch = box_iou_batch(
boxes_true=true_boxes, boxes_detection=detection_boxes
)
matched_idx = np.asarray(iou_batch > iou_threshold).nonzero()

if matched_idx[0].shape[0]:
matches = np.stack(
(matched_idx[0], matched_idx[1], iou_batch[matched_idx]), axis=1
)
matches = ConfusionMatrix._drop_extra_matches(matches=matches)
else:
matches = np.zeros((0, 3))
matched_gt_idx = set()
matched_det_idx = set()

matched_true_idx, matched_detection_idx, _ = matches.transpose().astype(
np.int16
)
# For each GT, find best matching detection (highest IoU > threshold)
for gt_idx, gt_class in enumerate(true_classes):
candidate_det_idxs = np.where(iou_batch[gt_idx] > iou_threshold)[0]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The selection of the best match is happening based solely on IoU, which means a wrong-class prediction can still be chosen over a right-class one if it has a higher IoU.


for i, true_class_value in enumerate(true_classes):
j = matched_true_idx == i
if matches.shape[0] > 0 and sum(j) == 1:
result_matrix[
true_class_value, detection_classes[matched_detection_idx[j]]
] += 1 # TP
if len(candidate_det_idxs) == 0:
# No matching detection → FN for this GT
result_matrix[gt_class, num_classes] += 1
continue

best_det_idx = candidate_det_idxs[
np.argmax(iou_batch[gt_idx, candidate_det_idxs])
]
det_class = detection_classes[best_det_idx]

if best_det_idx not in matched_det_idx:
# Count as matched regardless of class:
# same class → TP, different class → misclassification
result_matrix[gt_class, det_class] += 1
matched_gt_idx.add(gt_idx)
matched_det_idx.add(best_det_idx)
else:
result_matrix[true_class_value, num_classes] += 1 # FN
# Detection already matched, GT is FN
result_matrix[gt_class, num_classes] += 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that this logic iterates through ground truth boxes and for each one finds the best-matching detection box, i.e, the one with highest IoU above the threshold, that hasn't been matched yet.

The issue with this logic is that the matching process depends on the order of the ground truth boxes in true_classes. So, if a single detection box has a high IoU with multiple ground truth boxes, it will be matched with the first ground truth box that is processed which can lead to inconsistent and incorrect confusion matrices, as the result will vary depending on the order of ground truths in the input data.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for pointing that out, @soumik12345 — you're absolutely right about the issue with order-dependent matching in the original logic.

To address this, the updated implementation builds a full IoU matrix between all ground truth and detection boxes, then collects all valid matches (IoU above threshold), and sorts them globally — prioritizing class-correct matches first, then by highest IoU. This removes any dependency on the order of the ground truth boxes.

We then greedily assign matches while ensuring each GT and detection is only matched once, which avoids conflicts where multiple GTs compete for a single detection.

This approach ensures that:

  • Matching is consistent and order-independent.
  • The most meaningful matches (correct class, highest IoU) are selected first.
  • The confusion matrix is computed accurately, regardless of input order.

I have written some test cases that help me correct the logic, at the beginning i was failing most of them, but now all of them are passed.

import numpy as np
import sys
import os

# Add current directory or specific path to sys.path
sys.path.insert(0, os.path.abspath("./supervision/"))

import supervision as sv
# print(sv.__file__)

class_names = ['cat', 'dog', 'rabbit']

def cm_from(preds, gts, name, conf_th=0.5, iou_th=0.5):
    """Utility that builds a confusion‑matrix from two Detections objects and prints it."""
    cm = sv.ConfusionMatrix.from_detections(
        predictions=[preds],
        targets=[gts],
        classes=class_names,
        conf_threshold=conf_th,
        iou_threshold=iou_th
    )
    # print(f"{name} ➜ matrix\n{cm.matrix}\n")
    print(f"{name}")
    return cm

def assert_matrix_equal(actual_matrix, expected_matrix, test_name):
    """Assert that two confusion matrices are equal."""
    try:
        np.testing.assert_array_equal(actual_matrix, expected_matrix)
        print(f"✅ {test_name} - PASSED")
        print(f"Actual:\n{actual_matrix}\n")
        return True
    except AssertionError as e:
        print(f"❌ {test_name} - FAILED")
        print(f"Expected:\n{expected_matrix}")
        print(f"Actual:\n{actual_matrix}")
        # print(f"Difference:\n{actual_matrix - expected_matrix}")
        print()  # Add blank line for readability
        return False

# ------------------------------------------------------------
# Test 1: Class priority over IoU - correct class with lower IoU should win
# ------------------------------------------------------------
def test_class_priority():
    gt = sv.Detections(
        xyxy=np.array([[0,0,2,2]]),  # cat
        class_id=np.array([0])
    )
    pred = sv.Detections(
        xyxy=np.array([[0.1,0.1,2.1,2.1],    # cat with IoU ~0.82
                       [0.0,0.0,2.0,2.0]]),   # dog with IoU ~1.0
        class_id=np.array([0, 1]),
        confidence=np.array([0.9, 0.95])
    )
    
    cm = cm_from(pred, gt, "TEST-1 Class priority over IoU")
    
    # Expected confusion matrix shape: 4x4 (3 classes + FP/FN)
    # Rows: GT_cat, GT_dog, GT_rabbit, FP
    # Cols: pred_cat, pred_dog, pred_rabbit, FN
    # GT cat matches pred cat (TP), pred dog is unmatched (FP)
    expected = np.array([
        [1., 0., 0., 0.],  # GT cat: 1 TP (matched with cat pred)
        [0., 0., 0., 0.],  # GT dog: none
        [0., 0., 0., 0.],  # GT rabbit: none
        [0., 1., 0., 0.]   # FP: 1 dog prediction unmatched
    ])
    
    assert_matrix_equal(cm.matrix, expected, "Class priority over IoU")

# ------------------------------------------------------------
# Test 2: Multiple overlapping predictions with different classes
# ------------------------------------------------------------
def test_multiple_overlapping():
    gt = sv.Detections(
        xyxy=np.array([[0,0,2,2], [4,4,6,6]]),  # cat, dog
        class_id=np.array([0, 1])
    )
    pred = sv.Detections(
        xyxy=np.array([[0.1,0.1,2.1,2.1],    # cat (IoU ~0.82)
                       [0.2,0.2,2.2,2.2],    # dog (IoU ~0.64)
                       [0.3,0.3,2.3,2.3],    # rabbit (IoU ~0.49)
                       [4.1,4.1,6.1,6.1]]),  # dog (IoU ~0.82)
        class_id=np.array([0, 1, 2, 1]),
        confidence=np.array([0.9, 0.8, 0.7, 0.85])
    )
    
    cm = cm_from(pred, gt, "TEST-2 Multiple overlapping predictions", iou_th=0.5)
    
    # Expected matrix (4x4):
    # Rows: GT_cat, GT_dog, GT_rabbit, FP
    # Cols: pred_cat, pred_dog, pred_rabbit, FN
    # GT cat matches pred cat (TP), GT dog matches pred dog (TP)
    # Unmatched predictions: dog pred (low IoU), rabbit pred (both FP)
    expected = np.array([
        [1., 0., 0., 0.],  # GT cat: 1 TP (matched with cat pred)
        [0., 1., 0., 0.],  # GT dog: 1 TP (matched with dog pred)
        [0., 0., 0., 0.],  # GT rabbit: none
        [0., 1., 1., 0.]   # FP: 1 dog pred + 1 rabbit pred unmatched
    ])
    
    assert_matrix_equal(cm.matrix, expected, "Multiple overlapping predictions")

# ------------------------------------------------------------
# Test 3: Confidence threshold filtering with edge cases
# ------------------------------------------------------------
def test_confidence_filtering():
    gt = sv.Detections(
        xyxy=np.array([[0,0,2,2], [4,4,6,6]]),
        class_id=np.array([0, 1])
    )
    pred = sv.Detections(
        xyxy=np.array([[0,0,2,2], [4,4,6,6], [8,8,10,10]]),
        class_id=np.array([0, 1, 2]),
        confidence=np.array([0.6, 0.4, 0.8])  # middle one below threshold
    )
    
    cm = cm_from(pred, gt, "TEST-3 Confidence filtering", conf_th=0.5)
    
    # Expected matrix (4x4):
    # Rows: GT_cat, GT_dog, GT_rabbit, FP
    # Cols: pred_cat, pred_dog, pred_rabbit, FN
    # Dog pred filtered out (conf < 0.5), so GT dog becomes FN
    # Rabbit pred has no matching GT, so it's FP
    expected = np.array([
        [1., 0., 0., 0.],  # GT cat: 1 TP (matched with cat pred)
        [0., 0., 0., 1.],  # GT dog: 1 FN (no valid pred due to conf filter)
        [0., 0., 0., 0.],  # GT rabbit: none
        [0., 0., 1., 0.]   # FP: 1 rabbit pred unmatched
    ])
    
    assert_matrix_equal(cm.matrix, expected, "Confidence filtering")

# ------------------------------------------------------------
# Test 4: IoU threshold boundary cases
# ------------------------------------------------------------
def test_iou_threshold_boundary():
    gt = sv.Detections(
        xyxy=np.array([[0,0,2,2], [4,4,6,6]]),
        class_id=np.array([0, 1])
    )
    pred = sv.Detections(
        xyxy=np.array([[0,0,1.5,1.5],    # cat with IoU = 0.5 (at threshold)
                       [4,4,5.5,5.5]]),  # dog with IoU = 0.5 (at threshold)
        class_id=np.array([0, 1]),
        confidence=np.array([0.9, 0.8])
    )
    
    cm = cm_from(pred, gt, "TEST-4 IoU threshold boundary", iou_th=0.5)
    
    # Expected matrix (4x4):
    # Rows: GT_cat, GT_dog, GT_rabbit, FP
    # Cols: pred_cat, pred_dog, pred_rabbit, FN
    # IoU = 0.5 meets threshold, so both are TP
    expected = np.array([
        [1., 0., 0., 0.],  # GT cat: 1 TP (IoU = 0.5 meets threshold)
        [0., 1., 0., 0.],  # GT dog: 1 TP (IoU = 0.5 meets threshold)
        [0., 0., 0., 0.],  # GT rabbit: none
        [0., 0., 0., 0.]   # FP: none
    ])
    
    assert_matrix_equal(cm.matrix, expected, "IoU threshold boundary")

# ------------------------------------------------------------
# Test 5: Chain of overlapping detections
# ------------------------------------------------------------
def test_chain_overlapping():
    gt = sv.Detections(
        xyxy=np.array([[0,0,2,2], [1,1,3,3], [2,2,4,4]]),
        class_id=np.array([0, 1, 2])  # cat, dog, rabbit
    )
    pred = sv.Detections(
        xyxy=np.array([[0.1,0.1,2.1,2.1],    # overlaps with cat and dog
                       [1.9,1.9,3.9,3.9]]),  # overlaps with dog and rabbit
        class_id=np.array([0, 2]),  # cat, rabbit
        confidence=np.array([0.9, 0.8])
    )
    
    cm = cm_from(pred, gt, "TEST-5 Chain overlapping detections")
    
    # Expected matrix (4x4):
    # Rows: GT_cat, GT_dog, GT_rabbit, FP
    # Cols: pred_cat, pred_dog, pred_rabbit, FN
    # Cat pred matches GT cat (TP), rabbit pred matches GT rabbit (TP)
    # GT dog has no matching pred (FN)
    expected = np.array([
        [1., 0., 0., 0.],  # GT cat: 1 TP (matched with cat pred)
        [0., 0., 0., 1.],  # GT dog: 1 FN (no matching pred)
        [0., 0., 1., 0.],  # GT rabbit: 1 TP (matched with rabbit pred)
        [0., 0., 0., 0.]   # FP: none
    ])
    
    assert_matrix_equal(cm.matrix, expected, "Chain overlapping detections")

# ------------------------------------------------------------
# Test 6: All false positives (no ground truth)
# ------------------------------------------------------------
def test_no_ground_truth():
    gt = sv.Detections(
        xyxy=np.empty((0, 4)),
        class_id=np.array([], dtype=int)
    )
    pred = sv.Detections(
        xyxy=np.array([[0,0,2,2], [4,4,6,6], [8,8,10,10]]),
        class_id=np.array([0, 1, 2]),
        confidence=np.array([0.9, 0.8, 0.7])
    )
    
    cm = cm_from(pred, gt, "TEST-6 No ground truth")
    
    # Expected matrix (4x4):
    # Rows: GT_cat, GT_dog, GT_rabbit, FP
    # Cols: pred_cat, pred_dog, pred_rabbit, FN
    # All predictions are FP since no GT exists
    expected = np.array([
        [0., 0., 0., 0.],  # GT cat: none
        [0., 0., 0., 0.],  # GT dog: none
        [0., 0., 0., 0.],  # GT rabbit: none
        [1., 1., 1., 0.]   # FP: 1 each for cat, dog, rabbit preds
    ])
    
    assert_matrix_equal(cm.matrix, expected, "No ground truth")

# ------------------------------------------------------------
# Test 7: Empty predictions and empty ground truth
# ------------------------------------------------------------
def test_empty_detections():
    gt = sv.Detections(
        xyxy=np.empty((0, 4)),
        class_id=np.array([], dtype=int)
    )
    pred = sv.Detections(
        xyxy=np.empty((0, 4)),
        class_id=np.array([], dtype=int),
        confidence=np.array([], dtype=float)
    )
    
    cm = cm_from(pred, gt, "TEST-7 Empty detections")
    
    # Expected matrix (4x4):
    # Rows: GT_cat, GT_dog, GT_rabbit, FP
    # Cols: pred_cat, pred_dog, pred_rabbit, FN
    # All zeros since no GT and no predictions
    expected = np.zeros((len(class_names)+1, len(class_names)+1))
    assert_matrix_equal(cm.matrix, expected, "Empty detections")

# ------------------------------------------------------------
# Test 8: Multi-class, multiple matches and misses
# ------------------------------------------------------------
def test_multi_class_misses():
    gt = sv.Detections(
        xyxy=np.array([[0,0,2,2], [4,4,6,6], [8,8,10,10]]),
        class_id=np.array([0, 1, 2])
    )
    pred = sv.Detections(
        xyxy=np.array([[0,0,2,2], [4,4,6,6], [10,10,12,12]]),
        class_id=np.array([0, 2, 1]),
        confidence=np.array([0.9, 0.8, 0.7])
    )
    
    cm = cm_from(pred, gt, "TEST-8 Multi-class misses")
    
    # Expected matrix (4x4):
    # Rows: GT_cat, GT_dog, GT_rabbit, FP
    # Cols: pred_cat, pred_dog, pred_rabbit, FN
    # GT cat matches pred cat (TP)
    # GT dog matches pred rabbit (misclassification: GT dog → pred rabbit)
    # GT rabbit has no matching pred (FN)
    # Pred dog has no matching GT (FP)
    expected = np.array([
        [1., 0., 0., 0.],  # GT cat: 1 TP (matched with cat pred)
        [0., 0., 1., 0.],  # GT dog: misclassified as rabbit pred
        [0., 0., 0., 1.],  # GT rabbit: 1 FN (no matching pred)
        [0., 1., 0., 0.]   # FP: 1 dog pred unmatched
    ])
    
    assert_matrix_equal(cm.matrix, expected, "Multi-class misses")

# ------------------------------------------------------------
# Test 9: Complex multiple predictions with mixed results
# ------------------------------------------------------------
def test_complex_multiple():
    gt = sv.Detections(
        xyxy=np.array([[0,0,2,2], [4,4,6,6], [8,8,10,10], [12,12,14,14]]),
        class_id=np.array([0, 1, 2, 0])
    )
    pred = sv.Detections(
        xyxy=np.array([[0,0,2,2], [4,4,6,6], [8,8,10,10], [12,12,14,14], [16,16,18,18]]),
        class_id=np.array([0, 1, 1, 2, 2]),
        confidence=np.array([0.9, 0.8, 0.7, 0.6, 0.5])
    )
    
    cm = cm_from(pred, gt, "TEST-9 Complex multiple predictions")
    
    # Expected matrix (4x4):
    # Rows: GT_cat, GT_dog, GT_rabbit, FP
    # Cols: pred_cat, pred_dog, pred_rabbit, FN
    # Matches: GT cat[0,0,2,2] → pred cat[0,0,2,2] (TP)
    #          GT dog[4,4,6,6] → pred dog[4,4,6,6] (TP)
    #          GT rabbit[8,8,10,10] → pred dog[8,8,10,10] (misclassified as dog)
    #          GT cat[12,12,14,14] → pred rabbit[12,12,14,14] (misclassified as rabbit)
    #          pred rabbit[16,16,18,18] → no GT match (FP)
    expected = np.array([
        [1., 0., 1., 0.],  # GT cat: 1 TP + 1 misclassified as rabbit
        [0., 1., 0., 0.],  # GT dog: 1 TP
        [0., 1., 0., 0.],  # GT rabbit: 1 misclassified as dog
        [0., 0., 1., 0.]   # FP: 1 rabbit pred unmatched
    ])

    assert_matrix_equal(cm.matrix, expected, "Complex multiple predictions")

# ------------------------------------------------------------
# Test 10: Large complex example with multi-class and misses
# ------------------------------------------------------------
def test_large_complex():
    gt = sv.Detections(
        xyxy=np.array([[0,0,2,2], [4,4,6,6], [8,8,10,10], [12,12,14,14]]),
        class_id=np.array([0, 1, 2, 0])
    )
    pred = sv.Detections(
        xyxy=np.array([[0,0,2,2], [4,4,6,6], [8,8,10,10], [12,12,14,14], [16,16,18,18], [18,18,20,20]]),
        class_id=np.array([0, 0, 1, 2, 1, 2]),
        confidence=np.array([0.9, 0.8, 0.7, 0.6, 0.5, 0.4])
    )
    
    cm = cm_from(pred, gt, "TEST-10 Large complex example", conf_th=0.5)
    
    # Expected matrix (4x4):
    # Rows: GT_cat, GT_dog, GT_rabbit, FP
    # Cols: pred_cat, pred_dog, pred_rabbit, FN
    # After confidence filtering (≥0.5): last pred is filtered out
    # Matches: GT cat[0,0,2,2] → pred cat[0,0,2,2] (TP)
    #          GT dog[4,4,6,6] → pred cat[4,4,6,6] (misclassified as cat)
    #          GT rabbit[8,8,10,10] → pred dog[8,8,10,10] (misclassified as dog)
    #          GT cat[12,12,14,14] → pred rabbit[12,12,14,14] (misclassified as rabbit)
    #          pred dog[16,16,18,18] → no GT match (FP)
    expected = np.array([
        [1., 0., 1., 0.],  # GT cat: 1 TP + 1 misclassified as rabbit
        [1., 0., 0., 0.],  # GT dog: 1 misclassified as cat
        [0., 1., 0., 0.],  # GT rabbit: 1 misclassified as dog
        [0., 1., 0., 0.]   # FP: 1 dog pred unmatched
    ])
 
    assert_matrix_equal(cm.matrix, expected, "Large complex example")

# ------------------------------------------------------------
# Test 11: High counts with multiple TPs and misclassifications
# ------------------------------------------------------------
def test_high_counts_misclass():
    gt = sv.Detections(
        xyxy=np.array([
            [0,0,2,2], [0,3,2,5], [0,6,2,8],   # 3 cats
            [4,0,6,2], [4,3,6,5],              # 2 dogs
            [8,0,10,2], [8,3,10,5]             # 2 rabbits
        ]),
        class_id=np.array([0,0,0, 1,1, 2,2])
    )
    pred = sv.Detections(
        xyxy=np.array([
            [0,0,2,2], [0,3,2,5], [0,6,2,8],   # cats → cat  (3 TP)
            [4,0,6,2], [4,3,6,5],              # dogs → rabbit (2 confused)
            [8,0,10,2],                        # rabbit → rabbit (1 TP)
            [12,0,14,2]                        # stray cat (FP)
        ]),
        class_id=np.array([0,0,0, 2,2, 2, 0]),
        confidence=np.array([.95,.95,.95, .9,.9, .9, .8])
    )

    cm = cm_from(pred, gt, "TEST-11 High counts & misclassifications")

    expected = np.array([
        [3., 0., 0., 0.],  # GT cat: 3 TP
        [0., 0., 2., 0.],  # GT dog: 2 confused as rabbit
        [0., 0., 1., 1.],  # GT rabbit: 1 TP, 1 FN
        [1., 0., 0., 0.]   # FP: 1 stray cat prediction
    ])

    assert_matrix_equal(cm.matrix, expected, "High counts & misclassifications")

# ------------------------------------------------------------
# Test 12: Symmetric multi‑class confusions with higher counts
# ------------------------------------------------------------
def test_symmetric_multi_confusions():
    gt = sv.Detections(
        xyxy=np.array([
            [0,0,2,2], [0,4,2,6],             # 2 cats
            [4,0,6,2], [4,4,6,6],             # 2 dogs
            [8,0,10,2], [8,4,10,6]            # 2 rabbits
        ]),
        class_id=np.array([0,0, 1,1, 2,2])
    )
    pred = sv.Detections(
        xyxy=np.array([
            [0,0,2,2], [0,4,2,6],             # cats → cat  (2 TP)
            [4,0,6,2], [4,4,6,6],             # dogs → dog  (2 TP)
            [8,0,10,2], [8,4,10,6],           # rabbits → cat (2 confused)
            [12,0,14,2], [12,4,14,6]          # stray dogs (FP × 2)
        ]),
        class_id=np.array([0,0, 1,1, 0,0, 1,1]),
        confidence=np.array([.9,.9, .9,.9, .9,.9, .8,.8])
    )

    cm = cm_from(pred, gt, "TEST-12 Symmetric multi‑class confusions")

    expected = np.array([
        [2., 0., 0., 0.],  # GT cat: 2 TP
        [0., 2., 0., 0.],  # GT dog: 2 TP
        [2., 0., 0., 0.],  # GT rabbit: 2 confused as cat
        [0., 2., 0., 0.]   # FP: 2 stray dog predictions
    ])

    assert_matrix_equal(cm.matrix, expected, "Symmetric multi‑class confusions")


# Run all tests
if __name__ == "__main__":
    test_class_priority()
    test_multiple_overlapping()
    test_confidence_filtering()
    test_iou_threshold_boundary()
    test_chain_overlapping()
    test_no_ground_truth()
    test_empty_detections()
    test_multi_class_misses()
    test_complex_multiple()
    test_large_complex()
    test_high_counts_misclass()
    test_symmetric_multi_confusions()

If you find any other issues i'll be happy to address them!


for i, detection_class_value in enumerate(detection_classes):
if not any(matched_detection_idx == i):
result_matrix[num_classes, detection_class_value] += 1 # FP
# unmatched detections are FP
for det_idx, det_class in enumerate(detection_classes):
if det_idx not in matched_det_idx:
result_matrix[num_classes, det_class] += 1

return result_matrix

Expand Down