Merge pull request sgkit-dev#15 from eric-czech/add_gp

eric-czech · web-flow · commit 3b7c30bd74f8 · 2020-08-25T09:12:04.000-04:00
Return genotype probabilities and dosages
diff --git a/sgkit_bgen/bgen_reader.py b/sgkit_bgen/bgen_reader.py
@@ -94,28 +94,38 @@ def max_str_len(arr: ArrayLike) -> Any:
                 else:
                     self.sample_id = generate_samples(bgen.nsamples)
 
-        self.shape = (self.n_variants, len(self.sample_id))
+        self.shape = (self.n_variants, len(self.sample_id), 3)
         self.dtype = dtype
-        self.ndim = 2
+        self.ndim = 3
 
     def __getitem__(self, idx):
         if not isinstance(idx, tuple):
-            raise IndexError(  # pragma: no cover
-                f"Indexer must be tuple (received {type(idx)})"
-            )
+            raise IndexError(f"Indexer must be tuple (received {type(idx)})")
         if len(idx) != self.ndim:
-            raise IndexError(  # pragma: no cover
-                f"Indexer must be two-item tuple (received {len(idx)} slices)"
+            raise IndexError(
+                f"Indexer must have {self.ndim} items (received {len(idx)} slices)"
+            )
+        if not all(isinstance(i, slice) or isinstance(i, int) for i in idx):
+            raise IndexError(
+                f"Indexer must contain only slices or ints (received types {[type(i) for i in idx]})"
             )
+        # Determine which dims should have unit size in result
+        squeeze_dims = tuple(i for i in range(len(idx)) if isinstance(idx[i], int))
+        # Convert all indexers to slices
+        idx = tuple(slice(i, i + 1) if isinstance(i, int) else i for i in idx)
 
         if idx[0].start == idx[0].stop:
-            return np.empty((0, 0), dtype=self.dtype)
+            return np.empty((0,) * self.ndim, dtype=self.dtype)
 
+        # Determine start and end partitions that correspond to the
+        # given variant dimension indexer
         start_partition = idx[0].start // self.partition_size
         start_partition_offset = idx[0].start % self.partition_size
         end_partition = (idx[0].stop - 1) // self.partition_size
         end_partition_offset = (idx[0].stop - 1) % self.partition_size
 
+        # Create a list of all offsets into the underlying file at which
+        # data for each variant begins
         all_vaddr = []
         with bgen_metafile(self.metafile_filepath) as mf:
             for i in range(start_partition, end_partition + 1):
@@ -129,21 +139,27 @@ def __getitem__(self, idx):
                 vaddr = partition["vaddr"].tolist()
                 all_vaddr.extend(vaddr[start_offset:end_offset])
 
+        # Read the probabilities for each variant, apply indexer for
+        # samples dimension to give probabilities for all genotypes,
+        # and then apply final genotype dimension indexer
         with bgen_file(self.path) as bgen:
             res = None
             for i, vaddr in enumerate(all_vaddr):
                 probs = bgen.read_genotype(vaddr)["probs"][idx[1]]
-                dosage = _to_dosage(probs)
+                assert len(probs.shape) == 2 and probs.shape[1] == 3
                 if res is None:
-                    res = np.zeros((len(all_vaddr), len(dosage)), dtype=self.dtype)
-                res[i] = dosage
-            return res
+                    res = np.zeros((len(all_vaddr), len(probs), 3), dtype=self.dtype)
+                res[i] = probs
+            res = res[..., idx[2]]
+            return np.squeeze(res, axis=squeeze_dims)
 
 
 def _to_dosage(probs: ArrayLike):
     """Calculate the dosage from genotype likelihoods (probabilities)"""
-    assert len(probs.shape) == 2 and probs.shape[1] == 3
-    return 2 * probs[:, -1] + probs[:, 1]
+    assert (
+        probs.shape[-1] == 3
+    ), f"Expecting genotype (trailing) dimension of size 3, got array of shape {probs.shape}"
+    return probs[..., 1] + 2 * probs[..., 2]
 
 
 def read_bgen(
@@ -162,7 +178,8 @@ def read_bgen(
     path : PathType
         Path to BGEN file.
     chunks : Union[str, int, tuple], optional
-        Chunk size for genotype data, by default "auto"
+        Chunk size for genotype probability data (3 dimensions),
+        by default "auto".
     lock : bool, optional
         Whether or not to synchronize concurrent reads of
         file blocks, by default False. This is passed through to
@@ -190,13 +207,15 @@ def read_bgen(
 
     sample_id = np.array(bgen_reader.sample_id, dtype=str)
 
-    call_dosage = da.from_array(
+    call_genotype_probability = da.from_array(
         bgen_reader,
         chunks=chunks,
         lock=lock,
+        fancy=False,
         asarray=False,
         name=f"{bgen_reader.name}:read_bgen:{path}",
     )
+    call_dosage = _to_dosage(call_genotype_probability)
 
     ds = create_genotype_dosage_dataset(
         variant_contig_names=variant_contig_names,
@@ -205,6 +224,7 @@ def read_bgen(
         variant_alleles=variant_alleles,
         sample_id=sample_id,
         call_dosage=call_dosage,
+        call_genotype_probability=call_genotype_probability,
         variant_id=variant_id,
     )
 
diff --git a/sgkit_bgen/tests/test_bgen_reader.py b/sgkit_bgen/tests/test_bgen_reader.py
@@ -1,9 +1,41 @@
+import numpy as np
 import numpy.testing as npt
 import pytest
 from sgkit_bgen import read_bgen
+from sgkit_bgen.bgen_reader import BgenReader
 
+CHUNKS = [
+    (100, 200, 3),
+    (100, 200, 1),
+    (100, 500, 3),
+    (199, 500, 3),
+    ((100, 99), 500, 2),
+    "auto",
+]
+INDEXES = [0, 10, 20, 100, -1]
 
-@pytest.mark.parametrize("chunks", [(100, 200), (100, 500), (199, 500), "auto"])
+# Expectations below generated using bgen-reader directly, ex:
+# > from bgen_reader import open_bgen
+# > bgen = open_bgen('sgkit_bgen/tests/data/example.bgen', verbose=False)
+# > bgen.read(-1)[0] # Probabilities for last variant, first sample
+# array([[0.0133972 , 0.98135378, 0.00524902]]
+# > bgen.allele_expectation(-1)[0, 0, -1] # Dosage for last variant, first sample
+# 0.9918518217727197
+EXPECTED_PROBABILITIES = np.array(
+    [  # Generated using bgen-reader directly
+        [np.nan, np.nan, np.nan],
+        [0.007, 0.966, 0.0259],
+        [0.993, 0.002, 0.003],
+        [0.916, 0.007, 0.0765],
+        [0.013, 0.981, 0.0052],
+    ]
+)
+EXPECTED_DOSAGES = np.array(
+    [np.nan, 1.018, 0.010, 0.160, 0.991]  # Generated using bgen-reader directly
+)
+
+
+@pytest.mark.parametrize("chunks", CHUNKS)
 def test_read_bgen(shared_datadir, chunks):
     path = shared_datadir / "example.bgen"
     ds = read_bgen(path, chunks=chunks)
@@ -12,6 +44,21 @@ def test_read_bgen(shared_datadir, chunks):
     assert ds["call_dosage"].shape == (199, 500)
     npt.assert_almost_equal(ds["call_dosage"].values[1][0], 1.987, decimal=3)
     npt.assert_almost_equal(ds["call_dosage"].values[100][0], 0.160, decimal=3)
+    npt.assert_array_equal(ds["call_dosage_mask"].values[0, 0], [True])
+    npt.assert_array_equal(ds["call_dosage_mask"].values[0, 1], [False])
+    assert ds["call_genotype_probability"].shape == (199, 500, 3)
+    npt.assert_almost_equal(
+        ds["call_genotype_probability"].values[1][0], [0.005, 0.002, 0.992], decimal=3
+    )
+    npt.assert_almost_equal(
+        ds["call_genotype_probability"].values[100][0], [0.916, 0.007, 0.076], decimal=3
+    )
+    npt.assert_array_equal(
+        ds["call_genotype_probability_mask"].values[0, 0], [True] * 3
+    )
+    npt.assert_array_equal(
+        ds["call_genotype_probability_mask"].values[0, 1], [False] * 3
+    )
 
 
 def test_read_bgen_with_sample_file(shared_datadir):
@@ -38,3 +85,43 @@ def test_read_bgen_with_no_samples(shared_datadir):
         "sample_3",
         "sample_4",
     ]
+
+
+@pytest.mark.parametrize("chunks", CHUNKS)
+def test_read_bgen_fancy_index(shared_datadir, chunks):
+    path = shared_datadir / "example.bgen"
+    ds = read_bgen(path, chunks=chunks)
+    npt.assert_almost_equal(
+        ds["call_genotype_probability"][INDEXES, 0], EXPECTED_PROBABILITIES, decimal=3
+    )
+    npt.assert_almost_equal(ds["call_dosage"][INDEXES, 0], EXPECTED_DOSAGES, decimal=3)
+
+
+@pytest.mark.parametrize("chunks", CHUNKS)
+def test_read_bgen_scalar_index(shared_datadir, chunks):
+    path = shared_datadir / "example.bgen"
+    ds = read_bgen(path, chunks=chunks)
+    for i, ix in enumerate(INDEXES):
+        npt.assert_almost_equal(
+            ds["call_genotype_probability"][ix, 0], EXPECTED_PROBABILITIES[i], decimal=3
+        )
+        npt.assert_almost_equal(
+            ds["call_dosage"][ix, 0], EXPECTED_DOSAGES[i], decimal=3
+        )
+        for j in range(3):
+            npt.assert_almost_equal(
+                ds["call_genotype_probability"][ix, 0, j],
+                EXPECTED_PROBABILITIES[i, j],
+                decimal=3,
+            )
+
+
+def test_read_bgen_raise_on_invalid_indexers(shared_datadir):
+    path = shared_datadir / "example.bgen"
+    reader = BgenReader(path)
+    with pytest.raises(IndexError, match="Indexer must be tuple"):
+        reader[[0]]
+    with pytest.raises(IndexError, match="Indexer must have 3 items"):
+        reader[(slice(None),)]
+    with pytest.raises(IndexError, match="Indexer must contain only slices or ints"):
+        reader[([0], [0], [0])]