Skip to content

Commit 91e8912

Browse files
committed
Precompute string lengths sgkit-dev#12
1 parent 5ef6aac commit 91e8912

File tree

1 file changed

+22
-6
lines changed

1 file changed

+22
-6
lines changed

sgkit_plink/pysnptools.py

+22-6
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
"""PLINK 1.9 reader implementation"""
22
from pathlib import Path
3-
from typing import Optional, Union
3+
from typing import Optional, Union, Mapping, Any
44

55
import dask.array as da
66
import dask.dataframe as dd
77
import numpy as np
88
from dask.dataframe import DataFrame
9+
from dask.array import Array
910
from pysnptools.snpreader import Bed
1011
from xarray import Dataset
1112

@@ -92,11 +93,26 @@ def close(self):
9293
self.bed._close_bed() # pragma: no cover
9394

9495

95-
def _to_dict(df, dtype=None):
96-
return {
97-
c: df[c].to_dask_array(lengths=True).astype(dtype[c] if dtype else df[c].dtype)
98-
for c in df
99-
}
96+
def _max_str_len(arr: Array) -> Array:
97+
return arr.map_blocks(
98+
lambda s: np.char.str_len(s.astype(str)), dtype=np.int8
99+
).max()
100+
101+
102+
def _to_dict(df: DataFrame, dtype: Mapping[str, Any]=None):
103+
arrs = {}
104+
for c in df:
105+
a = df[c].to_dask_array(lengths=True)
106+
dt = df[c].dtype
107+
if dtype:
108+
dt = dtype[c]
109+
kind = np.dtype(dt).kind
110+
if kind in ['U', 'S']:
111+
# Compute fixed-length string dtype for array
112+
max_len = _max_str_len(a).compute()
113+
dt = f"{kind}{max_len}"
114+
arrs[c] = a.astype(dt)
115+
return arrs
100116

101117

102118
def read_fam(path: PathType, sep: str = " ") -> DataFrame:

0 commit comments

Comments
 (0)